# -*- coding: utf-8 -*-
import requests
from pyquery import PyQuery as pq
from goose import Goose
from goose.text import StopWordsChinese
import json
import time
class ItSlaw(object):
def __init__(self):
self.url = 'http://www.itslaw.com/api/v1/caseFiles?startIndex=0&countPerPage=20&sortType=1&conditions=searchWord+{keyword}+1+{keywordcopy}'
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "www.itslaw.com",
"If-Modified-Since": "Mon, 26 Jul 1997 05:00:00 GMT",
"Pragma": "no-cache",
"Referer": "http://www.itslaw.com"}
self.result = None
self.keyword = None
self.session = requests.Session()
def reset(self, keyword):
self.keyword = keyword
self.result = None
def fetch(self):
url = self.url.format(keyword='self.keyword', keywordcopy='self.keyword')
res = []
time.sleep(3)
proxies = {"http": "14.111.148.1"}
r = self.session.get(url, proxies=proxies)
print r.status_code
print '@@'*20
completed_url = 'http://www.itslaw.com/' + 'url'
g = Goose({'stopwords_class': StopWordsChinese})
article = g.extract(url=completed_url)
content = article.cleaned_text
res.append({'title': title, 'url': url, 'content': content})
self.result = res
return self.result
def get_result(self):
return self.result
if __name__ == '__main__':
search = ItSlaw()
search.reset('九州通医药集团股份有限公司')
search.fetch()
info = search.get_result()
print info