# -*- coding: utf-8 -*-import requestsfrom pyquery import PyQuery as pqfrom goose import Goosefrom goose.text import StopWordsChineseimport jsonimport timeclass ItSlaw(object): def __init__(self): self.url = 'http://www.itslaw.com/api/v1/caseFiles?startIndex=0&countPerPage=20&sortType=1&conditions=searchWord+{keyword}+1+{keywordcopy}' self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36", "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.itslaw.com", "If-Modified-Since": "Mon, 26 Jul 1997 05:00:00 GMT", "Pragma": "no-cache", "Referer": "http://www.itslaw.com"} self.result = None self.keyword = None self.session = requests.Session() def reset(self, keyword): self.keyword = keyword self.result = None def fetch(self): url = self.url.format(keyword='self.keyword', keywordcopy='self.keyword') res = [] time.sleep(3) proxies = {"http": "14.111.148.1"} r = self.session.get(url, proxies=proxies) print r.status_code print '@@'*20 completed_url = 'http://www.itslaw.com/' + 'url' g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=completed_url) content = article.cleaned_text res.append({'title': title, 'url': url, 'content': content}) self.result = res return self.result def get_result(self): return self.resultif __name__ == '__main__': search = ItSlaw() search.reset('九州通医药集团股份有限公司') search.fetch() info = search.get_result() print info