download:极客大学-云原生训练营

coding=utf8

class UrlManager(object):

def __init__(self):    self.new_urls = set()    self.old_urls = set()#增加新的urldef _add_new_url(self, url):    if url is None:        return    if url not in self.new_urls and url not in self.old_urls:        self.new_urls.add(url)#批量增加urldef add_new_urls(self,urls):    if urls is None or len(urls) == 0:        return    for url in urls:        self._add_new_url(url)#是否有新的urldef has_new_url(self):    return len(self.new_urls) != 0#获取新的urldef get_new_url(self):    new_url = self.new_urls.pop()    self.old_urls.add(new_url)    return new_url

URL下载模块,负责把网页的内容下载下来,使用urllib2库进行下载。

coding utf8

import urllib2
class HtmlDownloader(object):

def download(self, url):    if url is None:        return None    response = urllib2.urlopen(url);    if response.getcode == 200:        return None    return response.read()

URL解析模块,使用Beautifulsoup把以后网页的其余链接和网页简介解析进去。

coding:utf-8

import urlparse
from bs4 import BeautifulSoup
import re
class HtmlParser(object):

def parser(self, page_url, html_content):    if page_url is None or html_content is None:        return    soup = BeautifulSoup(html_content, "html.parser", from_encoding="utf-8")    new_urls = self._get_new_urls(page_url, soup)    new_data = self._get_new_data(page_url, soup)    return new_urls, new_datadef _get_new_urls(self, page_url, soup):    new_urls = set()    links = soup.find_all('a', href=re.compile(r'/item/'))    for link in links:        new_url = link["href"]        new_url_full = urlparse.urljoin(page_url, new_url)        new_urls.add(new_url_full)    return new_urlsdef _get_new_data(self, page_url, soup):    res_data = {}    res_data['url'] = page_url    #

Python

    title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1')    res_data['title'] = title_node.get_text()    #    summary_node = soup.find("div", class_="lemma-summary")    res_data['summary'] = summary_node.get_text()    return res_data

输出模块,负责把解析的网页内容保存起来。

coding=utf-8

class HtmlOutputer(object):

def __init__(self):    self.datas = []def collect_data(self,data):    if data is None:        return    self.datas.append(data)def outpute_html(self):    fout = open("outputer.html",'w')    fout.write("")    fout.write("")    fout.write("")    for data in self.datas:        fout.write("")        fout.write("%s" % data['url'])        fout.write("%s" % data['title'].encode('utf-8'))        fout.write("%s" % data['summary'].encode('utf-8'))        fout.write("")    fout.write("")    fout.write("")    fout.write("")