download:极客大学-云原生训练营
coding=utf8
class UrlManager(object):
def __init__(self): self.new_urls = set() self.old_urls = set()#增加新的urldef _add_new_url(self, url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url)#批量增加urldef add_new_urls(self,urls): if urls is None or len(urls) == 0: return for url in urls: self._add_new_url(url)#是否有新的urldef has_new_url(self): return len(self.new_urls) != 0#获取新的urldef get_new_url(self): new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url
URL下载模块,负责把网页的内容下载下来,使用urllib2库进行下载。
coding utf8
import urllib2
class HtmlDownloader(object):
def download(self, url): if url is None: return None response = urllib2.urlopen(url); if response.getcode == 200: return None return response.read()
URL解析模块,使用Beautifulsoup把以后网页的其余链接和网页简介解析进去。
coding:utf-8
import urlparse
from bs4 import BeautifulSoup
import re
class HtmlParser(object):
def parser(self, page_url, html_content): if page_url is None or html_content is None: return soup = BeautifulSoup(html_content, "html.parser", from_encoding="utf-8") new_urls = self._get_new_urls(page_url, soup) new_data = self._get_new_data(page_url, soup) return new_urls, new_datadef _get_new_urls(self, page_url, soup): new_urls = set() links = soup.find_all('a', href=re.compile(r'/item/')) for link in links: new_url = link["href"] new_url_full = urlparse.urljoin(page_url, new_url) new_urls.add(new_url_full) return new_urlsdef _get_new_data(self, page_url, soup): res_data = {} res_data['url'] = page_url #
Python
title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1') res_data['title'] = title_node.get_text() # summary_node = soup.find("div", class_="lemma-summary") res_data['summary'] = summary_node.get_text() return res_data
输出模块,负责把解析的网页内容保存起来。
coding=utf-8
class HtmlOutputer(object):
def __init__(self): self.datas = []def collect_data(self,data): if data is None: return self.datas.append(data)def outpute_html(self): fout = open("outputer.html",'w') fout.write("") fout.write("") fout.write("") for data in self.datas: fout.write("") fout.write("%s" % data['url']) fout.write("%s" % data['title'].encode('utf-8')) fout.write("%s" % data['summary'].encode('utf-8')) fout.write("") fout.write("") fout.write("") fout.write("")