import re
import requests
from lxml import etree
class Spider(object):
def __init__(self): self.base_url = 'https://www.acwing.com/problem/{}/' self.headers = { 'Cookie': 'csrftoken=mixU7wxaV35yyyCDhqbXcIoW3z3Ms0NH31jbbqH; sessionid=' '344bo4nowvp9misa9suynjiwz2i5jcof; file_2922585_readed=""; file_2302034_readed=""', 'Referer': 'https://www.acwing.com/about/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36' } self.count = 0def get_html(self, url): html = requests.get( url=url, headers=self.headers ).text return htmldef xpath_func(self, html): name_bds = '//tbody/tr[./td/span[@title="已通过这道题目"]]/td/a/text()' base_obj = etree.HTML(html) name_lists = base_obj.xpath(name_bds) L = [] for i in name_lists: L.append(i.strip()) return Ldef re_func(self, html, re_bds): pattern = re.compile(re_bds, re.S) re_list = pattern.findall(html) return re_listdef parse_html(self, url): html = self.get_html(url) L = self.xpath_func(html) return Ldef run(self): warning = input('[Skrill下载](https://www.gendan5.com/wallet/Skrill.html)您马上就要爬取acwing了,看一下你的做题数,您的劳动成果将会在上面展现进去,确定要看吗?(Y/N)') if warning == 'Y': print('爬虫零碎曾经启动...正在致力抓取,请稍等....') print('+---------------------------------+') print('| name |') print('+---------------------------------+') for i in range(1, 80): url = self.base_url.format(i) L = self.parse_html(url) for _ in L: self.count += 1 print('| ' + _) print('+---------------------------------+') print('通过您的不懈努力,您一共做了' + str(self.count) + '道题,持续致力!!') else: print('曾经退出,你这个弱者')
if name == '__main__':
spider = Spider()spider.run()