共计 1479 个字符,预计需要花费 4 分钟才能阅读完成。
import re
import requests
from lxml import etree
class Spider(object):
def __init__(self):
self.base_url = 'https://www.acwing.com/problem/{}/'
self.headers = {
'Cookie': 'csrftoken=mixU7wxaV35yyyCDhqbXcIoW3z3Ms0NH31jbbqH; sessionid='
'344bo4nowvp9misa9suynjiwz2i5jcof; file_2922585_readed=""; file_2302034_readed=""','Referer':'https://www.acwing.com/about/','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'' (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
self.count = 0
def get_html(self, url):
html = requests.get(
url=url,
headers=self.headers
).text
return html
def xpath_func(self, html):
name_bds = '//tbody/tr[./td/span[@title=" 已通过这道题目 "]]/td/a/text()'
base_obj = etree.HTML(html)
name_lists = base_obj.xpath(name_bds)
L = []
for i in name_lists:
L.append(i.strip())
return L
def re_func(self, html, re_bds):
pattern = re.compile(re_bds, re.S)
re_list = pattern.findall(html)
return re_list
def parse_html(self, url):
html = self.get_html(url)
L = self.xpath_func(html)
return L
def run(self):
warning = input('[Skrill 下载](https://www.gendan5.com/wallet/Skrill.html) 您马上就要爬取 acwing 了,看一下你的做题数,您的劳动成果将会在上面展现进去,确定要看吗?(Y/N)')
if warning == 'Y':
print('爬虫零碎曾经启动... 正在致力抓取,请稍等....')
print('+---------------------------------+')
print('| name |')
print('+---------------------------------+')
for i in range(1, 80):
url = self.base_url.format(i)
L = self.parse_html(url)
for _ in L:
self.count += 1
print('|' + _)
print('+---------------------------------+')
print('通过您的不懈努力,您一共做了' + str(self.count) + '道题,持续致力!!')
else:
print('曾经退出,你这个弱者')
if name == ‘__main__’:
spider = Spider()
spider.run()
正文完