共计 5221 个字符,预计需要花费 14 分钟才能阅读完成。
先贴连接,让各位观众老爷看看,对不对你们的胃口
工控行业系统漏洞
可以看到,这个网页是 html 静态的,所以问题变的非常的简单
只需要用 request 请求网页就可以了
话不多说,直接贴代码
import requests
from urllib.parse import urlencode
from lxml import etree
import pymysql
import time
import xlwt
import xlrd
def makeurl():
# http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0
baseurl = 'http://ics.cnvd.org.cn/?'
params = {
'tdsourcetag': 's_pctim_aiomsg',
'max': '20'
}
for page in range(MAX_PAGE):
params['offset'] = page * 20
url = baseurl + urlencode(params)
print('url is', url)
yield url
def get_page_urllist(url):
headers = {
'Host': 'ics.cnvd.org.cn',
'Referer': 'http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=40',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.text
def parse_urllist(content):
html = etree.HTML(content)
for li in html.xpath('//tbody[@id="tr"]/tr'):
yield li.xpath('td/a/@href')[0]
def get_page(url):
headers = {
'Host': 'www.cnvd.org.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.text
def parse_page(content, url):
html = etree.HTML(content)
item = {}
item['url'] = url
item['标题'] = str(html.xpath('//div[@class="blkContainerSblk"]/h1/text()')[0])
item['CNVD_ID'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()="CNVD-ID"]/following-sibling::*[1]//text()')])
item['公开日期'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 公开日期 "]/following-sibling::*[1]//text()')])
item['危害级别'] = ''.join([i.strip().replace(' ','').replace('\r', '').replace('\n','').replace('\t', '') for i in
html.xpath('//tbody/tr/td[text()=" 危害级别 "]/following-sibling::*[1]//text()')])
item['影响产品'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 影响产品 "]/following-sibling::*[1]//text()')])
try:
item['BUGTRAQ_ID'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()="BUGTRAQ ID"]/following-sibling::*[1]//text()')])
except:
item['BUGTRAQ_ID'] = ''item['CVE_ID'] =''.join([i.strip() for i in
html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//text()')]) + '' +''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//@href')])
item['漏洞描述'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 漏洞描述 "]/following-sibling::*[1]//text()')])
item['漏洞类型'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 漏洞类型 "]/following-sibling::*[1]//text()')])
item['参考链接'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 参考链接 "]/following-sibling::*[1]//text()')])
item['漏洞解决方案'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 漏洞解决方案 "]/following-sibling::*[1]//text()')])
item['厂商补丁'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 厂商补丁 "]/following-sibling::*[1]//text()')]) + 'http://www.cnvd.org.cn' + ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 厂商补丁 "]/following-sibling::*[1]//@href')])
item['验证信息'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 验证信息 "]/following-sibling::*[1]//text()')])
item['报送时间'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 报送时间 "]/following-sibling::*[1]//text()')])
item['收录时间'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 收录时间 "]/following-sibling::*[1]//text()')])
item['更新时间'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 更新时间 "]/following-sibling::*[1]//text()')])
item['漏洞附件'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 漏洞附件 "]/following-sibling::*[1]//text()')])
return item
def save_data(index, item, workbook):
sheet = workbook.get_sheet('sheet1') # 创建一个 sheet 表格
for col, value in enumerate(item.values()):
sheet.write(index, col, value)
workbook.save(filename)
print('保存成功')
def excel_prepare(heads):
workbook = xlwt.Workbook()
sheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True) # 创建一个 sheet 表格
for col, value in enumerate(heads):
sheet.write(0, col, value)
return workbook
def urlisexist(url, urlset):
if url in urlset:
return True
else:
return False
def getallurl(filename):
workbook = xlrd.open_workbook(filename)
sheet1 = workbook.sheet_by_name('sheet1')
results = sheet1.col_values(0, 1)
return results
def read_old(filename):
workbook = xlrd.open_workbook(filename)
sheet1 = workbook.sheet_by_name('sheet1')
alloldset = []
for index in range(sheet1.nrows):
alloldset.append(sheet1.row_values(index))
return alloldset, sheet1.nrows
def save_old(index, olditem):
sheet = workbook.get_sheet('sheet1') # 创建一个 sheet 表格
for col, value in enumerate(olditem):
sheet.write(index, col, value)
workbook.save(filename)
if __name__ == '__main__':
# http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0
# 睡眠时间
TIMESLEEP = 0
filename = '工程控制系统漏洞.xls'
MAX_PAGE = 96
heads = ['url',
'标题',
'CNVD_ID',
'公开日期',
'危害级别',
'影响产品',
'BUGTRAQ_ID',
'CVE_ID',
'漏洞描述',
'漏洞类型',
'参考链接',
'漏洞解决方案',
'厂商补丁',
'验证信息',
'报送时间',
'收录时间',
'更新时间',
'漏洞附件']
try:
alloldset, length = read_old(filename)
except:
alloldset = []
length = 1
workbook = excel_prepare(heads)
for index, olditem in enumerate(alloldset):
save_old(index, olditem)
try:
urlset = getallurl(filename)
except:
urlset = []
index = length
for urlofpage in makeurl():
pagelistcontent = get_page_urllist(urlofpage)
for url in parse_urllist(pagelistcontent):
print('url is >>>', url)
if not urlisexist(url, urlset):
time.sleep(TIMESLEEP)
result = get_page(url)
item = parse_page(result, url)
print('item is >>>', item)
save_data(index, item, workbook)
index = index + 1
workbook.save(filename)
不懂的地方,下方评论提问
正文完