Python爬取-工控行业系统漏洞

20次阅读

共计 5221 个字符,预计需要花费 14 分钟才能阅读完成。

先贴连接,让各位观众老爷看看,对不对你们的胃口

工控行业系统漏洞

可以看到,这个网页是 html 静态的,所以问题变的非常的简单

只需要用 request 请求网页就可以了

话不多说,直接贴代码

import requests
from urllib.parse import urlencode
from lxml import etree
import pymysql
import time
import xlwt
import xlrd


def makeurl():
    # http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0
    baseurl = 'http://ics.cnvd.org.cn/?'
    params = {
        'tdsourcetag': 's_pctim_aiomsg',
        'max': '20'
    }
    for page in range(MAX_PAGE):
        params['offset'] = page * 20
        url = baseurl + urlencode(params)
        print('url is', url)
        yield url


def get_page_urllist(url):
    headers = {
        'Host': 'ics.cnvd.org.cn',
        'Referer': 'http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=40',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    return response.text


def parse_urllist(content):
    html = etree.HTML(content)
    for li in html.xpath('//tbody[@id="tr"]/tr'):
        yield li.xpath('td/a/@href')[0]


def get_page(url):
    headers = {
        'Host': 'www.cnvd.org.cn',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    return response.text


def parse_page(content, url):
    html = etree.HTML(content)
    item = {}
    item['url'] = url
    item['标题'] = str(html.xpath('//div[@class="blkContainerSblk"]/h1/text()')[0])

    item['CNVD_ID'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()="CNVD-ID"]/following-sibling::*[1]//text()')])
    item['公开日期'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 公开日期 "]/following-sibling::*[1]//text()')])
    item['危害级别'] = ''.join([i.strip().replace(' ','').replace('\r', '').replace('\n','').replace('\t', '') for i in
                            html.xpath('//tbody/tr/td[text()=" 危害级别 "]/following-sibling::*[1]//text()')])
    item['影响产品'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 影响产品 "]/following-sibling::*[1]//text()')])
    try:
        item['BUGTRAQ_ID'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()="BUGTRAQ ID"]/following-sibling::*[1]//text()')])
    except:
        item['BUGTRAQ_ID'] = ''item['CVE_ID'] =''.join([i.strip() for i in
         html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//text()')]) + '' +''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//@href')])

    item['漏洞描述'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 漏洞描述 "]/following-sibling::*[1]//text()')])
    item['漏洞类型'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 漏洞类型 "]/following-sibling::*[1]//text()')])
    item['参考链接'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 参考链接 "]/following-sibling::*[1]//text()')])
    item['漏洞解决方案'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 漏洞解决方案 "]/following-sibling::*[1]//text()')])
    item['厂商补丁'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 厂商补丁 "]/following-sibling::*[1]//text()')]) + 'http://www.cnvd.org.cn' + ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 厂商补丁 "]/following-sibling::*[1]//@href')])
    item['验证信息'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 验证信息 "]/following-sibling::*[1]//text()')])
    item['报送时间'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 报送时间 "]/following-sibling::*[1]//text()')])
    item['收录时间'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 收录时间 "]/following-sibling::*[1]//text()')])
    item['更新时间'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 更新时间 "]/following-sibling::*[1]//text()')])
    item['漏洞附件'] = ''.join([i.strip() for i in html.xpath('//tbody/tr/td[text()=" 漏洞附件 "]/following-sibling::*[1]//text()')])

    return item


def save_data(index, item, workbook):
    sheet = workbook.get_sheet('sheet1')  # 创建一个 sheet 表格
    for col, value in enumerate(item.values()):
        sheet.write(index, col, value)
    workbook.save(filename)
    print('保存成功')


def excel_prepare(heads):
    workbook = xlwt.Workbook()
    sheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True)  # 创建一个 sheet 表格
    for col, value in enumerate(heads):
        sheet.write(0, col, value)
    return workbook


def urlisexist(url, urlset):
    if url in urlset:
        return True
    else:
        return False


def getallurl(filename):
    workbook = xlrd.open_workbook(filename)
    sheet1 = workbook.sheet_by_name('sheet1')
    results = sheet1.col_values(0, 1)
    return results


def read_old(filename):
    workbook = xlrd.open_workbook(filename)
    sheet1 = workbook.sheet_by_name('sheet1')
    alloldset = []
    for index in range(sheet1.nrows):
        alloldset.append(sheet1.row_values(index))
    return alloldset, sheet1.nrows


def save_old(index, olditem):
    sheet = workbook.get_sheet('sheet1')  # 创建一个 sheet 表格
    for col, value in enumerate(olditem):
        sheet.write(index, col, value)
    workbook.save(filename)


if __name__ == '__main__':
    # http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0

    # 睡眠时间
    TIMESLEEP = 0

    filename = '工程控制系统漏洞.xls'

    MAX_PAGE = 96



    heads = ['url',
             '标题',
             'CNVD_ID',
             '公开日期',
             '危害级别',
             '影响产品',
             'BUGTRAQ_ID',
             'CVE_ID',
             '漏洞描述',
             '漏洞类型',
             '参考链接',
             '漏洞解决方案',
             '厂商补丁',
             '验证信息',
             '报送时间',
             '收录时间',
             '更新时间',
             '漏洞附件']

    try:
        alloldset, length = read_old(filename)
    except:
        alloldset = []
        length = 1

    workbook = excel_prepare(heads)

    for index, olditem in enumerate(alloldset):
        save_old(index, olditem)

    try:
        urlset = getallurl(filename)
    except:
        urlset = []

    index = length
    for urlofpage in makeurl():
        pagelistcontent = get_page_urllist(urlofpage)
        for url in parse_urllist(pagelistcontent):
            print('url is >>>', url)
            if not urlisexist(url, urlset):
                time.sleep(TIMESLEEP)
                result = get_page(url)
                item = parse_page(result, url)
                print('item is >>>', item)
                save_data(index, item, workbook)
                index = index + 1

    workbook.save(filename)

不懂的地方,下方评论提问

正文完
 0