关于python:python网络爬虫-myipms

指标网站:https://myip.ms
难度10颗星
具备极强的反爬虫能力：封ip
'''------------------------------https://myip.ms/browse/web_hosting/1/countryID/ALA%5EASM------------------------------'''import osimport csvimport timeimport randomimport requestsfrom lxml import etreefrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.chrome.options import Optionsfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.common.exceptions import TimeoutExceptionclass LoopOver(Exception):    def __init__(self, *args, **kwargs):        passclass Spider:    def __init__(self):        # csv贮存        self.path = '.'        self.inputfilename = 'country.csv'        self.csvfilename = 'datas.csv'        self.logfilename = 'run.log'        options = webdriver.ChromeOptions()        self.browser = webdriver.Chrome(options=options)        self.wait = WebDriverWait(self.browser, 20)        # 链接        self.listurl = 'https://myip.ms/browse/web_hosting/1/countryID/{}'        self.host = 'https://myip.ms'        self.tempalte = '''<p>    {}</p><table border="5">    <thead class="tableFloatingHeaderOriginal">        <tr valign="middle">            <th class="nobackgroundimage" align="center" style="width: 32px;">                No            </th>            <th colfirst="ip_owners" align="center" title-orig="Hosting Company" class="header" style="width: 163px;">                Hosting Company</th>            <th align="center" title-orig="Website/s" class="header" style="width: 114px;">                Website/s</th>            <th align="center" title-orig="Total Websites use this company IPs" class="header headerSortUp"                style="width: 92px;">                Total Websites use this company IPs</th>            <th align="center" title-orig="TOP Websites use this company IPs" class="header" style="width: 77px;">                TOP Websites use this company IPs</th>            <th align="center" title-orig="Diagram" class="header" style="width: 38px;">                Record Update Time</th>        </tr>    </thead>    <tbody>        {}    </tbody></table>                '''        self.tempalte_page = '''<table border="5">    <thead class="tableFloatingHeaderOriginal">        <tr valign="middle">            <th class="nobackgroundimage" align="center" style="width: 44px;"><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div>No</th>            <th colfirst="sites" align="center" title-orig="Web Site" class="header" style="width: 153px;">            Web Site</th>            <th align="center" title-orig="Website IP Address" class="header" style="width: 144px;">            Website IP Address</th>            <th align="center" title-orig="Web Hosting Company / IP Owner" class="header" style="width: 178px;">                Website IPV6 Address</th>            <th align="center" title-orig="Web Hosting / Server IP Location" class="header" style="width: 134px;">                World Site Popular</th>            <th align="center" title-orig="Web Hosting City" class="header" style="width: 105px;">                World Site Popular Rating</th>            <th align="center" title-orig="World Site Popular Rating" class="header headerSortDown" style="width: 86px;">            DNS Records</th>            <th align="center" title-orig="Diagram" class="header" style="width: 38px;">            Record Update Time</th>            </tr>    </thead>    <tbody>        {}    </tbody></table>                '''    def turn2filename(self, dst):        d = dst.replace("\\", "").replace("/", "").replace(":", "").replace("*", "").replace(            "?", "").replace("\"", "").replace("<", "").replace(">", "").replace(            "|", "")        return d    def run(self):        strat = time.time()        self.get_input()        # 71        for c, cid in self.datas[115:116]:            print('>>> ', c, self.listurl.format(cid))            for item_index, item in enumerate(self.parse_list(self.get_list(self.listurl.format(cid)))):                if item[1] == '- No Records Found -':                    item[0] = c                if c in ['British Indian Ocean Territory', 'Brunei', 'Bulgaria']:                    self.save_data(item=item, filename=self.turn2filename(c) + '.csv')                else:                    self.save_data(item=item, filename='data.csv')            time.sleep(0)            end = time.time()            self.runtime = end - strat            print('用时{}'.format(self.runtime))        end = time.time()        self.runtime = end - strat    def get_input(self):        with open(self.inputfilename, 'r', encoding='utf_8') as f:            reader = csv.reader(f)            self.datas = [i for i in list(reader) if i]    def mkurl(self, kw):        for i in range(0, 1):            yield self.listurl.format(kw, i * 10)    def get_list(self, url):        while True:            try:                self.browser.get(url)                try:                    self.wait.until(EC.presence_of_element_located(                        (By.XPATH, '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]')))                except Exception:                    if 'a Robot' in self.browser.find_element_by_xpath('/html/body/div[2]/div/div/div/center').text:                        self.browser.find_element_by_xpath(                            '//*[@id="captcha_submit"]').click()                        time.sleep(1)                        raise Exception                return self.browser            except Exception as error:                print('error >>> ', error)                if self.browser.current_url != url:                    self.browser.quit()                    self.browser = webdriver.Chrome()                    self.wait = WebDriverWait(self.browser, 20)                    time.sleep(1)                pass    def parse_list(self, response):        html = etree.HTML(response.page_source)        def pop(attr): return attr[0].strip().replace(            '\n', '').replace('  ', '') if attr else ''        for tr in html.xpath('//*[@id="web_hosting_tbl"]/tbody/tr[not(contains(@class,"expand"))]'):            No = tr.xpath('./td[1]/text()')[0].strip()            Hosting_Company = pop(tr.xpath('./td[2]/a/text()'))            page_url = pop(tr.xpath('./td[2]/a/@href'))            country_name = pop(tr.xpath('./td[3]/a/text()'))            Website = pop(tr.xpath('./td[4]/a/text()'))            Total_Websites_use_this_company_IPs = pop(                tr.xpath('./td[5]/a/text()'))            TOP_Websites_use_this_company_IPs = pop(                tr.xpath('./td[6]/a/text()'))            record_update_time = pop(                tr.xpath('./td[7]/text()'))            yield [country_name, No, Hosting_Company, Website, Total_Websites_use_this_company_IPs,                   TOP_Websites_use_this_company_IPs, record_update_time, self.host + page_url]    def get_page(self, url):        while True:            try:                self.browser.get(url)                try:                    self.wait.until(EC.presence_of_element_located(                        (By.XPATH, '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]')))                except Exception as error:                    print('//*[@id="sites_tbl" or @id ="web_hosting_tbl"] error', error)                    if 'a Robot' in self.browser.find_element_by_xpath('/html/body/div[2]/div/div/div/center').text:                        self.browser.find_element_by_xpath(                            '//*[@id="captcha_submit"]').click()                        time.sleep(5)                        raise Exception                return self.browser            except Exception as error:                print('error >>> ', error)                if self.browser.current_url != url:                    self.browser.quit()                    self.browser = webdriver.Chrome()                    self.wait = WebDriverWait(self.browser, 20)                    time.sleep(100)                pass    def parse_page(self, response):        text = response.page_source        html = etree.HTML(text)        def pop(attr):            return attr[0].strip().replace(                '\n', '').replace('  ', '') if attr else ''        l = len(html.xpath(            '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]/tbody/tr[not(contains(@class,"expand"))]'))        print('len is ', l)        try:            for i in range(1, l + 1):                tr = html.xpath(                    '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]/tbody/tr[not(contains(@class,"expand"))][{}]'.format(                        i))[0]                tre = html.xpath(                    '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]/tbody/tr[contains(@class,"expand")][{}]'.format(i))[                    0]                No = pop(tr.xpath('./td[1]/text()'))                web_site = pop(tr.xpath('./td[2]/a/text()'))                web_site_ip_address = pop(tr.xpath('./td[3]/a/text()'))                # tre                web_site_ipv6_address = pop(                    tre.xpath(                        './td[1]/div[@class="stitle"]/b[contains(text(),"IPv6")]/../following-sibling::*[1]//a/text()'))                # tre                website_popularity = pop(                    tre.xpath('./td[1]/div/span[@class="bold arial grey"]/text()'))                website_popularity_rating = pop(                    tr.xpath('./td[7]/span/text()'))                # tre                dns_records = '\n'.join(                    [i for i in tre.xpath(                        './td[1]/div[@class="stitle"]/b[contains(text(),"DNS")]/../following-sibling::*[1]//a/text()')])                # tre                record_update_time = pop(                    tre.xpath(                        './td[1]/div[@class="stitle"]/b[contains(text(),"Record Update Time")]/../following-sibling::div/text()'))                yield [No, web_site, web_site_ip_address, web_site_ipv6_address, website_popularity,                       website_popularity_rating, dns_records, record_update_time]        except IndexError:            raise LoopOver        if l < 50:            with open('error.html', 'w', encoding='utf-8') as f:                f.write(text)            raise LoopOver    def save_data(self, filename=None, path=None, item=None):        if not filename:            filename = self.csvfilename        if not path:            path = self.path        '''        保留文件        '''        with open('{}/{}'.format(path, filename), 'a', encoding='utf_8', newline='') as csvfile:            writer = csv.writer(csvfile)            writer.writerow(item)    def save_log(self, info):        with open(self.logfilename, 'a', encoding='utf-8') as f:            f.write(info + '   ' + time.strftime("%Y-%m-%d %H:%M:%S",                                                 time.localtime()) + '\n')    def save_html_list(self, country, items, filename=None, path=None):        tr = ''        for item in items:            t = ''            for index, it in enumerate(item):                if index == 1:                    td = '<td><a href="./data/{}-{}.html">{}</a></td>'.format(                        country, it.replace("\\", "").replace("/", "").replace(":", "").replace("*", "").replace("?",                                                                                                                 "").replace(                            "\"", "").replace("<", "").replace(">", "").replace("|", ""), it)                else:                    td = '<td>{}</td>'.format(it)                t += td            tr += '<tr>' + t + '</tr>'        with open('main.html', 'a', encoding='utf-8') as f:            f.write(self.tempalte.format(country, tr))    def save_html_page(self, country, items, filename=None, path=None, it=None):        if not os.path.exists(path):            os.mkdir(path)        tr = ''        for index, item in enumerate(items):            t = ''            for it in item:                td = '<td>{}</td>'.format(it)                t += td            tr += '<tr>' + t + '</tr>'        with open('./{}/{}'.format(path, filename), 'w', encoding='utf-8') as f:            f.write(self.tempalte_page.format(tr))    @property    def time(self):        return '总共用时：{}秒'.format(self.runtime)if __name__ == '__main__':    spider = Spider()    spider.run()    print(spider.time)  # 运行总工夫