# -*- coding:utf-8 -*-import requestsfrom lxml import etreedef get_sitemapinfo(robots_url): """ 性能:获得robots.txt中的Sitemap网址 返回sitemap 例如 https://www.qidian.com/newsitemap2/pcfixedsitemap.xml :param robots_url :return: https://www.qidian.com/newsitemap2/pcfixedsitemap.xml """ response = requests.get(robots_url).text try: link = response.split("Sitemap:")[-1].strip() return link except: print("以后网站robots协定 未蕴含Sitemap")def get_links(sitemap_url,rule): """ 性能:获得Sitemap下所有的入口地址 返回links 例如 ['https://www.qidian.com/all_pub/chanId13700/', 'https://www.qidian.com/all_pub/chanId14100/', 'https://www.qidian.com/all_pub/chanId14400/'] :param sitemap_url sitemap的地址 :param rule xpath匹配规定 :return: https://www.qidian.com/newsitemap2/pcfixedsitemap.xml """ response = requests.get(sitemap_url) r = etree.HTML(response.text.encode("UTF-8")) links = r.xpath(rule) return linksif __name__ == "__main__": ## 开始执行程序 # robots.txt地址 url = "https://www.qidian.com/robots.txt" sitemap_url = get_sitemapinfo(robots_url=url) links = get_links(sitemap_url=sitemap_url, rule="//url/loc/text()") print(f"links:{links}")