关于python:Python爬虫之爬取海贼王全彩漫画图片

54次阅读

共计 7499 个字符,预计需要花费 19 分钟才能阅读完成。

制作工具模块

- 暗藏身份信息的 User-Agent 模块; 对象服务器辨认不了身份信息。

import random user_agent_data = [{ “User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 QIHU 360SE”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3722.400 QQBrowser/10.5.3751.400”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3765.400 QQBrowser/10.6.4153.400”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3765.400 QQBrowser/10.6.4153.400”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64; ServiceUI 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36”}, {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/77.0”}, ] def get_headers(): “”” 随机获取报头 ””” index = random.randint(0,len(user_agent_data)-1) # print(“ 下标值:”,index) return user_agent_data[index] if __name__ == ‘__main__’: headers = get_headers() print(“ 随机获取 UA 值:”,headers)

制作一个动静的 IP 代理池; 避免 IP 被封; 能够用的 ip 代理已存进 ippool.json

import json

import random

def get_proxies():

“”” 随机获取代理池 ”””

读取文件

rfile = open(“./ipfile/ippool.json”,”r”,encoding=”utf-8″)

proxy_lists = json.load(rfile)

rfile.close()

# print(len(proxy_lists))

随机数

index = random.randint(0,len(proxy_lists)-1)

return proxy_lists[index]

if __name__ == ‘__main__’:

proxies = get_proxies()

print(“ 随机获取 ip 代理:”,proxies)

import requests import useragenttool import proxytool from lxml import etree import json import os class OnePieceSpider(object): def __init__(self): # 初始化 self.url = “http://kanbook.net/328” self.html_data = None self.one_piece_data_list = [] def get_url_html(self): “”” 解析取得网址源代码 ””” headers = useragenttool.get_headers() # 增加报头,暗藏身份 headers[“Accept-Encoding”] = “deflate, sdch, br” headers[“Content-Type”] = “text/html; charset=UTF-8” headers[“Referer”] = “https://kanbook.net/328/3/1/1″# 参考点 # print(headers) # 申请响应 response = requests.get(url=self.url, headers=headers, proxies=proxytool.get_proxies()) html_content = response.content.decode(“utf-8”) self.html_data = html_content # print(html_content) def catch_html_data(self): “”” 抓取网址源代码的数据 ””” # 取得 etree 对象 data_parse = etree.HTML(self.html_data) # print(data_parse) li_list = data_parse.xpath(“//div[@aria-labelledby=’3-tab’]/ol/li”) # print(li_list) # 遍历解决, 列表倒置 for li_element in li_list[::-1]: # print(li_element) # 提取后的链接 h_name = li_element.xpath(“./a/@href”)[0] # print(h_name) title = li_element.xpath(“./a/@title”)[0] # 提取题目 # print(title) # 提取页数 page = int(li_element.xpath(“./a/span/text()”)[0][1:4]) # print(page) # 放进字典中 one_piece_item = {“title”: title, “postfix”: h_name, “page”: page} # print(one_piece_item) self.one_piece_data_list.append(one_piece_item) print(“ 增加胜利!”) def save_data_file(self): “”” 保存信息 ””” path = “./image_url” if not os.path.exists(path): os.mkdir(path) file = open(path + “/one_piece_data.json”, “w”, encoding=”utf-8″) json.dump(self.one_piece_data_list, file, ensure_ascii=False, indent=2) file.close() print(“ 数据保留胜利!”) def run(self): # 启动程序 self.get_url_html() # print(html_content) self.catch_html_data() self.save_data_file() # print(self.one_piece_data_list) def main(): spider = OnePieceSpider() spider.run()

开始爬取海贼王全副的全彩漫画图片

- 留神点:报头要增加 referer 参考页,抉择漫画本站

此外循环 (while True) 为了让全副卷图片都能下载胜利,胜利下载就跳出循环

import requests

import useragenttool

import proxytool

import time

import random

import json

import os

import re

import urllib3

urllib3.disable_warnings()

class OnePieceImageSpider(object):

def __init__(self):

# 初始化

self.url = “”

def set_url(self, out_url):

“”” 设置网络地址 ”””

self.url = out_url

def get_url_list(self, num):

“”” 获取 num 页网址 ”””

url_list = []

# 拼接网址,取得列表

for page in range(1, num+1):

new_url = self.url.format(page)

url_list.append(new_url)

return url_list

def get_url_html(self, inner_url):

“”” 解析取得网址源代码 ”””

headers = useragenttool.get_headers()

headers[“Accept-Encoding”] = “deflate, sdch, br”

headers[“Content-Type”] = “text/html; charset=UTF-8”

headers[“Referer”] = “https://kanbook.net/328/3/6″# 参照页

# print(headers)

response = requests.get(url=inner_url,

headers=headers,

proxies=proxytool.get_proxies(),

timeout=30,

verify=False)

# 动静限度爬取网页源代码工夫

wait_time = random.randint(1, 6)

time.sleep(wait_time)

html_content = response.content

# print(html_content)

return html_content

def __download_image(self, image_url, name, index):

“””

下载图片
:param image_url: 图片地址
外汇 MT4 教程 https://www.kaifx.cn/mt4.html

:param name: 文件名字

:param index: 图片数字

:return:

“””

while True:

try:

if len(image_url) == 0:

break

content = self.get_url_html(image_url)

path = “./onepieceimage/%s” % name

if not os.path.exists(path):

os.mkdir(path)

with open(path + “/%d.jpg” % index, “wb”) as wfile:

wfile.write(content)

break

except Exception as msg:

print(“ 出现异常, 错误信息为 ”, msg)

# 启动程序

def run(self,url_list, title):

# print(url_list)

# 遍历解决,取得 html

index = 2

for url in url_list:

while True:

try:

# print(url)

data = self.get_url_html(url).decode(“utf-8”)

# print(data)

regex = r”””var img_list=(\[.+])”””

result = re.findall(regex, data)

# print(type(result[0]))

# 转换列表

lists = json.loads(result[0])

# print(lists)

img_url = lists[0]

print(img_url)

break

except Exception as msg:

print(“ 错误信息:”,msg)

self.__download_image(img_url, title, index)

print(“ 第 %d 张下载 ” % index)

index += 1

print(“ 所有图片下载胜利 ”)

def main():

# 提取文件

read_file = open(“./image_url/one_piece_data.json”,”r”,encoding=”utf-8″)

one_piece_data = json.load(read_file)

read_file.close()

# 遍历解决,提取字典数据

for element in one_piece_data:

# print(element)

# 海贼王地址、页数、题目

href_name = element[“postfix”]

number = element[“page”]

name = element[“title”]

# 拼接网址

http_url = “http://kanbook.net”+href_name+”/{}”

# print(http_url)

onepieceimgspider = OnePieceImageSpider()

onepieceimgspider.set_url(http_url)

print(“%s 开始下载!” % name)

url_list = onepiecespider.get_url_list(number)

# print(url_list)

# 取得每页的 url 列表

onepieceimgspider.run(url_list, name)

if __name__ == ‘__main__’:

main()

正文完
 0