共计 3094 个字符,预计需要花费 8 分钟才能阅读完成。
import requests
import time
import os
from multiprocessing import Pool,cpu_count,current_process,Process
from lxml import etree
import parsel
import re
from PIL import Image
from io import BytesIO
timeout = 10
下载图片保留门路
DIR_PATH = r”d:\meizi\ 明星模特 ”
header = \
{"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
获取每一页的地址
def get_page_num():
r = requests.get(url=base_url+'/bz/nxxz/nxmt/',timeout=10,headers=header)
html = parsel.Selector(r.text) list=html.xpath('//span[@class="ea"]/select[@name="sldd"]/option/@value').extract()
return list
组装每一页上每个 item 的地址
def get_page_detail(list):
page_list=[]
for i in list:
url =base_url+'/bz/nxxz/nxmt/'+i
r = requests.get(url=url,timeout=10,headers=header)
html = parsel.Selector(r.text) list=html.xpath('//ul[@class="picbz"]/li/a[@target="_blank"]/@href').extract()
nums=html.xpath('//ul[@class="picbz"]/li/text()').extract()
total=[]
for d in nums:
if re.findall(r"\d+\.?\d*",d):
total.append(re.findall(r"\d+\.?\d*",d)[0])
for ind, i in enumerate(list):
id = re.findall(r"\d+",i)[0]
for i3 in range(int(total[ind])):
if i3 !=0:
s = f'{id}_{i3+1}'
page_list.append(i.replace(id,s))
else:
page_list.append(i)
return page_list
获取原图地址
def get_img_orangin_url(url,index):
r = requests.get(url=url,timeout=10,headers=header)
r.encoding=r.apparent_encoding
html = parsel.Selector(r.text)
file_name=html.xpath('/html/body/div[3]/h1/span/text()').get()
img_url = html.xpath('/html/body/script[1]').get()
pattern = re.compile("'(.*)'")
img_url=pattern.[期货](https://www.gendan5.com/futures.html)findall(img_url)[0]
img_url= 'http://pic.jj20.com'+img_url
file_name = re.sub('\(.*\)','',file_name)
print("url",url)
print("img_url",img_url)
if '_' in url:
file_name=file_name+img_url[img_url.index('-')+1:]
else:
file_name=f'{file_name}-1.jpg'
# pool.apply_async(save_img,(img_url,file_name))
print("file_name",file_name)
save_img(img_url,file_name)
保留图片
def save_img(img_url,file_name):
try:
# r = requests.get(img_url, headers=header, timeout=timeout).content
if not os.path.exists(DIR_PATH):
os.makedirs(DIR_PATH)
os.chdir(DIR_PATH)
# with open(file_name, 'wb') as f:
# f.write(r)
img_header={
“Accept”:”image/avif,image/webp,image/apng,image/svg+xml,image/,/*;q=0.8″,
"Cache-Control":"no-cache", "Cookie":"__yjs_duid=1_6ab2ea97fb9890674b30afac7438c78d1623378472065; UM_distinctid=179f8e5766edd-01960ab8d9a8cc-68141f7b-1fa400-179f8e5766f714; Hm_lvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623378459; Hm_lpvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623734936",
"Host": "pic.jj20.com",
"Pragma": "no-cache",
"Referer": "http://cj.jj20.com/",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
response = requests.get(img_url, headers=img_header, timeout=timeout)
image = Image.open(BytesIO(response.content))
image.save(file_name)
print("保留结束")
except Exception as e:
print(e)
def get_img(page_list):
for index, i in enumerate(page_list):
time.sleep(0.1)
# pool.apply_async(get_img_orangin_url,(base_url+i,))
get_img_orangin_url(base_url+i,index+1)
if name == ‘__main__’:
base_url='http://www.jj20.com'
pool=Pool(6)
list=get_page_num()
# list=["list_57_1.html"]
page_list = get_page_detail(list)
get_img(page_list)
print(len(page_list))
pool.close()
pool.join()
正文完