import requests
import time
import os
from multiprocessing import Pool,cpu_count,current_process,Process
from lxml import etree
import parsel
import re
from PIL import Image
from io import BytesIO
timeout = 10
下载图片保留门路
DIR_PATH = r"d:\meizi\明星模特"
header = \
{ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"}
获取每一页的地址
def get_page_num():
r = requests.get(url=base_url+'/bz/nxxz/nxmt/',timeout=10,headers=header)html = parsel.Selector(r.text) list=html.xpath('//span[@class="ea"]/select[@name="sldd"]/option/@value').extract()return list
组装每一页上每个item的地址
def get_page_detail(list):
page_list=[]for i in list: url =base_url+'/bz/nxxz/nxmt/'+i r = requests.get(url=url,timeout=10,headers=header) html = parsel.Selector(r.text) list=html.xpath('//ul[@class="picbz"]/li/a[@target="_blank"]/@href').extract() nums=html.xpath('//ul[@class="picbz"]/li/text()').extract() total=[] for d in nums: if re.findall(r"\d+\.?\d*",d): total.append(re.findall(r"\d+\.?\d*",d)[0]) for ind, i in enumerate(list): id = re.findall(r"\d+",i)[0] for i3 in range(int(total[ind])): if i3 !=0: s = f'{id}_{i3+1}' page_list.append(i.replace(id,s)) else: page_list.append(i)return page_list
获取原图地址
def get_img_orangin_url(url,index):
r = requests.get(url=url,timeout=10,headers=header)r.encoding=r.apparent_encodinghtml = parsel.Selector(r.text)file_name=html.xpath('/html/body/div[3]/h1/span/text()').get()img_url = html.xpath('/html/body/script[1]').get()pattern = re.compile("'(.*)'")img_url=pattern.[期货](https://www.gendan5.com/futures.html)findall(img_url)[0]img_url= 'http://pic.jj20.com'+img_urlfile_name = re.sub('\(.*\)','',file_name)print("url",url)print("img_url",img_url)if '_' in url: file_name=file_name+img_url[img_url.index('-')+1:]else: file_name=f'{file_name}-1.jpg'# pool.apply_async(save_img,(img_url,file_name))print("file_name",file_name)save_img(img_url,file_name)
保留图片
def save_img(img_url,file_name):
try: # r = requests.get(img_url, headers=header, timeout=timeout).content if not os.path.exists(DIR_PATH): os.makedirs(DIR_PATH) os.chdir(DIR_PATH) # with open(file_name, 'wb') as f: # f.write(r) img_header={
"Accept":"image/avif,image/webp,image/apng,image/svg+xml,image/,/*;q=0.8",
"Cache-Control":"no-cache", "Cookie":"__yjs_duid=1_6ab2ea97fb9890674b30afac7438c78d1623378472065; UM_distinctid=179f8e5766edd-01960ab8d9a8cc-68141f7b-1fa400-179f8e5766f714; Hm_lvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623378459; Hm_lpvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623734936", "Host": "pic.jj20.com", "Pragma": "no-cache", "Referer": "http://cj.jj20.com/", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36" } response = requests.get(img_url, headers=img_header, timeout=timeout) image = Image.open(BytesIO(response.content)) image.save(file_name) print("保留结束")except Exception as e: print(e)
def get_img(page_list):
for index, i in enumerate(page_list): time.sleep(0.1) # pool.apply_async(get_img_orangin_url,(base_url+i,)) get_img_orangin_url(base_url+i,index+1)
if name == '__main__':
base_url='http://www.jj20.com'pool=Pool(6)list=get_page_num()# list=["list_57_1.html"]page_list = get_page_detail(list)get_img(page_list)print(len(page_list))pool.close()pool.join()