import requests
import time
import os
from multiprocessing import Pool,cpu_count,current_process,Process
from lxml import etree
import parsel
import re
from PIL import Image
from io import BytesIO
timeout = 10

下载图片保留门路

DIR_PATH = r"d:\meizi\明星模特"
header = \

{    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"}

获取每一页的地址

def get_page_num():

r = requests.get(url=base_url+'/bz/nxxz/nxmt/',timeout=10,headers=header)html = parsel.Selector(r.text)    list=html.xpath('//span[@class="ea"]/select[@name="sldd"]/option/@value').extract()return list

组装每一页上每个item的地址

def get_page_detail(list):

page_list=[]for i in list:    url =base_url+'/bz/nxxz/nxmt/'+i    r = requests.get(url=url,timeout=10,headers=header)    html = parsel.Selector(r.text)        list=html.xpath('//ul[@class="picbz"]/li/a[@target="_blank"]/@href').extract()    nums=html.xpath('//ul[@class="picbz"]/li/text()').extract()    total=[]    for d in nums:        if re.findall(r"\d+\.?\d*",d):            total.append(re.findall(r"\d+\.?\d*",d)[0])    for ind, i in enumerate(list):        id = re.findall(r"\d+",i)[0]        for i3 in range(int(total[ind])):            if i3 !=0:                s = f'{id}_{i3+1}'                page_list.append(i.replace(id,s))            else:                page_list.append(i)return page_list

获取原图地址

def get_img_orangin_url(url,index):

r = requests.get(url=url,timeout=10,headers=header)r.encoding=r.apparent_encodinghtml = parsel.Selector(r.text)file_name=html.xpath('/html/body/div[3]/h1/span/text()').get()img_url = html.xpath('/html/body/script[1]').get()pattern = re.compile("'(.*)'")img_url=pattern.[期货](https://www.gendan5.com/futures.html)findall(img_url)[0]img_url= 'http://pic.jj20.com'+img_urlfile_name = re.sub('\(.*\)','',file_name)print("url",url)print("img_url",img_url)if '_' in url:    file_name=file_name+img_url[img_url.index('-')+1:]else:    file_name=f'{file_name}-1.jpg'# pool.apply_async(save_img,(img_url,file_name))print("file_name",file_name)save_img(img_url,file_name)

保留图片

def save_img(img_url,file_name):

try:    # r = requests.get(img_url, headers=header, timeout=timeout).content    if not os.path.exists(DIR_PATH):        os.makedirs(DIR_PATH)    os.chdir(DIR_PATH)    # with open(file_name, 'wb') as f:    #     f.write(r)    img_header={

"Accept":"image/avif,image/webp,image/apng,image/svg+xml,image/,/*;q=0.8",

        "Cache-Control":"no-cache",            "Cookie":"__yjs_duid=1_6ab2ea97fb9890674b30afac7438c78d1623378472065; UM_distinctid=179f8e5766edd-01960ab8d9a8cc-68141f7b-1fa400-179f8e5766f714; Hm_lvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623378459; Hm_lpvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623734936",        "Host": "pic.jj20.com",        "Pragma": "no-cache",        "Referer": "http://cj.jj20.com/",        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"    }    response = requests.get(img_url, headers=img_header, timeout=timeout)    image = Image.open(BytesIO(response.content))    image.save(file_name)    print("保留结束")except Exception as e:    print(e)

def get_img(page_list):

for index, i in enumerate(page_list):    time.sleep(0.1)    # pool.apply_async(get_img_orangin_url,(base_url+i,))    get_img_orangin_url(base_url+i,index+1)

if name == '__main__':

base_url='http://www.jj20.com'pool=Pool(6)list=get_page_num()# list=["list_57_1.html"]page_list = get_page_detail(list)get_img(page_list)print(len(page_list))pool.close()pool.join()