关于python:python抓取壁纸资源httpwwwjj20com

42次阅读

共计 3094 个字符,预计需要花费 8 分钟才能阅读完成。

import requests
import time
import os
from multiprocessing import Pool,cpu_count,current_process,Process
from lxml import etree
import parsel
import re
from PIL import Image
from io import BytesIO
timeout = 10

下载图片保留门路

DIR_PATH = r”d:\meizi\ 明星模特 ”
header = \

{"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}

获取每一页的地址

def get_page_num():

r = requests.get(url=base_url+'/bz/nxxz/nxmt/',timeout=10,headers=header)
html = parsel.Selector(r.text)    list=html.xpath('//span[@class="ea"]/select[@name="sldd"]/option/@value').extract()
return list

组装每一页上每个 item 的地址

def get_page_detail(list):

page_list=[]
for i in list:
    url =base_url+'/bz/nxxz/nxmt/'+i
    r = requests.get(url=url,timeout=10,headers=header)
    html = parsel.Selector(r.text)        list=html.xpath('//ul[@class="picbz"]/li/a[@target="_blank"]/@href').extract()
    nums=html.xpath('//ul[@class="picbz"]/li/text()').extract()
    total=[]
    for d in nums:
        if re.findall(r"\d+\.?\d*",d):
            total.append(re.findall(r"\d+\.?\d*",d)[0])
    for ind, i in enumerate(list):
        id = re.findall(r"\d+",i)[0]
        for i3 in range(int(total[ind])):
            if i3 !=0:
                s = f'{id}_{i3+1}'
                page_list.append(i.replace(id,s))
            else:
                page_list.append(i)
return page_list

获取原图地址

def get_img_orangin_url(url,index):

r = requests.get(url=url,timeout=10,headers=header)
r.encoding=r.apparent_encoding
html = parsel.Selector(r.text)
file_name=html.xpath('/html/body/div[3]/h1/span/text()').get()
img_url = html.xpath('/html/body/script[1]').get()
pattern = re.compile("'(.*)'")
img_url=pattern.[期货](https://www.gendan5.com/futures.html)findall(img_url)[0]
img_url= 'http://pic.jj20.com'+img_url
file_name = re.sub('\(.*\)','',file_name)
print("url",url)
print("img_url",img_url)
if '_' in url:
    file_name=file_name+img_url[img_url.index('-')+1:]
else:
    file_name=f'{file_name}-1.jpg'
# pool.apply_async(save_img,(img_url,file_name))
print("file_name",file_name)
save_img(img_url,file_name)

保留图片

def save_img(img_url,file_name):

try:
    # r = requests.get(img_url, headers=header, timeout=timeout).content
    if not os.path.exists(DIR_PATH):
        os.makedirs(DIR_PATH)
    os.chdir(DIR_PATH)
    # with open(file_name, 'wb') as f:
    #     f.write(r)
    img_header={

“Accept”:”image/avif,image/webp,image/apng,image/svg+xml,image/,/*;q=0.8″,

        "Cache-Control":"no-cache",            "Cookie":"__yjs_duid=1_6ab2ea97fb9890674b30afac7438c78d1623378472065; UM_distinctid=179f8e5766edd-01960ab8d9a8cc-68141f7b-1fa400-179f8e5766f714; Hm_lvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623378459; Hm_lpvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623734936",
        "Host": "pic.jj20.com",
        "Pragma": "no-cache",
        "Referer": "http://cj.jj20.com/",
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
    }
    response = requests.get(img_url, headers=img_header, timeout=timeout)
    image = Image.open(BytesIO(response.content))
    image.save(file_name)
    print("保留结束")
except Exception as e:
    print(e)

def get_img(page_list):

for index, i in enumerate(page_list):
    time.sleep(0.1)
    # pool.apply_async(get_img_orangin_url,(base_url+i,))
    get_img_orangin_url(base_url+i,index+1)

if name == ‘__main__’:

base_url='http://www.jj20.com'
pool=Pool(6)
list=get_page_num()
# list=["list_57_1.html"]
page_list = get_page_detail(list)
get_img(page_list)
print(len(page_list))
pool.close()
pool.join()

正文完
 0