关于python:python-爬取桌面壁纸

54次阅读

共计 2666 个字符,预计需要花费 7 分钟才能阅读完成。

今天下午用 python 写了一个爬桌面壁纸的爬虫。
十分的简略,毕竟大部分的网站都没有反爬策略的。

import requests
from lxml import etree
import re
import time
url = ‘https://wallhaven.cc/toplist’

headers = {

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'

}
def getmaxlistnum():

makelistnumlink = 'https://wallhaven.cc/toplist?page=2'
r = requests.get(makelistnumlink, headers=headers)
r = etree.HTML(r.content.decode('utf8'))
list_num_xpath = '/html/body/main/div[1]/section/header/h2/text()[2]'
list_num = r.xpath(list_num_xpath)
list_num = list_num.pop()
list_num = re.findall('([0-9]{1,4})', list_num).pop()
return list_num

list_num = getmaxlistnum()
print(‘ 目前一共有 ’+list_num+’ 页壁纸 ’)

def writefilespng(endlink,imgname,num):

count = 1
while count < 4:
    try:
        bgimg = requests.get(endlink, headers=headers, timeout=5).content
        break
    except requests.exceptions.RequestException:
        count += 1
        time.sleep(30)
with open(imgname + '.png', 'wb') as mh:
    mh.write(bgimg)
    print('已实现'+imgname+'.png')

def writefilesjpg(endlink,imgname,num):

count = 1
while count < 4:
    try:
        bgimg = requests.get(endlink, headers=headers, timeout=5).content
        break
    except requests.exceptions.RequestException:
        count += 1
        time.sleep(30)
with open(imgname + '.jpg', 'wb') as mh:
    mh.write(bgimg)
    print('已实现'+imgname+'.jpg')

def makebgimg(url,num):

backgroundimgurl_xpath = '//*[@id="thumbs"]/section/ul/li/figure/a/@href'
r = requests.get(url, headers=headers)
r = etree.HTML(r.content.decode('utf8'))
backgroundimgurl = r.xpath(backgroundimgurl_xpath)
endlink_xpath = '//*[@id="wallpaper"]/@src'
for bgurl in range(len(backgroundimgurl)):
    everylink = backgroundimgurl.pop()
    r = requests.get(everylink, headers=headers)
    r = etree.HTML(r.content.decode('utf8'))
    endlink = r.xpath(endlink_xpath).pop()
    bgimginfo = re.findall('.*[a-z0-9]{6}.([pngjp]{3})', endlink).pop()
    if bgimginfo == 'png':
        imgname = re.findall('.*([a-z0-9]{6}).png', endlink).pop()
        writefilespng(endlink, imgname,num)
    elif bgimginfo == 'jpg':
        imgname = re.findall('.*([a-z0-9]{6}).jpg', endlink).pop()
        writefilesjpg(endlink, imgname,num)

def makelink(wantget):

urllist = []
pagenum = re.findall('([0-9])', wantget)
pagenum = list(set(pagenum))
pagenum.sort()
maxpagenum = int(pagenum.pop())
minpagenum = int(pagenum.pop(0))
for i in range(minpagenum, maxpagenum + 1):
    newurl = url+'?page='+str(i)
    urllist.append(newurl)
return urllist

def mainbk():

print('*'*30)
print('壁纸网站:https://wallhaven.cc/toplist')
print('只是爬取 toplist')
print('注: 输出的是一个范畴,如果想要独自的页码请只输出一个数字')
wantget = input('请输出你想爬取的页数,如 1 -10 代表爬取 1 -10 页,1- 1 代表爬取第一页 n 不要用 123456 这类页码示意,举荐 1 -2,1.2.3 之类的 n 请输出:')
urllist = makelink(wantget)
for num in range(len(urllist)):
    url = urllist.pop()
    makebgimg(url,num)

mainbk()

大体思路就是下面的代码了,首先是获取页码,而后通过 re 来进行获取一个列表,而后创立链接,接下来传入到创立图片的函数里,进行操作,目前我测试应该没什么大问题。
不过能够将 sleep 去掉,或者批改工夫少一些,不然有的时候会很急人的。

对于 python 爬取桌面壁纸就简略说到这里,爬虫基本上会一个,其余的都会写了,大体思路都是相通的。
python 爬取桌面壁纸

正文完
 0