class Queue(object):

#初始化队列def __init__(self):    self.items = []      #入队def enqueue(self, item):    self.items.append(item)#出队def dequeue(self):    if self.is_Empty():        print("以后队列为空!!")    else:        return self.items.pop(0)        #判断是否为空def is_Empty(self):    return self.items == []       #队列长度def size(self):    return len(self.items)    #返回队头元素,如果队列为空的话,返回Nonedef front(self):    if self.is_Empty():        print("以后队列为空!!")    else:        return self.items[len(self.items) - 1]

导入库

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
import re
import urllib.parse
import time
import random
queueInt = Queue() #存储内链的队列
queueExt = Queue() #存储外链的队列
externalLinks = []
internalLinks = []

获取页面中所有外链的列表

def getExterLinks(bs, exterurl):

#找出所有以www或http结尾且不蕴含以后URL的链接for link in bs.find_all('a', href = re.compile                        ('^(http|www)((?!'+urlparse(exterurl).netloc+').)*$')):    #依照规范,[WebMoney下载](https://www.gendan5.com/wallet/WebMoney.html)URL只容许一部分ASCII字符,其余字符(如汉字)是不符合标准的,    #咱们的链接网址可能存在汉字的状况,此时就要进行编码。    link.attrs['href'] = urllib.parse.quote(link.attrs['href'],safe='?=&:/')    if link.attrs['href'] is not None:        if link.attrs['href'] not in externalLinks:            queueExt.enqueue(link.attrs['href'])            externalLinks.append(link.attrs['href'])            print(link.attrs['href'])

return externalLinks

获取页面中所以内链的列表

def getInterLinks(bs, interurl):

interurl = '{}://{}'.format(urlparse(interurl).scheme,                            urlparse(interurl).netloc)  #找出所有以“/”结尾的外部链接for link in bs.find_all('a', href = re.compile                        ('^(/|.*'+urlparse(interurl).netloc+')')):    link.attrs['href'] = urllib.parse.quote(link.attrs['href'],safe='?=&:/')    if link.attrs['href'] is not None:        if link.attrs['href'] not in internalLinks:    #startsWith()办法用来判断以后字符串是否是以另外一个给定的子字符串“结尾”的            if(link.attrs['href'].startswith('//')):                if interurl+link.attrs['href'] not in internalLinks:                    queueInt.enqueue(interurl+link.attrs['href'])                    internalLinks.append(interurl+link.attrs['href'])            elif(link.attrs['href'].startswith('/')):                if interurl+link.attrs['href'] not in internalLinks:                    queueInt.enqueue(interurl+link.attrs['href'])                    internalLinks.append(interurl+link.attrs['href'])            else:                queueInt.enqueue(link.attrs['href'])                internalLinks.append(link.attrs['href'])

return internalLinks

def deepLinks():

num = queueInt.size()while num > 1:    i = queueInt.dequeue()    if i is None:        break    else:        print('拜访的内链')        print(i)        print('找到的新外链')#         html = urlopen(i)        html=requests.get(i,headers=headers_)        time.sleep(random.random()*3)        domain1 = '{}://{}'.format(urlparse(i).scheme, urlparse(i).netloc)        bs = BeautifulSoup(html.content, 'html.parser')        getExterLinks(bs, domain1)        getInterLinks(bs, domain1)

def getAllLinks(url):

global num

html = urlopen(url)

headers_={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68'}html = requests.get(url,headers=headers_)time.sleep(random.random()*3) #模仿人类行为,距离随机的工夫domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)bs = BeautifulSoup(html.content, 'html.parser')getInterLinks(bs, domain)getExterLinks(bs, domain)deepLinks()

getAllLinks('https://image.baidu.com/')