class Queue(object):
# 初始化队列
def __init__(self):
self.items = []
#入队
def enqueue(self, item):
self.items.append(item)
#出队
def dequeue(self):
if self.is_Empty():
print("以后队列为空!!")
else:
return self.items.pop(0)
#判断是否为空
def is_Empty(self):
return self.items == []
#队列长度
def size(self):
return len(self.items)
#返回队头元素,如果队列为空的话,返回 None
def front(self):
if self.is_Empty():
print("以后队列为空!!")
else:
return self.items[len(self.items) - 1]
导入库
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
import re
import urllib.parse
import time
import random
queueInt = Queue() #存储内链的队列
queueExt = Queue() #存储外链的队列
externalLinks = []
internalLinks = []
获取页面中所有外链的列表
def getExterLinks(bs, exterurl):
# 找出所有以 www 或 http 结尾且不蕴含以后 URL 的链接
for link in bs.find_all('a', href = re.compile
('^(http|www)((?!'+urlparse(exterurl).netloc+').)*$')):
#依照规范,[WebMoney 下载](https://www.gendan5.com/wallet/WebMoney.html)URL 只容许一部分 ASCII 字符,其余字符(如汉字)是不符合标准的,#咱们的链接网址可能存在汉字的状况,此时就要进行编码。link.attrs['href'] = urllib.parse.quote(link.attrs['href'],safe='?=&:/')
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
queueExt.enqueue(link.attrs['href'])
externalLinks.append(link.attrs['href'])
print(link.attrs['href'])
return externalLinks
获取页面中所以内链的列表
def getInterLinks(bs, interurl):
interurl = '{}://{}'.format(urlparse(interurl).scheme,
urlparse(interurl).netloc)
#找出所有以“/”结尾的外部链接
for link in bs.find_all('a', href = re.compile
('^(/|.*'+urlparse(interurl).netloc+')')):
link.attrs['href'] = urllib.parse.quote(link.attrs['href'],safe='?=&:/')
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
#startsWith() 办法用来判断以后字符串是否是以另外一个给定的子字符串“结尾”的
if(link.attrs['href'].startswith('//')):
if interurl+link.attrs['href'] not in internalLinks:
queueInt.enqueue(interurl+link.attrs['href'])
internalLinks.append(interurl+link.attrs['href'])
elif(link.attrs['href'].startswith('/')):
if interurl+link.attrs['href'] not in internalLinks:
queueInt.enqueue(interurl+link.attrs['href'])
internalLinks.append(interurl+link.attrs['href'])
else:
queueInt.enqueue(link.attrs['href'])
internalLinks.append(link.attrs['href'])
return internalLinks
def deepLinks():
num = queueInt.size()
while num > 1:
i = queueInt.dequeue()
if i is None:
break
else:
print('拜访的内链')
print(i)
print('找到的新外链')
# html = urlopen(i)
html=requests.get(i,headers=headers_)
time.sleep(random.random()*3)
domain1 = '{}://{}'.format(urlparse(i).scheme, urlparse(i).netloc)
bs = BeautifulSoup(html.content, 'html.parser')
getExterLinks(bs, domain1)
getInterLinks(bs, domain1)
def getAllLinks(url):
global num
html = urlopen(url)
headers_={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68'}
html = requests.get(url,headers=headers_)
time.sleep(random.random()*3) #模仿人类行为,距离随机的工夫
domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)
bs = BeautifulSoup(html.content, 'html.parser')
getInterLinks(bs, domain)
getExterLinks(bs, domain)
deepLinks()
getAllLinks(‘https://image.baidu.com/’)