关于python:Python爬取网页的所有内外链

4次阅读

共计 2875 个字符,预计需要花费 8 分钟才能阅读完成。

class Queue(object):

# 初始化队列
def __init__(self):
    self.items = []      
#入队
def enqueue(self, item):
    self.items.append(item)
#出队
def dequeue(self):
    if self.is_Empty():
        print("以后队列为空!!")
    else:
        return self.items.pop(0)        
#判断是否为空
def is_Empty(self):
    return self.items == []       
#队列长度
def size(self):
    return len(self.items)    
#返回队头元素,如果队列为空的话,返回 None
def front(self):
    if self.is_Empty():
        print("以后队列为空!!")
    else:
        return self.items[len(self.items) - 1]

导入库

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
import re
import urllib.parse
import time
import random
queueInt = Queue() #存储内链的队列
queueExt = Queue() #存储外链的队列
externalLinks = []
internalLinks = []

获取页面中所有外链的列表

def getExterLinks(bs, exterurl):

# 找出所有以 www 或 http 结尾且不蕴含以后 URL 的链接
for link in bs.find_all('a', href = re.compile
                        ('^(http|www)((?!'+urlparse(exterurl).netloc+').)*$')):
    #依照规范,[WebMoney 下载](https://www.gendan5.com/wallet/WebMoney.html)URL 只容许一部分 ASCII 字符,其余字符(如汉字)是不符合标准的,#咱们的链接网址可能存在汉字的状况,此时就要进行编码。link.attrs['href'] = urllib.parse.quote(link.attrs['href'],safe='?=&:/')
    if link.attrs['href'] is not None:
        if link.attrs['href'] not in externalLinks:
            queueExt.enqueue(link.attrs['href'])
            externalLinks.append(link.attrs['href'])
            print(link.attrs['href'])

return externalLinks

获取页面中所以内链的列表

def getInterLinks(bs, interurl):

interurl = '{}://{}'.format(urlparse(interurl).scheme,
                            urlparse(interurl).netloc)  
#找出所有以“/”结尾的外部链接
for link in bs.find_all('a', href = re.compile
                        ('^(/|.*'+urlparse(interurl).netloc+')')):
    link.attrs['href'] = urllib.parse.quote(link.attrs['href'],safe='?=&:/')
    if link.attrs['href'] is not None:
        if link.attrs['href'] not in internalLinks:
    #startsWith() 办法用来判断以后字符串是否是以另外一个给定的子字符串“结尾”的
            if(link.attrs['href'].startswith('//')):
                if interurl+link.attrs['href'] not in internalLinks:
                    queueInt.enqueue(interurl+link.attrs['href'])
                    internalLinks.append(interurl+link.attrs['href'])
            elif(link.attrs['href'].startswith('/')):
                if interurl+link.attrs['href'] not in internalLinks:
                    queueInt.enqueue(interurl+link.attrs['href'])
                    internalLinks.append(interurl+link.attrs['href'])
            else:
                queueInt.enqueue(link.attrs['href'])
                internalLinks.append(link.attrs['href'])

return internalLinks

def deepLinks():

num = queueInt.size()
while num > 1:
    i = queueInt.dequeue()
    if i is None:
        break
    else:
        print('拜访的内链')
        print(i)
        print('找到的新外链')
#         html = urlopen(i)
        html=requests.get(i,headers=headers_)
        time.sleep(random.random()*3)
        domain1 = '{}://{}'.format(urlparse(i).scheme, urlparse(i).netloc)
        bs = BeautifulSoup(html.content, 'html.parser')
        getExterLinks(bs, domain1)
        getInterLinks(bs, domain1)

def getAllLinks(url):

global num

html = urlopen(url)

headers_={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68'}
html = requests.get(url,headers=headers_)
time.sleep(random.random()*3) #模仿人类行为,距离随机的工夫
domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)
bs = BeautifulSoup(html.content, 'html.parser')
getInterLinks(bs, domain)
getExterLinks(bs, domain)
deepLinks()

getAllLinks(‘https://image.baidu.com/’)

正文完
 0