爬虫网络申请模块下
2. urllib.parse 的应用 —- 用于 url 的拼接
(1)urllib.parse.urlencode(字典)
(2)urllib.parse.quote(字符串)
爬取百度贴吧
1. 需要
(1)输出要爬取的贴吧的名称
(2)要做翻页的解决 指定起始页和完结页
(3)把爬取下来的每一页都保留到本地 1.html 2.html
2. 思路剖析
(1)输出要爬取的贴吧主题
(2)翻页解决——通过寻找 url 的法则
(3)保留文件写下来
代码
(1)代码 urllib_parse
import urllib.request
import urllib.parse
# %E5%A5%A5%E8%BF%90%E4%BC%9A
# wd=%E5%A5%A5%E8%BF%90%E4%BC%9A
url1 = 'https://www.baidu.com/s?&wd=%E5%A5%A5%E8%BF%90%E4%BC%9A'
url2 = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd= 奥运会 &oq=%25E5%25A5%25A5%25E8%25BF%2590%25E4%25BC%259A&rsv_pq=c4fdeace0001b86e&rsv_t=b9a5o4NRKcevKybzt%2FjEXz2BSzfzO8uczAqwZu1MQH0Z8K4%2FzLOixzWa2zU&rqlang=cn&rsv_enter=1&rsv_dl=tb&rsv_sug3=1&rsv_sug1=1&rsv_sug7=100&rsv_sug2=0&rsv_btype=t&inputT=5&rsv_sug4=701'
# url1 和 url2 指代的网页是一样的 只不过表达方式不同
response = urllib.request.urlopen(url2)
print(response)
# 如果 url 中有中文 就不能间接用 urllib 发动申请 解决方案如下
# 第一种形式 传字典
r = {'wd': '奥运会'}
result = urllib.parse.urlencode(r)
print(result,type(result))
# 察看失去的 result 后果 以及 result 的类型
com_url = 'https://www.baidu.com/s?&' + result
print(com_url)
# 第二种形式 传字符串
r = '奥运会'
result = urllib.parse.quote(r)
print(result,type(result))
com_url = 'https://www.baidu.com/s?&wd=' + result
print(com_url)
result = urllib.parse.quote(url2)
print(result)
# 也不能间接对整个 url 进行解决 会呈现谬误
# 小拓展
img_url = 'http%3A%2F%2Fshp%2Eqpic%2Ecn%2Fishow%2F2735072812%2F1627447940%5F84828260%5F23031%5FsProdImgNo%5F2%2Ejpg%2F200'
result = urllib.parse.unquote(img_url)
print(result)
(2)代码 baidu_tieba_1
import urllib.request
import urllib.parse
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
# 1. 输出要爬取的主题
title = input('请输出要爬取的主题:')
# 2. 指定起始页和完结页
start = int(input('请输出起始页:'))
end = int(input('请输出完结页:'))
# 3. 拼接 url
kw = {'kw': title}
result = urllib.parse.urlencode(kw)
# range 是左闭右开的
for i in range(start, end+1):
pn = (i-1) * 50
# print(pn)
# https://tieba.baidu.com/f?kw= 奥运会 &pn=0
com_url = 'https://tieba.baidu.com/f?' + result + '&pn=' + str(pn)
# print(com_url)
# 发申请 获取响应
# 发明申请对象 构建 UA
req = urllib.request.Request(com_url, headers=headers)
# 发送申请
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
# 写入数据
filename = '第' + str(i) + '页' + '.html'
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
print('正在爬取第 %d 页'%i)
(3) 代码 baidu_tieba_2 应用函数进行改写
import urllib.request
import urllib.parse
# 读取页面
def read_page(com_url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
# 发申请 获取相应
# 发明申请对象 构建 UA
req = urllib.request.Request(com_url, headers=headers)
# 发送申请
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
return html
# 写入文件
def save_page(filename,html):
# 写入数据
# filename = '第' + str(i) + '页' + '.html'
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
# print('正在爬取第 %d 页' % i)
# 主函数
def main():
# 1. 输出要爬取的贴吧主题
title = input('请输出要爬取的贴吧主题:')
# 2. 指定起始页和完结页
start = int(input('请输出起始页:'))
end = int(input('请输出完结页:'))
kw = {'kw': title}
result = urllib.parse.urlencode(kw)
for i in range(start, end+1):
pn = (i-1) * 50
# print(pn)
# https://tieba.baidu.com/f?kw= 奥运会 &pn=0
com_url = 'https://tieba.baidu.com/f?' + result + '&pn=' +str(pn)
# print(com_url)
html = read_page(com_url)
filename ='第' + str(i) + '页' + '.html'
save_page(filename, html)
print('正在爬取第 %d 页' % i)
if __name__ == '__main__':
main()
(4)代码 baidu_tieba_3 应用面向对象进行改写
import urllib.request
import urllib.parse
class BaiduSpider():
def __init__(self):
pass
# 读取页面
def read_page(self,com_url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
# 发送申请获取响应
# 发明申请对象、构建 UA
req = urllib.request.Request(com_url,headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
return html
# 写入文件
def save_page(self, filename, html):
# 写入数据
# filename = '第' + str(i) + '页' +'.html'
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
# print(' 正在爬取第 %d 页‘% i)
def main(self):
# 1. 输出要爬取的贴吧主题
title = input('请输出要爬取的贴吧主题:')
# 2. 指定起始页和完结页
start = int(input('请输出起始页:'))
end = int(input('请输出完结页:'))
# 3. 拼接 url
kw = {'kw': title}
result = urllib.parse.urlencode(kw)
for i in range(start, end+1):
pn = (i-1) * 50
# print(pn)
print(type(pn))
# https://tieba.baidu.com/f?kw= 奥运会 &pn=0
com_url = 'https://tieba.baidu.com/f?' + result + '&pn=' + str(pn)
html = self.read_page(com_url)
filename = '第' + str(i) + '页' + '.html'
self.save_page(filename,html)
print('正在爬取第 %d 页' % i)
if __name__ == '__main__':
# 实例化对象
spider = BaiduSpider()
spider.main()