爬虫网络申请模块下
2. urllib.parse的应用----用于url的拼接
(1)urllib.parse.urlencode(字典)
(2)urllib.parse.quote(字符串)
爬取百度贴吧
1. 需要
(1)输出要爬取的贴吧的名称
(2)要做翻页的解决 指定起始页和完结页
(3)把爬取下来的每一页都保留到本地 1.html 2.html
2. 思路剖析
(1)输出要爬取的贴吧主题
(2)翻页解决——通过寻找url的法则
(3)保留文件写下来
代码
(1)代码urllib_parse
import urllib.requestimport urllib.parse# %E5%A5%A5%E8%BF%90%E4%BC%9A# wd=%E5%A5%A5%E8%BF%90%E4%BC%9Aurl1 = 'https://www.baidu.com/s?&wd=%E5%A5%A5%E8%BF%90%E4%BC%9A'url2 = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=奥运会&oq=%25E5%25A5%25A5%25E8%25BF%2590%25E4%25BC%259A&rsv_pq=c4fdeace0001b86e&rsv_t=b9a5o4NRKcevKybzt%2FjEXz2BSzfzO8uczAqwZu1MQH0Z8K4%2FzLOixzWa2zU&rqlang=cn&rsv_enter=1&rsv_dl=tb&rsv_sug3=1&rsv_sug1=1&rsv_sug7=100&rsv_sug2=0&rsv_btype=t&inputT=5&rsv_sug4=701'# url1 和 url2 指代的网页是一样的 只不过表达方式不同response = urllib.request.urlopen(url2)print(response)# 如果url中有中文 就不能间接用urllib发动申请 解决方案如下# 第一种形式 传字典r = {'wd': '奥运会'}result = urllib.parse.urlencode(r)print(result,type(result))# 察看失去的result后果 以及result的类型com_url = 'https://www.baidu.com/s?&' + resultprint(com_url)# 第二种形式 传字符串r = '奥运会'result = urllib.parse.quote(r)print(result,type(result))com_url = 'https://www.baidu.com/s?&wd=' + resultprint(com_url)result = urllib.parse.quote(url2)print(result)# 也不能间接对整个url进行解决 会呈现谬误# 小拓展img_url = 'http%3A%2F%2Fshp%2Eqpic%2Ecn%2Fishow%2F2735072812%2F1627447940%5F84828260%5F23031%5FsProdImgNo%5F2%2Ejpg%2F200'result = urllib.parse.unquote(img_url)print(result)
(2)代码baidu_tieba_1
import urllib.requestimport urllib.parseheaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',}# 1.输出要爬取的主题title = input('请输出要爬取的主题:')# 2.指定起始页和完结页start = int(input('请输出起始页:'))end = int(input('请输出完结页:'))# 3.拼接urlkw = {'kw': title}result = urllib.parse.urlencode(kw)# range是左闭右开的for i in range(start, end+1): pn = (i-1) * 50 # print(pn) # https://tieba.baidu.com/f?kw=奥运会&pn=0 com_url = 'https://tieba.baidu.com/f?' + result + '&pn=' + str(pn) # print(com_url) # 发申请 获取响应 # 发明申请对象 构建UA req = urllib.request.Request(com_url, headers=headers) # 发送申请 res = urllib.request.urlopen(req) html = res.read().decode('utf-8') # 写入数据 filename = '第' + str(i) + '页' + '.html' with open(filename, 'w', encoding='utf-8') as f: f.write(html) print('正在爬取第%d页'%i)
(3)代码baidu_tieba_2 应用函数进行改写
import urllib.requestimport urllib.parse# 读取页面def read_page(com_url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', } # 发申请 获取相应 # 发明申请对象 构建UA req = urllib.request.Request(com_url, headers=headers) # 发送申请 res = urllib.request.urlopen(req) html = res.read().decode('utf-8') return html# 写入文件def save_page(filename,html): # 写入数据 # filename = '第' + str(i) + '页' + '.html' with open(filename, 'w', encoding='utf-8') as f: f.write(html) # print('正在爬取第%d页' % i)# 主函数def main(): # 1.输出要爬取的贴吧主题 title = input('请输出要爬取的贴吧主题:') # 2.指定起始页和完结页 start = int(input('请输出起始页:')) end = int(input('请输出完结页:')) kw = {'kw': title} result = urllib.parse.urlencode(kw) for i in range(start, end+1): pn = (i-1) * 50 # print(pn) # https://tieba.baidu.com/f?kw=奥运会&pn=0 com_url = 'https://tieba.baidu.com/f?' + result + '&pn=' +str(pn) # print(com_url) html = read_page(com_url) filename ='第' + str(i) + '页' + '.html' save_page(filename, html) print('正在爬取第%d页' % i)if __name__ == '__main__': main()
(4)代码baidu_tieba_3 应用面向对象进行改写
import urllib.requestimport urllib.parseclass BaiduSpider(): def __init__(self): pass # 读取页面 def read_page(self,com_url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', } # 发送申请获取响应 # 发明申请对象、构建UA req = urllib.request.Request(com_url,headers=headers) res = urllib.request.urlopen(req) html = res.read().decode('utf-8') return html # 写入文件 def save_page(self, filename, html): # 写入数据 # filename = '第' + str(i) + '页' +'.html' with open(filename, 'w', encoding='utf-8') as f: f.write(html) # print('正在爬取第%d页‘ % i) def main(self): # 1.输出要爬取的贴吧主题 title = input('请输出要爬取的贴吧主题:') # 2.指定起始页和完结页 start = int(input('请输出起始页:')) end = int(input('请输出完结页:')) # 3.拼接url kw = {'kw': title} result = urllib.parse.urlencode(kw) for i in range(start, end+1): pn = (i-1) * 50 # print(pn) print(type(pn)) # https://tieba.baidu.com/f?kw=奥运会&pn=0 com_url = 'https://tieba.baidu.com/f?' + result + '&pn=' + str(pn) html = self.read_page(com_url) filename = '第' + str(i) + '页' + '.html' self.save_page(filename,html) print('正在爬取第%d页' % i)if __name__ == '__main__': # 实例化对象 spider = BaiduSpider() spider.main()