共计 7456 个字符,预计需要花费 19 分钟才能阅读完成。
python 实现:爬取【应用】需求数据
import requests
import json
import pandas as pd
url = “https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&reqPageNum=1&uri=34789c86f4654624ba9e63cf1353c860&maxResults=25&locale=zh_CN”
def getUrlText(url):
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36”}
”’
爬取网页的通用代码框架
”’
try:
r = requests.get(url,headers=headers,timeout=30)
r.raise_for_status() # 如果 status_code 不等于 200, 就抛出异常
r.encoding = r.apparent_encoding
return r.text
except:
return “ 产生异常 ”
first_url_text = getUrlText(url)
first_data = json.loads(first_url_text)
first_levels = first_data[‘layoutData’][1][‘dataList’]
col_num = 0 # 用来记录存取数据的行数
result = pd.DataFrame(columns=[‘ 一级标签 ’,’ 二级标签 ’,’app 名称 ’]) # 用来存放结果
for i in range(len(first_levels)):
first_level = first_levels[i][‘name’] # 一级标签
# 游戏单独处理
if first_level != “ 游戏 ”:
first_detailId = first_levels[i][‘detailId’]
second_url = r”https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri=”+first_detailId+”&maxResults=25&reqPageNum=1&locale=zh_CN”
second_url_text = getUrlText(second_url)
second_data = json.loads(second_url_text)
second_levels = second_data[‘layoutData’]
for j in range(len(second_levels)):
second_level = second_levels[j][‘name’] # 二级标签
second_detailId = second_levels[j][‘detailId’]
third_url = “https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri=”+second_detailId+”&maxResults=25&reqPageNum=1&locale=zh_CN”
third_url_text = getUrlText(third_url)
third_data = json.loads(third_url_text)
third_levels = third_data[‘layoutData’][0][‘dataList’]
for k in range(len(third_levels)):
app_name = third_levels[k][‘name’] # 对应的 app
result = result.append(pd.DataFrame({‘ 一级标签 ’:[first_level],’ 二级标签 ’:[second_level],’app 名称 ’:[app_name]}))
col_num +=1
if col_num%100==1:
print(col_num)
result.to_excel(‘result2.xlsx’,sheet_name=’ 应用 ’,encoding=’utf-8′,index=False)
python 实现:爬取【游戏】需求数据
【游戏】部分的 app 信息获取比【应用】部分多一个层级,过程类似,直接上代码感受下:
import panda as pd
import requests
import json
# from bs4 import BeautifulSoup
url = “https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&reqPageNum=1&uri=34789c86f4654624ba9e63cf1353c860&maxResults=25&locale=zh_CN”
def getUrlText(url):
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36”}
”’
爬取网页的通用代码框架
”’
try:
r = requests.get(url,headers=headers,timeout=30)
r.raise_for_status() # 如果 status_code 不等于 200, 就抛出异常
r.encoding = r.apparent_encoding
return r.text
except:
return “ 产生异常 ”
first_url_text = getUrlText(url)
first_data = json.loads(first_url_text)
first_levels = first_data[‘layoutData’][1][‘dataList’]
col_num = 0 # 用来记录存取数据的行数
result = pd.DataFrame(columns=[‘ 一级标签 ’,’ 二级标签 ’,’ 三级标签 ’,’app 名称 ’]) # 用来存放结果
for i in range(len(first_levels)):
first_level = first_levels[i][‘name’] # 一级标签
# 游戏单独处理
亨达代理申请 http://www.kaifx.cn/broker/ha…
if first_level == “ 游戏 ”:
second_detailId = first_levels[i][‘detailId’]
second_url = r”https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri=”+second_detailId+”&maxResults=25&reqPageNum=1&locale=zh_CN”
second_url_text = getUrlText(second_url)
second_data = json.loads(second_url_text)
second_levels = second_data[‘layoutData’][0][‘dataList’]
for j in range(len(second_levels)):
third_detailId = second_levels[j][‘detailId’]
second_level = second_levels[j][‘name’] # 二级标签
third_url = r”https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri=”+third_detailId+”&maxResults=25&reqPageNum=1&locale=zh_CN”
third_url_text = getUrlText(third_url)
third_data = json.loads(third_url_text)
third_levels = third_data[‘layoutData’]
for k in range(len(third_levels)):
third_level = third_levels[k][‘dataList’][0][‘name’] # 三级标签
four_detailId = third_levels[k][‘dataList’][0][‘detailId’]
four_url = r”https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri=” +four_detailId+”&maxResults=25&reqPageNum=1&locale=zh_CN”
four_url_text = getUrlText(four_url)
four_data = json.loads(four_url_text)
four_levels = four_data[‘layoutData’][0][‘dataList’]
for h in range(len(four_levels)):
app_name = four_levels[h][‘name’] # 对应的 app
# print([first_level,second_level,third_level,app_name])
result = result.append(pd.DataFrame({‘ 一级标签 ’:[first_level],’ 二级标签 ’:[second_level],
‘ 三级标签 ’:[third_level],’app 名称 ’:[app_name]}))
col_num +=1
if col_num%100==1:
print(col_num)
result.to_excel(‘result.xlsx’,sheet_name=’ 游戏 ’,encoding=’utf-8′,index=False)
【应用】信息翻页问题处理实现
对第三部分的实现进行了打包处理和部分优化。具体程序如下:
# -*- coding: utf-8 -*-
“””
Created on Sun Jun 21 09:36:17 2020
@author: Administrator
“””
import time
from xlrd import open_workbook
from xlutils.copy import copy
import requests
import json
import pandas as pd
写入 excel,xlutils 可以写入到已存在的 excel 中,xlwt 只能每次都重写
def write_xls(filename,row,first_level, second_level, app_name):
rb=open_workbook(filename)
wb=copy(rb)
ws=wb.get_sheet(0)
ws.write(row,0,first_level)
ws.write(row,1,second_level)
ws.write(row,2,app_name)
wb.save(filename)
def getUrlText(url):
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36”}
”’
爬取网页的通用代码框架
”’
try:
r = requests.get(url,headers=headers,timeout=30)
r.raise_for_status() # 如果 status_code 不等于 200, 就抛出异常
r.encoding = r.apparent_encoding
return r.text
except:
return “ 产生异常 ”
def getAppLabels():
time_start=time.time() # 500 个结果开始计时
flag_second = True
flag_third = True
url = “https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&reqPageNum=1&uri=34789c86f4654624ba9e63cf1353c860&maxResults=25&locale=zh_CN”
com_left_url = r”https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri=”
com_center_url = r”&maxResults=25&reqPageNum=”
com_right_url = r”&locale=zh_CN”
first_url_text = getUrlText(url)
first_data = json.loads(first_url_text)
first_levels = first_data[‘layoutData’][1][‘dataList’]
# row = 0 # 用来记录存取数据的行数
global row
# result = pd.DataFrame(columns=[‘ 一级标签 ’,’ 二级标签 ’,’app 名称 ’]) # 用来存放结果
result = []
for i in range(len(first_levels)):
first_level = first_levels[i][‘name’] # 一级标签
# 游戏单独处理
if first_level != “ 游戏 ”:
first_detailId = first_levels[i][‘detailId’]
# 获取二级标签对应的多页数据
second_page_num = 1 # 页面初值设为 1
while flag_second:
second_url = com_left_url + first_detailId + com_center_url+str(second_page_num)+com_right_url
second_url_text = getUrlText(second_url)
second_data = json.loads(second_url_text)
second_levels = second_data[‘layoutData’]
if second_levels == []:# 无数据
flag_second = False # 结束循环
else:
second_page_num +=1 # 二级页面自加 1
for j in range(len(second_levels)):
second_level = second_levels[j][‘name’] # 二级标签
second_detailId = second_levels[j][‘detailId’]
# 获取三级对应的多页 app 信息
third_page_num = 1 # 页面初值设为 1
while flag_third:
third_url = com_left_url + second_detailId + com_center_url + str(third_page_num) + com_right_url
third_url_text = getUrlText(third_url)
third_data = json.loads(third_url_text)
third_levels = third_data[‘layoutData’]
if third_levels ==[] :# 无数据
flag_third=False
else:
third_page_num +=1 # 三级页面自加 1
third_levels = third_levels[0][‘dataList’]
for k in range(len(third_levels)):
app_name = third_levels[k][‘name’] # 对应的 app
# write_xls(‘test.xlsx’,row,first_level, second_level, app_name)
result.append([first_level, second_level, app_name])
# result = result.append(pd.DataFrame({‘ 一级标签 ’:[first_level],’ 二级标签 ’:[second_level],’app 名称 ’:[app_name]}))
row +=1
if row%500==1:
time_end=time.time() # 500 个结果结束计时
print(‘time cost:%.3f’%(time_end-time_start),’s’)
time_start = time.time()
print(row)
print(first_level,”:“,second_level)
flag_third = True # 恢复初值
flag_second = True # 恢复初值
return result
# 主程序
if __name__ ==”__main__”:
row = 0 # 用来记录存取数据的行数
result = getAppLabels()
result = pd.DataFrame(result,columns=[‘ 一级标签 ’,’ 二级标签 ’,’app 名称 ’])
result.to_excel(‘result2.xlsx’,sheet_name=’ 应用 ’,encoding=’utf-8′,index=False)