此次爬取波及到的库:

request+json--网页数据爬取
openpyxl--保留数据至Excel
pandas--表格数据处理
pyechars--数据可视化

一、剖析网页

关上去哪儿旅行网页:https://piao.qunar.com/

二、爬取每个行政区数据取出url和headers须要用到

def get_city_scenic(city, page):    ua = UserAgent(verify_ssl=False)    # headers = {'User-Agent': ua.random}    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}    # url = f'https://piao.qunar.com/ticket/list.json?keyword={city}&region=&from=mpl_search_suggest&sort=pp&page={page}'    url = f'https://piao.qunar.com/ticket/list.json?keyword={city}&region=&from=mpl_search_suggest&page={page}'    result = requests.get(url, headers=headers, timeout=10)    result.raise_for_status()    return  get_scenic_info(city, result.text)

三、爬取每一页数据

def get_scenic_info(city, response):    response_info = json.loads(response)    sight_list = response_info['data']['sightList']    one_city_scenic = []    for sight in sight_list:        scenic = []        name = sight['sightName'] # 景点名称        star = sight.get('star', None) # 星级        score = sight.get('score', 0) # 评分        price = sight.get('qunarPrice', 0) # 价格        sale = sight.get('saleCount', 0) # 销量        districts = sight.get('districts', None) # 省,市,区        point = sight.get('point', None) # 坐标        intro = sight.get('intro', None) # 简介        free = sight.get('free', True) # 是否收费        address = sight.get('address', None) # 具体地址        scenic.append(city)        scenic.append(name)        scenic.append(star)        scenic.append(score)        scenic.append(price)        scenic.append(sale)        scenic.append(districts)        scenic.append(point)        scenic.append(intro)        scenic.append(free)        scenic.append(address)        one_city_scenic.append(scenic)    return one_city_scenic

四、循环爬取每个行政区每页数据

def get_city_info(city, pages):    # for city in cities:        one_city_info = []        for page in range(1, pages+1):            try:                print(f'正在爬取-{city}(省/市), 第{page}页景点数据...')                time.sleep(random.uniform(0.8,1.5))                one_page_info = get_city_scenic(city, page)            except:                continue            if one_page_info:                one_city_info += one_page_info        # print(one_city_info)        return one_city_info

五、输入到excel表中保留

def insert2excel(filepath,allinfo):    try:        if not os.path.exists(filepath):            tableTitle = ['城市','名称','星级','评分','价格','销量','省/市/区','坐标','简介','是否收费','具体地址']            wb = Workbook()            ws = wb.active            ws.title = 'sheet1'            ws.append(tableTitle)            wb.save(filepath)            time.sleep(3)        wb = load_workbook(filepath)        ws = wb.active        ws.title = 'sheet1'        for info in allinfo:            ws.append(info)        wb.save(filepath)        return True    except:        return False

爬取的过程展现:


六、数据可视化展现

通过剖析上次python爬获得到的表格数据,再用pandas模块遍历文件夹读取数据

def get_datas():    """    遍历task文件夹里的文件    :return:    """    df_allinfo = pd.DataFrame()    for root, dirs, files in os.walk('D:/work/loginn/task'):        print(files)        for filename in files:            try:                df = pd.read_excel(f'D:/work/loginn/task/{filename}')                df_allinfo = df_allinfo.append(df, ignore_index=True)            except:                continue    # 去重    df_allinfo.drop_duplicates(subset=['名称'], keep='first', inplace=True)    return df_allinfo    # print(df_allinfo)

6.1 热门景点数据图

以门票销量前20为例:

def get_sales_bar(data):    sort_info = data.sort_values(by='销量', ascending=True)    c = (        Bar()        .add_xaxis(list(sort_info['名称'])[-20:])        .add_yaxis('热门景点销量', sort_info['销量'].values.tolist()[-20:])        .reversal_axis()        .set_global_opts(            title_opts=opts.TitleOpts(title='热门景点销量数据'),            yaxis_opts=opts.AxisOpts(name='景点名称'),            xaxis_opts=opts.AxisOpts(name='销量'),            )        .set_series_opts(label_opts=opts.LabelOpts(position="right"))        .render('1-热门景点数据.html')        )

成果:

6.2假期出行数据地图分布图

def get_sales_geo(data):    df = data[['城市','销量']]    # print(df)    df_counts = df.groupby('城市').sum()    df_counts = df.groupby('城市').count()['销量']    print(df_counts)    c = (        Map()        .add('假期出行分布', [list(z) for z in zip(df_counts.index.values.tolist(), df_counts.values.tolist())], 'china')        .set_global_opts(        title_opts=opts.TitleOpts(title='假期出行数据地图散布'),        visualmap_opts=opts.VisualMapOpts(max_=100, is_piecewise=True),        )        .render('2-假期出行数据地图散布.html')    )

成果:

6.3各省市4A-5A景区数量图

def get_level_counts(data):    df = data[data['星级'].isin(['4A', '5A'])]    df_counts = df.groupby('城市').count()['星级']    c = (        Bar()            .add_xaxis(df_counts.index.values.tolist())            .add_yaxis('4A-5A景区数量', df_counts.values.tolist())            .set_global_opts(            title_opts=opts.TitleOpts(title='各省市4A-5A景区数量'),            datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_='inside')],        )        .render('3-各省市4A-5A景区数量.html')    )

成果:

6.4 4-4A-5A景区数据地图散布

def get_level_geo(data):    df = data[data['星级'].isin(['4A', '5A'])]    df_counts = df.groupby('城市').count()['星级'] c = (        Map()        .add('4A-5A景区散布', [list(z) for z in zip(df_counts.index.values.tolist(), df_counts.values.tolist())], 'china')        .set_global_opts(        title_opts=opts.TitleOpts(title='地图数据分布'),        visualmap_opts=opts.VisualMapOpts(max_=50, is_piecewise=True),        )        .render('4-4A-5A景区数据地图散布.html')    )

成果: