共计 7424 个字符,预计需要花费 19 分钟才能阅读完成。
要说往年最火的综艺节目,兴妖作怪……,哦,不对;小明历险记……,也不对,哎!算了,接着看吧……
一、数据
利用爬虫抓取了百度百科和维基百科上姐姐们的公开数据信息。两个网站均为动态的页面,只须要对相应 HTML 标签进行解析即可获取相应的数据。(更简略的办法也能够间接复制相应的表格信息到本地)
百度百科
PS: 认真一瞧,百度百科上有些信息居然是错的,比方陈松伶的初舞台评分。
维基百科
维基百科数据能够跟百度百科互补一下,对于评分细节更粗疏许多。
视频弹幕
看着视频,按 F12 关上开发者工具,而后在 network 下就会有弹幕 json 数据包一直的向你扔过去了,在 Headers 栏能够查看 json 的门路。https://bullet-ali.hitv.com/bullet/2020/06/30/224134/8766480/0.json
能够看出有日期,即爬取当天的弹幕数据,其余两个应该是节目 id 和视频 id,x.json 应该是依照节目的工夫来定的,每 60s 一个数据包。
还是采纳之前【链接】应用的 scrapy 框架进行弹幕数据爬取,新建我的项目与爬虫文件。
scrapy startproject sisiter
cd sisiter
scrapy genspider danmu bullet-ali.hitv.com
构建数据 Item
import scrapy
class DanmuItem(scrapy.Item):
# define the fields for your item here like:
# 视频 id
vid_id = scrapy.Field()
# id
danmu_id = scrapy.Field()
# uname
uname = scrapy.Field()
# 内容
content = scrapy.Field()
# 工夫
danmu_time = scrapy.Field()
# 点赞
up_count = scrapy.Field()
# 分钟
danmu_minites = scrapy.Field()
编写爬虫解析代码
# -*- coding: utf-8 -*-
import scrapy
import json
from datetime import datetime
from sister.items import DanmuItem
class DanmuSpider(scrapy.Spider):
name = 'danmu'
# allowed_domains = ['bullet-ws.hitv.com']
start_urls = ['https://bullet-ali.hitv.com']
date_str = datetime.now().strftime('%Y/%m/%d')[:8] + str(datetime.now().day)# 以后时日期
num1 = 112744 # 节目参数 7/ 3 最新一期
num2 = 8980904 # 视频参数
def start_requests(self):
base_url = 'https://bullet-ali.hitv.com/bullet/%s/%d/%d/%d.json'
for page in range(38):
# 打印进度
print('正在获取第 {} 页的信息'.format(page))
url = base_url % (self.date_str, self.num1, self.num2, page)
yield scrapy.Request(url=url, meta={'step': page}, callback=self.parse)
def parse(self, response):
step = response.meta['step']
json_data = json.loads(response.body)
# 获取数据
all_data = json_data['data']['items']
print(len(all_data))
for data in all_data:
danmu_item = DanmuItem()
danmu_item['vid_id'] = self.num2
danmu_item['danmu_id'] = data.get('id')
danmu_item['uname'] = data.get('uname')
danmu_item['content'] =data.get('content')
danmu_item['danmu_time'] = data.get('time')
danmu_item['up_count'] = data.get('v2_up_count')
danmu_item['danmu_minites'] = step + 1
yield danmu_item
保留数据 pipeline
import pandas as pd
class SisterPipeline(object):
def __init__(self):
self.info_list = []
def process_item(self, item, spider):
self.info_list.append(item)
return item
def close_spider(self, spider):
df = pd.DataFrame(self.info_list)
df.to_csv('danmu_info.csv', sep='\t', encoding='utf-8', index=False)
保留的数据就是这样婶儿的
二、可视化
整体
首先,看看姐姐们都来自哪里,能够看出节目组在姐姐们的抉择上兼顾到了两岸三地,
而且在民族形成上,也蕴含了 7 位少数民族选手,比方“土家族之花”– 沈梦辰。
年龄散布状况
职业散布状况
年龄与初评得分的关系很显著了????
弹幕热度排行
弹幕词云,看来大家都 喜爱 姐姐
基于这些可视化后果,制作了一个繁难的统计大屏进行展现:
每位浪姐
代码
import os
import jieba
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from pyecharts.charts import Page, Sankey, WordCloud, Radar
from pyecharts.components import Image
from pyecharts.options import ComponentTitleOpts
from collections import Counter
from pyecharts.globals import SymbolType
from pyecharts import options as opts
from pyecharts.options.global_options import ThemeType
from pyecharts import options as opts
from collections import Counter
import random
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置 value 的显示长度为 100,默认为 50
pd.set_option('max_colwidth',100)
def get_cut_words(content_series):
# 读入停用词表
import jieba
stop_words = []
with open("data/stopwords.txt", 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
stop_words.append(line.strip())
# 增加关键词
my_words = ['杜华', '辣鸡', '导演组', '节目组', '不偏心', '黄圣依', '无杜华版']
for i in my_words:
jieba.add_word(i)
my_words2 = my_words_list
for j in my_words2:
jieba.add_word(j)
# 自定义停用词
my_stop_words = ['第一期', '一堆', '三个', '真的', '哈哈哈', '哈哈哈哈', '啊啊啊']
stop_words.extend(my_stop_words)
# 分词
word_num = jieba.lcut(content_series.str.cat(sep='。'), cut_all=False)
# 条件筛选
word_num_selected = [i for i in word_num if i not in stop_words and len(i) >= 2]
return word_num_selected
def show_all():
page = Page()
page.add(
image1,
wc,
radar
)
out_html = 'data/sister/%s.html' % name
page.render(out_html)
# 批改款式
with open(os.path.join(os.path.abspath("."), out_html), 'r+', encoding="utf8") as html:
html_bf = BeautifulSoup(html, "lxml")
divs = html_bf.find_all("div")
print(len(divs))
divs[0]["style"] = "align=\"center\";margin:0 auto;text-align:center;"
divs[1]["style"] = "width:550px;height:350px;position:absolute;top:120px;left:700px;border-style:solid;border-color:#444444;border-width:0px;" # 批改图表大小、地位、边框
divs[2]["style"] = "width:700px;height:700px;position:absolute;top:120px;left:20px;border-style:solid;border-color:#444444;border-width:0px;" # 批改图表大小、地位、边框
divs[3]["style"] = "width:600px;height:400px;position:absolute;top:300px;left:1250px;border-style:solid;border-color:#444444;border-width:0px;" # 批改图表大小、地位、边框
# 批改页面背景色
body = html_bf.find("body")
body["style"] = "background-color:#333333;"
# 追加题目
div_title = "<div align=\"center\"style=\"width:1840px;\">\n<span style=\"font-size:32px;font face=\'黑体 \';color:#FFFFFF\"><b> </b></div>"
body.insert(0, BeautifulSoup(div_title, "lxml").div)
html_new = str(html_bf)
html.seek(0, 0)
html.truncate()
html.write(html_new)
html.close()
df = pd.read_csv('data/sister_data.csv', encoding='utf-8', sep='\t')
# 弹幕数据
df_all = pd.DataFrame()
for i in range(1, 9):
tmp = pd.read_csv('sister/sister/danmu_info_%d.csv' % i, encoding='utf-8', sep='\t')
df_all = df_all.append(tmp)
# print(df['names'].tolist())
df.sort_values('总分', ascending=False, inplace=True)
# 昵称
df['昵称'] = ['蓝盈莹 | 盈莹', '黄龄', '丹妮', '孟佳', '梦辰',
'可唯', '平静 | 静静子 | 静姐', '霏霏', '希怡', '袁咏琳',
'圣依 | 依依子', '金晨', '阿朵', '含韵', '白冰',
'钟丽缇', '茜 | 茜茜子', '张萌 | 萌萌子', '婧汐', '丁当',
'许飞', '刘芸 | 芸芸子', '吴昕 | 昕昕子 | 昕姐 | 昕昕', '伊能静', '松伶',
'丽坤', '张雨绮 | 雨绮 | 绮绮子', '海陆', '金莎', '王智']
print(df.head(5))
print(df.columns)
for name in df.names.tolist():
image1 = Image()
img_src = ("../img/%s.jpg" % name # html 门路问题)
image1.add(
src=img_src,
style_opts={"width": "345px", "height": "584px", "style": "margin-top: 15px"},
)
image1.set_global_opts(
title_opts=ComponentTitleOpts(title_style={"style": "color: white; font-size: 18px; font-weight:bold;"},
subtitle_style={"style": "color: white;font-size: 12px;"})
)
# 雷达图
value = df[["集体特质", "声乐表现力", "成团后劲", "舞台表现力"]][df.names == name].values[0]
data = [{"value": [float(i) for i in value], "name": "分数"}]
c_schema = [{"name": "集体特质", "max": 25, "min": 0},
{"name": "声乐表现力", "max": 25, "min": 0},
{"name": "成团后劲", "max": 25, "min": 0},
{"name": "舞台表现力", "max": 25, "min": 0},
]
radar = (Radar()
.set_colors(["#4587E7"])
.add_schema(
schema=c_schema,
shape="circle",
center=["50%", "50%"],
radius="80%",
angleaxis_opts=opts.AngleAxisOpts(
min_=0,
max_=360,
is_clockwise=False,
interval=5,
axistick_opts=opts.AxisTickOpts(is_show=False),
axislabel_opts=opts.LabelOpts(is_show=False),
axisline_opts=opts.AxisLineOpts(is_show=False),
splitline_opts=opts.SplitLineOpts(is_show=False),
),
radiusaxis_opts=opts.RadiusAxisOpts(
min_=0,
max_=25,
interval=5,
splitarea_opts=opts.SplitAreaOpts(is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
),
),
polar_opts=opts.PolarOpts(),
splitarea_opt=opts.SplitAreaOpts(is_show=False),
splitline_opt=opts.SplitLineOpts(is_show=False),
)
.add(
series_name="分数",
data=data,
color="#f9713c",
areastyle_opts=opts.AreaStyleOpts(opacity=0.1),
linestyle_opts=opts.LineStyleOpts(width=1),
)
)
# 弹幕词云
tmp = df[df.names == name]
my_words_list = df. 昵称.str.cat(sep='。').replace('|', '。').split('。')
print(tmp. 昵称.values[0])
text1 = get_cut_words(content_series=df_all.content[df_all.content.str.contains(tmp. 昵称.values[0])])
wordCount_dict = Counter(text1)
choices_number = 200
count_list = sorted(wordCount_dict.items(), key=lambda x:x[1],reverse=True)
count_list = count_list[:choices_number]
keyword_list = [k[0] for k in count_list]
value_list = [k[1] for k in count_list]
wc = (WordCloud()
.add(series_name="弹幕词云", data_pair=count_list, word_size_range=[20, 100],
textstyle_opts=opts.TextStyleOpts(font_family="cursive"),shape=SymbolType.DIAMOND)
.set_global_opts(tooltip_opts=opts.TooltipOpts(is_show=True),
)
)
show_all()
- 参考链接:
- python 爬取乘风破浪的姐姐弹幕 – 芒果 TV 爬虫
- 乘风破浪的姐姐?NO!是兴妖作怪的姑奶奶 Python 剖析芒果 TV9 万条弹幕,评论,谁才是真正的 C 位?
- 我剖析了《乘风破浪的姐姐》,发现了这些机密
- pyecharts 实现新冠肺炎疫情可视化 BI 数据大屏
正文完