共计 13154 个字符,预计需要花费 33 分钟才能阅读完成。
作者 |Audhi Aprilliant
编译 |VK
起源 |Towards Datas Science
概述
对于这个我的项目,咱们在 2019 年 5 月 28-29 日通过爬虫来应用 Twitter 的原始数据。此外,数据是 CSV 格局(逗号分隔),能够在这里下载。
https://github.com/audhiapril…
它波及两个主题,一个是蕴含关键字“Joko Widodo”的 Joko Widodo 的数据,另一个是带有关键字“Prabowo Subianto”的 Prabowo Subianto 的数据。其中包含几个变量和信息,以确定用户情绪。实际上,数据有 16 个变量或属性和 1000 多个察看值。表 1 列出了一些变量。
# 导入库
library(ggplot2)
library(lubridate)
# 加载 Joko Widodo 的数据
data.jokowi.df = read.csv(file = 'data-joko-widodo.csv',
header = TRUE,
sep = ',')
senti.jokowi = read.csv(file = 'sentiment-joko-widodo.csv',
header = TRUE,
sep = ',')
# 加载 Prabowo Subianto 的数据
data.prabowo.df = read.csv(file = 'data-prabowo-subianto.csv',
header = TRUE,
sep = ',')
senti.prabowo = read.csv(file = 'sentiment-prabowo-subianto.csv',
header = TRUE,
sep = ',')
数据可视化
数据摸索旨在从 Twitter 数据中获取任何信息。应该指出的是,数据曾经进行了文本预处理。咱们对那些被认为是很乏味的变量进行摸索。。
# TWEETS 的条形图 -JOKO WIDODO
data.jokowi.df$created = ymd_hms(data.jokowi.df$created,
tz = 'Asia/Jakarta')
# 另一种制作“date”和“hour”变量的办法
data.jokowi.df$date = date(data.jokowi.df$created)
data.jokowi.df$hour = hour(data.jokowi.df$created)
# 日期 2019-05-29
data.jokowi.date1 = subset(x = data.jokowi.df,
date == '2019-05-29')
data.hour.date1 = data.frame(table(data.jokowi.date1$hour))
colnames(data.hour.date1) = c('Hour','Total.Tweets')
# 创立数据可视化
ggplot(data.hour.date1)+
geom_bar(aes(x = Hour,
y = Total.Tweets,
fill = I('blue')),
stat = 'identity',
alpha = 0.75,
show.legend = FALSE)+
geom_hline(yintercept = mean(data.hour.date1$Total.Tweets),
col = I('black'),
size = 1)+
geom_text(aes(fontface = 'italic',
label = paste('Average:',
ceiling(mean(data.hour.date1$Total.Tweets)),
'Tweets per hour'),
x = 8,
y = mean(data.hour.date1$Total.Tweets)+20),
hjust = 'left',
size = 4)+
labs(title = 'Total Tweets per Hours - Joko Widodo',
subtitle = '28 May 2019',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlab('Time of Day')+
ylab('Total Tweets')+
scale_fill_brewer(palette = 'Dark2')+
theme_bw()
# TWEETS 的条形图 -PRABOWO SUBIANTO
data.prabowo.df$created = ymd_hms(data.prabowo.df$created,
tz = 'Asia/Jakarta')
# 另一种制作“date”和“hour”变量的办法
data.prabowo.df$date = date(data.prabowo.df$created)
data.prabowo.df$hour = hour(data.prabowo.df$created)
# 日期 2019-05-28
data.prabowo.date1 = subset(x = data.prabowo.df,
date == '2019-05-28')
data.hour.date1 = data.frame(table(data.prabowo.date1$hour))
colnames(data.hour.date1) = c('Hour','Total.Tweets')
# 日期 2019-05-29
data.prabowo.date2 = subset(x = data.prabowo.df,
date == '2019-05-29')
data.hour.date2 = data.frame(table(data.prabowo.date2$hour))
colnames(data.hour.date2) = c('Hour','Total.Tweets')
data.hour.date3 = rbind(data.hour.date1,data.hour.date2)
data.hour.date3$Date = c(rep(x = '2019-05-28',
len = nrow(data.hour.date1)),
rep(x = '2019-05-29',
len = nrow(data.hour.date2)))
data.hour.date3$Labels = c(letters,'A','B')
data.hour.date3$Hour = as.character(data.hour.date3$Hour)
data.hour.date3$Hour = as.numeric(data.hour.date3$Hour)
# 数据预处理
for (i in 1:nrow(data.hour.date3)) {if (i%%2 == 0) {data.hour.date3[i,'Hour'] = ''
}
if (i%%2 == 1) {data.hour.date3[i,'Hour'] = data.hour.date3[i,'Hour']
}
}
data.hour.date3$Hour = as.factor(data.hour.date3$Hour)
# 数据可视化
ggplot(data.hour.date3)+
geom_bar(aes(x = Labels,
y = Total.Tweets,
fill = Date),
stat = 'identity',
alpha = 0.75,
show.legend = TRUE)+
geom_hline(yintercept = mean(data.hour.date3$Total.Tweets),
col = I('black'),
size = 1)+
geom_text(aes(fontface = 'italic',
label = paste('Average:',
ceiling(mean(data.hour.date3$Total.Tweets)),
'Tweets per hour'),
x = 5,
y = mean(data.hour.date3$Total.Tweets)+6),
hjust = 'left',
size = 3.8)+
scale_x_discrete(limits = data.hour.date3$Labels,
labels = data.hour.date3$Hour)+
labs(title = 'Total Tweets per Hours - Prabowo Subianto',
subtitle = '28 - 29 May 2019',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlab('Time of Day')+
ylab('Total Tweets')+
ylim(c(0,100))+
theme_bw()+
theme(legend.position = 'bottom',
legend.title = element_blank())+
scale_fill_brewer(palette = 'Dark2')
依据图 1,咱们能够得出结论,通过数据抓取(关键字“Jokow Widodo”和“Prabowo Subianto”)失去的 tweet 数量并不类似,即便在同一日期。
例如,在图 1(左)中,从视觉上看,对于关键字为“Joko Widodo”的推文,仅在 2019 年 5 月 28 日 03:00–17:00 WIB 期间取得。而在图 1(右图)中,咱们得出的论断是,在 2019 年 5 月 28 日至 29 日 12:00-23:59 WIB(2019 年 5 月 28 日)和 00:00-15:00 WIB(2019 年 5 月 29 日)期间取得的关键词为“Prabowo Subianto”的推文。
# 2019-05-28 的推特
ggplot(data.hour.date1)+
geom_bar(aes(x = Hour,
y = Total.Tweets,
fill = I('red')),
stat = 'identity',
alpha = 0.75,
show.legend = FALSE)+
geom_hline(yintercept = mean(data.hour.date1$Total.Tweets),
col = I('black'),
size = 1)+
geom_text(aes(fontface = 'italic',
label = paste('Average:',
ceiling(mean(data.hour.date1$Total.Tweets)),
'Tweets per hour'),
x = 6.5,
y = mean(data.hour.date1$Total.Tweets)+5),
hjust = 'left',
size = 4)+
labs(title = 'Total Tweets per Hours - Prabowo Subianto',
subtitle = '28 May 2019',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlab('Time of Day')+
ylab('Total Tweets')+
ylim(c(0,100))+
theme_bw()+
scale_fill_brewer(palette = 'Dark2')
# 2019-05-29 的推特
ggplot(data.hour.date2)+
geom_bar(aes(x = Hour,
y = Total.Tweets,
fill = I('red')),
stat = 'identity',
alpha = 0.75,
show.legend = FALSE)+
geom_hline(yintercept = mean(data.hour.date2$Total.Tweets),
col = I('black'),
size = 1)+
geom_text(aes(fontface = 'italic',
label = paste('Average:',
ceiling(mean(data.hour.date2$Total.Tweets)),
'Tweets per hour'),
x = 1,
y = mean(data.hour.date2$Total.Tweets)+6),
hjust = 'left',
size = 4)+
labs(title = 'Total Tweets per Hours - Prabowo Subianto',
subtitle = '29 May 2019',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlab('Time of Day')+
ylab('Total Tweets')+
ylim(c(0,100))+
theme_bw()+
scale_fill_brewer(palette = 'Dark2')
依据图 2,咱们失去了应用关键字“Joko Widodo”和“Prabowo Subianto”的用户之间的显著差别。关键词为“Joko Widodo”的 tweet 在某个特定工夫(07:00–09:00 WIB)议论 Joko Widodo 往往十分强烈,08:00 WIB 的 tweet 数量最多。它有 348 条推文。然而,在 2019 年 5 月 28 日至 29 日期间,关键词为“Prabowo Subianto”的推文往往会一直地议论 Prabowo Subianto。2019 年 5 月 28 日至 29 日,每小时上传关键词为“Prabowo Subianto”的推文均匀为 36 条。
# JOKO WIDODO
df.score.1 = subset(senti.jokowi,class == c('Negative','Positive'))
colnames(df.score.1) = c('Score','Text','Sentiment')
# Data viz
ggplot(df.score.1)+
geom_density(aes(x = Score,
fill = Sentiment),
alpha = 0.75)+
xlim(c(-11,11))+
labs(title = 'Density Plot of Sentiment Scores',
subtitle = 'Joko Widodo',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlab('Score')+
ylab('Density')+
theme_bw()+
scale_fill_brewer(palette = 'Dark2')+
theme(legend.position = 'bottom',
legend.title = element_blank())
# PRABOWO SUBIANTO
df.score.2 = subset(senti.prabowo,class == c('Negative','Positive'))
colnames(df.score.2) = c('Score','Text','Sentiment')
ggplot(df.score.2)+
geom_density(aes(x = Score,
fill = Sentiment),
alpha = 0.75)+
xlim(c(-11,11))+
labs(title = 'Density Plot of Sentiment Scores',
subtitle = 'Prabowo Subianto',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlab('Density')+
ylab('Score')+
theme_bw()+
scale_fill_brewer(palette = 'Dark2')+
theme(legend.position = 'bottom',
legend.title = element_blank())
图 3 是 2019 年 5 月 28 日至 29 日以“Joko Widodo”和“Prabowo Subianto”为关键词的多条推文的条形图。由图 3(左) 能够得出,Twitter 用户在 19:00-23:59 WIB 上议论 Prabowo Subianto 的频率较低。这是因为印尼人的休息时间造成的。然而,这些带有主题的推文总是在午夜更新,因为有的用户寓居在国外,有的用户依然沉闷。而后,用户在 04:00 WIB 开始流动,在 07:00 WIB 达到顶峰,而后降落,直到 12:00 WIB 再次回升。
# JOKO WIDODO
df.senti.score.1 = data.frame(table(senti.jokowi$score))
colnames(df.senti.score.1) = c('Score','Freq')
# 数据预处理
df.senti.score.1$Score = as.character(df.senti.score.1$Score)
df.senti.score.1$Score = as.numeric(df.senti.score.1$Score)
Score1 = df.senti.score.1$Score
sign(df.senti.score.1[1,1])
for (i in 1:nrow(df.senti.score.1)) {sign.row = sign(df.senti.score.1[i,'Score'])
for (j in 1:ncol(df.senti.score.1)) {df.senti.score.1[i,j] = df.senti.score.1[i,j] * sign.row
}
}
df.senti.score.1$Label = c(letters[1:nrow(df.senti.score.1)])
df.senti.score.1$Sentiment = ifelse(df.senti.score.1$Freq < 0,
'Negative','Positive')
df.senti.score.1$Score1 = Score1
# 数据可视化
ggplot(df.senti.score.1)+
geom_bar(aes(x = Label,
y = Freq,
fill = Sentiment),
stat = 'identity',
show.legend = FALSE)+
# 踊跃情感
geom_hline(yintercept = mean(abs(df.senti.score.1[which(df.senti.score.1$Sentiment == 'Positive'),'Freq'])),
col = I('black'),
size = 1)+
geom_text(aes(fontface = 'italic',
label = paste('Average Freq:',
ceiling(mean(abs(df.senti.score.1[which(df.senti.score.1$Sentiment == 'Positive'),'Freq'])))),
x = 10,
y = mean(abs(df.senti.score.1[which(df.senti.score.1$Sentiment == 'Positive'),'Freq']))+30),
hjust = 'right',
size = 4)+
# 消极情感
geom_hline(yintercept = mean(df.senti.score.1[which(df.senti.score.1$Sentiment == 'Negative'),'Freq']),
col = I('black'),
size = 1)+
geom_text(aes(fontface = 'italic',
label = paste('Average Freq:',
ceiling(mean(abs(df.senti.score.1[which(df.senti.score.1$Sentiment == 'Negative'),'Freq'])))),
x = 5,
y = mean(df.senti.score.1[which(df.senti.score.1$Sentiment == 'Negative'),'Freq'])-15),
hjust = 'left',
size = 4)+
labs(title = 'Barplot of Sentiments',
subtitle = 'Joko Widodo',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlab('Score')+
scale_x_discrete(limits = df.senti.score.1$Label,
labels = df.senti.score.1$Score1)+
theme_bw()+
scale_fill_brewer(palette = 'Dark2')
# PRABOWO SUBIANTO
df.senti.score.2 = data.frame(table(senti.prabowo$score))
colnames(df.senti.score.2) = c('Score','Freq')
# 数据预处理
df.senti.score.2$Score = as.character(df.senti.score.2$Score)
df.senti.score.2$Score = as.numeric(df.senti.score.2$Score)
Score2 = df.senti.score.2$Score
sign(df.senti.score.2[1,1])
for (i in 1:nrow(df.senti.score.2)) {sign.row = sign(df.senti.score.2[i,'Score'])
for (j in 1:ncol(df.senti.score.2)) {df.senti.score.2[i,j] = df.senti.score.2[i,j] * sign.row
}
}
df.senti.score.2$Label = c(letters[1:nrow(df.senti.score.2)])
df.senti.score.2$Sentiment = ifelse(df.senti.score.2$Freq < 0,
'Negative','Positive')
df.senti.score.2$Score1 = Score2
# 数据可视化
ggplot(df.senti.score.2)+
geom_bar(aes(x = Label,
y = Freq,
fill = Sentiment),
stat = 'identity',
show.legend = FALSE)+
# 踊跃情感
geom_hline(yintercept = mean(abs(df.senti.score.2[which(df.senti.score.2$Sentiment == 'Positive'),'Freq'])),
col = I('black'),
size = 1)+
geom_text(aes(fontface = 'italic',
label = paste('Average Freq:',
ceiling(mean(abs(df.senti.score.2[which(df.senti.score.2$Sentiment == 'Positive'),'Freq'])))),
x = 11,
y = mean(abs(df.senti.score.2[which(df.senti.score.2$Sentiment == 'Positive'),'Freq']))+20),
hjust = 'right',
size = 4)+
# 消极情感
geom_hline(yintercept = mean(df.senti.score.2[which(df.senti.score.2$Sentiment == 'Negative'),'Freq']),
col = I('black'),
size = 1)+
geom_text(aes(fontface = 'italic',
label = paste('Average Freq:',
ceiling(mean(abs(df.senti.score.2[which(df.senti.score.2$Sentiment == 'Negative'),'Freq'])))),
x = 9,
y = mean(df.senti.score.2[which(df.senti.score.2$Sentiment == 'Negative'),'Freq'])-10),
hjust = 'left',
size = 4)+
labs(title = 'Barplot of Sentiments',
subtitle = 'Prabowo Subianto',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlab('Score')+
scale_x_discrete(limits = df.senti.score.2$Label,
labels = df.senti.score.2$Score1)+
theme_bw()+
scale_fill_brewer(palette = 'Dark2')
图 4 是蕴含关键字“Joko Widodo”和“Prabowo Subianto”的情感得分密度图。tweets 的得分是由组成 tweets 的词根的均匀得分失去的。因而,它的分数是针对每个词根给出的,其值介于 -10 到 10 之间。如果分数越小,那么微博中的负面情绪就越多,反之亦然。依据图 4(左),能够得出结论,蕴含关键字“Joko Widodo”的推文的负面情绪在 -10 到 - 1 之间,两头得分为 -4。它也实用于踊跃的情绪(当然,有一个踊跃的分数)。依据图 4(左)中的密度图,咱们发现积极情绪的得分具备相当小的方差。因而,咱们得出结论,对蕴含关键词“Joko Widodo”的微博的积极情绪并不是太多样化。
图 4(右)显示了蕴含关键字“Prabowo Subianto”的情感得分密度图。它与图 4(左)不同,因为图 4(右)上的负面情绪在 - 8 到 - 1 之间。这意味着 tweets 没有太多负面情绪(tweets 有负面情绪,但不够高)。此外,负面情绪得分的散布在 4 和 1 之间有两个峰值。然而,积极情绪从 1 到 10 不等。与图 4(左)相比,图 4(右)的积极情绪具备较高的方差,在 3 和 10 范畴内有两个峰值。这表明,蕴含关键词“Prabowo Subianto”的微博具备很高的积极情绪。
# JOKO WIDODO
df.senti.3 = as.data.frame(table(senti.jokowi$class))
colnames(df.senti.3) = c('Sentiment','Freq')
# 数据预处理
df.pie.1 = df.senti.3
df.pie.1$Prop = df.pie.1$Freq/sum(df.pie.1$Freq)
df.pie.1 = df.pie.1 %>%
arrange(desc(Sentiment)) %>%
mutate(lab.ypos = cumsum(Prop) - 0.5*Prop)
# 数据可视化
ggplot(df.pie.1,
aes(x = 2,
y = Prop,
fill = Sentiment))+
geom_bar(stat = 'identity',
col = 'white',
alpha = 0.75,
show.legend = TRUE)+
coord_polar(theta = 'y',
start = 0)+
geom_text(aes(y = lab.ypos,
label = Prop),
color = 'white',
fontface = 'italic',
size = 4)+
labs(title = 'Piechart of Sentiments',
subtitle = 'Joko Widodo',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlim(c(0.5,2.5))+
theme_void()+
scale_fill_brewer(palette = 'Dark2')+
theme(legend.title = element_blank(),
legend.position = 'right')
# PRABOWO SUBIANTO
df.senti.4 = as.data.frame(table(senti.prabowo$class))
colnames(df.senti.4) = c('Sentiment','Freq')
# 数据预处理
df.pie.2 = df.senti.4
df.pie.2$Prop = df.pie.2$Freq/sum(df.pie.2$Freq)
df.pie.2 = df.pie.2 %>%
arrange(desc(Sentiment)) %>%
mutate(lab.ypos = cumsum(Prop) - 0.5*Prop)
# 数据可视化
ggplot(df.pie.2,
aes(x = 2,
y = Prop,
fill = Sentiment))+
geom_bar(stat = 'identity',
col = 'white',
alpha = 0.75,
show.legend = TRUE)+
coord_polar(theta = 'y',
start = 0)+
geom_text(aes(y = lab.ypos,
label = Prop),
color = 'white',
fontface = 'italic',
size = 4)+
labs(title = 'Piechart of Sentiments',
subtitle = 'Prabowo Subianto',
caption = 'Twitter Crawling 28 - 29 May 2019')+
xlim(c(0.5,2.5))+
theme_void()+
scale_fill_brewer(palette = 'Dark2')+
theme(legend.title = element_blank(),
legend.position = 'right')
图 5 是推特的情绪得分汇总,这些微博被分为负面情绪、中性情绪和积极情绪。消极情绪是指得分低于零的情绪,中性是指分数等于零的情绪,积极情绪得分大于零。从图 5 能够看出,关键字为“Joko Widodo”的微博的负面情绪百分比低于关键字为“Prabowo Subianto”的 tweet。有 6.3% 的差别。钻研还发现,与关键词为 Prabowo Subianto 的微博相比,蕴含关键词“Joko Widodo”的微博具备更高的中性情绪和积极情绪。通过 piechart 的钻研发现,与关键字为“Prabowo Subianto”的 tweet 相比,带有关键字“Joko Widodo”的 tweet 偏向于领有更高比例的积极情绪。然而通过密度图发现,踊跃和消极情绪得分的散布表明,与“Joko Widodo”相比,蕴含关键字“Prabowo Subianto”的微博往往具备更高的情绪得分。它必须进行进一步的剖析。
图 6 显示了用户在 2019 年 5 月 28-29 日常常上传的 tweet(关键词“Joko Widodo”和“Prabowo Subianto”)中的术语或单词。通过这个 WordCloud 可视化,能够找到热门话题,这些话题都是针对关键词进行探讨的。对于蕴含关键词“Joko Widodo”的 tweet,咱们发现术语“tuang”、“petisi”、“negara”、“aman”和“nusantara”是前五名,每个 tweet 呈现的次数最多。然而,蕴含关键词“Joko Widodo”的 tweet 发现,“Prabowo”、“Subianto”、“kriminalisasi”、“selamat”和“dubai”是每个 tweet 中呈现次数最多的前五个词。这间接地显示了以关键字“Prabowo Subianto”上传的 tweet 的模式,即:简直能够必定的是,每个上传的 tweet 都间接蕴含“Prabowo Subianto”的名称,而不是通过提及(@)。这是因为,在文本预处理中,提到(@)已被删除。
能够返回我的 GitHub repo 查找代码:https://github.com/audhiapril…
参考援用
[1] K. Borau, C. Ullrich, J. Feng, R. Shen. Microblogging for Language Learning: Using Twitter to Train Communicative and Cultural Competence (2009), Advances in Web-Based Learning — ICWL 2009, 8th International Conference, Aachen, Germany, August 19–21, 2009.
原文链接:https://towardsdatascience.co…
欢送关注磐创 AI 博客站:
http://panchuang.net/
sklearn 机器学习中文官网文档:
http://sklearn123.com/
欢送关注磐创博客资源汇总站:
http://docs.panchuang.net/