原文链接:http://tecdat.cn/?p=22751
对于宏大的公交地铁路线信息的数据挖掘,个别软件遇到的问题次要有两点:1. 对于文本信息的开掘,特地是中文词汇的开掘,不足成熟的工具或者软件包,2. 对于大数据量,个别软件的读取和解决会遇到问题。即便一个月的局部区域路线信息也会达到几百 m 以上,因而,对于这类数据,无论从算法运行还是数据读取来说一般的 SQL 语言或者 matlab 软件解决起来都乏善可陈。对于这类数据,咱们个别用 r 软件能够轻松实现读取,数据挖掘以及可视化的过程。
例如对于上面这样的车站数据:
和近 600M 的进出站信息的数据, 如果要实现每隔一段时间的对应路线的进出站人数整顿以及可视化的过程,咱们能够进行一下的步骤进行剖析:
首先咱们进行数据的读取和预处理
install.packages("dplyr")
library("dplyr")# 读取 dplyr 包用以排序
### 对数据读取
data=read.table("E:\\\201501 一卡通进出站.txt",stringsAsFactors=F)
## 对数据列进行命名
colnames(data)=c("逻辑卡号",
"交易日期" ,
"交易工夫",
"票种",
"交易代码",
"交易车站",
"上次交易车站")
### 对数据进行预处理
for(ii in 20150101:20150131){# 每天的数据
data1=data\[which(data\[,2\]==ii),\]# 筛选出日期为 20150101 这天的数据
data2=data1\[,c(2,3,6,7)\]# 筛选出 "交易日期" ,"交易工夫", "交易车站","上次交易车站" 的数据
data2# 查看数据
data2=data2\[order(data2$ 交易车站),\]
line1=data2\[substr(data2$ 交易车站,1,1)=="1",\]#1 号线
line2=data2\[substr(data2$ 交易车站,1,1)=="2",\]#2 号线
### 筛选出车站为 243
bus=unique(data2\[,3\])#################### 每个站的数据
for(busi in 1:length(bus)){index=which(data2\[,3\]==bus\[busi\])# 筛选出车站为 243 的数据行号
data3=data2\[index,\]# 获取交易车站为 243 的数据
###data3=data2\[order(data2$ 交易车站),\]# 如果不筛选车站,间接按交易车站递增排序
data4=arrange(data3, 交易日期, 交易工夫)# 对工夫排序,先按年份递增排序,而后依照工夫递增排序
### 按每十分钟工夫宰割
for (time in 6:21){for(i in 1:6){index=intersect(which(data4\[,2\]>time\*10000+(i-1)\*1000),which(data4\[,2\]<=time\*10000+1000\*i))
datat=data4\[index,\]
outnum=length(which(datat\[,4\]!=0))
innum=length(which(datat\[,4\]==0))
if(i!=6)cat(file=paste("E:\\\",bus\[busi\],"车站",ii,"日一卡通进出站工夫.txt"),append=TRUE,ii,"日",time,"点",i-1,"0 分到",i,"0 分的出站人数为",outnum,""," 进站人数为 ",innum,"\\n")
else cat(file=paste("E:\\\",bus\[busi\],"车站",ii,"日一卡通进出站工夫.txt"),append=TRUE,ii,"日",time,"点",i-1,"0 分到",time+1,"点 0 分的出站人数为",outnum,""," 进站人数为 ",innum,"\\n")
#cat(file="E:\\\243 车站一卡通进出站工夫.txt",append=TRUE,time,"点",i-1,"0 分到",time+1,"点 0 分的出站人数为",outnum,""," 进站人数为 ",innum,"\\n")
}
}
#筛选出出站人数
dataout=data3\[which(data3\[,4\]!=0),\]# 上次交易车站不为 0,为出站人数
datain=data3\[which(data3\[,4\]==0),\]
### 将数据进行输入
write.table(data4,paste("E:\\\",ii,"日",bus\[busi\],"车站一卡通进出站整顿.txt"))# 将数据整顿好输入到指定的目录文件名
}
}
####################################################################################3
################1,2 号线 ##########
data2=data2\[order(data2$ 交易车站),\]
line1=data2\[substr(data2$ 交易车站,1,1)=="1",\]#1 号线
line2=data2\[substr(data2$ 交易车站,1,1)=="2",\]#2 号线
#########1 号线
data4=arrange(line1, 交易日期, 交易工夫)# 对工夫排序,先按年份递增排序,而后依照工夫递增排序
### 按每十分钟工夫宰割
cat(file="E:\\\1 号线一卡通进出站工夫.txt",append=TRUE, "点", "分","出站人数", ""," 进站人数 ","\\n")
for (time in 6:21){for(i in 1:6){index=intersect(which(data4\[,2\]>time\*10000+(i-1)\*1000),which(data4\[,2\]<=time\*10000+1000\*i))
datat=data4\[index,\]
outnum=length(which(datat\[,4\]!=0))
innum=length(which(datat\[,4\]==0))
if(i!=6)cat(file="E:\\\1 号线一卡通进出站工夫.txt",append=TRUE,time,"",i-1,"0 "," ",outnum," "," ",innum,"\\n")#cat(time," 点 ",i-1,"0 分到 ",i,"0 分的出站人数为 ",outnum," "," 进站人数为 ",innum,"\\n")
else cat(file="E:\\\1 号线一卡通进出站工夫.txt",append=TRUE,time,"",i-1,"0 "," ",outnum," "," ",innum,"\\n")#cat(time," 点 ",i-1,"0 分到 ",time+1," 点 0 分的出站人数为 ",outnum," "," 进站人数为 ",innum,"\\n") #
#cat(file="E:\\\20150101 日 243 车站一卡通进出站工夫.txt",append=TRUE,time,"点",i-1,"0 分到",time+1,"点 0 分的出站人数为",outnum,""," 进站人数为 ",innum,"\\n")
}
}
#筛选出出站人数
dataout=data3\[which(data3\[,4\]!=0),\]# 上次交易车站不为 0,为出站人数
datain=data3\[which(data3\[,4\]==0),\]
numout=dim(dataout)\[1\]# 出站人数总和
numin=dim(datain)\[1\]# 进站人数总和
### 将数据进行输入
write.table(data4,"E:\\\1 号线一卡通进出站整顿.txt")# 将数据整顿好输入到指定的目录文件名
########2 号线
data4=arrange(line2, 交易日期, 交易工夫)# 对工夫排序,先按年份递增排序,而后依照工夫递增排序
### 按每十分钟工夫宰割
cat(file="E:\\\2 号线一卡通进出站工夫.txt",append=TRUE, "点", "分","出站人数", ""," 进站人数 ","\\n")
for (time in 6:21){for(i in 1:6){index=intersect(which(data4\[,2\]>time\*10000+(i-1)\*1000),which(data4\[,2\]<=time\*10000+1000\*i))
datat=data4\[index,\]
outnum=length(which(datat\[,4\]!=0))
innum=length(which(datat\[,4\]==0))
if(i!=6)cat(file="E:\\\2 号线一卡通进出站工夫.txt",append=TRUE,time,"",i-1,"0 "," ",outnum," "," ",innum,"\\n")#cat(time," 点 ",i-1,"0 分到 ",i,"0 分的出站人数为 ",outnum," "," 进站人数为 ",innum,"\\n")
else cat(file="E:\\\2 号线一卡通进出站工夫.txt",append=TRUE,time,"",i-1,"0 "," ",outnum," "," ",innum,"\\n")#cat(time," 点 ",i-1,"0 分到 ",time+1," 点 0 分的出站人数为 ",outnum," "," 进站人数为 ",innum,"\\n") #
#cat(file="E:\\\TB related\\\Service\\\temp\\\20150101 日 243 车站一卡通进出站工夫.txt",append=TRUE,time,"点",i-1,"0 分到",time+1,"点 0 分的出站人数为",outnum,""," 进站人数为 ",innum,"\\n")
}
}
#筛选出出站人数
dataout=data3\[which(data3\[,4\]!=0),\]# 上次交易车站不为 0,为出站人数
datain=data3\[which(data3\[,4\]==0),\]
### 将数据进行输入
write.table(data4,"E:\\\2 号线一卡通进出站整顿.txt")# 将数据整顿好输入到指定的目录文件名
#########1,2 总和
data4=arrange(line1, 交易日期, 交易工夫)# 对工夫排序,先按年份递增排序,而后依照工夫递增排序
data44=arrange(line2, 交易日期, 交易工夫)# 对工夫排序,先按年份递增排序,而后依照工夫递增排序
cat(file="E:\\\1,2 号线一卡通进出站工夫.txt",append=TRUE, "点", "分","出站人数", ""," 进站人数 ","\\n")
for (time in 6:21){for(i in 1:6){index=intersect(which(data4\[,2\]>time\*10000+(i-1)\*1000),which(data4\[,2\]<=time\*10000+1000\*i))
index2=intersect(which(data44\[,2\]>time\*10000+(i-1)\*1000),which(data44\[,2\]<=time\*10000+1000\*i))
datat=data4\[index,\]
datat1=data44\[index2,\]
outnum=length(which(datat\[,4\]!=0))
outnum1=length(which(datat1\[,4\]!=0))
innum=length(which(datat\[,4\]==0))
innum1=length(which(datat1\[,4\]==0))
if(i!=6)cat(file="E:\\\1,2 号线一卡通进出站工夫.txt",append=TRUE,time,"",i-1,"0 "," ",outnum+outnum1," "," ",innum+innum1,"\\n")#cat(time," 点 ",i-1,"0 分到 ",i,"0 分的出站人数为 ",outnum," "," 进站人数为 ",innum,"\\n")
else cat(file="E:\\\1,2 号线一卡通进出站工夫.txt",append=TRUE,time,"",i-1,"0 "," ",outnum+outnum1," "," ",innum+innum1,"\\n")#cat(time," 点 ",i-1,"0 分到 ",time+1," 点 0 分的出站人数为 ",outnum," "," 进站人数为 ",innum,"\\n") #
#cat(file="E:\\\20150101 日 243 车站一卡通进出站工夫.txt",append=TRUE,time,"点",i-1,"0 分到",time+1,"点 0 分的出站人数为",outnum,""," 进站人数为 ",innum,"\\n")
}
}
}
通过以上过程,咱们能够将整顿后的数据输入到对应的文件中:
以及交通路线的可视化过程;
对于交通路线的网络图来说,r 中 igraph 包确实是实现利器:
# 读取数据
ljhdat1=readLines("E:/ shanghai_1.txt")
ljhdat2=readLines("E:/ shanghai_2.txt")
ljhdat3=readLines("E:/ shanghai_3.txt")
ljhdat4=readLines("E:/ shanghai_4.txt")
ljhdat5=readLines("E:/ shanghai_5.txt")
bus=""# 建设巴士信息库
for(i in 1:length(ljhdat1)){if(ljhdat1\[i\]=="")bus=c(bus,ljhdat1\[i-1\])# 提取每个巴士的路线信息
}
for(i in 1:length(ljhdat2)){if(ljhdat2\[i\]=="")bus=c(bus,ljhdat2\[i-1\])# 提取每个巴士的路线信息
}
for(i in 1:length(ljhdat3)){if(ljhdat3\[i\]=="")bus=c(bus,ljhdat3\[i-1\])# 提取每个巴士的路线信息
}
for(i in 1:length(ljhdat4)){if(ljhdat4\[i\]=="")bus=c(bus,ljhdat4\[i-1\])# 提取每个巴士的路线信息
}
for(i in 1:length(ljhdat5)){if(ljhdat5\[i\]=="")bus=c(bus,ljhdat5\[i-1\])# 提取每个巴士的路线信息
}
bus;
bus=bus\[-1\]
route=list(0)# 建设路线信息
####################### 宰割路线失去站点信息 #################################
route\[\[1\]\]=unlist(strsplit(bus\[1\],split=" "))\[-1\]
route\[\[1\]\]=route\[\[1\]\]\[-which(route\[\[1\]\]=="#")\]# 删除 #号
n=length(route\[\[1\]\])
library(igraph)
d = data.frame(route\[\[1\]\]\[1:n-1\] ,route\[\[1\]\]\[2:n \]# 建设邻接矩阵
)
g = graph.data.frame(d, directed = TRUE)
plot(g)
################################ 宰割所有路线失去站点信息 ###########################
library(igraph)
route1=character(0);
对于最初生成的网络图因为路线泛滥,在查看的过程中能够通过设置可视化参数来进一步优化。
最受欢迎的见解
1. 数据类岗位需要的数据面
2. 探析大数据期刊文章钻研热点
3. 机器学习助推快时尚精准销售预测
4. 用机器学习辨认一直变动的股市情况—隐马尔科夫模型 (HMM) 的利用
5. 数据凝听人民网留言板的那些“网事”
6. 在 r 语言中应用 GAM(狭义相加模型)进行电力负荷工夫序列剖析
7. 用数据解读体育决策:开掘体育赛事新价值
8. 把握出租车行驶的数据脉搏
9. 智能门锁“剁手”数据攻略