R語(yǔ)言公交地鐵路線網(wǎng)絡(luò)圖實(shí)現(xiàn)數(shù)據(jù)挖掘?qū)崙?zhàn)
原文鏈接
對(duì)于龐大的公交地鐵路線信息的數(shù)據(jù)挖掘,一般軟件遇到的問(wèn)題主要有兩點(diǎn):1.對(duì)于文本信息的挖掘,特別是中文詞匯的挖掘,缺乏成熟的工具或者軟件包,2.對(duì)于大數(shù)據(jù)量,一般軟件的讀取和處理會(huì)遇到問(wèn)題。即使一個(gè)月的部分區(qū)域路線信息也會(huì)達(dá)到幾百m以上,因此,對(duì)于這類數(shù)據(jù),無(wú)論從算法運(yùn)行還是數(shù)據(jù)讀取來(lái)說(shuō)普通的SQL語(yǔ)言或者matlab軟件處理起來(lái)都乏善可陳。對(duì)于這類數(shù)據(jù),我們一般用r軟件可以輕松實(shí)現(xiàn)讀取,數(shù)據(jù)挖掘以及可視化的過(guò)程。
例如對(duì)于下面這樣的車站數(shù)據(jù):
和近600M的進(jìn)出站信息的數(shù)據(jù), 如果要實(shí)現(xiàn)每隔一段時(shí)間的對(duì)應(yīng)路線的進(jìn)出站人數(shù)整理以及可視化的過(guò)程,我們可以進(jìn)行一下的步驟進(jìn)行分析:


首先我們進(jìn)行數(shù)據(jù)的讀取和預(yù)處理
install.packages("dplyr")
library("dplyr")#讀取dplyr包用以排序
###對(duì)數(shù)據(jù)讀取
data=read.table("E:\\201501一卡通進(jìn)出站.txt",stringsAsFactors=F)
##對(duì)數(shù)據(jù)列進(jìn)行命名
colnames(data)=c("邏輯卡號(hào)",
"交易日期" ,
"交易時(shí)間",
"票種",
"交易代碼",
"交易車站",
"上次交易車站")
###對(duì)數(shù)據(jù)進(jìn)行預(yù)處理
for( ii in 20150101:20150131){#每天的數(shù)據(jù)
data1=data[which(data[,2]==ii),]#篩選出日期為20150101這天的數(shù)據(jù)
data2=data1[,c(2,3,6,7)]#篩選出"交易日期" ,"交易時(shí)間", "交易車站","上次交易車站"的數(shù)據(jù)
data2#查看數(shù)據(jù)
data2=data2[order(data2$交易車站),]
line1=data2[substr(data2$交易車站,1,1)=="1",]#1號(hào)線
line2=data2[substr(data2$交易車站,1,1)=="2",]#2號(hào)線
###篩選出車站為243
bus=unique(data2[,3])####################每個(gè)站的數(shù)據(jù)
for(busi in 1:length(bus)){
index=which(data2[,3]==bus[busi])#篩選出車站為243的數(shù)據(jù)行號(hào)
data3=data2[index,]#獲取交易車站為243的數(shù)據(jù)
###data3=data2[order(data2$交易車站),]#如果不篩選車站,直接按交易車站遞增排序
data4=arrange(data3,交易日期,交易時(shí)間)#對(duì)時(shí)間排序,先按年份遞增排序,然后按照時(shí)間遞增排序
###按每十分鐘時(shí)間分割
for (time in 6:21){
for(i in 1:6){
index=intersect(which(data4[,2]>time*10000+(i-1)*1000),which(data4[,2]<=time*10000+1000*i))
datat=data4[index,]
outnum=length(which(datat[,4]!=0))
innum=length(which(datat[,4]==0))
if(i!=6)cat(file=paste("E:\\",bus[busi],"車站",ii,"日一卡通進(jìn)出站時(shí)間.txt"),append=TRUE,ii,"日",time,"點(diǎn)",i-1,"0分到",i,"0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n")
else cat(file=paste("E:\\",bus[busi],"車站",ii,"日一卡通進(jìn)出站時(shí)間.txt"),append=TRUE,ii,"日",time,"點(diǎn)",i-1,"0分到",time+1,"點(diǎn)0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n")
#cat(file="E:\\243車站一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time,"點(diǎn)",i-1,"0分到",time+1,"點(diǎn)0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n")
}
}
#篩選出出站人數(shù)
dataout=data3[which(data3[,4]!=0),]#上次交易車站不為0,為出站人數(shù)
datain=data3[which(data3[,4]==0),]
###將數(shù)據(jù)進(jìn)行輸出
write.table(data4,paste("E:\\",ii,"日 ",bus[busi],"車站一卡通進(jìn)出站整理.txt"))#將數(shù)據(jù)整理好輸出到指定的目錄文件名
}
}
####################################################################################3
################1,2號(hào)線##########
data2=data2[order(data2$交易車站),]
line1=data2[substr(data2$交易車站,1,1)=="1",]#1號(hào)線
line2=data2[substr(data2$交易車站,1,1)=="2",]#2號(hào)線
#########1號(hào)線
data4=arrange(line1,交易日期,交易時(shí)間)#對(duì)時(shí)間排序,先按年份遞增排序,然后按照時(shí)間遞增排序
###按每十分鐘時(shí)間分割
cat(file="E:\\1號(hào)線一卡通進(jìn)出站時(shí)間.txt",append=TRUE, " 點(diǎn)", " 分"," 出站人數(shù)", " ","進(jìn)站人數(shù) " ,"\n")
for (time in 6:21){
for(i in 1:6){
index=intersect(which(data4[,2]>time*10000+(i-1)*1000),which(data4[,2]<=time*10000+1000*i))
datat=data4[index,]
outnum=length(which(datat[,4]!=0))
innum=length(which(datat[,4]==0))
if(i!=6)cat(file="E:\\1號(hào)線一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time," ",i-1,"0 "," ",outnum," "," ",innum,"\n")#cat(time,"點(diǎn)",i-1,"0分到",i,"0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n")
else cat(file="E:\\1號(hào)線一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time," ",i-1,"0 "," ",outnum," "," ",innum,"\n")#cat(time,"點(diǎn)",i-1,"0分到",time+1,"點(diǎn)0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n") #
#cat(file="E:\\20150101日243車站一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time,"點(diǎn)",i-1,"0分到",time+1,"點(diǎn)0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n")
}
}
#篩選出出站人數(shù)
dataout=data3[which(data3[,4]!=0),]#上次交易車站不為0,為出站人數(shù)
datain=data3[which(data3[,4]==0),]
numout=dim(dataout)[1]#出站人數(shù)總和
numin=dim(datain)[1]#進(jìn)站人數(shù)總和
###將數(shù)據(jù)進(jìn)行輸出
write.table(data4,"E:\\1號(hào)線一卡通進(jìn)出站整理.txt")#將數(shù)據(jù)整理好輸出到指定的目錄文件名
########2號(hào)線
data4=arrange(line2,交易日期,交易時(shí)間)#對(duì)時(shí)間排序,先按年份遞增排序,然后按照時(shí)間遞增排序
###按每十分鐘時(shí)間分割
cat(file="E:\\2號(hào)線一卡通進(jìn)出站時(shí)間.txt",append=TRUE, " 點(diǎn)", " 分"," 出站人數(shù)", " ","進(jìn)站人數(shù) " ,"\n")
for (time in 6:21){
for(i in 1:6){
index=intersect(which(data4[,2]>time*10000+(i-1)*1000),which(data4[,2]<=time*10000+1000*i))
datat=data4[index,]
outnum=length(which(datat[,4]!=0))
innum=length(which(datat[,4]==0))
if(i!=6)cat(file="E:\\2號(hào)線一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time," ",i-1,"0 "," ",outnum," "," ",innum,"\n")#cat(time,"點(diǎn)",i-1,"0分到",i,"0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n")
else cat(file="E:\\2號(hào)線一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time," ",i-1,"0 ", " ",outnum," "," ",innum,"\n")#cat(time,"點(diǎn)",i-1,"0分到",time+1,"點(diǎn)0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n") #
#cat(file="E:\\TB related\\Service\\temp\\20150101日243車站一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time,"點(diǎn)",i-1,"0分到",time+1,"點(diǎn)0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n")
}
}
#篩選出出站人數(shù)
dataout=data3[which(data3[,4]!=0),]#上次交易車站不為0,為出站人數(shù)
datain=data3[which(data3[,4]==0),]
###將數(shù)據(jù)進(jìn)行輸出
write.table(data4,"E:\\2號(hào)線一卡通進(jìn)出站整理.txt")#將數(shù)據(jù)整理好輸出到指定的目錄文件名
#########1,2總和
data4=arrange(line1,交易日期,交易時(shí)間)#對(duì)時(shí)間排序,先按年份遞增排序,然后按照時(shí)間遞增排序
data44=arrange(line2,交易日期,交易時(shí)間)#對(duì)時(shí)間排序,先按年份遞增排序,然后按照時(shí)間遞增排序
cat(file="E:\\1,2號(hào)線一卡通進(jìn)出站時(shí)間.txt",append=TRUE, " 點(diǎn)", " 分"," 出站人數(shù)", " ","進(jìn)站人數(shù) " ,"\n")
for (time in 6:21){
for(i in 1:6){
index=intersect(which(data4[,2]>time*10000+(i-1)*1000),which(data4[,2]<=time*10000+1000*i))
index2=intersect(which(data44[,2]>time*10000+(i-1)*1000),which(data44[,2]<=time*10000+1000*i))
datat=data4[index,]
datat1=data44[index2,]
outnum=length(which(datat[,4]!=0))
outnum1=length(which(datat1[,4]!=0))
innum=length(which(datat[,4]==0))
innum1=length(which(datat1[,4]==0))
if(i!=6)cat(file="E:\\1,2號(hào)線一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time," ",i-1,"0 "," ",outnum+outnum1," "," ",innum+innum1,"\n")#cat(time,"點(diǎn)",i-1,"0分到",i,"0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n")
else cat(file="E:\\1,2號(hào)線一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time," ",i-1,"0 ", " ",outnum+outnum1," "," ",innum+innum1,"\n")#cat(time,"點(diǎn)",i-1,"0分到",time+1,"點(diǎn)0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n") #
#cat(file="E:\\20150101日243車站一卡通進(jìn)出站時(shí)間.txt",append=TRUE,time,"點(diǎn)",i-1,"0分到",time+1,"點(diǎn)0分的出站人數(shù)為",outnum," ","進(jìn)站人數(shù)為",innum,"\n")
}
}
}
通過(guò)以上過(guò)程,我們可以將整理后的數(shù)據(jù)輸出到對(duì)應(yīng)的文件中:


以及交通路線的可視化過(guò)程;
對(duì)于交通路線的網(wǎng)絡(luò)圖來(lái)說(shuō),r中igraph包的確是實(shí)現(xiàn)利器:
#讀取數(shù)據(jù)
ljhdat1=readLines("E:/ shanghai_1.txt" )
ljhdat2=readLines("E:/ shanghai_2.txt")
ljhdat3=readLines("E:/ shanghai_3.txt")
ljhdat4=readLines("E:/ shanghai_4.txt")
ljhdat5=readLines("E:/ shanghai_5.txt")
bus=""#建立巴士信息庫(kù)
for(i in 1:length(ljhdat1)){
if(ljhdat1[i]=="")bus=c(bus,ljhdat1[i-1])#提取每個(gè)巴士的路線信息
}
for(i in 1:length(ljhdat2)){
if(ljhdat2[i]=="")bus=c(bus,ljhdat2[i-1])#提取每個(gè)巴士的路線信息
}
for(i in 1:length(ljhdat3)){
if(ljhdat3[i]=="")bus=c(bus,ljhdat3[i-1])#提取每個(gè)巴士的路線信息
}
for(i in 1:length(ljhdat4)){
if(ljhdat4[i]=="")bus=c(bus,ljhdat4[i-1])#提取每個(gè)巴士的路線信息
}
for(i in 1:length(ljhdat5)){
if(ljhdat5[i]=="")bus=c(bus,ljhdat5[i-1])#提取每個(gè)巴士的路線信息
}
bus;
bus=bus[-1]
route=list(0)#建立路線信息
#######################分割路線得到站點(diǎn)信息 #################################
route[[1]]=unlist(strsplit(bus[1],split=" "))[-1]
route[[1]]=route[[1]][-which(route[[1]]=="#")]#刪除#號(hào)
n=length(route[[1]])
library(igraph)
d = data.frame(route[[1]][1:n-1] ,route[[1]][2:n ]#建立鄰接矩陣
)
g = graph.data.frame(d, directed = TRUE)
plot(g )
################################分割所有路線得到站點(diǎn)信息###########################
library(igraph)
route1=character(0);

對(duì)于最后生成的網(wǎng)絡(luò)圖由于路線眾多,在查看的過(guò)程中可以通過(guò)設(shè)置可視化參數(shù)來(lái)進(jìn)一步優(yōu)化。
?