WGCNA極速入門

setwd("E:/demo") #設(shè)定工作目錄,路徑需修改
options(digits=2)
###############原始數(shù)據(jù)下載base on R3.5.3###############
#GEOquery包,此處以數(shù)據(jù)集GSE3910為例,用getGEOSUppFiles()函數(shù)獲取原始數(shù)據(jù)
library(GEOquery)
rawdata = getGEOSuppFiles("GSE42568")
#因網(wǎng)速問題,本人手動下載了該數(shù)據(jù)集的前6個樣本數(shù)據(jù)并用于后面的分析,正常情況下使用上述函數(shù)下載數(shù)據(jù)后需要解壓成單個CEL格式文件,并存放在同一個文件夾中,以備后面使用
###############原始數(shù)據(jù)讀取----ReadAffy()函數(shù)
#使用choose.dir()函數(shù)選擇文件夾
dir <- choose.dir(caption = "Select folder")
#列出CEL文件,保存到變量
cel.files <- list.files(path = dir, pattern = ".+\\.cel", ignore.case = TRUE,
????????????full.names = TRUE, recursive = TRUE)
#查看文件名
basename(cel.files)
library(affy)
#讀入數(shù)據(jù)
data.raw <- ReadAffy(filenames = cel.files)
#讀入芯片的默認(rèn)樣品名稱是文件名,用sampleNames函數(shù)查看并修改:
sampleNames(data.raw)
###############樣本重命名--使用stri_sub函數(shù)刪除CEL等,僅保留GSM號
library(stringi)
#8或9或10
sampleNames(data.raw)<-stri_sub(sampleNames(data.raw),1,10)
sampleNames(data.raw)
###############下載樣本信息,即每個gsm的臨床信息
library(dplyr);library(rvest);library(stringr)
getmultiplegsmclin=function(a){
?baseurl="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
?link=paste(baseurl,a,sep ='')
?con = url(link)
?htmlCode = readLines(con)
?clin=read_html(htmlCode[302]) %>%
??html_nodes("td")
?clin=str_sub(clin, end=-11)
??clin=stri_replace_all(clin,replacement=",",regex="<br>")
?print(paste("writing clin data for",a))
?write.table(clin, "clin.csv", sep = ",", row.names = a,col.names=F, append = T)}
lapply(sampleNames(data.raw),getmultiplegsmclin)
###############構(gòu)建樣本分組信息
group_file=pData(data.raw)
#group_file$sample=rownames(group_file)
#根據(jù)實際情況修改
group_file$disease=c(rep(0,time=17),rep(1,time=104))
#查看分組信息
group_file
###############基因芯片數(shù)據(jù)預(yù)處理---質(zhì)量控制--查找并刪除芯片質(zhì)量差的芯片optional###############
#質(zhì)量控制包括:質(zhì)量分析報告、RLE箱線圖、NUSE箱線圖、RNA降解圖,通過上述分析,進而剔除質(zhì)量不合格的樣本
library(hgu133plus2cdf)
#R>3.5使用BiocManager::install("hgu133plus2.db")
#source("http://bioconductor.org/biocLite.R")
#biocLite("hgu133plus2.db")
library(hgu133plus2.db)
T##3.2.10 使用 arrayQualityMetrics 包進行質(zhì)量控制(一鍵生成所有圖),消耗大量內(nèi)存
library(arrayQualityMetrics)
arrayQualityMetrics(expressionset = data.raw,
??????????outdir = "fig",
??????????force = TRUE,
??????????do.logtransform = T)
dev.off()
###############基因芯片數(shù)據(jù)預(yù)處理:背景校正,標(biāo)準(zhǔn)化,匯總###############
#option1,如果樣本多會消耗大量內(nèi)存,三合一函數(shù)
#eset.mas<-expresso(afbatch = data.raw,bgcorrect.method = "mas",
#??????????normalize.method = "constant",pmcorrect.method ="mas",
#??????????summary.method = "mas")
#option2
#eset.mas<-justrma(data.raw)
eset.mas <- mas5(data.raw)
exprmat=exprs(eset.mas)
str(exprmat)
#刪除內(nèi)參探針
del <- grep("AFFX",rownames(exprmat)) #獲取affx的基因名
exprmat <- exprmat[-del,]
dim(exprmat)
#對數(shù)據(jù)做了歸一化處理
exprmat.quantiles = preprocessCore::normalize.quantiles(exprmat)
dimnames(exprmat.quantiles)=dimnames(exprmat)
#求探針表達均值并過濾低表達基因
expmean=apply(exprmat.quantiles,1,mean)
sd=apply(exprmat.quantiles,1,sd)
cv=sd/expmean
exprmat.quantiles<- exprmat.quantiles[expmean>quantile(expmean,0.25) & cv>quantile(cv,0.25),]
dim(exprmat.quantiles)
exprmat.quantiles<- log2(exprmat.quantiles)
#保存表達矩陣
#save(dataexp,files="GSE**_exp.RData")
write.csv(exprmat.quantiles,file="GS42568.csv",quote=F)
#limma進行差異基因分析
library(limma)
Exp <- factor(group_file$disease)
design <- model.matrix(~ 0+Exp)
rownames(design)=colnames(exprmat.quantiles)
#比較應(yīng)根據(jù)實際修改,實驗組-對照組
cont_matrix <- makeContrasts(bc = Exp1-Exp0,levels=design)
fit <- lmFit(exprmat.quantiles, design)
fit_contrast <- contrasts.fit(fit, cont_matrix)
fit_contrast <- eBayes(fit_contrast, trend=TRUE, robust=TRUE)
results <- decideTests(fit_contrast,p.value = 0.05,lfc = 1.3) #設(shè)置p和變化倍數(shù)閾值
summary(results)
volcanoplot(fit_contrast,coef=1,highlight=10,names=rownames(exprmat.quantiles))
top_genes <- topTable(fit_contrast, number = 100, adjust = "BH")
heatmap(exprmat.quantiles[rownames(top_genes),],labRow=F,labCol=F,ColSideColors=c(rep("#FF0000FF",17),rep("#BDFF00FF",104)),distfun=dist,?
????hclustfun = hclust,main = "blood from HIV resistant and negative women")?#熱圖樣本bar需要根據(jù)實際情況設(shè)置ColSideColors,需根據(jù)矩陣樣本順序設(shè)定顏色
DE_genes <- topTable(fit_contrast, number =summary(results)[1]+summary(results)[3], adjust = "BH")
write.csv(DE_genes,"DE_genes.csv")
write.csv(exprmat.quantiles[rownames(DE_genes),],"exprSet.significant.csv") #將差異基因保存輸出,下面分析可直接讀取該文件;
#exprSet.significant=read.csv("exprSet.significant.csv",header=T)?#下次直接讀取,進行后面的分析,差異基因一般要求幾百或者幾千個
#####WgcNA:讀取準(zhǔn)備數(shù)據(jù)#####
library(WgcNA);
enableWgcNAThreads(2)?
#memory.limit(6000) #限制占用最大內(nèi)存數(shù)
options(stringsAsFactors = FALSE);
dat0=read.csv("exprSet.significant.csv",header=TRUE)??
datSummary=dat0[,1];
dim(dat0) #查看數(shù)據(jù)形狀
datExpr = t(dat0[,2: ncol(dat0)]);?#轉(zhuǎn)置表達矩陣
dim(datExpr)
ArrayName= names(data.frame(dat0[,-1])) #列名提取為樣本名
GeneName= dat0[,1]?#首列提取為基因名
names(datExpr)=dat0[,1]?#指定datExpr列名
####有trait數(shù)據(jù)或運行demo數(shù)據(jù)可以去掉注釋#####
#datTraits=read.csv("trait.csv") #按照NCBI GEO數(shù)據(jù)集的樣本信息構(gòu)建csv,1列為樣本名稱,其他為性狀及其值,對照可為0,實驗組為1
#table(dimnames(datExpr)[[1]]==datTraits$sample)?
y=design[,2] #只有1種性狀時,有2個的要設(shè)置z
#z=datTraits$age
#sizeGrWindow(9, 5)
#pdf("ClusterTreeSamples.pdf")
#plotClusterTreeSamples(datExpr=datExpr, y=y)
#dev.off()
#rm(dat0);
#gc()
#####WgcNA:進行Power選擇#####
powers=c(seq(1,10,by=1),seq(12,16,by=2));
sft=pickSoftThreshold(datExpr, powerVector=powers,networkType = "signed")?
sizeGrWindow(9, 5); #打開畫布
pdf('choosing power.pdf');#生成pdf文件
par(mfrow = c(1,1)); #畫布布局,1行1列
cex1 = 0.9; #power閾值,可減為0.8
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
???xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence")); #畫圖
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red"); #添加文本
abline(h=0.90,col="red"); #畫高位0.9的水平紅色直線
dev.off() #關(guān)閉
# 同上畫Mean connectivity對power的函數(shù)
sizeGrWindow(9, 5);
pdf('mean connectivity.pdf');
plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red");
dev.off()
#####進行WgcNA#####
softPower =14?#參數(shù)可改,在上面那power圖里曲線平滑時的第一個數(shù),一般不大于12
Connectivity=softConnectivity(datExpr,corFnc = "cor", corOptions = "use ='p'",power=softPower,type = "signed")
pdf("scale-free.pdf");
scaleFreePlot(Connectivity,nBreaks = 10,truncated = FALSE,removeFirst = FALSE, main = "");
dev.off()
adjacency = adjacency(datExpr,corFnc = "cor", corOptions = "use ='p'",type = "signed", power = softPower)
TOM = TOMsimilarity(adjacency,TOMType="signed");
dissTOM = 1-TOM
geneTree = hclust(as.dist(dissTOM), method = "average")
minModuleSize =30; #模塊最小基因數(shù),參數(shù)可改,模塊太少要改小
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,deepSplit = 4, pamRespectsDendro = FALSE,minClusterSize = minModuleSize,cutHeight=0.99); #deepSplit = 0參數(shù)可改,0-4越大越敏感,模塊數(shù)越多,5000個基因一般10-40個模塊
table(dynamicMods)
dynamicColors = labels2colors(dynamicMods)?#模塊編號轉(zhuǎn)換為顏色
table(dynamicColors)
MEList = moduleEigengenes(datExpr, colors = dynamicMods) #colors = dynamicColors
MEs = MEList$eigengenes
MEDiss = 1-cor(MEs);
METree = hclust(as.dist(MEDiss), method = "average");#
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")
MEDissThres = 0.2
abline(h=MEDissThres, col = "red")
merge = mergeCloseModules(datExpr, dynamicMods, cutHeight = MEDissThres, verbose = 3); #合并相似的模塊為1個
mergedColors = merge$colors;
table(mergedColors)
mergedMEs = merge$newMEs;
sizeGrWindow(12, 9)
pdf("DendroAndColors.pdf")
plotDendroAndColors(geneTree, cbind(dynamicMods, mergedColors),c("Dynamic Tree Cut", "Merged dynamic"),dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
dev.off()
moduleColors = mergedColors
MEs = mergedMEs;
MEDiss = 1-cor(MEs);
METree = hclust(as.dist(MEDiss), method = "average");#
pdf("METree.pdf")
plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")
dev.off()
nSamples=nrow(datExpr)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p")); #計算基因?qū)儆谀硞€模塊的程度
MMPvalue = cbind.data.frame(datSummary,corPvalueStudent(as.matrix(geneModuleMembership), nSamples)); #計算其P值
write.table(data.frame(ArrayName,MEs),"MEs.csv",row.name=F) #輸出模塊基因表達信息
kMEdat=data.frame(geneModuleMembership,MMPvalue)
write.table(data.frame(datSummary,kMEdat),"kME-MMPvalue.csv",row.names=FALSE)
k.in=intramodularConnectivity(adjacency(datExpr,corFnc = "cor", corOptions = "use ='p'", type = "signed", power = softPower), moduleColors,scaleByMax = FALSE)
datout=data.frame(datSummary, colorNEW=moduleColors, k.in)?
write.table(datout, file="OutputCancerNetwork.csv", sep=",", row.names=F) #輸出連接度及模塊信息
hubs??= chooseTopHubInEachModule(datExpr, moduleColors)
write.csv(data.frame(module=names(hubs),moduleColor=labels2colors(names(hubs)),hub=hubs),"num2color.csv",row.names=F)
#####模塊基因功能富集分析#####
gene=read.csv("OutputCancerNetwork.csv",header=T)
library(gProfileR)
for (i in unique(gene$colorNEW)[unique(gene$colorNEW)>0]){
?genes=subset(gene$datSummary,gene$colorNEW==i)
?go=gprofiler(as.vector(genes),?
????????organism = "hsapiens",numeric_ns="ENTREZGENE_ACC")[,-14]
?write.table(go,"module_enrichment.csv",append =T,row.names=rep(i,nrow(go)),sep=",")}
#####畫TOM及熱圖:可不做#####
pdf("TOM.pdf")
TOMplot(dissTOM , geneTree, moduleColors)
dev.off()
#畫MDS圖:可不做
pdf("MDS.pdf")
cmd1=cmdscale(as.dist(dissTOM),2)
par(mfrow=c(1,1))
plot(cmd1, col=as.character(moduleColors),?main="MDS plot",xlab="Scaling Dimension 1",ylab="Scaling Dimension 2")
dev.off()
#####畫module heatmap and the eigengene#####
pdf("modheatmap.pdf")
for (i in unique(moduleColors)[unique(moduleColors)!=0]){
ME=MEs[, paste("ME",i, sep="")] #ME是模塊的基因表達方向
par(mfrow=c(2,1), mar=c(0.3, 5.5, 3, 2))
plotMat(t(scale(datExpr[,moduleColors==i ])),nrgcols=30,rlabels=F,rcols=labels2colors(i),main=labels2colors(i), cex.main=2)}
dev.off()
pdf("eigenebar.pdf")
for (i in unique(moduleColors)[unique(moduleColors)!=0]){
par(mar=c(5, 4.2, 0, 0.7))
barplot(ME, col=i, main="", cex.main=2,ylab="Eigengene expression",xlab=labels2colors(i))}
dev.off()
#####有性狀數(shù)據(jù)時做,模塊基因表達與trait進行關(guān)聯(lián)#####
signif(cor(y,MEs, use="p"),2)
p.values = corPvalueStudent(cor(y,MEs, use="p"), nSamples = nSamples) #trait和模塊相關(guān)分析的P值
#Measure of module significance as average gene significance
GS1=as.numeric(cor(y,datExpr, use="p")) #trait和基因表達的相關(guān)系數(shù)
GeneSignificance=abs(GS1)
# Next, module significance is defined as average gene significance.
ModuleSignificance=tapply(GeneSignificance, moduleColors, mean, na.rm=T) #求模塊內(nèi)GS1的均值
sizeGrWindow(8,7)
par(mfrow = c(1,1))
plotModuleSignificance(GeneSignificance[moduleColors!=0],moduleColors[moduleColors!=0])
#plot Gene significance (y-axis) vs. intramodular connectivity (x-axis) 畫圖gene sig和連接度散點
colorlevels=unique(moduleColors)
colorlevels=colorlevels[colorlevels!=0]
sizeGrWindow(9,6)
par(mfrow=c(2,as.integer(0.5+length(colorlevels)/2)))
par(mar = c(4,5,3,1))
for (i in c(1:length(colorlevels)))
{
?whichmodule=colorlevels[[i]];
?restrict1 = (moduleColors==whichmodule);
?verboseScatterplot(k.in$kWithin[restrict1],
???????????GeneSignificance[restrict1], col=moduleColors[restrict1],
???????????main=whichmodule,
???????????xlab = "Connectivity", ylab = "Gene Significance", abline = TRUE)
}
#Generalizing intramodular connectivity for all genes on the array
datKME=signedKME(datExpr, MEs, outputColumnName="MM.")
# Display the first few rows of the data frame
head(datKME)
#####有性狀時做,篩選和性狀相關(guān)的基因#####
NS1=networkScreening(y=y, datME=MEs, datExpr=datExpr,oddPower=softPower, blockSize=10000, minimumSampleSize=4,addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)
GeneResultsNetworkScreening=data.frame(GeneName=row.names(NS1), NS1)
#write.table(GeneResultsNetworkScreening, file="GeneResultsNetworkScreening.csv",row.names=F,sep=",")
MEsy = data.frame(y, MEs)
eigengeneSignificance = cor(MEsy, y); #計算模塊和性狀的相關(guān)系數(shù)
eigengeneSignificance[1,1] = (1+max(eigengeneSignificance[-1, 1]))/2
eigengeneSignificance.pvalue = corPvalueStudent(eigengeneSignificance, nSamples = length(y)) #計算p值
namesME=names(MEsy)
# Form a summary data frame
out1=data.frame(t(data.frame(eigengeneSignificance,eigengeneSignificance.pvalue, namesME, t(MEsy))))
# Set appropriate row names
dimnames(out1)[[1]][1]="EigengeneSignificance"
dimnames(out1)[[1]][2]="EigengeneSignificancePvalue"
dimnames(out1)[[1]][3]="ModuleEigengeneName"
dimnames(out1)[[1]][-c(1:3)]=dimnames(datExpr)[[1]]
# Write the data frame into a file
write.table(out1, file="MEResultsNetworkScreening.csv", row.names=TRUE, col.names = TRUE, sep=",")
GeneName=dimnames(datExpr)[[2]]
GeneSummary=data.frame(GeneName, moduleColors, NS1)
write.table(GeneSummary, file="GeneSummaryTutorial.csv", row.names=F,sep=",")
datTraits=data.frame(ArrayName, MEsy)
dimnames(datTraits)[[2]][2:length(namesME)]=paste("Trait",dimnames(datTraits)[[2]][2:length(namesME)],sep=".")
write.table(datTraits, file="TraitsTutorial.csv", row.names=F,sep=",")
#Relationships among the top 30 most significant genes,correlation heatmaps for signed network:
sizeGrWindow(7,7)
NS1=networkScreening(y=y, datME=MEs, datExpr=datExpr,oddPower=softPower, blockSize=10000, minimumSampleSize=4,addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)
topList=rank(NS1$p.Weighted,ties.method="first")<=30
#gene.names= names(datExpr)[topList]
gene.names=GeneName[topList]
colnames(datExpr)=GeneName
# The following shows the correlations between the top genes
plotNetworkHeatmap(datExpr, plotGenes = gene.names,networkType="signed", useTOM=TRUE,power=softPower, main="signed correlations")
#####自動進行所有模塊的輸出automatic finish the Cytoscape mods,必須做shiny用#####
probes = dat0[,1]
n=length(unique(moduleColors)[unique(moduleColors)!=0])
pb <- txtProgressBar(min = 0, max = n, style = 3)?#添加進度條
for (p in 1:n) { modules=unique(moduleColors)[unique(moduleColors)!=0][p]
inModule = is.finite(match(moduleColors,modules));
modProbes = probes[inModule];
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
cyt = exportNetworkToCytoscape(modTOM,
????????????????edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
????????????????nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
????????????????weighted = TRUE,threshold = quantile(abs(modTOM),probs=0.8,nodeNames = modProbes ,nodeAttr = moduleColors[inModule]))??
#threshold can be replaced by quantile(abs(modTOM),probs=0.8)
setTxtProgressBar(pb, p)}
close(pb) #關(guān)閉進度條
#####有性狀數(shù)據(jù)時做,多個trait和module聯(lián)系起來#####
allTraits = read.csv("trait.csv",row.names = 1);
femaleSamples=rownames(datExpr);
#traitRows = match(femaleSamples, allTraits$arrayname);
traitRows=match(femaleSamples, names(allTraits))
datTraits = allTraits[,1:6];
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
moduleTraitCor = WgcNA::cor(MEs[names(MEs)!="ME0"], datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
??????????signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,#moduleTraitCor
????????# xLabels = names(datTraits),
????????xLabels =names(datTraits), #如果只有1個性狀,圖比較難看,顯示性狀與模塊的關(guān)系
????????yLabels = names(MEs[names(MEs)!="ME0"]),
????????ySymbols = names(MEs[names(MEs)!="ME0"]),
????????colorLabels = FALSE,
????????colors = greenWhiteRed(50),
????????textMatrix =textMatrix,#textMatrix
????????setStdMargins = FALSE,
????????cex.text = 1,
????????zlim = c(-1,1),
????????main = paste("Module-trait relationships"))
#可以輸出熱圖中的數(shù)據(jù),textMatrix可以刪去
out2=data.frame(data.frame(moduleTraitCor,moduleTraitPvalue))
write.table(out2, file="trait-module relationship.csv", row.names=TRUE, col.names = TRUE, sep=",")
#cor函數(shù)可以有use和method參數(shù)的設(shè)置 : Missing values present in input data. Consider using use = 'pairwise.complete.obs'.
#注意:大小寫是不同的,特別是文件名等!
###############suvival analysis(optional)##############
library(tidyverse);library(tidytidbits);library(survivalanalysis);library(survival);library(survminer);library(pROC)
#####Fit survival data using the Kaplan-Meier method
data=cbind(MEs[names(MEs)!="ME0"],allTraits[,7:10])
surv_object <- Surv(time = allTraits$overall.survival.time_days, event = allTraits$overall.survival.event)
pdf("overall_sruvival.pdf")
for (a in 1:6){
?group=as.numeric(MEs[,a]>median(unlist(MEs[,a]))) #change a in combine_exp[a,]
?fit2 <- survfit(surv_object ~ group, data = data)
?print(ggsurvplot(fit2, data =data, pval = TRUE,risk.table=TRUE, xlab="Time (days)",title=names(data)[a]))?#use print
}
dev.off()