最美情侣中文字幕电影,在线麻豆精品传媒,在线网站高清黄,久久黄色视频

歡迎光臨散文網(wǎng) 會員登陸 & 注冊

WGCNA極速入門

2021-12-29 16:12 作者:Biogeek_Lau  | 我要投稿

setwd("E:/demo") #設(shè)定工作目錄,路徑需修改

options(digits=2)

###############原始數(shù)據(jù)下載base on R3.5.3###############

#GEOquery包,此處以數(shù)據(jù)集GSE3910為例,用getGEOSUppFiles()函數(shù)獲取原始數(shù)據(jù)

library(GEOquery)

rawdata = getGEOSuppFiles("GSE42568")

#因網(wǎng)速問題,本人手動下載了該數(shù)據(jù)集的前6個樣本數(shù)據(jù)并用于后面的分析,正常情況下使用上述函數(shù)下載數(shù)據(jù)后需要解壓成單個CEL格式文件,并存放在同一個文件夾中,以備后面使用

###############原始數(shù)據(jù)讀取----ReadAffy()函數(shù)

#使用choose.dir()函數(shù)選擇文件夾

dir <- choose.dir(caption = "Select folder")

#列出CEL文件,保存到變量

cel.files <- list.files(path = dir, pattern = ".+\\.cel", ignore.case = TRUE,

????????????full.names = TRUE, recursive = TRUE)

#查看文件名

basename(cel.files)

library(affy)

#讀入數(shù)據(jù)

data.raw <- ReadAffy(filenames = cel.files)

#讀入芯片的默認(rèn)樣品名稱是文件名,用sampleNames函數(shù)查看并修改:

sampleNames(data.raw)

###############樣本重命名--使用stri_sub函數(shù)刪除CEL等,僅保留GSM號

library(stringi)

#8或9或10

sampleNames(data.raw)<-stri_sub(sampleNames(data.raw),1,10)

sampleNames(data.raw)

###############下載樣本信息,即每個gsm的臨床信息

library(dplyr);library(rvest);library(stringr)

getmultiplegsmclin=function(a){

?baseurl="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="

?link=paste(baseurl,a,sep ='')

?con = url(link)

?htmlCode = readLines(con)

?clin=read_html(htmlCode[302]) %>%

??html_nodes("td")

?clin=str_sub(clin, end=-11)

??clin=stri_replace_all(clin,replacement=",",regex="<br>")

?print(paste("writing clin data for",a))

?write.table(clin, "clin.csv", sep = ",", row.names = a,col.names=F, append = T)}

lapply(sampleNames(data.raw),getmultiplegsmclin)

###############構(gòu)建樣本分組信息

group_file=pData(data.raw)

#group_file$sample=rownames(group_file)

#根據(jù)實際情況修改

group_file$disease=c(rep(0,time=17),rep(1,time=104))

#查看分組信息

group_file

###############基因芯片數(shù)據(jù)預(yù)處理---質(zhì)量控制--查找并刪除芯片質(zhì)量差的芯片optional###############

#質(zhì)量控制包括:質(zhì)量分析報告、RLE箱線圖、NUSE箱線圖、RNA降解圖,通過上述分析,進而剔除質(zhì)量不合格的樣本

library(hgu133plus2cdf)

#R>3.5使用BiocManager::install("hgu133plus2.db")

#source("http://bioconductor.org/biocLite.R")

#biocLite("hgu133plus2.db")

library(hgu133plus2.db)

T##3.2.10 使用 arrayQualityMetrics 包進行質(zhì)量控制(一鍵生成所有圖),消耗大量內(nèi)存

library(arrayQualityMetrics)

arrayQualityMetrics(expressionset = data.raw,

??????????outdir = "fig",

??????????force = TRUE,

??????????do.logtransform = T)

dev.off()


###############基因芯片數(shù)據(jù)預(yù)處理:背景校正,標(biāo)準(zhǔn)化,匯總###############

#option1,如果樣本多會消耗大量內(nèi)存,三合一函數(shù)

#eset.mas<-expresso(afbatch = data.raw,bgcorrect.method = "mas",

#??????????normalize.method = "constant",pmcorrect.method ="mas",

#??????????summary.method = "mas")

#option2

#eset.mas<-justrma(data.raw)

eset.mas <- mas5(data.raw)

exprmat=exprs(eset.mas)

str(exprmat)

#刪除內(nèi)參探針

del <- grep("AFFX",rownames(exprmat)) #獲取affx的基因名

exprmat <- exprmat[-del,]

dim(exprmat)

#對數(shù)據(jù)做了歸一化處理

exprmat.quantiles = preprocessCore::normalize.quantiles(exprmat)

dimnames(exprmat.quantiles)=dimnames(exprmat)

#求探針表達均值并過濾低表達基因

expmean=apply(exprmat.quantiles,1,mean)

sd=apply(exprmat.quantiles,1,sd)

cv=sd/expmean

exprmat.quantiles<- exprmat.quantiles[expmean>quantile(expmean,0.25) & cv>quantile(cv,0.25),]

dim(exprmat.quantiles)

exprmat.quantiles<- log2(exprmat.quantiles)

#保存表達矩陣

#save(dataexp,files="GSE**_exp.RData")

write.csv(exprmat.quantiles,file="GS42568.csv",quote=F)

#limma進行差異基因分析

library(limma)

Exp <- factor(group_file$disease)

design <- model.matrix(~ 0+Exp)

rownames(design)=colnames(exprmat.quantiles)

#比較應(yīng)根據(jù)實際修改,實驗組-對照組

cont_matrix <- makeContrasts(bc = Exp1-Exp0,levels=design)

fit <- lmFit(exprmat.quantiles, design)

fit_contrast <- contrasts.fit(fit, cont_matrix)

fit_contrast <- eBayes(fit_contrast, trend=TRUE, robust=TRUE)

results <- decideTests(fit_contrast,p.value = 0.05,lfc = 1.3) #設(shè)置p和變化倍數(shù)閾值

summary(results)

volcanoplot(fit_contrast,coef=1,highlight=10,names=rownames(exprmat.quantiles))

top_genes <- topTable(fit_contrast, number = 100, adjust = "BH")

heatmap(exprmat.quantiles[rownames(top_genes),],labRow=F,labCol=F,ColSideColors=c(rep("#FF0000FF",17),rep("#BDFF00FF",104)),distfun=dist,?

????hclustfun = hclust,main = "blood from HIV resistant and negative women")?#熱圖樣本bar需要根據(jù)實際情況設(shè)置ColSideColors,需根據(jù)矩陣樣本順序設(shè)定顏色

DE_genes <- topTable(fit_contrast, number =summary(results)[1]+summary(results)[3], adjust = "BH")

write.csv(DE_genes,"DE_genes.csv")

write.csv(exprmat.quantiles[rownames(DE_genes),],"exprSet.significant.csv") #將差異基因保存輸出,下面分析可直接讀取該文件;

#exprSet.significant=read.csv("exprSet.significant.csv",header=T)?#下次直接讀取,進行后面的分析,差異基因一般要求幾百或者幾千個


#####WgcNA:讀取準(zhǔn)備數(shù)據(jù)#####

library(WgcNA);

enableWgcNAThreads(2)?

#memory.limit(6000) #限制占用最大內(nèi)存數(shù)

options(stringsAsFactors = FALSE);

dat0=read.csv("exprSet.significant.csv",header=TRUE)??

datSummary=dat0[,1];

dim(dat0) #查看數(shù)據(jù)形狀

datExpr = t(dat0[,2: ncol(dat0)]);?#轉(zhuǎn)置表達矩陣

dim(datExpr)

ArrayName= names(data.frame(dat0[,-1])) #列名提取為樣本名

GeneName= dat0[,1]?#首列提取為基因名

names(datExpr)=dat0[,1]?#指定datExpr列名


####有trait數(shù)據(jù)或運行demo數(shù)據(jù)可以去掉注釋#####

#datTraits=read.csv("trait.csv") #按照NCBI GEO數(shù)據(jù)集的樣本信息構(gòu)建csv,1列為樣本名稱,其他為性狀及其值,對照可為0,實驗組為1

#table(dimnames(datExpr)[[1]]==datTraits$sample)?

y=design[,2] #只有1種性狀時,有2個的要設(shè)置z

#z=datTraits$age

#sizeGrWindow(9, 5)

#pdf("ClusterTreeSamples.pdf")

#plotClusterTreeSamples(datExpr=datExpr, y=y)

#dev.off()

#rm(dat0);

#gc()


#####WgcNA:進行Power選擇#####

powers=c(seq(1,10,by=1),seq(12,16,by=2));

sft=pickSoftThreshold(datExpr, powerVector=powers,networkType = "signed")?

sizeGrWindow(9, 5); #打開畫布

pdf('choosing power.pdf');#生成pdf文件

par(mfrow = c(1,1)); #畫布布局,1行1列

cex1 = 0.9; #power閾值,可減為0.8

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],

???xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence")); #畫圖

text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red"); #添加文本

abline(h=0.90,col="red"); #畫高位0.9的水平紅色直線

dev.off() #關(guān)閉

# 同上畫Mean connectivity對power的函數(shù)

sizeGrWindow(9, 5);

pdf('mean connectivity.pdf');

plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))

text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red");

dev.off()


#####進行WgcNA#####

softPower =14?#參數(shù)可改,在上面那power圖里曲線平滑時的第一個數(shù),一般不大于12

Connectivity=softConnectivity(datExpr,corFnc = "cor", corOptions = "use ='p'",power=softPower,type = "signed")

pdf("scale-free.pdf");

scaleFreePlot(Connectivity,nBreaks = 10,truncated = FALSE,removeFirst = FALSE, main = "");

dev.off()

adjacency = adjacency(datExpr,corFnc = "cor", corOptions = "use ='p'",type = "signed", power = softPower)

TOM = TOMsimilarity(adjacency,TOMType="signed");

dissTOM = 1-TOM

geneTree = hclust(as.dist(dissTOM), method = "average")

minModuleSize =30; #模塊最小基因數(shù),參數(shù)可改,模塊太少要改小

dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,deepSplit = 4, pamRespectsDendro = FALSE,minClusterSize = minModuleSize,cutHeight=0.99); #deepSplit = 0參數(shù)可改,0-4越大越敏感,模塊數(shù)越多,5000個基因一般10-40個模塊

table(dynamicMods)

dynamicColors = labels2colors(dynamicMods)?#模塊編號轉(zhuǎn)換為顏色

table(dynamicColors)

MEList = moduleEigengenes(datExpr, colors = dynamicMods) #colors = dynamicColors

MEs = MEList$eigengenes

MEDiss = 1-cor(MEs);

METree = hclust(as.dist(MEDiss), method = "average");#

sizeGrWindow(7, 6)

plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")

MEDissThres = 0.2

abline(h=MEDissThres, col = "red")

merge = mergeCloseModules(datExpr, dynamicMods, cutHeight = MEDissThres, verbose = 3); #合并相似的模塊為1個

mergedColors = merge$colors;

table(mergedColors)

mergedMEs = merge$newMEs;

sizeGrWindow(12, 9)

pdf("DendroAndColors.pdf")

plotDendroAndColors(geneTree, cbind(dynamicMods, mergedColors),c("Dynamic Tree Cut", "Merged dynamic"),dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)

dev.off()

moduleColors = mergedColors

MEs = mergedMEs;

MEDiss = 1-cor(MEs);

METree = hclust(as.dist(MEDiss), method = "average");#

pdf("METree.pdf")

plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")

dev.off()

nSamples=nrow(datExpr)

geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p")); #計算基因?qū)儆谀硞€模塊的程度

MMPvalue = cbind.data.frame(datSummary,corPvalueStudent(as.matrix(geneModuleMembership), nSamples)); #計算其P值

write.table(data.frame(ArrayName,MEs),"MEs.csv",row.name=F) #輸出模塊基因表達信息

kMEdat=data.frame(geneModuleMembership,MMPvalue)

write.table(data.frame(datSummary,kMEdat),"kME-MMPvalue.csv",row.names=FALSE)

k.in=intramodularConnectivity(adjacency(datExpr,corFnc = "cor", corOptions = "use ='p'", type = "signed", power = softPower), moduleColors,scaleByMax = FALSE)

datout=data.frame(datSummary, colorNEW=moduleColors, k.in)?

write.table(datout, file="OutputCancerNetwork.csv", sep=",", row.names=F) #輸出連接度及模塊信息

hubs??= chooseTopHubInEachModule(datExpr, moduleColors)

write.csv(data.frame(module=names(hubs),moduleColor=labels2colors(names(hubs)),hub=hubs),"num2color.csv",row.names=F)

#####模塊基因功能富集分析#####

gene=read.csv("OutputCancerNetwork.csv",header=T)

library(gProfileR)

for (i in unique(gene$colorNEW)[unique(gene$colorNEW)>0]){

?genes=subset(gene$datSummary,gene$colorNEW==i)

?go=gprofiler(as.vector(genes),?

????????organism = "hsapiens",numeric_ns="ENTREZGENE_ACC")[,-14]

?write.table(go,"module_enrichment.csv",append =T,row.names=rep(i,nrow(go)),sep=",")}


#####畫TOM及熱圖:可不做#####

pdf("TOM.pdf")

TOMplot(dissTOM , geneTree, moduleColors)

dev.off()

#畫MDS圖:可不做

pdf("MDS.pdf")

cmd1=cmdscale(as.dist(dissTOM),2)

par(mfrow=c(1,1))

plot(cmd1, col=as.character(moduleColors),?main="MDS plot",xlab="Scaling Dimension 1",ylab="Scaling Dimension 2")

dev.off()


#####畫module heatmap and the eigengene#####

pdf("modheatmap.pdf")

for (i in unique(moduleColors)[unique(moduleColors)!=0]){

ME=MEs[, paste("ME",i, sep="")] #ME是模塊的基因表達方向

par(mfrow=c(2,1), mar=c(0.3, 5.5, 3, 2))

plotMat(t(scale(datExpr[,moduleColors==i ])),nrgcols=30,rlabels=F,rcols=labels2colors(i),main=labels2colors(i), cex.main=2)}

dev.off()

pdf("eigenebar.pdf")

for (i in unique(moduleColors)[unique(moduleColors)!=0]){

par(mar=c(5, 4.2, 0, 0.7))

barplot(ME, col=i, main="", cex.main=2,ylab="Eigengene expression",xlab=labels2colors(i))}

dev.off()


#####有性狀數(shù)據(jù)時做,模塊基因表達與trait進行關(guān)聯(lián)#####

signif(cor(y,MEs, use="p"),2)

p.values = corPvalueStudent(cor(y,MEs, use="p"), nSamples = nSamples) #trait和模塊相關(guān)分析的P值

#Measure of module significance as average gene significance

GS1=as.numeric(cor(y,datExpr, use="p")) #trait和基因表達的相關(guān)系數(shù)

GeneSignificance=abs(GS1)

# Next, module significance is defined as average gene significance.

ModuleSignificance=tapply(GeneSignificance, moduleColors, mean, na.rm=T) #求模塊內(nèi)GS1的均值

sizeGrWindow(8,7)

par(mfrow = c(1,1))

plotModuleSignificance(GeneSignificance[moduleColors!=0],moduleColors[moduleColors!=0])

#plot Gene significance (y-axis) vs. intramodular connectivity (x-axis) 畫圖gene sig和連接度散點

colorlevels=unique(moduleColors)

colorlevels=colorlevels[colorlevels!=0]

sizeGrWindow(9,6)

par(mfrow=c(2,as.integer(0.5+length(colorlevels)/2)))

par(mar = c(4,5,3,1))

for (i in c(1:length(colorlevels)))

{

?whichmodule=colorlevels[[i]];

?restrict1 = (moduleColors==whichmodule);

?verboseScatterplot(k.in$kWithin[restrict1],

???????????GeneSignificance[restrict1], col=moduleColors[restrict1],

???????????main=whichmodule,

???????????xlab = "Connectivity", ylab = "Gene Significance", abline = TRUE)

}

#Generalizing intramodular connectivity for all genes on the array

datKME=signedKME(datExpr, MEs, outputColumnName="MM.")

# Display the first few rows of the data frame

head(datKME)


#####有性狀時做,篩選和性狀相關(guān)的基因#####

NS1=networkScreening(y=y, datME=MEs, datExpr=datExpr,oddPower=softPower, blockSize=10000, minimumSampleSize=4,addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)

GeneResultsNetworkScreening=data.frame(GeneName=row.names(NS1), NS1)

#write.table(GeneResultsNetworkScreening, file="GeneResultsNetworkScreening.csv",row.names=F,sep=",")

MEsy = data.frame(y, MEs)

eigengeneSignificance = cor(MEsy, y); #計算模塊和性狀的相關(guān)系數(shù)

eigengeneSignificance[1,1] = (1+max(eigengeneSignificance[-1, 1]))/2

eigengeneSignificance.pvalue = corPvalueStudent(eigengeneSignificance, nSamples = length(y)) #計算p值

namesME=names(MEsy)

# Form a summary data frame

out1=data.frame(t(data.frame(eigengeneSignificance,eigengeneSignificance.pvalue, namesME, t(MEsy))))

# Set appropriate row names

dimnames(out1)[[1]][1]="EigengeneSignificance"

dimnames(out1)[[1]][2]="EigengeneSignificancePvalue"

dimnames(out1)[[1]][3]="ModuleEigengeneName"

dimnames(out1)[[1]][-c(1:3)]=dimnames(datExpr)[[1]]

# Write the data frame into a file

write.table(out1, file="MEResultsNetworkScreening.csv", row.names=TRUE, col.names = TRUE, sep=",")

GeneName=dimnames(datExpr)[[2]]

GeneSummary=data.frame(GeneName, moduleColors, NS1)

write.table(GeneSummary, file="GeneSummaryTutorial.csv", row.names=F,sep=",")

datTraits=data.frame(ArrayName, MEsy)

dimnames(datTraits)[[2]][2:length(namesME)]=paste("Trait",dimnames(datTraits)[[2]][2:length(namesME)],sep=".")

write.table(datTraits, file="TraitsTutorial.csv", row.names=F,sep=",")

#Relationships among the top 30 most significant genes,correlation heatmaps for signed network:

sizeGrWindow(7,7)

NS1=networkScreening(y=y, datME=MEs, datExpr=datExpr,oddPower=softPower, blockSize=10000, minimumSampleSize=4,addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)

topList=rank(NS1$p.Weighted,ties.method="first")<=30

#gene.names= names(datExpr)[topList]

gene.names=GeneName[topList]

colnames(datExpr)=GeneName

# The following shows the correlations between the top genes

plotNetworkHeatmap(datExpr, plotGenes = gene.names,networkType="signed", useTOM=TRUE,power=softPower, main="signed correlations")


#####自動進行所有模塊的輸出automatic finish the Cytoscape mods,必須做shiny用#####

probes = dat0[,1]

n=length(unique(moduleColors)[unique(moduleColors)!=0])

pb <- txtProgressBar(min = 0, max = n, style = 3)?#添加進度條

for (p in 1:n) { modules=unique(moduleColors)[unique(moduleColors)!=0][p]

inModule = is.finite(match(moduleColors,modules));

modProbes = probes[inModule];

modTOM = TOM[inModule, inModule];

dimnames(modTOM) = list(modProbes, modProbes)

cyt = exportNetworkToCytoscape(modTOM,

????????????????edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),

????????????????nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),

????????????????weighted = TRUE,threshold = quantile(abs(modTOM),probs=0.8,nodeNames = modProbes ,nodeAttr = moduleColors[inModule]))??

#threshold can be replaced by quantile(abs(modTOM),probs=0.8)

setTxtProgressBar(pb, p)}

close(pb) #關(guān)閉進度條


#####有性狀數(shù)據(jù)時做,多個trait和module聯(lián)系起來#####

allTraits = read.csv("trait.csv",row.names = 1);

femaleSamples=rownames(datExpr);

#traitRows = match(femaleSamples, allTraits$arrayname);

traitRows=match(femaleSamples, names(allTraits))

datTraits = allTraits[,1:6];

nGenes = ncol(datExpr);

nSamples = nrow(datExpr);

moduleTraitCor = WgcNA::cor(MEs[names(MEs)!="ME0"], datTraits, use = "p");

moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);

sizeGrWindow(10,6)

# Will display correlations and their p-values

textMatrix = paste(signif(moduleTraitCor, 2), "\n(",

??????????signif(moduleTraitPvalue, 1), ")", sep = "");

dim(textMatrix) = dim(moduleTraitCor)

par(mar = c(6, 8.5, 3, 3));

# Display the correlation values within a heatmap plot

labeledHeatmap(Matrix = moduleTraitCor,#moduleTraitCor

????????# xLabels = names(datTraits),

????????xLabels =names(datTraits), #如果只有1個性狀,圖比較難看,顯示性狀與模塊的關(guān)系

????????yLabels = names(MEs[names(MEs)!="ME0"]),

????????ySymbols = names(MEs[names(MEs)!="ME0"]),

????????colorLabels = FALSE,

????????colors = greenWhiteRed(50),

????????textMatrix =textMatrix,#textMatrix

????????setStdMargins = FALSE,

????????cex.text = 1,

????????zlim = c(-1,1),

????????main = paste("Module-trait relationships"))

#可以輸出熱圖中的數(shù)據(jù),textMatrix可以刪去

out2=data.frame(data.frame(moduleTraitCor,moduleTraitPvalue))

write.table(out2, file="trait-module relationship.csv", row.names=TRUE, col.names = TRUE, sep=",")

#cor函數(shù)可以有use和method參數(shù)的設(shè)置 : Missing values present in input data. Consider using use = 'pairwise.complete.obs'.

#注意:大小寫是不同的,特別是文件名等!


###############suvival analysis(optional)##############

library(tidyverse);library(tidytidbits);library(survivalanalysis);library(survival);library(survminer);library(pROC)

#####Fit survival data using the Kaplan-Meier method

data=cbind(MEs[names(MEs)!="ME0"],allTraits[,7:10])

surv_object <- Surv(time = allTraits$overall.survival.time_days, event = allTraits$overall.survival.event)

pdf("overall_sruvival.pdf")

for (a in 1:6){

?group=as.numeric(MEs[,a]>median(unlist(MEs[,a]))) #change a in combine_exp[a,]

?fit2 <- survfit(surv_object ~ group, data = data)

?print(ggsurvplot(fit2, data =data, pval = TRUE,risk.table=TRUE, xlab="Time (days)",title=names(data)[a]))?#use print

}

dev.off()


WGCNA極速入門的評論 (共 條)

分享到微博請遵守國家法律
英德市| 鄯善县| 金平| 乐山市| 涞水县| 海丰县| 阿瓦提县| 乌兰浩特市| 仲巴县| 阿拉善左旗| 广元市| 九江县| 顺义区| 鞍山市| 贵定县| 苏尼特右旗| 财经| 西贡区| 阳曲县| 宁阳县| 邯郸市| 梧州市| 江门市| 赣州市| 南开区| 宜州市| 新乡市| 革吉县| 西乡县| 寿阳县| 咸阳市| 扶风县| 珠海市| 拜泉县| 济源市| 东乌| 通城县| 海门市| 津市市| 常德市| 平江县|