「生信技能樹」三陰性乳腺癌表達(dá)矩陣探索

操作數(shù)據(jù)與視頻案例不一樣,已根據(jù)實(shí)際情況改動,請注意契合度。
library("GEOquery")
load("GSE55235.Rdata")
a=gset[[1]]
dat=exprs(a)
pd=pData(a)
#這里跟案例操作不一樣,因?yàn)槲矣玫腉SE里有三種類型數(shù)據(jù),所以我選取了其中兩種
RA=rownames(pd[grepl('disease state: rheumatoid arthritis',as.character(pd$characteristics_ch1)),])
NOR=rownames(pd[grepl('disease state: healthy control',as.character(pd$characteristics_ch1)),])
dat=dat[,c(RA,NOR)]
group_list=c(rep("RA",times=10),rep("NOR",times=10))
save(dat,group_list,file="expression and pd")
rm(list = ls())
#PCA
load(file = "expression and pd")
dat=t(dat)
dat=as.data.frame(dat)
dat=cbind(dat,group_list)??
library("ggplot2")
library("factoextra")
library("FactoMineR")
#the the variable group_list (index =22284)is removed?
dat.pca <- PCA(dat[,-22284],graph=FALSE)
fviz_pca_ind(dat.pca,
???????geom.ind = "point",
???????col.ind = dat$group_list,
???????palette = c("#00AFBB","#E7B800"),
???????addEllipses = TRUE,
???????legend.title = "Groups"
)
rm(list = ls())
# heatmap
load(file = "expression and pd")
cg=names(tail(sort(apply(dat,1,sd)),20))
library(pheatmap)
n=t(scale(t(dat[cg,])))
#讓圖區(qū)別明顯
n[n>2]=2
n[n<-2]=-2
pheatmap(n,show_rownames = F, show_colnames = F)
#加分組信息
ac=data.frame(g=group_list)
row.names(ac)=colnames(n)
pheatmap(n,annotation_col = ac)
#boxplot
rm(list = ls())
load(file = "expression and pd")
library("ggplot2")
library("ggpubr")
bp=function(g){library("ggplot2")
?library("ggpubr")
?df=data.frame(gene=g,stage=group_list)
?p <- ggboxplot(df,x="stage",y="gene",
????????color="stage",palette="jco",
????????add="jitter")
?#add p value
?p + stat_compare_means()
}
bp(dat[1,])
bp(dat[2,])
rm(list = ls())
#差異分析
load(file = "expression and pd")
library("limma")
design=model.matrix(~factor(group_list))
fit=lmFit(dat,design)
fit=eBayes(fit)
options(digits = 4)
topTable(fit,coef=2,adjust="BH")
deg=topTable(fit,coef=2,adjust="BH",number = Inf)
save(deg,file = "P value?")
rm(list = ls())
#基因ID轉(zhuǎn)換
load(file = "P value")
deg$probe_id=rownames(deg)
library("hgu133plus2.db")
ids=toTable(hgu133plus2SYMBOL)
deg=merge(deg,ids,by="probe_id")
save(deg,file = "symbol")
#繪制火山圖
if(T){
?nrDEG=deg
?head(nrDEG)
?attach(nrDEG)
?plot(logFC,-log10(P.Value))
?library("ggplot2")
?library("ggpubr")
?df=nrDEG
?df$v=-log10(P.Value)
?ggscatter(df,x="logFC",y="v",size=0.5)
??
?df$g=ifelse(df$P.Value>0.01,"stable",
???????ifelse(df$logFC>2,"up",
???????????ifelse(df$logFC<-2,"down","stable"))
?)
?table(df$g)
?df$name=rownames(df)
?ggscatter(df,x="logFC",y="v",size=0.5,color="g")
?ggscatter(df,x="logFC",y="v",color="g",size=0.5,
??????label="symbol",repel=T,
??????label.select=c("MMP3","MT1X"),
??????palette=c("#00AFBB","#E78800","#FC4E07"))
??
?ggscatter(df,x="AveExpr",y="logFC",size=0.2)
?df$p_c=ifelse(df$P.Value<0.001,"p<0.001",
????????ifelse(df$P.Value<0.01,"0.001<p<0.01","p>0.01"))
?table(df$p_c)
?ggscatter(df,x="AveExpr",y="logFC",color="p_c",size=0.2,
??????palette=c("green","red","black"))
??
}
#繪制熱圖
if(T){
?#熱圖需要矩陣數(shù)據(jù)
?load(file = "expression and pd")
?load(file = "P value")
?dat[1:4,1:4]
?#取探針名
?x=deg$logFC
?deg$probe_id=rownames(deg)
?names(x)=deg$probe_id
?#根據(jù)logFC大小來取探針名;head前多少;tail后多少
?cg=c(names(head(sort(x),100)),
????names(tail(sort(x),100)))
?library(pheatmap)
?pheatmap(dat[cg,],show_colnames=F,show_rownames=F)
?#歸一化后畫圖
?n=t(scale(t(dat[cg,])))
?n[n>2]=2
?n[n<-2]=-2
?n[1:4,1:4]
?pheatmap(n,show_colnames=F,show_rownames=F)
?#加回group_list
?ac=data.frame(g=group_list)
?rownames(ac)=colnames(n)
?pheatmap(n,show_colnames=F,
??????show_rownames=F,
??????#不排序就是F,默認(rèn)排序T
??????cluster_cols=F,
??????annotation_col=ac)
}
rm(list = ls())
#注釋
load(file = "symbol")
head(deg)
##這里出大問題,我的logFC運(yùn)行代碼后全部變成閾值了,待解決;然后我繞了一下路采用輔助列恢復(fù)原值
logFC_t=648.43
logFC_true=deg$logFC
deg=cbind(deg,logFC_true)
deg$g=ifelse(deg$P.Value>0.01,"stable",
???????ifelse( deg$logFC >logFC_t,"up",
??????????ifelse( deg$logFC<-logFC_t,"down","stable"))
)
table(deg$g)
deg$logFC=deg$logFC_true
deg=deg[,-9]
#有些包不需要加載,同時關(guān)系到下面的一些函數(shù),發(fā)現(xiàn)函數(shù)找不到時嘗試重新加載包
library("ggplot2")
library("clusterProfiler")
library("AnnotationDbi")
library("stats4")
library("BiocGenerics")
library("org.Hs.eg.db")
df <- bitr(unique(deg$symbol),fromType="SYMBOL",
??????toType=c("ENTREZID"),
??????OrgDb=org.Hs.eg.db)
head(df)
head(deg)
deg$SYMBOL=rownames(deg)
deg=merge(deg,df,by.y="SYMBOL",by.x="symbol")
head(deg)
gene_up=deg[deg$g=="up","ENTREZID"]
gene_down=deg[deg$g=="down","ENTREZID"]
gene_diff=c(gene_up,gene_down)
#
gene_all=as.character(deg[,"ENTREZID"])
data(geneList,package = "DOSE")
head(geneList)
#這里兩個圖還沒有理解
boxplot(geneList)
boxplot(deg$logFC)
geneList=deg$logFC
names(geneList)=deg$ENTREZID
geneList=sort(geneList,decreasing = T)
## KEGG pathway analysis
if(T){
?### over=representation test
?kk.up <- enrichKEGG(gene = gene_up,
???????????organism = "hsa",
???????????universe = gene_all,
???????????pvalueCutoff = 0.9,
???????????qvalueCutoff = 0.9)
?head(kk.up)[,1:6]
?kk.down <- enrichKEGG(gene = gene_down,
????????????organism = "hsa",
????????????universe = gene_all,
????????????pvalueCutoff = 0.05)
?head(kk.down)[,1:6]
?kk.diff <- enrichKEGG(gene = gene_diff,
????????????organism = "hsa",
????????????pvalueCutoff = 0.05)
?head(kk.diff)[,1:6]
??
?kegg_diff_dt <- as.data.frame(kk.diff)
?kegg_down_dt <- as.data.frame(kk.down)
?kegg_up_dt <- as.data.frame(kk.up)
?down_kegg <- kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
?up_kegg <- kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
?#這里的自定義函數(shù)指路https://www.jianshu.com/p/0dd8f42a92ba
?
?source("functions.R")
?g_kegg=kegg_plot(up_kegg,down_kegg)
?print(g_kegg)
??
?ggsave(g_kegg,filename="kegg_up_down.png")
??
?#GSEA
?#這里報(bào)錯不用管
?kk_gse <- gseKEGG(geneList = geneList,
??????????organism = "hsa",
??????????verbose = FALSE)
?head(kk_gse)[,1:6]
?gseaplot(kk_gse,geneSetID = rownames(kk_gse[1,]))
??
?down_kegg <- kk_gse[kk_gse$pvalue<0.05&kk_gse$enrichmentScore<0,];down_kegg$group
?up_kegg <- kk_gse[kk_gse$pvalue<0.05&kk_gse$enrichmentScore<0,];up_kegg$group
??
?source("functions.R")
?#這里有錯,待解決:Error in `$<-.data.frame`(`*tmp*`, "pvalue", value = numeric(0)) : 替換數(shù)據(jù)里有0行,但數(shù)據(jù)有4
?g_kegg=kegg_plot(up_kegg,down_kegg)
?print(g_kegg)
?ggsave(g_kegg,filename = "kegg_up_down_gsea.png")
??
}
#GO databases analysis
{
g_list=list(gene_up=gene_up,
??????gene_down=gene_down,
??????gene_diff=gene_diff)
if(F){
?go_enrich_results <- lapply(g_list,function(gene){
??lapply(c("BP","MF","CC"),function(ont){
???cat(paste("Now process",ont))
???ego <- enrichGO(gene = gene,
???????????universe = gene_all,
???????????OrgDb = org.Hs.eg.db,
???????????ont = ont,
???????????pAdjustMethod = "BH",
???????????pvalueCutoff = 0.99,
???????????qvalueCutoff = 0.99,
???????????readable = TRUE)
???print(head(ego))
???return(ego)
??})
?})
}
}
#根據(jù)需要畫圖
rm(list = ls())
options(stringsAsFactors = F)
load(file = "expression and pd")
dat[1:4,1:4]
library("hgu133plus2.db")
p2s=toTable(hgu133plus2SYMBOL)
a=p2s$symbol %in% c()
np=p2s[a,1]
ng=p2s[a,2]
dat[np,1:4]
x=dat[np,]
rownames(x)=paste(ng,np,sep = ":")
library('pheatmap')
tmp=data.frame(group=group_list)
rownames(tmp)=colnames(x)
pheatmap(x,annotation_col=tmp)