TCGA-19-臨床數(shù)據(jù)整理及人群分組

代碼解釋:
# 07 miRNA差異表達
rm(list = ls())
# miRNA數(shù)據(jù)下載及整理
library(TCGAbiolinks)
library(limma)
# 查看有哪些分類數(shù)據(jù)?project="TCGA-CHOL"
getProjectSummary("TCGA-CHOL")
query <- GDCquery(project = 'TCGA-CHOL',?
???????????data.category = "Transcriptome Profiling",?
???????????data.type = "miRNA Expression Quantification",?
???????????experimental.strategy = "miRNA-Seq",
???????????workflow.type = "BCGSC miRNA Profiling")
GDCdownload(query)
expData<- GDCprepare(query = query)?
#讀取下載的query,用GDCprepare函數(shù)以SummarizedExperiment形式讀入rstudio
expData[1:4,1:4]?# counts第二列,rpm第三列
#這個代碼的用處就是提取前4行和前4列展示出來,沒用
dim(expData)
#查看維度,沒用的代碼
expr <- expData[,seq(2,136,3)]
#看OneNote解釋
dim(expr)
colnames(expr) <- substring(colnames(expr),12)??
# substr()需要指定stop值,substring()不需要
#提取列標題的12位及以后的字符串作為標題
expr <- data.frame(expData$miRNA_ID,expr)
#expData中提取出來的miRNA_ID列與expr數(shù)據(jù)框合并,重新命名為expr
names(expr)[1] <- 'id'
#把exper列名命名為id
#這樣就得到未處理的counts矩陣
# 篩選
expr=as.matrix(expr)
#強制轉(zhuǎn)為矩陣
rownames(expr)=expr[,1]
#第一列命名為行名
expr.matrix=expr[,2:ncol(expr)]
#提取除第一列以外的所有列,命名為expr.matrix
dimnames=list(rownames(expr.matrix),colnames(expr.matrix))
#定義一個列表 名為dimnames
data=matrix(as.numeric(as.matrix(expr.matrix)),nrow=nrow(expr.matrix),dimnames=dimnames)
#定義一個矩陣,名為data,強制轉(zhuǎn)為數(shù)值型矩陣
data=avereps(data)
#avereps()函數(shù):去重復:對于行上出現(xiàn)的相同基因,取平均
data=data[rowMeans(data)>0,]
#去除低表達基因,同一行上數(shù)據(jù)的平均值大于零的留下
dim(data)
data1 <- rbind(colnames(data),data)
#把列名作為第一行,命名為data1
write.table(data1, file="CHOL-miRNA-matrix.txt", sep="\t", quote=F, col.names=F)
#導出data1位txt文件,名稱為CHOL-miRNA-matrix
rm(list = ls())
# 導入數(shù)據(jù)
rt <- read.table("CHOL-miRNA-matrix.txt",sep="\t",header=T,check.names=F)
?#讀取文件數(shù)據(jù)入R,check.names=TRUE檢查變量名在R中是否有效
#數(shù)據(jù)處理
rownames(rt) <- rt[,1]
?#把第一列作為行名
rt <- rt[,-1]
?#刪除第一列
dim(rt)
?#返回維度信息
# 用substr函數(shù)在TCGA數(shù)據(jù)中提取樣本信息
group <- factor(ifelse(as.integer(substr(colnames(rt),14,15))<10,'tumor','normal'),
????????levels = c('normal','tumor'))
??#提取第14和15位,用來判斷是正常組還是腫瘤組
table(group)
??#table()函數(shù)——向量內(nèi)元素頻數(shù)統(tǒng)計,本例題是提取group中的頻數(shù)統(tǒng)計
# 差異性分析
library(edgeR)
logFC_cutoff=1
padj=0.05
#定義兩個變量:logFC_cutoff=1判斷差異有顯著差異的依據(jù), padj是調(diào)整后的p值,
dge <- DGEList(counts=rt,group=group)??#將數(shù)據(jù)打包為edgeR包識別的數(shù)據(jù)?group=group表示分類的因子變量是剛剛定義的分組變量
dge$samples$lib.size <- colSums(dge$counts) #colsums()函數(shù)——計算列求和??lib.size一個向量,表示每個樣本的總計數(shù)。
dge <- calcNormFactors(dge)??#標準化,消除量綱的影響
design <- model.matrix(~0+factor(group))?#看onenote解釋
colnames(design) <- c('normal','tumor')
rownames(design)<-colnames(dge)
colnames(design)<-levels(group)
#這一段代碼的目的看onenote
dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit1 <- glmLRT(fit, contrast=c(-1,1))?
DEG=topTags(fit1, n=nrow(rt))
DEG=as.data.frame(DEG)
# 輸出有顯著差異的結果
significant = DEG[(
?(DEG$FDR < padj) & (abs(DEG$logFC) > logFC_cutoff)
),]?# 篩選顯著差異的基因
write.table(significant, file="significant.xls",sep="\t",quote=F)
# 輸出表達上調(diào)的結果
upregulation = DEG[(DEG$FDR < padj & (DEG$logFC>logFC_cutoff)),]
write.table(upregulation, file="upregulation.xls",sep="\t",quote=F)
# 輸出表達下調(diào)的結果
downregulation = DEG[(DEG$FDR < padj & (DEG$logFC<(-logFC_cutoff))),]
write.table(downregulation, file="downregulation.xls",sep="\t",quote=F)
# 添加顯著性標簽
DEG$change <- factor(ifelse(DEG$PValue < padj & abs(DEG$logFC) > logFC_cutoff,
??????????????ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),
??????????????'NOT'))
#這個代碼的目的是再DEG數(shù)據(jù)框中添加名稱位change的列,用于顯示基因表達是上調(diào)了下降了還是不變
# 輸出結果
write.csv(DEG,file="miRNA-edgeR.csv")
# 結果可視化:火山圖 and 熱圖
# ggplot2繪制火山圖 volcano
library(ggplot2)
g <- ggplot(DEG,aes(logFC,-log10(as.numeric(FDR)),color=change))
g + geom_point()
g + geom_point(size = 1) +?
?labs(x = 'log2 Fold Change', y = '-log10 adjust p-value', title = 'volcano_plot') +?
?theme(plot.title = element_text(hjust = 0.5, size = 20,color = 'firebrick'),?
????panel.grid = element_blank(),?
????panel.background = element_rect(color = 'black', fill = 'transparent'),?
????legend.key = element_rect(fill = 'transparent')) +?
?geom_vline(xintercept = c(-logFC_cutoff, logFC_cutoff), lty = 2, color = 'black') +?#添加閾值線
?geom_hline(yintercept = -log10(padj), lty = 2, color = 'black')?
library(ggrepel)# ggrepel包給點添加點標簽的包
significant = subset(DEG,abs(logFC)>3 & -log10(as.numeric(FDR)) > 5)
#生成significant變量,里面的值決定了到底給哪些點添加標簽
#geom_text_repel函數(shù)才是給點添加標簽的函數(shù)
g + geom_point(size = 1) +?
?labs(x = 'log2 Fold Change', y = '-log10 adjust p-value', title = 'volcano_plot') +?
?theme(plot.title = element_text(hjust = 0.5, size = 20,color = 'firebrick'),?
????panel.grid = element_blank(),?
????panel.background = element_rect(color = 'black', fill = 'transparent'),?
????legend.key = element_rect(fill = 'transparent')) +?
?geom_vline(xintercept = c(-logFC_cutoff, logFC_cutoff), lty = 2, color = 'black') +?#添加閾值線
?geom_hline(yintercept = -log10(padj), lty = 2, color = 'black') +
?geom_text_repel(data = significant,aes(label = rownames(significant)),
?????????max.overlaps = 10000, # 最大覆蓋率,當點很多時,有些標記會被覆蓋,調(diào)大該值則不被覆蓋
?????????size=3,col = 'black')
#視頻解釋:【TCGA-09-miRNA差異表達】 【精準空降到 11:07】 https://www.bilibili.com/video/BV1Zv4y1J7D7/?share_source=copy_web&vd_source=3d598f1156bea5867cef7412e80ded78&t=667
# 繪制熱圖heatmap
# df <- tibble::column_to_rownames(df,var = 'id')
pheatmap::pheatmap(rt,?
??????????#annotation_row=dfGene, # (可選)指定行分組文件
??????????#annotation_col=dfSample, # (可選)指定列分組文件
??????????show_colnames = TRUE, # 是否顯示列名
??????????show_rownames=TRUE,?# 是否顯示行名
??????????fontsize=1, # 字體大小
??????????color = colorRampPalette(c('#0000ff','#ffffff','#ff0000'))(50), # 指定熱圖的顏色
??????????#annotation_legend=TRUE, # 是否顯示圖例
??????????border_color=NA,?# 邊框顏色 NA表示沒有
??????????scale="row",?# 指定歸一化的方式。"row"按行歸一化,"column"按列歸一化,"none"不處理
??????????cluster_rows = TRUE, # 是否對行聚類
??????????cluster_cols = TRUE # 是否對列聚類
)
rm(list = ls())