三分鐘繪制雙曲線新型火山圖
雙曲線火山圖
最近在很多的文章中出現(xiàn)了下圖類似的新型火山圖,其實原理就是,分別以原來的的閾值線為基準,畫兩個原始函數(shù)為y=1/x的的曲線。這樣對基因進行分類,有利于篩選差異更顯著的基因。因此圖形也變的美觀。
安裝相應的包并加載
#相關R包載入: library(dplyr) library(ggplot2) library(ggrepel) library(patchwork) #本地測試數(shù)據(jù)讀入,具體數(shù)據(jù)見附件 df <- read.csv('./test.xls',header = T,sep = "\t") head(df) ##篩選閾值確定,一般閾值是p<0.05,|log2FC|>1 pvalue = 0.05 log2FC = 1 #根據(jù)閾值添加上下調(diào)分組標簽: df$group <- case_when( ??df$log2FC > log2FC & df$Pvalue < pvalue ~ "up", ??df$log2FC < -log2FC & df$Pvalue < pvalue ~ "down", ??TRUE ~ 'none' ) head(df) #轉(zhuǎn)換為因子指定繪圖順序; df$'-log10(pvalue)' <- -log10(df$Pvalue) #新增-log10p列 df$group <- factor(df$group, levels = c("up","down","none")) #ggplot2建立映射: p <- ggplot(data = df, ????????????aes(x = log2FC, y = `-log10(pvalue)`, color = group)) + #建立映射 ??geom_point(size = 2.2) #繪制散點
#根據(jù)自己的需求可以調(diào)整下列參數(shù),進行繪圖。 p1 <- p + ??scale_x_continuous(limits = c(-7, 7), breaks = seq(-7, 7, by = 2))+ ??scale_y_continuous(expand = expansion(add = c(0.5, 0)), ?????????????????????limits = c(0, 10), breaks = seq(0, 10, by = 5))
#自定義顏色與主題: mycol <- c("#EB4232","#2DB2EB","#d8d8d8") mytheme <- theme_classic() + ??theme( ????axis.title = element_text(size = 15), ????axis.text = element_text(size = 14), ????legend.text = element_text(size = 14) ??) p2 <- p1 + ??scale_colour_manual(name = "", values = alpha(mycol, 0.7)) + ??mytheme p2
#自定義顏色與主題: mycol <- c("#EB4232","#2DB2EB","#d8d8d8") mytheme <- theme_classic() + ??theme( ????axis.title = element_text(size = 15), ????axis.text = element_text(size = 14), ????legend.text = element_text(size = 14) ??) p2 <- p1 + ??scale_colour_manual(name = "", values = alpha(mycol, 0.7)) + ??mytheme p2#添加閾值輔助線 p3 <- p2 + ??geom_hline(yintercept = c(-log10(pvalue)),size = 0.7,color = "black",lty = "dashed") + #水平閾值線 ??geom_vline(xintercept = c(-log2FC, log2FC),size = 0.7,color = "black",lty = "dashed") #垂直閾值線 p3
添加雙曲線
#根據(jù)反比例函數(shù) y = 1/X和自己設定的閾值自定義雙曲線函數(shù) f <- function(x){ ??inputx <- seq(0.0001, x, by = 0.0001) ??y <- 1/(inputx) + (-log10(pvalue)) ??dff <- rbind(data.frame(x = inputx + log2FC, y = y), ???????????????data.frame(x = -(inputx + log2FC), y = y)) ??return(dff) } #根據(jù)函數(shù)生成所需的曲線數(shù)組坐標: dff_curve <- f(2) head(dff_curve) #需要新增曲線數(shù)值列: ##每列l(wèi)og2FoldChange值在曲線上對應的y軸坐標; df$curve_y <- case_when( ??df$log2FC > 0 ~ 1/(df$log2FC-log2FC) + (-log10(pvalue)), ??df$log2FC <= 0 ~ 1/(-df$log2FC-log2FC) + (-log10(pvalue)) ) #跟前文講述的一樣根據(jù)曲線新增上下調(diào)分組標簽: df$group2 <- case_when( ??df$`-log10(pvalue)` > df$curve_y & df$log2FC >= log2FC ~ 'up', ??df$`-log10(pvalue)` > df$curve_y & df$log2FC <= -log2FC ~ 'down', ??TRUE ~ 'none' ) #繪制新雙曲線閾值火山圖: df$group2 <- factor(df$group2, levels = c("up","down","none")) #指定順序 mycol2 <- c("#F8B606","#4A1985","#d8d8d8") p4 <- ggplot(data = df, ?????????????aes(x = log2FC, y = `-log10(pvalue)`, color = group2)) + #使用新的分組 ??geom_point(size = 2.2) + ??scale_x_continuous(limits = c(-7, 7), breaks = seq(-7, 7, by = 2)) + ??scale_y_continuous(expand = expansion(add = c(0.5, 0)), ?????????????????????limits = c(0, 10), breaks = seq(0, 10, by = 5)) + ??scale_colour_manual(name = "", values = alpha(mycol2, 0.7)) + ??geom_line(data = dff_curve, ????????????aes(x = x, y = y), #曲線坐標 ????????????color = "black",lty = "dashed", size = 0.7) + ??mytheme p4 ? ?
添加標簽
#篩選顯著性top10標簽: top10 <- filter(df, group2 != "none") %>% ??distinct(GeneName, .keep_all = T) %>% ??top_n(10, abs(log2FC)) p5 <- p4 + ??geom_text_repel(data = top10, ??????????????????aes(x = log2FC, y = `-log10(pvalue)`, label = GeneName), ??????????????????force = 80, color = 'black', size = 3.2, ??????????????????point.padding = 0.5, hjust = 0.5, ??????????????????arrow = arrow(length = unit(0.02, "npc"), ????????????????????????????????type = "open", ends = "last"), ??????????????????segment.color="black", ??????????????????segment.size = 0.3, ??????????????????nudge_x = 0, ??????????????????nudge_y = 1) p5
補充關于火山圖差異篩選的統(tǒng)計學意義:
1、什么是fold change?
差異倍數(shù),簡單來說就是基因在一組樣品中的表達值的均值除以其在另一組樣品中的表達值的均值。所以火山圖只適合展示兩組樣品之間的比較。
2、為什么要做Log 2轉(zhuǎn)換?
繪制到圖中時,上調(diào)占的空間多,下調(diào)占的空間少,展示起來不方便。所以一般會做Log 2轉(zhuǎn)換。默認我們都會用兩倍差異 (fold change == 2 | 0.5)做為一個篩選標準。
3、P-value都比較熟悉
,統(tǒng)計檢驗獲得的是否統(tǒng)計差異顯著的一個衡量值,約定成俗的P-value<0.05為統(tǒng)計檢驗顯著的常規(guī)標準。
4、什么是adjusted P-value?
做差異基因檢測時,要對成千上萬的基因分別做差異統(tǒng)計檢驗。統(tǒng)計學家認為做這么多次的檢驗,本身就會引入假陽性結(jié)果,需要做一個多重假設檢驗校正。
5、為什么做-Log 10轉(zhuǎn)換呢?
因為FDR值是0-1之間,數(shù)值越小越是統(tǒng)計顯著,也越是我們關注的。-Log 10 (adjusted P-value)轉(zhuǎn)換后正好是反了多來,數(shù)值越大越顯著,而且以10為底很容易換算回去。 ?
好了今天的火山圖就講到這里,歡迎大家有問題與小云一起討論哦~