R语⾔:GO富集和KEGG富集、可视化教程,附代码
R语⾔:GO富集和KEGG富集、可视化教程,附代码
⼩⽩⼀枚,博客仅⽤于记录⾃⼰的学习历程,参考了很多代码,感觉有些代码太复杂了,根据⾃⼰的喜欢进⾏了部分改动。
1.⽂件准备
导⼊准备好的差异基因列表,或者是某个你需要进⾏富集的模块的基因列表,只要有基因的名字就⾏,此处diff是我导⼊的基因列表的命名,SYMBOL是对应的基因的名字(也对应了后⾯我⽤到的SYMBOL类型的ID转换,就不⽤了再改动了。)
diff<-read.csv(file="C:/Urs/27487/Desktop/three/group/diffgen.csv")
这张图是我导⼊的模块的基因列表图,真正重要的是第⼆列,第⼀列是我导⼊时⾃动⽣成的不⽤理会。
这张图是我导⼊的差异基因列表图,真正重要的是第⼀列。
2.基因ID转换
#1、加载包
library(AnnotationDbi)
library(db)#基因注释包
library(clusterProfiler)#富集包
library(dplyr)
library(ggplot2)#画图包
#2、基因id转换,kegg和go富集⽤的ID类型是ENTREZID)
gene.df <- bitr(diff$SYMBOL,fromType="SYMBOL",toType="ENTREZID", OrgDb = db)#TCGA数据框如果没有进⾏基因注释,那么fromType应该是Enmbl,各种ID之间可以互相转换,toType可以是⼀个字符串,也可以是⼀个向量,看⾃⼰需求
gene <- gene.df$ENTREZID
这张图是gene.df的界⾯图,我们需要⽤到第⼆列
3.GO富集及可视化
#3、GO富集
##CC表⽰细胞组分,MF表⽰分⼦功能,BP表⽰⽣物学过程,ALL表⽰同时富集三种过程,选⾃⼰需要的,我⼀般是做BP,MF,CC这3组再合并成⼀个数据框,⽅便后续摘取部分通路绘图。
ego_ALL <- enrichGO(gene = gene,#我们上⾯定义了
OrgDb=db,
keyType ="ENTREZID",
ont ="ALL",#富集的GO类型
pAdjustMethod ="BH",#这个不⽤管,⼀般都⽤的BH
minGSSize =1,
pvalueCutoff =0.01,#P值可以取0.05
qvalueCutoff =0.05,
readable =TRUE)
ego_CC <- enrichGO(gene = gene,
OrgDb=db,
keyType ="ENTREZID",
ont ="CC",
pAdjustMethod ="BH",
minGSSize =1,
pvalueCutoff =0.01,
qvalueCutoff =0.05,
readable =TRUE)
ego_BP <- enrichGO(gene = gene,
OrgDb=db,
心若在梦就在
微波炉如何使用keyType ="ENTREZID",
ont ="BP",
pAdjustMethod ="BH",
minGSSize =1,
pvalueCutoff =0.01,
qvalueCutoff =0.05,
readable =TRUE)
ego_MF <- enrichGO(gene = gene,
OrgDb=db,
keyType ="ENTREZID",
ont ="MF",
pAdjustMethod ="BH",
minGSSize =1,
pvalueCutoff =0.01,
qvalueCutoff =0.05,
readable =TRUE)
#4、将结果保存到当前路径
ego_ALL <- as.data.frame(ego_ALL)
ego_result_BP <- as.data.frame(ego_BP)
ego_result_CC <- as.data.frame(ego_CC)
ego_result_MF <- as.data.frame(ego_MF)
ego <- rbind(ego_result_BP,ego_result_CC,ego_result_MF)#或者这样也能得到ego_ALL⼀样的结果
write.csv(ego_ALL,file ="ego_ALL.csv",row.names = T)
write.csv(ego_result_BP,file ="ego_result_BP.csv",row.names = T)
write.csv(ego_result_CC,file ="ego_result_CC.csv",row.names = T)
write.csv(ego_result_MF,file ="ego_result_MF.csv",row.names = T)
write.csv(ego,file ="ego.csv",row.names = T)
#5、但是有时候我们富集到的通路太多了,不⽅便绘制图⽚,可以选择部分绘制,这时候跳过第4步,来到第5步
display_number = c(22,22,22)#这三个数字分别代表选取的BP、CC、MF的通路条数,这个⾃⼰设置就⾏了
ego_result_BP <- as.data.frame(ego_BP)[1:display_number[1],]
ego_result_CC <- as.data.frame(ego_CC)[1:display_number[2],]
ego_result_MF <- as.data.frame(ego_MF)[1:display_number[3],]
##将以上我们摘取的部分通路重新组合成数据框
go_enrich_df <- data.frame(
ID=c(ego_result_BP$ID, ego_result_CC$ID, ego_result_MF$ID), Description=c(ego_result_BP$Description,ego_result_CC$Description,ego
创新与发展ID=c(ego_result_BP$ID, ego_result_CC$ID, ego_result_MF$ID), Description=c(ego_result_BP$Description,ego_result_CC$Description,ego _result_MF$Description),
GeneNumber=c(ego_result_BP$Count, ego_result_CC$Count, ego_result_MF$Count),
type=factor(c(rep("biological process", display_number[1]),
rep("cellular component", display_number[2]),
rep("molecular function", display_number[3])),
levels=c("biological process","cellular component","molecular function")))
##通路的名字太长了,我选取了通路的前五个单词作为通路的名字
for(i in1:nrow(go_enrich_df)){
description_splite=strsplit(go_enrich_df$Description[i],split =" ")
description_collap=paste(description_splite[[1]][1:5],collap =" ")#这⾥的5就是指5个单词的意思,可以⾃⼰更改
go_enrich_df$Description[i]=description_collap
go_enrich_df$Description=gsub(pattern ="NA","",go_enrich_df$Description)
}
##开始绘制GO柱状图
###横着的柱状图
go_enrich_df$type_order=factor(rev(as.integer(rownames(go_enrich_df))),labels=rev(go_enrich_df$Description))#这⼀步是必须的,为了让柱⼦按顺序显⽰,不⾄于很乱
COLS <- c("#66C3A5","#8DA1CB","#FD8D62")#设定颜⾊
盲人英语
ggplot(data=go_enrich_df, aes(x=type_order,y=GeneNumber, fill=type))+#横纵轴取值
geom_bar(stat="identity", width=0.8)+#柱状图的宽度,可以⾃⼰设置
scale_fill_manual(values = COLS)+###颜⾊
别故乡
coord_flip()+##这⼀步是让柱状图横过来,不加的话柱状图是竖着的
xlab("GO term")+
ylab("Gene_Number")+
labs(title ="The Most Enriched GO Terms")+
theme_bw()
###竖着的柱状图
go_enrich_df$type_order=factor(rev(as.integer(rownames(go_enrich_df))),labels=rev(go_enrich_df$Description))
COLS <- c("#66C3A5","#8DA1CB","#FD8D62")
体育教学论文
ggplot(data=go_enrich_df, aes(x=type_order,y=GeneNumber, fill=type))+
geom_bar(stat="identity", width=0.8)+
scale_fill_manual(values = COLS)+
theme_bw()+
xlab("GO term")+
ylab("Num of Genes")+
labs(title ="The Most Enriched GO Terms")+
=element_text(face ="bold", color="gray50",angle =70,vjust =1, hjust =1))#angle是坐标轴字体倾斜的⾓度,可以⾃⼰设置
排序默认是按照调整后P值从⼩到⼤排列的,如果喜欢的话,可以⾃⼰按照富集的基因个数多少进⾏排列
这张图是横着的GO柱状图,每种类型我画了22条
这张图是竖着的柱状图
4.KEGG富集和可视化
#1、KEGG富集
kk <- enrichKEGG(gene = gene,keyType ="kegg",organism="human", qvalueCutoff =0.05, pvalueCutoff=0.05) #2、可视化
###柱状图
hh <- as.data.frame(kk)#⾃⼰记得保存结果哈!
rownames(hh)<-1:nrow(hh)
hh$order=factor(rev(as.integer(rownames(hh))),labels = rev(hh$Description))
ggplot(hh,aes(y=order,x=Count,fill=p.adjust))+
geom_bar(stat ="identity",width=0.7)+####柱⼦宽度
#coord_flip()+##颠倒横纵轴
scale_fill_gradient(low ="red",high ="blue")+#颜⾊⾃⼰可以换
labs(title ="KEGG Pathways Enrichment",
x ="Gene numbers",
致谢老师y ="Pathways")+
theme(axis.title.x = element_text(face ="bold",size =16),
axis.title.y = element_text(face ="bold",size =16),
legend.title = element_text(face ="bold",size =16))+
theme_bw()群体关系
###⽓泡图
hh <- as.data.frame(kk)
rownames(hh)<-1:nrow(hh)
hh$order=factor(rev(as.integer(rownames(hh))),labels = rev(hh$Description))
ggplot(hh,aes(y=order,x=Count))+
geom_point(aes(size=Count,color=-1*p.adjust))+# 修改点的⼤⼩
scale_color_gradient(low="green",high ="red")+
labs(color=expression(p.adjust,size="Count"),
x="Gene Number",y="Pathways",title="KEGG Pathway Enrichment")+
theme_bw()