新手遇到的问题都是类似的,比如批量ID转换
虽然我写过大量的教程:ID转换大全 不过都需要R基础,因为是大批量转换啊!
但热心肠的植物生物信息学教学大佬还是友善的给出了解决方案
我也狗尾续貂制作了一个网页工具教程:
简单的3个步骤,不会代码也可以很容易把ID批量转换啦!
不过有趣的是我搜索电脑资料,看到了2年前写的拟南芥教程。
不过我为什么会花时间写拟南芥教程呢?
1 首先加载必要的包
library(ggplot2)
library(stringr)
#source('/biocLite.R')
#biocLite('clusterProfiler')
#biocLite('org.At.tair.db')
library(clusterProfiler)
library(org.At.tair.db)
2 然后导入数据
deg1=read.table('/jmzeng1314/my-R/master/DEG_scripts/tair/DESeq2_DEG.Day1-Day0.txt')
deg1=na.omit(deg1)
head(deg1)
##baseMeanlog2FoldChangelfcSEstatpvalue
##AT3G0143022.90892918.9899902.1482618.8397049.597263e-19
##AT1G4740520.70955120.9588062.4345748.6088207.381677e-18
##AT2G338301938.159722-2.5606090.312663-8.1896782.619266e-16
##AT5G286278.118376-21.1313182.875691-7.3482572.008078e-13
##AT2G337509.789740-19.9893012.847513-7.0199152.23e-12
##AT3G545002238.3146522.7204300.3863057.0421821.892517e-12
##padj
##AT3G014301.858318e-14
##AT1G474057.146571e-14
##AT2G338301.690562e-12
##AT5G286279.720602e-10
##AT2G337507.164418e-09
##AT3G545007.164418e-09
prefix='Day1-Day0'
3 然后判断显著差异基因
很明显,这个时候用padj来挑选差异基因即可,不需要看foldchange了。
table(deg1$padj<0.05)
##
##FALSETRUE
##19166197
table(deg1$padj<0.01)
##
##FALSETRUE
##1930360
diff_geneList=rownames(deg1[deg1$padj<0.05,])
up_geneList=rownames(deg1[deg1$padj<0.05°1$log2FoldChange>0,])
down_geneList=rownames(deg1[deg1$padj<0.05°1$log2FoldChange<0,])
length(diff_geneList)
##[1]197
length(up_geneList)
##[1]89
length(down_geneList)
##[1]108
3.1 画个火山图看看挑选的差异基因合理与否
colnames(deg1)
##[1]'baseMean''log2FoldChange''lfcSE''stat'
##[5]'pvalue''padj'
log2FoldChange_Cutof=0
##这里我不准备用log2FoldChange来挑选差异基因,仅仅是用padj即可
deg1$change=as.factor(ifelse(deg1$padj<0.05&
abs(deg1$log2FoldChange)>log2FoldChange_Cutof,
ifelse(deg1$log2FoldChange>log2FoldChange_Cutof,'UP','DOWN'),'NOT'))
this_tile<-paste0('Cutoffforlog2FoldChangeis',round(log2FoldChange_Cutof,3),
'\nThenumberofupgeneis',nrow(deg1[deg1$change=='UP',]),
'\nThenumberofdowngeneis',nrow(deg1[deg1$change=='DOWN',])
)
g_volcano=ggplot(data=deg1,aes(x=log2FoldChange,y=-log10(padj),color=change))+
geom_point(alpha=0.4,size=1.75)+
theme_set(theme_set(theme_bw(base_size=20)))+
xlab('log2foldchange')+ylab('-log10p-value')+
ggtitle(this_tile)+
theme(plot.title=element_text(size=15,hjust=0.5))+
scale_colour_manual(values=c('blue','black','red'))##correspondingtothelevels(res$change)
print(g_volcano)
4 GO/KEGG注释
4.1 首先进行KEGG注释
diff.kk<-enrichKEGG(gene=diff_geneList,
organism='ath',
pvalueCutoff=0.99,
qvalueCutoff=0.99
)
kegg_diff_dt<-as.data.frame(setReadable(diff.kk,org.At.tair.db,keytype='TAIR'))
up.kk<-enrichKEGG(gene=up_geneList,
organism='ath',
pvalueCutoff=0.99,
qvalueCutoff=0.99
)
kegg_up_dt<-as.data.frame(setReadable(up.kk,org.At.tair.db,keytype='TAIR'))
down.kk<-enrichKEGG(gene=down_geneList,
organism='ath',
pvalueCutoff=0.99,
qvalueCutoff=0.99
)
kegg_down_dt<-as.data.frame(setReadable(down.kk,org.At.tair.db,keytype='TAIR'))
4.2 可视化看看KEGG注释结果
##KEGGpatheayvisulization:
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
dat=rbind(up_kegg,down_kegg)
colnames(dat)
##[1]'ID''Description''GeneRatio''BgRatio''pvalue'
##[6]'p.adjust''qvalue''geneID''Count''group'
dat$pvalue=-log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group
dat=dat[order(dat$pvalue,decreasing=F),]
g_kegg<-ggplot(dat,aes(x=reorder(Description,order(pvalue,decreasing=F)),y=pvalue,fill=group))+
geom_bar(stat='identity')+
scale_fill_gradient(low='blue',high='red',guide=FALSE)+
scale_x_discrete(name='Pathwaynames')+
scale_y_continuous(name='-log10P-value')+
coord_flip()+
ggtitle('PathwayEnrichment')
print(g_kegg)
4.3 接着进行GO注释
for(ontoinc('CC','BP','MF')){
ego<-enrichGO(gene=diff_geneList,
OrgDb=org.At.tair.db,
keyType='TAIR',
ont=onto,
pAdjustMethod='BH',
pvalueCutoff=0.2,
qvalueCutoff=0.9)
ego<-setReadable(ego,org.At.tair.db,keytype='TAIR')
write.csv(as.data.frame(ego),paste0(prefix,'_',onto,'.csv'))
#ego2<-gofilter(ego,4)
ego2=ego
pdf(paste0(prefix,'_',onto,'_barplot.pdf'))
p=barplot(ego2,showCategory=12)+scale_x_discrete(labels=function(x)str_wrap(x,width=20))
print(p)
dev.off()
}
ggsave(filename=paste0(prefix,'_volcano_plot.pdf'),g_volcano)
##Saving7x5inimage
ggsave(filename=paste0(prefix,'_kegg_plot.pdf'),g_kegg)
##Saving7x5inimage
write.csv(x=kegg_diff_dt,file=paste0(prefix,'_kegg_diff.csv'))
write.csv(x=kegg_up_dt,file=paste0(prefix,'_kegg_up.csv'))
write.csv(x=kegg_down_dt,file=paste0(prefix,'_kegg_down.csv'))
5 基因ID注释
library(org.At.tair.db)
ls('package:org.At.tair.db')
##[1]'org.At.tair''org.At.tair.db'
##[3]'org.At.tairARACYC''org.At.tairARACYCENZYME'
##[5]'org.At.tairCHR''org.At.tairCHRLENGTHS'
##[7]'org.At.tairCHRLOC''org.At.tairCHRLOCEND'
##[9]'org.At.tairENTREZID''org.At.tairENZYME'
##[11]'org.At.tairENZYME2TAIR''org.At.tairGENENAME'
##[13]'org.At.tairGO''org.At.tairGO2ALLTAIRS'
##[15]'org.At.tairGO2TAIR''org.At.tairMAPCOUNTS'
##[17]'org.At.tairORGANISM''org.At.tairPATH'
##[19]'org.At.tairPATH2TAIR''org.At.tairPMID'
##[21]'org.At.tairPMID2TAIR''org.At.tairREFSEQ'
##[23]'org.At.tairREFSEQ2TAIR''org.At.tairSYMBOL'
##[25]'org.At.tair_dbInfo''org.At.tair_dbconn'
##[27]'org.At.tair_dbfile''org.At.tair_dbschema'
##ThendrawGO/keggfigures:
deg1$gene_id=rownames(deg1)
id2symbol=toTable(org.At.tairSYMBOL)
deg1=merge(deg1,id2symbol,by='gene_id')
##可以看到有一些ID是没有genesymbol的,这样的基因,意义可能不大,也有可能是大部分人漏掉了
head(deg1)
##gene_idbaseMeanlog2FoldChangelfcSEstatpvalue
##1AT1G0101058.256571.131053350.80002741.41376830.1574300
##2AT1G0101058.256571.131053350.80002741.41376830.1574300
##3AT1G01020542.64394-0.057455540.3721650-0.15438190.8773086
##4AT1G0103048.77442-1.098452631.2875862-0.85311000.3935983
##5AT1G010401708.689490.324217340.27775301.16728650.2430947
##6AT1G010401708.689490.324217340.27775301.16728650.2430947
##padjchangesymbol
##10.6008903NOTANAC001
##20.6008903NOTNAC001
##30.9805661NOTARV1
##40.8144858NOTNGA3
##50.6992279NOTDCL1
##60.6992279NOTEMB60
可以看到基因的ID和symbol的对应关系就出来了,根使用网页工具是类似的,感兴趣的朋友可以试试看网页工具和R代码的ID批量转换差别有多大。
■■ ■
如果觉得《拟南芥的基因ID批量转换?差异基因 GO/KEGG数据库注释(转录组直接送你全套流程)》对你有帮助,请点赞、收藏,并留下你的观点哦!