GEO获取差异基因进行 GO KEGG PPI

参考文章《GEO免疫细胞浸润》获得基因表达矩阵GSE124226.normalize.txt(exp9),若矩阵数值偏小(<20),且小数点位数较多,则选择normalize数据差异分析,否则选择原始count差异分析

  • 准备三个文件:GSE124226.normalize.txt、sample1.txt(对照组列名)、sample2.txt(实验组列名)

一.normalize数据差异分析

01  fdr过滤

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("limma")

#install.packages("pheatmap")

setwd("D:\immune cell infiltration\06")   
library(limma)
library(pheatmap)

inputFile="GSE124226.normalize.txt"
pFilter=0.05            
logFCfilter=1                   
conFile="sample1.txt"        
treatFile="sample2.txt"         

#读取基因表达矩阵
rt <- read.table(inputFile,sep="	",header=T,check.names=F)
rt <- as.matrix(rt)
data <- rt[rowMeans(rt)>0.1,]#若一个基因在所有样本中表达均很低则剔除该基因
class(data)

#读取样品信息
sample1 <- read.table(conFile,sep="	",header=F,check.names=F)
sample2 <- read.table(treatFile,sep="	",header=F,check.names=F)
conData=data[,as.vector(sample1[,1])]
treatData=data[,as.vector(sample2[,1])]
data=cbind(conData,treatData)
conNum=ncol(conData)
treatNum=ncol(treatData)
type=c(rep(1,conNum),rep(2,treatNum))

#差异分析
outTab=data.frame()
for(i in row.names(data)){
	if(sd(data[i,1:conNum])==0){
		data[i,1]=0.001
	}
	if(sd(data[i,(conNum+1):(conNum+treatNum)])==0){
		data[i,(conNum+1)]=0.001
	}
	geneName=i
	rt=data.frame(expression=data[i,],type=type)
	wilcoxTest<-wilcox.test(expression ~ type, data=rt)#将表达数据与样本类型进行比较
	conGeneMeans=mean(data[i,1:conNum])#基因在正常组中的表达均值
	treatGeneMeans=mean(data[i,(conNum+1):ncol(data)])#在疾病组中的表达均值
	
	logFC=log2(treatGeneMeans)-log2(conGeneMeans)
	pvalue=wilcoxTest$p.value
	conMed=median(data[i,1:conNum])
	treatMed=median(data[i,(conNum+1):ncol(data)])
	diffMed=treatMed-conMed
	if( ((logFC>0) & (diffMed>0)) | ((logFC<0) & (diffMed<0)) ){  
		outTab=rbind(outTab,cbind(gene=i,logFC=logFC,conMean=conGeneMeans,treatMean=treatGeneMeans,pValue=pvalue))
	}
}

#输出所有基因的差异情况
write.table(outTab,file="all.xls",sep="	",row.names=F,quote=F)

#输出差异表格
outDiff=outTab[( abs(as.numeric(as.vector(outTab$logFC)))>logFCfilter & as.numeric(as.vector(outTab$pValue))<pFilter),]
write.table(outDiff,file="diff.xls",sep="	",row.names=F,quote=F)
write.table(outDiff,file="diff.txt",sep="	",row.names=F,quote=F)

#差异基因的表达文件
heatmap=rbind(ID=colnames(data[as.vector(outDiff[,1]),]),data[as.vector(outDiff[,1]),])
write.table(heatmap,file="diffGeneExp.txt",sep="	",col.names=F,quote=F)

#绘制差异基因热图
geneNum=50
diffSig=outDiff[order(as.numeric(as.vector(outDiff$logFC))),]#按照logFC进行排序
diffGeneName=as.vector(diffSig[,1])
diffLength=length(diffGeneName)
hmGene=c()
if(diffLength>(geneNum*2) ){
    hmGene=diffGeneName[c(1:geneNum,(diffLength-geneNum+1):diffLength)]
}else{#若差异基因大于100,则选择上调的50和下调的59进行绘制
    hmGene=diffGeneName
}
hmExp=data[hmGene,]
Type=c(rep("Con",conNum),rep("Treat",treatNum))
names(Type)=colnames(data)
Type=as.data.frame(Type)
pdf(file="heatmap.pdf",height=7,width=8)
pheatmap(hmExp, 
         annotation=Type, 
         color = colorRampPalette(c("blue", "white", "red"))(50),
         cluster_cols =F,
         show_colnames = F,
         scale="row",
         fontsize = 8,
         fontsize_row=6,
         fontsize_col=8)
dev.off()

02 pvalue 过滤(若fdr过滤得到的差异基因较少则考虑pvalue过滤)

  • 代码仅有一行不同
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("limma")

#install.packages("pheatmap")

setwd("D:\immune cell infiltration\06")   
library(limma)
library(pheatmap)

inputFile="GSE124226.normalize.txt"
pFilter=0.05            
logFCfilter=1                   
conFile="sample1.txt"        
treatFile="sample2.txt"         

#读取基因表达矩阵
rt <- read.table(inputFile,sep="	",header=T,check.names=F)
rt <- as.matrix(rt)
data <- rt[rowMeans(rt)>0.1,]#若一个基因在所有样本中表达均很低则剔除该基因
class(data)

#读取样品信息
sample1 <- read.table(conFile,sep="	",header=F,check.names=F)
sample2 <- read.table(treatFile,sep="	",header=F,check.names=F)
conData=data[,as.vector(sample1[,1])]
treatData=data[,as.vector(sample2[,1])]
data=cbind(conData,treatData)
conNum=ncol(conData)
treatNum=ncol(treatData)
type=c(rep(1,conNum),rep(2,treatNum))

#差异分析
outTab=data.frame()
for(i in row.names(data)){
	if(sd(data[i,1:conNum])==0){
		data[i,1]=0.001
	}
	if(sd(data[i,(conNum+1):(conNum+treatNum)])==0){
		data[i,(conNum+1)]=0.001
	}
	geneName=i
	rt=data.frame(expression=data[i,],type=type)
	wilcoxTest<-wilcox.test(expression ~ type, data=rt)#将表达数据与样本类型进行比较
	conGeneMeans=mean(data[i,1:conNum])#基因在正常组中的表达均值
	treatGeneMeans=mean(data[i,(conNum+1):ncol(data)])#在疾病组中的表达均值
	
	logFC=log2(treatGeneMeans)-log2(conGeneMeans)
	pvalue=wilcoxTest$p.value
	conMed=median(data[i,1:conNum])
	treatMed=median(data[i,(conNum+1):ncol(data)])
	diffMed=treatMed-conMed
	if( ((logFC>0) & (diffMed>0)) | ((logFC<0) & (diffMed<0)) ){  
		outTab=rbind(outTab,cbind(gene=i,logFC=logFC,conMean=conGeneMeans,treatMean=treatGeneMeans,pValue))
	}
}

#输出所有基因的差异情况
write.table(outTab,file="all.xls",sep="	",row.names=F,quote=F)

#输出差异表格
outDiff=outTab[( abs(as.numeric(as.vector(outTab$logFC)))>logFCfilter & as.numeric(as.vector(outTab$pValue))<pFilter),]
write.table(outDiff,file="diff.xls",sep="	",row.names=F,quote=F)
write.table(outDiff,file="diff.txt",sep="	",row.names=F,quote=F)

#差异基因的表达文件
heatmap=rbind(ID=colnames(data[as.vector(outDiff[,1]),]),data[as.vector(outDiff[,1]),])
write.table(heatmap,file="diffGeneExp.txt",sep="	",col.names=F,quote=F)

#绘制差异基因热图
geneNum=50
diffSig=outDiff[order(as.numeric(as.vector(outDiff$logFC))),]#按照logFC进行排序
diffGeneName=as.vector(diffSig[,1])
diffLength=length(diffGeneName)
hmGene=c()
if(diffLength>(geneNum*2) ){
    hmGene=diffGeneName[c(1:geneNum,(diffLength-geneNum+1):diffLength)]
}else{#若差异基因大于100,则选择上调的50和下调的50进行绘制
    hmGene=diffGeneName
}
hmExp=data[hmGene,]
Type=c(rep("Con",conNum),rep("Treat",treatNum))
names(Type)=colnames(data)
Type=as.data.frame(Type)
pdf(file="heatmap.pdf",height=7,width=8)
pheatmap(hmExp, 
         annotation=Type, 
         color = colorRampPalette(c("blue", "white", "red"))(50),
         cluster_cols =F,
         show_colnames = F,
         scale="row",
         fontsize = 8,
         fontsize_row=6,
         fontsize_col=8)
dev.off()

二.将基因名称转换为entrezID

  • 准备文件:diff.txt(来自一)
setwd("")

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("org.Hs.eg.db")

library("org.Hs.eg.db")

inputFile="diff.txt"        

rt=read.table(inputFile,sep="	",check.names=F,header=T)    
rt=rt[,c(1,2)]#提取基因名称和logFC
genes=as.vector(rt[,1])
entrezIDs <- mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA) #找出基因对应的id,找到不则返回NA
entrezIDs <- as.character(entrezIDs)

#输出基因id的结果文件
out=cbind(rt,entrezID=entrezIDs)
write.table(out,file="id.txt",sep="	",quote=F,row.names=F)

三.GO富集分析

  • 准备文件id.txt(来自二)
setwd("") 

#install.packages("colorspace")
#install.packages("stringi")
#install.packages("ggplot2")

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("DOSE")
#BiocManager::install("clusterProfiler")
#BiocManager::install("enrichplot")


library("clusterProfiler")
library("org.Hs.eg.db")
library("enrichplot")
library("ggplot2")

pvalueFilter=0.05         
qvalueFilter=0.05  #矫正后的p值过滤条件(若结果太少,将qvalueFilter值改为1再筛选)     

        
rt=read.table("id.txt",sep="	",header=T,check.names=F)      
rt=rt[is.na(rt[,"entrezID"])==F,]   #去除基因id为NA的基因                       
gene=rt$entrezID
geneFC=rt$logFC
#geneFC=2^geneFC
names(geneFC)=gene

#定义颜色类型
colorSel="qvalue"
if(qvalueFilter>0.05){
	colorSel="pvalue"
}

#GO富集分析
kk=enrichGO(gene = gene,OrgDb = org.Hs.eg.db, pvalueCutoff =1, qvalueCutoff = 1, ont="all", readable =T)
GO=as.data.frame(kk)
GO=GO[(GO$pvalue<pvalueFilter & GO$qvalue<qvalueFilter),]
#保存富集结果
write.table(GO,file="GO.txt",sep="	",quote=F,row.names = F)

#定义显示Term数目
showNum=10 #bp、cc、mf各显示前十个  
if(nrow(GO)<30){   #小于30个则全部显示
	showNum=nrow(GO)
}

#柱状图
pdf(file="barplot.pdf",width = 9,height =7)
bar=barplot(kk, drop = TRUE, showCategory =showNum,split="ONTOLOGY",color = colorSel) + facet_grid(ONTOLOGY~., scale='free')
print(bar)
dev.off()
		
#气泡图
pdf(file="bubble.pdf",width = 9,height =7)
bub=dotplot(kk,showCategory = showNum, orderBy = "GeneRatio",split="ONTOLOGY", color = colorSel) + facet_grid(ONTOLOGY~., scale='free')
print(bub)
dev.off()
		
#圈图
pdf(file="circos.pdf",width = 9,height = 6.5) #修改高度和宽度将图形变为圆形
cnet=cnetplot(kk, foldChange=geneFC, showCategory = 5, circular = TRUE, colorEdge = TRUE)
print(cnet)
dev.off()