参考文章《GEO免疫细胞浸润》获得基因表达矩阵GSE124226.normalize.txt(exp9),若矩阵数值偏小(<20),且小数点位数较多,则选择normalize数据差异分析,否则选择原始count差异分析
- 准备三个文件:GSE124226.normalize.txt、sample1.txt(对照组列名)、sample2.txt(实验组列名)
一.normalize数据差异分析
01 fdr过滤
#if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("limma") #install.packages("pheatmap") setwd("D:\immune cell infiltration\06") library(limma) library(pheatmap) inputFile="GSE124226.normalize.txt" pFilter=0.05 logFCfilter=1 conFile="sample1.txt" treatFile="sample2.txt" #读取基因表达矩阵 rt <- read.table(inputFile,sep=" ",header=T,check.names=F) rt <- as.matrix(rt) data <- rt[rowMeans(rt)>0.1,]#若一个基因在所有样本中表达均很低则剔除该基因 class(data) #读取样品信息 sample1 <- read.table(conFile,sep=" ",header=F,check.names=F) sample2 <- read.table(treatFile,sep=" ",header=F,check.names=F) conData=data[,as.vector(sample1[,1])] treatData=data[,as.vector(sample2[,1])] data=cbind(conData,treatData) conNum=ncol(conData) treatNum=ncol(treatData) type=c(rep(1,conNum),rep(2,treatNum)) #差异分析 outTab=data.frame() for(i in row.names(data)){ if(sd(data[i,1:conNum])==0){ data[i,1]=0.001 } if(sd(data[i,(conNum+1):(conNum+treatNum)])==0){ data[i,(conNum+1)]=0.001 } geneName=i rt=data.frame(expression=data[i,],type=type) wilcoxTest<-wilcox.test(expression ~ type, data=rt)#将表达数据与样本类型进行比较 conGeneMeans=mean(data[i,1:conNum])#基因在正常组中的表达均值 treatGeneMeans=mean(data[i,(conNum+1):ncol(data)])#在疾病组中的表达均值 logFC=log2(treatGeneMeans)-log2(conGeneMeans) pvalue=wilcoxTest$p.value conMed=median(data[i,1:conNum]) treatMed=median(data[i,(conNum+1):ncol(data)]) diffMed=treatMed-conMed if( ((logFC>0) & (diffMed>0)) | ((logFC<0) & (diffMed<0)) ){ outTab=rbind(outTab,cbind(gene=i,logFC=logFC,conMean=conGeneMeans,treatMean=treatGeneMeans,pValue=pvalue)) } } #输出所有基因的差异情况 write.table(outTab,file="all.xls",sep=" ",row.names=F,quote=F) #输出差异表格 outDiff=outTab[( abs(as.numeric(as.vector(outTab$logFC)))>logFCfilter & as.numeric(as.vector(outTab$pValue))<pFilter),] write.table(outDiff,file="diff.xls",sep=" ",row.names=F,quote=F) write.table(outDiff,file="diff.txt",sep=" ",row.names=F,quote=F) #差异基因的表达文件 heatmap=rbind(ID=colnames(data[as.vector(outDiff[,1]),]),data[as.vector(outDiff[,1]),]) write.table(heatmap,file="diffGeneExp.txt",sep=" ",col.names=F,quote=F) #绘制差异基因热图 geneNum=50 diffSig=outDiff[order(as.numeric(as.vector(outDiff$logFC))),]#按照logFC进行排序 diffGeneName=as.vector(diffSig[,1]) diffLength=length(diffGeneName) hmGene=c() if(diffLength>(geneNum*2) ){ hmGene=diffGeneName[c(1:geneNum,(diffLength-geneNum+1):diffLength)] }else{#若差异基因大于100,则选择上调的50和下调的59进行绘制 hmGene=diffGeneName } hmExp=data[hmGene,] Type=c(rep("Con",conNum),rep("Treat",treatNum)) names(Type)=colnames(data) Type=as.data.frame(Type) pdf(file="heatmap.pdf",height=7,width=8) pheatmap(hmExp, annotation=Type, color = colorRampPalette(c("blue", "white", "red"))(50), cluster_cols =F, show_colnames = F, scale="row", fontsize = 8, fontsize_row=6, fontsize_col=8) dev.off()
02 pvalue 过滤(若fdr过滤得到的差异基因较少则考虑pvalue过滤)
- 代码仅有一行不同
#if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("limma") #install.packages("pheatmap") setwd("D:\immune cell infiltration\06") library(limma) library(pheatmap) inputFile="GSE124226.normalize.txt" pFilter=0.05 logFCfilter=1 conFile="sample1.txt" treatFile="sample2.txt" #读取基因表达矩阵 rt <- read.table(inputFile,sep=" ",header=T,check.names=F) rt <- as.matrix(rt) data <- rt[rowMeans(rt)>0.1,]#若一个基因在所有样本中表达均很低则剔除该基因 class(data) #读取样品信息 sample1 <- read.table(conFile,sep=" ",header=F,check.names=F) sample2 <- read.table(treatFile,sep=" ",header=F,check.names=F) conData=data[,as.vector(sample1[,1])] treatData=data[,as.vector(sample2[,1])] data=cbind(conData,treatData) conNum=ncol(conData) treatNum=ncol(treatData) type=c(rep(1,conNum),rep(2,treatNum)) #差异分析 outTab=data.frame() for(i in row.names(data)){ if(sd(data[i,1:conNum])==0){ data[i,1]=0.001 } if(sd(data[i,(conNum+1):(conNum+treatNum)])==0){ data[i,(conNum+1)]=0.001 } geneName=i rt=data.frame(expression=data[i,],type=type) wilcoxTest<-wilcox.test(expression ~ type, data=rt)#将表达数据与样本类型进行比较 conGeneMeans=mean(data[i,1:conNum])#基因在正常组中的表达均值 treatGeneMeans=mean(data[i,(conNum+1):ncol(data)])#在疾病组中的表达均值 logFC=log2(treatGeneMeans)-log2(conGeneMeans) pvalue=wilcoxTest$p.value conMed=median(data[i,1:conNum]) treatMed=median(data[i,(conNum+1):ncol(data)]) diffMed=treatMed-conMed if( ((logFC>0) & (diffMed>0)) | ((logFC<0) & (diffMed<0)) ){ outTab=rbind(outTab,cbind(gene=i,logFC=logFC,conMean=conGeneMeans,treatMean=treatGeneMeans,pValue)) } } #输出所有基因的差异情况 write.table(outTab,file="all.xls",sep=" ",row.names=F,quote=F) #输出差异表格 outDiff=outTab[( abs(as.numeric(as.vector(outTab$logFC)))>logFCfilter & as.numeric(as.vector(outTab$pValue))<pFilter),] write.table(outDiff,file="diff.xls",sep=" ",row.names=F,quote=F) write.table(outDiff,file="diff.txt",sep=" ",row.names=F,quote=F) #差异基因的表达文件 heatmap=rbind(ID=colnames(data[as.vector(outDiff[,1]),]),data[as.vector(outDiff[,1]),]) write.table(heatmap,file="diffGeneExp.txt",sep=" ",col.names=F,quote=F) #绘制差异基因热图 geneNum=50 diffSig=outDiff[order(as.numeric(as.vector(outDiff$logFC))),]#按照logFC进行排序 diffGeneName=as.vector(diffSig[,1]) diffLength=length(diffGeneName) hmGene=c() if(diffLength>(geneNum*2) ){ hmGene=diffGeneName[c(1:geneNum,(diffLength-geneNum+1):diffLength)] }else{#若差异基因大于100,则选择上调的50和下调的50进行绘制 hmGene=diffGeneName } hmExp=data[hmGene,] Type=c(rep("Con",conNum),rep("Treat",treatNum)) names(Type)=colnames(data) Type=as.data.frame(Type) pdf(file="heatmap.pdf",height=7,width=8) pheatmap(hmExp, annotation=Type, color = colorRampPalette(c("blue", "white", "red"))(50), cluster_cols =F, show_colnames = F, scale="row", fontsize = 8, fontsize_row=6, fontsize_col=8) dev.off()
二.将基因名称转换为entrezID
- 准备文件:diff.txt(来自一)
setwd("") #if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("org.Hs.eg.db") library("org.Hs.eg.db") inputFile="diff.txt" rt=read.table(inputFile,sep=" ",check.names=F,header=T) rt=rt[,c(1,2)]#提取基因名称和logFC genes=as.vector(rt[,1]) entrezIDs <- mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA) #找出基因对应的id,找到不则返回NA entrezIDs <- as.character(entrezIDs) #输出基因id的结果文件 out=cbind(rt,entrezID=entrezIDs) write.table(out,file="id.txt",sep=" ",quote=F,row.names=F)
三.GO富集分析
- 准备文件id.txt(来自二)
setwd("") #install.packages("colorspace") #install.packages("stringi") #install.packages("ggplot2") #if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("DOSE") #BiocManager::install("clusterProfiler") #BiocManager::install("enrichplot") library("clusterProfiler") library("org.Hs.eg.db") library("enrichplot") library("ggplot2") pvalueFilter=0.05 qvalueFilter=0.05 #矫正后的p值过滤条件(若结果太少,将qvalueFilter值改为1再筛选) rt=read.table("id.txt",sep=" ",header=T,check.names=F) rt=rt[is.na(rt[,"entrezID"])==F,] #去除基因id为NA的基因 gene=rt$entrezID geneFC=rt$logFC #geneFC=2^geneFC names(geneFC)=gene #定义颜色类型 colorSel="qvalue" if(qvalueFilter>0.05){ colorSel="pvalue" } #GO富集分析 kk=enrichGO(gene = gene,OrgDb = org.Hs.eg.db, pvalueCutoff =1, qvalueCutoff = 1, ont="all", readable =T) GO=as.data.frame(kk) GO=GO[(GO$pvalue<pvalueFilter & GO$qvalue<qvalueFilter),] #保存富集结果 write.table(GO,file="GO.txt",sep=" ",quote=F,row.names = F) #定义显示Term数目 showNum=10 #bp、cc、mf各显示前十个 if(nrow(GO)<30){ #小于30个则全部显示 showNum=nrow(GO) } #柱状图 pdf(file="barplot.pdf",width = 9,height =7) bar=barplot(kk, drop = TRUE, showCategory =showNum,split="ONTOLOGY",color = colorSel) + facet_grid(ONTOLOGY~., scale='free') print(bar) dev.off() #气泡图 pdf(file="bubble.pdf",width = 9,height =7) bub=dotplot(kk,showCategory = showNum, orderBy = "GeneRatio",split="ONTOLOGY", color = colorSel) + facet_grid(ONTOLOGY~., scale='free') print(bub) dev.off() #圈图 pdf(file="circos.pdf",width = 9,height = 6.5) #修改高度和宽度将图形变为圆形 cnet=cnetplot(kk, foldChange=geneFC, showCategory = 5, circular = TRUE, colorEdge = TRUE) print(cnet) dev.off()