R语言批量计算一个基因的表达相关基因

代码不一定是最优化的，根据实际情况修改使用

rm(list = ls())
library(dplyr)
##读取基因ID，Ensembl ID注释文件
idmap <- read.delim("G:\\GTeX TCGA/gencode.v23.annotation.gene.probemap",as.is=T)

#读取表达矩阵
rt <- data.table::fread("G:/GTeX TCGA/tcga_RSEM_gene_tpm/split/LUAD.csv",data.table = F)
rt <- as.matrix(rt) #变为matrix
rownames(rt) <- rt[,1]
exp <-as.data.frame( rt[,2:ncol(rt)]) 
exp <- apply(exp, 2, as.numeric)
rownames(exp) <- rownames(rt) 
exp <- exp[,substr(colnames(exp),14,15)<10]
#将表达为0的样本赋值为NA
exp[exp== -9.9658]<-NA
# 求相关性
target <- "FBXO22"
#ID转换
gene <- idmap$id[which(idmap$gene==target)]
outTab=data.frame()
for (i in rownames(exp)){
  x=exp[gene,]
  y=exp[i,]
  cordata<- data.frame(x=as.numeric(x),y=as.numeric(y)) %>% na.omit()
  if (sd(cordata$y)==0|nrow(cordata)<10){
    next
  }
  corT=cor.test(cordata$x,cordata$y)
  cor=corT$estimate
  cor=round(cor,3)
  pvalue=as.numeric(corT$p.value)
  if(pvalue<0.001){
    pvalue=format(pvalue,scientific = TRUE)
  }else{
    pvalue=round(pvalue,3)
  }

    outTab=rbind(outTab,cbind(lncRNA=gene,mRNA=i,cor,pvalue))
}
##合并 需要，换一下列名
colnames(outTab)[2] <- "Ensembl"
outTab$Ensembl <- substr(outTab$Ensembl,1,15)
##另一个整理好的注释文件，包含RNAleixing
biomart_table <- readRDS(paste0("E:/apps/9. IDconvert/app_data/biomart_table_","homo",".Rds"))
outTab <- as.data.frame(outTab)
#ID转化
converted<-biomart_table %>%
  filter_all(., any_vars(. %in% outTab[,2])) %>%
  distinct(Ensembl, .keep_all = TRUE) %>%
  arrange(Ensembl)
conv.data<-merge(outTab,biomart_table,by="Ensembl")[-1]
##保存数据
write.csv(conv.data,paste0(choose.files(),"/",target,"_cor.csv"))

将上述代码包装成函数：

corr_tcga <- function(tcga="LUAD",target="TP53"){
  rt <- data.table::fread(paste0("G:/GTeX TCGA/tcga_RSEM_gene_tpm/split/",tcga,".csv"),data.table = F)
  rt <- as.matrix(rt) #变为matrix
  rownames(rt) <- rt[,1]
  exp <-as.data.frame( rt[,2:ncol(rt)]) 
  exp <- apply(exp, 2, as.numeric)
  rownames(exp) <- rownames(rt) 
  exp <- exp[,substr(colnames(exp),14,15)<10]
  exp[exp== -9.9658]<-NA
  # 求相关性
  gene <- idmap$id[which(idmap$gene==target)]
  outTab=data.frame()
  for (i in rownames(exp)){
    x=exp[gene,]
    y=exp[i,]
    cordata<- data.frame(x=as.numeric(x),y=as.numeric(y)) %>% na.omit()
    if (sd(cordata$y)==0|nrow(cordata)<10){
      next
    }
    corT=cor.test(cordata$x,cordata$y)
    cor=corT$estimate
    cor=round(cor,3)
    pvalue=as.numeric(corT$p.value)
    if(pvalue<0.001){
      pvalue=format(pvalue,scientific = TRUE)
    }else{
      pvalue=round(pvalue,3)
    }
    
    outTab=rbind(outTab,cbind(Target=target,Ensembl=i,cor,pvalue))
  }
  outTab$Ensembl <- substr(outTab$Ensembl,1,15)
  biomart_table <- readRDS(paste0("E:/apps/9. IDconvert/app_data/biomart_table_","homo",".Rds"))
  outTab <- as.data.frame(outTab)
  converted<-biomart_table %>%
    filter_all(., any_vars(. %in% outTab[,2])) %>%
    distinct(Ensembl, .keep_all = TRUE) %>%
    arrange(Ensembl)
  conv.data<-merge(outTab,biomart_table,by="Ensembl")[-1]
}
conv.data <- corr_tcga("LUSC","FBXO22")
write.csv(conv.data,choose.files())

打赏赞(4)

6 Replies to “R语言批量计算一个基因的表达相关基因”

请问如何在一个聚类分析后的rds文件中增加ENSEMBL_ID信息？

进哥哥说道：

2022-10-06 10:22

您的意思是要需要一列gene symbol转换后的ENSEMbl id?使用GTF文件或使用biomart包进行ID转换

回复

您好，请问第44行的“biomart_table <- readRDS(paste0("E:/apps/9. IDconvert/app_data/biomart_table_","homo",".Rds"))”中提到的文件是什么？
如果方便，还请您进行解释，谢谢。

进哥哥说道：

2022-08-08 11:20

您好，这个文件是用来进行Ensembl ID–> Gene ID 转化的，你也可以使用其他方式进行转化，我这个文件是从从biomart包整理导出的，需要的话我发你。
代码有些地方不是最合适的，你自己根据实际情况修改

回复
1. 空巷上的孤魂说道：
  
  2022-08-08 11:24
  
  麻烦您发一份给我吧，谢谢您。代码已经非常的完善了，需要修改的地方非常的少了。感谢您的分享，让我受益匪浅，特别的实用。
  
  回复
  1. 进哥哥说道：
    
    2022-08-08 11:37
    
    已发邮箱，有问题继续讨论
    
    回复

6 Replies to “R语言批量计算一个基因的表达相关基因”

发表评论 取消回复

发表评论取消回复