生存期分析在疾病研究中十分重要这自然不必说。目前有很多工具支持survival analysis,比如R中的survival包,Python的lifelines。我对Python更加熟悉,所以更倾向于使用lifelines,lifelines提供了Cox regression和kaplan-meier plot,十分方便。但是R的survival也是相当常用的,网上可以找到很多这种例子,以下代码实例来自生物技能树论坛
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | rm(list=ls()) setwd('./TCGA') ###读取自己的TCGA数据 dat=read.csv('3expforRsuoxiao.csv', header =TRUE, row.names= 1, sep = ",") rt=read.csv('3osforR.csv', header =TRUE, row.names= 1, sep = ",") os=rt[,2] ##(时间) STATUS=rt[,1] ###(生存的event) log_rank_p <- apply(dat, 1, function(values1){ ## for a matrix 1 indicates rows, 2 indicates columns, c(1, 2) indicates rows and columns. group=ifelse(values1>median(values1),'high','low') ##分高低组 ifelse(test, yes, no)test, an object which can be coerced to logical mode. survival_dat <- data.frame(group=group,os=os,STATUS=STATUS,stringsAsFactors = F) ##行,列的生存时间, 即 基因表达值分成了高低组,和相应 library(survival)###表格可以任意添加参数,如性别,年龄等。 my.surv <- Surv(survival_dat$os,survival_dat$STATUS) ###my.surv <- Surv(OS_MONTHS,OS_STATUS=='LIVING') ##这个Surv函数第一个参数必须是数值型的时间,第二个参数是逻辑向量,1,0表示死亡与否 kmfit2 <- survfit(my.surv~survival_dat$group) ##### 生存数据对象~表达高低 创建KM生存曲线 $绝对引用? data.survdiff=survdiff(my.surv~group,data = survival_dat) ##data = survival_dat 单个的表达高低与生存数据对象 ##用于不同组的统计检验 Two-sample test survdiff(Surv(futime, fustat) ~ rx,data=ovarian)#生存数据对象~表达高低 p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1) }) names(log_rank_p[log_rank_p<0.05]) |
第二个以TCGA中的Pancreatic Adenocarcinoma样本为例的surival分析
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | ####teach code#### library(survival) #read expression data and modify its class and colnames/rownames #PAAD_gene_expression.csv数据已经经过Z_转换的数据。 z_rna <- read.csv(file="PAAD_gene_expression.csv",header = T,row.names = 1) #clincal是所有样本的临床信息 clinical <- t(read.table("PAAD.merged_only_clinical_clin_format.txt", header=T, row.names=1, sep="\t", stringsAsFactors = F)) clinical <- as.data.frame(clinical) #transform rna data's colnames and rownames z_rna <- as.matrix(z_rna) colnames(z_rna) <- gsub("\\.","-",colnames(z_rna)) rownames(z_rna) <- sapply(rownames(z_rna), function(x) unlist(strsplit(x,"\\|"))[[1]]) #check the type of cancer data #table(substr(colnames(z_rna),14,15)) #grep patient ID in clinical data clinical$IDs <- toupper(clinical$patient.bcr_patient_barcode) sum(clinical$IDs %in% colnames(z_rna))# we have 178 patients that we could use #grep information for survival ind_keep <- grep("days_to_death",colnames(clinical)) death <- as.matrix(clinical[,ind_keep]) death_collapsed <- apply(death,1,FUN = function(x)max(x,na.rm = T)) #提取最大值 death_collapsed<- as.numeric(death_collapsed) #last follow_up information ind_keep <- grep("days_to_last_followup",colnames(clinical)) fl <- as.matrix(clinical[,ind_keep]) fl_collapsed <- apply(fl,1,FUN = function(x)max(x,na.rm = T)) fl_collapsed <- as.numeric(fl_collapsed) #construct a new dataframe containing all information all_clin <- data.frame(death_collapsed,fl_collapsed) colnames(all_clin) <- c("death_days", "followUp_days") all_clin$death_event <- ifelse(clinical$patient.vital_status == "alive", 0,1) all_clin$new_death <- apply(all_clin,1,function(x)ifelse(x[3]==0,x[2],x[1])) table(clinical$patient.vital_status) rownames(all_clin) <- clinical$IDs ##connect to RNA-seq # create event vector for RNASeq data event_rna <- t(apply(z_rna, 1, function(x) ifelse(abs(x) > 1.96,1,0))) in_clin_tum <- match(colnames(z_rna),rownames(all_clin)) tumor_clinical <- all_clin[in_clin_tum,] #example gene ind_gene <- which(rownames(z_rna) == "TP53") table(event_rna[ind_gene,]) ###survival analysis### #over_all survival su_OS <- Surv(as.numeric(tumor_clinical$new_death), as.numeric(tumor_clinical$death_event)) fit_OS <- survfit(su_OS~event_rna[ind_gene,]) p <- survdiff(su_OS~event_rna[ind_gene,]) pv <- ifelse(is.na(p),next,(round(1 - pchisq(p$chisq, length(p$n) - 1),3)))[[1]] x1 <- ifelse(is.na(as.numeric(summary(fit_OS)$table[,'median'][1])), "NA", as.numeric(summary(fit_OS)$table[,'median'][1])) x2 <- as.numeric(summary(fit_OS)$table[,'median'][2]) #get necessary information summary(fit_OS) summary(fit_OS)$table[,'median'] summary(fit_OS,times = 365) #plot survival curve plot(fit_OS,conf.int=FALSE,col=c("black","red"), xlab="Days",ylab = "Proportion Survival") legend(1800,0.995,legend=paste("p.value = ",pv[[1]],sep=""),bty="n",cex=1.4) legend(max(as.numeric(as.character(all_clin$death_days)[in_clin_tum]), na.rm = T)*0.7,0.94, legend=c(paste("NotAltered=",x1),paste("Altered=",x2)), bty="n", cex=1.3, lwd=3, col=c("black", "red")) |