Space of a muser

Insight

Enjoy life, relish the moment!


Survival analysis using R and Python

生存期分析在疾病研究中十分重要这自然不必说。目前有很多工具支持survival analysis,比如R中的survival包,Python的lifelines。我对Python更加熟悉,所以更倾向于使用lifelines,lifelines提供了Cox regression和kaplan-meier plot,十分方便。但是R的survival也是相当常用的,网上可以找到很多这种例子,以下代码实例来自生物技能树论坛

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
rm(list=ls())
setwd('./TCGA')
###读取自己的TCGA数据
dat=read.csv('3expforRsuoxiao.csv', header =TRUE, row.names= 1, sep = ",")
rt=read.csv('3osforR.csv', header =TRUE, row.names= 1, sep = ",")
os=rt[,2##(时间)
STATUS=rt[,1] ###(生存的event)
log_rank_p <- apply(dat,  1, function(values1){  
## for a matrix 1 indicates rows, 2 indicates columns, c(1, 2) indicates rows and columns.
  group=ifelse(values1>median(values1),'high','low'##分高低组  ifelse(test, yes, no)test, an object which can be coerced to logical mode.
  survival_dat <- data.frame(group=group,os=os,STATUS=STATUS,stringsAsFactors = F##行,列的生存时间, 即 基因表达值分成了高低组,和相应
  library(survival)###表格可以任意添加参数,如性别,年龄等。
  my.surv <- Surv(survival_dat$os,survival_dat$STATUS)  ###my.surv <- Surv(OS_MONTHS,OS_STATUS=='LIVING') 
  ##这个Surv函数第一个参数必须是数值型的时间,第二个参数是逻辑向量,1,0表示死亡与否
  kmfit2 <- survfit(my.surv~survival_dat$group)  ##### 生存数据对象~表达高低 创建KM生存曲线 $绝对引用?
  data.survdiff=survdiff(my.surv~group,data = survival_dat) ##data = survival_dat 单个的表达高低与生存数据对象
  ##用于不同组的统计检验 Two-sample test  survdiff(Surv(futime, fustat) ~ rx,data=ovarian)#生存数据对象~表达高低
  p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
})
names(log_rank_p[log_rank_p<0.05])

Here

第二个以TCGA中的Pancreatic Adenocarcinoma样本为例的surival分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
####teach code####
library(survival)
#read expression data and modify its class and colnames/rownames

#PAAD_gene_expression.csv数据已经经过Z_转换的数据。

z_rna <- read.csv(file="PAAD_gene_expression.csv",header = T,row.names = 1)

#clincal是所有样本的临床信息
clinical <- t(read.table("PAAD.merged_only_clinical_clin_format.txt",
 header=T, row.names=1, sep="\t", stringsAsFactors = F))

clinical <- as.data.frame(clinical)
#transform rna data's colnames and rownames
z_rna <- as.matrix(z_rna)
colnames(z_rna) <- gsub("\\.","-",colnames(z_rna))
rownames(z_rna) <- sapply(rownames(z_rna), 
 function(x) unlist(strsplit(x,"\\|"))[[1]])


#check the type of cancer data
#table(substr(colnames(z_rna),14,15))
#grep patient ID in clinical data
clinical$IDs <- toupper(clinical$patient.bcr_patient_barcode)
sum(clinical$IDs %in% colnames(z_rna))# we have 178 patients that we could use
#grep information for survival

ind_keep <- grep("days_to_death",colnames(clinical))
death <- as.matrix(clinical[,ind_keep])
death_collapsed <- apply(death,1,FUN = function(x)max(x,na.rm = T)) #提取最大值
death_collapsed<- as.numeric(death_collapsed)

#last follow_up information
ind_keep <- grep("days_to_last_followup",colnames(clinical))
fl <- as.matrix(clinical[,ind_keep])
fl_collapsed <- apply(fl,1,FUN = function(x)max(x,na.rm = T))
fl_collapsed <- as.numeric(fl_collapsed)


#construct a new dataframe containing all information
all_clin <- data.frame(death_collapsed,fl_collapsed)
colnames(all_clin) <- c("death_days", "followUp_days")
all_clin$death_event <- ifelse(clinical$patient.vital_status == "alive", 0,1)
all_clin$new_death <- apply(all_clin,1,function(x)ifelse(x[3]==0,x[2],x[1]))
table(clinical$patient.vital_status)
rownames(all_clin) <- clinical$IDs
##connect to RNA-seq
# create event vector for RNASeq data
event_rna <- t(apply(z_rna, 1, function(x) ifelse(abs(x) > 1.96,1,0)))
in_clin_tum <- match(colnames(z_rna),rownames(all_clin))
tumor_clinical <- all_clin[in_clin_tum,]
#example gene
ind_gene <- which(rownames(z_rna) == "TP53")
table(event_rna[ind_gene,])


###survival analysis###


#over_all survival
su_OS <- Surv(as.numeric(tumor_clinical$new_death),
 as.numeric(tumor_clinical$death_event))
fit_OS <- survfit(su_OS~event_rna[ind_gene,])
p <- survdiff(su_OS~event_rna[ind_gene,])
pv <- ifelse(is.na(p),next,(round(1 - pchisq(p$chisq, length(p$n) - 1),3)))[[1]]
x1 <- ifelse(is.na(as.numeric(summary(fit_OS)$table[,'median'][1])),
 "NA", as.numeric(summary(fit_OS)$table[,'median'][1]))
x2 <- as.numeric(summary(fit_OS)$table[,'median'][2])

#get necessary information
summary(fit_OS)
summary(fit_OS)$table[,'median']
summary(fit_OS,times = 365)
#plot survival curve

plot(fit_OS,conf.int=FALSE,col=c("black","red"),
 xlab="Days",ylab = "Proportion Survival")
legend(1800,0.995,legend=paste("p.value = ",pv[[1]],sep=""),bty="n",cex=1.4)
legend(max(as.numeric(as.character(all_clin$death_days)[in_clin_tum]),
 na.rm = T)*0.7,0.94,
legend=c(paste("NotAltered=",x1),paste("Altered=",x2)),
 bty="n", cex=1.3, lwd=3, col=c("black", "red"))
最近的文章

Math equation test

于  Math
comments powered by Disqus