之前介绍了利用随机分组提高ROC曲线的AUC值
但其提高的效果甚是微小,有时候得几千甚至上万次才能随机出一个理想的数值,因此这也是个治标不治本的方法
因此,想要切实提高模型的质量,就应该从根本上提高
即应从——数据、基因、模型算法,这三个方面入手去解决问题
然而,数据和基因基本上是经过同学们千筛万选得到的,万万不可轻易舍弃。
因此,从模型算法入手便成了一个更为理想的解决方案
本推文的test数据是从电脑里临时翻出来的(可能疾病都不一样)主要目的是教会同学们利用外部数据进行验证的方法
👇我们先来看看建模的效果👇
可见,相比于两个传统的预后模型,这个模型的效果提升不是一点半点,很大程度上解决了“换数据”“换基因”的焦虑
废话不多说,整👇👇👇
1.读取train和test数据train=read.table("train.txt", header=T, sep="\t", check.names=F,row.names = 1)
train$futime=train$futime/365
test=read.table("test.txt",header = T,sep = "\t",check.names = F,row.names = 1)
test$futime=test$futime/365
2.单因素Cox分析并绘制森林图uniTab=data.frame()
sigclinical=c("futime","fustat")
for(i in colnames(train[,3:ncol(train)])){
cox <- coxph(Surv(futime, fustat )~ train[,i], data = train)
coxSummary = summary(cox)
coxP=coxSummary$coefficients[,"Pr(>|z|)"]
if(coxP<0.05){ #筛选具有统计学意义的单因素结果
sigclinical=c(sigclinical,i)
uniTab=rbind(uniTab,
cbind(id=i,
HR=coxSummary$conf.int[,"exp(coef)"],
HR.95L=coxSummary$conf.int[,"lower .95"],
HR.95H=coxSummary$conf.int[,"upper .95"],
pvalue=coxSummary$coefficients[,"Pr(>|z|)"])
)
}
}
uniTab=na.omit(uniTab)#去除NA数据
write.table(uniTab,file="单因素Cox.txt",sep="\t",row.names=F,quote=F)
uniSigExp=train[,sigclinical]
uniSigExpID=cbind(id=row.names(train),uniSigExp)
write.table(uniSigExpID,file="单因素Cox筛选.txt",sep="\t",row.names=F,quote=F)
coxFile="单因素Cox.txt"#单因素结果文件
forestFile="单因素Cox森林图.pdf"#单因素输出的图片文件名
forestCol="green"#设置森林图颜色
rt <- read.table(coxFile,header=T,sep="\t",row.names=1,check.names=F)
gene <- rownames(rt)
hr <- sprintf("%.3f",rt$"HR")
hrLow <- sprintf("%.3f",rt$"HR.95L")
hrHigh <- sprintf("%.3f",rt$"HR.95H")
Hazard.ratio <- paste0(hr,"(",hrLow,"-",hrHigh,")")
pVal <- ifelse(rt$pvalue<0.001, "<0.001", sprintf("%.3f", rt$pvalue))
#输出图形
pdf(file=forestFile)
n <- nrow(rt)
nRow <- n+1
ylim <- c(1,nRow)
layout(matrix(c(1,2),nc=2),width=c(3,2.5))
#绘制森林图左边的临床信息
xlim = c(0,3)
par(mar=c(4,2.5,2,1))
plot(1,xlim=xlim,ylim=ylim,type="n",axes=F,xlab="",ylab="")
text.cex=0.8
text(0,n:1,gene,adj=0,cex=text.cex)
text(1.5-0.5*0.2,n:1,pVal,adj=1,cex=text.cex);text(1.5-0.5*0.2,n+1,'pvalue',cex=text.cex,font=2,adj=1)
text(3,n:1,Hazard.ratio,adj=1,cex=text.cex);text(3,n+1,'Hazard ratio',cex=text.cex,font=2,adj=1,)
#绘制森林图
par(mar=c(4,1,2,1),mgp=c(2,0.5,0))
xlim = c(0,max(as.numeric(hrLow),as.numeric(hrHigh)))
plot(1,xlim=xlim,ylim=ylim,type="n",axes=F,ylab="",xaxs="i",xlab="Hazard ratio")
arrows(as.numeric(hrLow),n:1,as.numeric(hrHigh),n:1,angle=90,code=3,length=0.05,col="darkblue",lwd=2.5)
abline(v=1,col="black",lty=2,lwd=2)
boxcolor = ifelse(as.numeric(hr) > 1, forestCol, forestCol)
points(as.numeric(hr), n:1, pch = 15, col = boxcolor, cex=1.3)
axis(1)
dev.off()
3.构建多因素Cox回归模型并绘制森林图multiCox=coxph(Surv(futime, fustat )~ ., data = uniSigExp)
multiCoxSum=summary(multiCox)
multiTab=data.frame()
multiTab=cbind(
HR=multiCoxSum$conf.int[,"exp(coef)"],
HR.95L=multiCoxSum$conf.int[,"lower .95"],
HR.95H=multiCoxSum$conf.int[,"upper .95"],
pvalue=multiCoxSum$coefficients[,"Pr(>|z|)"])
multiTab=na.omit(multiTab)#删除NA值
multiTabID=cbind(id=row.names(multiTab),multiTab)
write.table(multiTabID,file="多因素Cox.txt",sep="\t",row.names=F,quote=F)
coxFile="多因素Cox.txt"#多因素结果文件
forestFile="多因素Cox森林图.pdf"#单因素输出的图片文件名
forestCol="red"#设置森林图颜色
rt <- read.table(coxFile,header=T,sep="\t",row.names=1,check.names=F)
gene <- rownames(rt)
hr <- sprintf("%.3f",rt$"HR")
hrLow <- sprintf("%.3f",rt$"HR.95L")
hrHigh <- sprintf("%.3f",rt$"HR.95H")
Hazard.ratio <- paste0(hr,"(",hrLow,"-",hrHigh,")")
pVal <- ifelse(rt$pvalue<0.001, "<0.001", sprintf("%.3f", rt$pvalue))
#输出图形
pdf(file=forestFile)
n <- nrow(rt)
nRow <- n+1
ylim <- c(1,nRow)
layout(matrix(c(1,2),nc=2),width=c(3,2.5))
#绘制森林图左边的临床信息
xlim = c(0,3)
par(mar=c(4,2.5,2,1))
plot(1,xlim=xlim,ylim=ylim,type="n",axes=F,xlab="",ylab="")
text.cex=0.8
text(0,n:1,gene,adj=0,cex=text.cex)
text(1.5-0.5*0.2,n:1,pVal,adj=1,cex=text.cex);text(1.5-0.5*0.2,n+1,'pvalue',cex=text.cex,font=2,adj=1)
text(3,n:1,Hazard.ratio,adj=1,cex=text.cex);text(3,n+1,'Hazard ratio',cex=text.cex,font=2,adj=1,)
#绘制森林图
par(mar=c(4,1,2,1),mgp=c(2,0.5,0))
xlim = c(0,max(as.numeric(hrLow),as.numeric(hrHigh)))
plot(1,xlim=xlim,ylim=ylim,type="n",axes=F,ylab="",xaxs="i",xlab="Hazard ratio")
arrows(as.numeric(hrLow),n:1,as.numeric(hrHigh),n:1,angle=90,code=3,length=0.05,col="darkblue",lwd=2.5)
abline(v=1,col="black",lty=2,lwd=2)
boxcolor = ifelse(as.numeric(hr) > 1, forestCol, forestCol)
points(as.numeric(hr), n:1, pch = 15, col = boxcolor, cex=1.3)
axis(1)
dev.off()
4.分别计算train和test的风险评分mulCoxscore_train=predict(multiCox, newdata = train,type="risk")#计算train风险评分
mulCoxscore_test=predict(multiCox, newdata = test,type="risk")#计算test风险评分
mulCoxtrain_data=data.frame(uniSigExp,riskscore=mulCoxscore_train)
write.table(mulCoxtrain_data,file = "Cox_train_risk.txt",quote = F,sep = "\t")
uniSigExp_test=test[,sigclinical]
mulCoxtest_data=data.frame(uniSigExp_test,riskscore=mulCoxscore_test)
write.table(mulCoxtest_data,file = "Cox_test_risk.txt",quote = F,sep = "\t")
5.构建Lasso回归模型x=as.matrix(uniSigExp[,c(3:ncol(uniSigExp))])
y=data.matrix(Surv(uniSigExp$futime,uniSigExp$fustat))
fit=glmnet(x, y, family = "cox", maxit = 1000)
pdf("lasso_lambda.pdf")
plot(fit, xvar = "lambda", label = TRUE)
dev.off()
cvfit=cv.glmnet(x, y, family="cox", maxit = 1000)
pdf(file="lasso_cvfit.pdf",width=6,height=5.5)
plot(cvfit)
dev.off()
6.分别计算train和test的风险评分coef=coef(fit, s = cvfit$lambda.min)
index=which(coef != 0)
actCoef=coef[index]
lasso_clinical=row.names(coef)[index]
sames=intersect(lasso_clinical,colnames(train))
lassoout0=train[,sames,drop=F]
lassoout1=test[,sames,drop=F]
lassoout=data.frame(futime=uniSigExp$futime,fustat=uniSigExp$fustat,lassoout0)
myFun=function(x){crossprod(as.numeric(x),actCoef)}
lassoScore=apply(lassoout0,1,myFun)#计算train风险评分
lassoScore_test=apply(lassoout1,1,myFun)#计算test风险评分
outCol=c("futime","fustat",lasso_clinical)
outTab=cbind(uniSigExp[,outCol],riskScore=as.vector(lassoScore))
outTab_test=cbind(test[,outCol],riskScore=as.vector(lassoScore_test))
write.table(outTab,file="lass_Risk_train.txt",sep="\t",quote=F,row.names=F)
write.table(outTab_test,file = "lasso_Risk_test.txt",sep = "\t",quote = F,row.names = F)
7.构建随机森林模型
可试读47%
微信扫一扫
关注该公众号