机器学习与预后模型会碰撞出怎样的火花

叉叉滴同学 叉叉滴同学的生信笔记 2023-05-31 07:01 发表于广西

之前介绍了利用随机分组提高ROC曲线的AUC值

👉随机分组组内验证

👉预后模型组内验证

👉高ROC真的是天选之子吗?

但其提高的效果甚是微小,有时候得几千甚至上万次才能随机出一个理想的数值,因此这也是个治标不治本的方法

因此,想要切实提高模型的质量,就应该从根本上提高

即应从——数据、基因、模型算法,这三个方面入手去解决问题

然而,数据和基因基本上是经过同学们千筛万选得到的,万万不可轻易舍弃。

因此,从模型算法入手便成了一个更为理想的解决方案

本推文的test数据是从电脑里临时翻出来的(可能疾病都不一样)主要目的是教会同学们利用外部数据进行验证的方法

👇我们先来看看建模的效果👇

图片

图片

图片

图片


可见,相比于两个传统的预后模型,这个模型的效果提升不是一点半点,很大程度上解决了“换数据”“换基因”的焦虑

废话不多说,整👇👇👇

1.读取train和test数据

train=read.table("train.txt", header=T, sep="\t", check.names=F,row.names = 1)train$futime=train$futime/365
test=read.table("test.txt",header = T,sep = "\t",check.names = F,row.names = 1)test$futime=test$futime/365

图片

图片


2.单因素Cox分析并绘制森林图

uniTab=data.frame()sigclinical=c("futime","fustat")for(i in colnames(train[,3:ncol(train)])){   cox <- coxph(Surv(futime, fustat )~ train[,i], data = train)   coxSummary = summary(cox)  coxP=coxSummary$coefficients[,"Pr(>|z|)"]  if(coxP<0.05){ #筛选具有统计学意义的单因素结果    sigclinical=c(sigclinical,i)    uniTab=rbind(uniTab,                 cbind(id=i,                       HR=coxSummary$conf.int[,"exp(coef)"],                       HR.95L=coxSummary$conf.int[,"lower .95"],                       HR.95H=coxSummary$conf.int[,"upper .95"],                       pvalue=coxSummary$coefficients[,"Pr(>|z|)"])    )  }}uniTab=na.omit(uniTab)#去除NA数据write.table(uniTab,file="单因素Cox.txt",sep="\t",row.names=F,quote=F)uniSigExp=train[,sigclinical]uniSigExpID=cbind(id=row.names(train),uniSigExp)write.table(uniSigExpID,file="单因素Cox筛选.txt",sep="\t",row.names=F,quote=F)coxFile="单因素Cox.txt"#单因素结果文件forestFile="单因素Cox森林图.pdf"#单因素输出的图片文件名forestCol="green"#设置森林图颜色rt <- read.table(coxFile,header=T,sep="\t",row.names=1,check.names=F)gene <- rownames(rt)hr <- sprintf("%.3f",rt$"HR")hrLow  <- sprintf("%.3f",rt$"HR.95L")hrHigh <- sprintf("%.3f",rt$"HR.95H")Hazard.ratio <- paste0(hr,"(",hrLow,"-",hrHigh,")")pVal <- ifelse(rt$pvalue<0.001, "<0.001", sprintf("%.3f", rt$pvalue))#输出图形pdf(file=forestFile)n <- nrow(rt)nRow <- n+1ylim <- c(1,nRow)layout(matrix(c(1,2),nc=2),width=c(3,2.5))#绘制森林图左边的临床信息xlim = c(0,3)par(mar=c(4,2.5,2,1))plot(1,xlim=xlim,ylim=ylim,type="n",axes=F,xlab="",ylab="")text.cex=0.8text(0,n:1,gene,adj=0,cex=text.cex)text(1.5-0.5*0.2,n:1,pVal,adj=1,cex=text.cex);text(1.5-0.5*0.2,n+1,'pvalue',cex=text.cex,font=2,adj=1)text(3,n:1,Hazard.ratio,adj=1,cex=text.cex);text(3,n+1,'Hazard ratio',cex=text.cex,font=2,adj=1,)#绘制森林图par(mar=c(4,1,2,1),mgp=c(2,0.5,0))xlim = c(0,max(as.numeric(hrLow),as.numeric(hrHigh)))plot(1,xlim=xlim,ylim=ylim,type="n",axes=F,ylab="",xaxs="i",xlab="Hazard ratio")arrows(as.numeric(hrLow),n:1,as.numeric(hrHigh),n:1,angle=90,code=3,length=0.05,col="darkblue",lwd=2.5)abline(v=1,col="black",lty=2,lwd=2)boxcolor = ifelse(as.numeric(hr) > 1, forestCol, forestCol)points(as.numeric(hr), n:1, pch = 15, col = boxcolor, cex=1.3)axis(1)dev.off()

图片


3.构建多因素Cox回归模型并绘制森林图

multiCox=coxph(Surv(futime, fustat )~ ., data = uniSigExp)multiCoxSum=summary(multiCox)multiTab=data.frame()multiTab=cbind(  HR=multiCoxSum$conf.int[,"exp(coef)"],  HR.95L=multiCoxSum$conf.int[,"lower .95"],  HR.95H=multiCoxSum$conf.int[,"upper .95"],  pvalue=multiCoxSum$coefficients[,"Pr(>|z|)"])multiTab=na.omit(multiTab)#删除NA值multiTabID=cbind(id=row.names(multiTab),multiTab)write.table(multiTabID,file="多因素Cox.txt",sep="\t",row.names=F,quote=F)coxFile="多因素Cox.txt"#多因素结果文件forestFile="多因素Cox森林图.pdf"#单因素输出的图片文件名forestCol="red"#设置森林图颜色rt <- read.table(coxFile,header=T,sep="\t",row.names=1,check.names=F)gene <- rownames(rt)hr <- sprintf("%.3f",rt$"HR")hrLow  <- sprintf("%.3f",rt$"HR.95L")hrHigh <- sprintf("%.3f",rt$"HR.95H")Hazard.ratio <- paste0(hr,"(",hrLow,"-",hrHigh,")")pVal <- ifelse(rt$pvalue<0.001, "<0.001", sprintf("%.3f", rt$pvalue))#输出图形pdf(file=forestFile)n <- nrow(rt)nRow <- n+1ylim <- c(1,nRow)layout(matrix(c(1,2),nc=2),width=c(3,2.5))#绘制森林图左边的临床信息xlim = c(0,3)par(mar=c(4,2.5,2,1))plot(1,xlim=xlim,ylim=ylim,type="n",axes=F,xlab="",ylab="")text.cex=0.8text(0,n:1,gene,adj=0,cex=text.cex)text(1.5-0.5*0.2,n:1,pVal,adj=1,cex=text.cex);text(1.5-0.5*0.2,n+1,'pvalue',cex=text.cex,font=2,adj=1)text(3,n:1,Hazard.ratio,adj=1,cex=text.cex);text(3,n+1,'Hazard ratio',cex=text.cex,font=2,adj=1,)#绘制森林图par(mar=c(4,1,2,1),mgp=c(2,0.5,0))xlim = c(0,max(as.numeric(hrLow),as.numeric(hrHigh)))plot(1,xlim=xlim,ylim=ylim,type="n",axes=F,ylab="",xaxs="i",xlab="Hazard ratio")arrows(as.numeric(hrLow),n:1,as.numeric(hrHigh),n:1,angle=90,code=3,length=0.05,col="darkblue",lwd=2.5)abline(v=1,col="black",lty=2,lwd=2)boxcolor = ifelse(as.numeric(hr) > 1, forestCol, forestCol)points(as.numeric(hr), n:1, pch = 15, col = boxcolor, cex=1.3)axis(1)dev.off()

图片


4.分别计算train和test的风险评分

mulCoxscore_train=predict(multiCox, newdata = train,type="risk")#计算train风险评分mulCoxscore_test=predict(multiCox, newdata = test,type="risk")#计算test风险评分mulCoxtrain_data=data.frame(uniSigExp,riskscore=mulCoxscore_train)write.table(mulCoxtrain_data,file = "Cox_train_risk.txt",quote = F,sep = "\t")uniSigExp_test=test[,sigclinical]mulCoxtest_data=data.frame(uniSigExp_test,riskscore=mulCoxscore_test)write.table(mulCoxtest_data,file = "Cox_test_risk.txt",quote = F,sep = "\t")

图片

图片


5.构建Lasso回归模型

x=as.matrix(uniSigExp[,c(3:ncol(uniSigExp))])y=data.matrix(Surv(uniSigExp$futime,uniSigExp$fustat))fit=glmnet(x, y, family = "cox", maxit = 1000)pdf("lasso_lambda.pdf")plot(fit, xvar = "lambda", label = TRUE)dev.off()cvfit=cv.glmnet(x, y, family="cox", maxit = 1000)pdf(file="lasso_cvfit.pdf",width=6,height=5.5)plot(cvfit)dev.off()

图片

图片


6.分别计算train和test的风险评分

coef=coef(fit, s = cvfit$lambda.min)index=which(coef != 0)actCoef=coef[index]lasso_clinical=row.names(coef)[index]sames=intersect(lasso_clinical,colnames(train))lassoout0=train[,sames,drop=F]lassoout1=test[,sames,drop=F]lassoout=data.frame(futime=uniSigExp$futime,fustat=uniSigExp$fustat,lassoout0)myFun=function(x){crossprod(as.numeric(x),actCoef)}lassoScore=apply(lassoout0,1,myFun)#计算train风险评分lassoScore_test=apply(lassoout1,1,myFun)#计算test风险评分outCol=c("futime","fustat",lasso_clinical)outTab=cbind(uniSigExp[,outCol],riskScore=as.vector(lassoScore))outTab_test=cbind(test[,outCol],riskScore=as.vector(lassoScore_test))write.table(outTab,file="lass_Risk_train.txt",sep="\t",quote=F,row.names=F)write.table(outTab_test,file = "lasso_Risk_test.txt",sep = "\t",quote = F,row.names = F)

图片

图片


7.构建随机森林模型

1元 = 10微信豆

可试读47%

1元 = 10微信豆

微信扫一扫
关注该公众号