在之前我们介绍了生存资料多种机器学习算法实现特征筛选及模型建立和评价的过程。今天我们看看对于结局变量是二分类变量的任务如何进行多种机器学习算法的特征筛选、10折交叉验证、预测性能评价、特征重要性评价以及如何筛选关键的变量进入后续的分析。这次内容主要实现以下内容:
1、数据集中分类变量的独热编码处理;
2、基于随机森林实现分类任务的建模、10折交叉验证、模型评价、特征重要性评价、使用ggplot2可视化不同树与OOB之间的关系;
3、基于LASSO回归实现分类任务的建模、10折交叉验证、模型评价、特征重要性评价,更重要的是,使用ggplot2可视化变量惩罚过程,输出更加个性化的Figure;
4、基于XGBoost实现分类任务的建模、10折交叉验证、模型评价和特征重要性评价,更重要的是,在XGBoost的基础上,进一步通过SHAP可视化将“黑箱模型”转变为可解释的机器学习算法模型;
5、基于以上三种算法取交集变量,以便用于后续建模等分析。
以下为通过本次推文得到的Figure。
随机森林特征筛选、10折交叉验证后验证集评价模型(模拟数据不代表真实表现)及树与OOB之间的关系可视化
基于ggplot2可视化lasso回归过程中的变量惩罚过程,为Figure加分
基于XGBoost的SHAP可视化,让机器学习算法不再是黑箱模型
依赖包及数据加载 随机森林实现二分类任务特征筛选library(tidyverse)#数据操作library(ggplot2)#绘图library(ggvenn)#韦恩图library(pROC)#ROC曲线library(RColorBrewer)#画板library(caret)#数据拆分及交叉验证library(randomForest)#随机森林library(glmnet)#lassolibrary(xgboost)#xgboos# 读取数据load(file = "MLinputdata.Rda")data$Event<-as.factor(data$Event)#结局事件因子# 对两个分类变量进行独热编码encoded_vars <- model.matrix(~Age+BMI-1, data = data)# 将编码后的变量添加到原始数据框中encoded_df <- cbind(data[, !names(data) %in% c("Age", "BMI")], encoded_vars)# 将数据分为训练集和测试集set.seed(123)trainIndex <- createDataPartition(encoded_df$Event, p = .7, list = FALSE, times = 1)trainData<-encoded_df[ trainIndex,]testData<-encoded_df[-trainIndex,]#拟合模型fit.rf <-randomForest(Event ~ ., data =trainData, importance = TRUE, ntree = 500)#10折交叉验证set.seed(123)ctrl <- trainControl(method = "cv", number = 10)rf_model_cv <- train(Event ~ ., data =trainData, method = "rf", trControl = ctrl)#模型评价rf_pred <- predict(rf_model_cv, newdata =testData,type="prob")[,2]rf_roc<-roc(testData$Event,rf_pred)auc_value <- auc(rf_roc)p1<-ggroc(rf_roc,legacy.axes = TRUE,size=1,color="#69b3a2")+ ggtitle("ROC curve of Randomforest")+ geom_abline(intercept = 0,slope = 1,linetype="dashed",color="grey")+ theme_bw()+ annotate(geom = "text",label=paste0("AUC in test set:",round(auc_value,3)), x=0.7,y=0.05)ggsave("RFroc.png",plot =p1,width = 4,height = 4,units = 'in',dpi = 600)
看一下随机森林自带的变量重要性评价varImpPlot(fit.rf)
以上为对应两种损失函数不同特征的重要性排序,下面我们在此基础上进一步加工。rf.imp <- importance(fit.rf)%>%as.data.frame()rf.imp%>% mutate(Variable=rownames(rf.imp))%>% arrange(desc(MeanDecreaseAccuracy))->rf.imp#前15个特征top_vars<-rf.imp[1:15,]p2<-ggplot(top_vars, aes(y = reorder(Variable,-MeanDecreaseAccuracy), x =MeanDecreaseAccuracy)) + geom_bar(stat = "identity", fill = "#69b3a2",alpha=0.8,width = 0.6) + scale_fill_viridis() + labs(title = "Variable Importance Plot of RF", x = "Importance Score", y = "") + coord_flip()+ theme_bw()+ theme(axis.text.x = element_text(angle = 30,hjust =0.9))ggsave("RFvip.png",plot =p2,width =6,height = 4,units = 'in',dpi = 600)
下面我们看下不同树目和OOB之间的关系plot(fit.rf)
默认的比较难看,我们自己提取数据可视化。# 提取树与OOB的关系的数值err_rates <-fit.rf[["err.rate"]]err_rates%>% as.data.frame()%>% mutate(Tree=1:500)->err_rateserr_rates%>% pivot_longer(cols = 1:3, names_to = "OOB", values_to = "value")->err_ratesp3<-ggplot(err_rates, aes(x=Tree, y=value,color=OOB)) + geom_line(size=1) + labs(title = "The relationship between tree number and OOB", x="Number of Trees", y="Error Rate") + scale_color_brewer(palette = "Set2", name="Error rate", label=c("Alive","Death","OOB"))+ theme_bw()ggsave("RFoob.png",plot =p3,width =6,height = 4,units = 'in',dpi = 600)
以上为随机森林在二分类任务中特征筛选等过程的实现,接下来我们看看LASSO回归和XGBoost。 LASSO回归实现二分类任务特征筛选set.seed(123)x <- model.matrix(Event~ ., trainData)[,-1]y <- as.numeric(trainData$Event)-1fit.lasso<-glmnet(x, y, alpha = 1, family = "binomial", type.measure = "class")cv_lasso<- cv.glmnet(x, y, alpha = 1, family = "binomial", type.measure = "class", nfolds = 10)
#变量筛选过程plot(fit.lasso,xvar="lambda")
# 提取plot(fit.lasso)中的数据plot.data <- data.frame(as.matrix(fit.lasso$lambda), as.matrix(t(fit.lasso$beta)))plot.data%>% rename(lambda='as.matrix.fit.lasso.lambda.')%>% pivot_longer(cols = 2:ncol(plot.data), names_to ="variable", values_to = "coef")->plot.data# 绘制图形library(viridis)# 使用viridis调色板colors <- viridis(40)p4<-ggplot(plot.data, aes(x = lambda, y = coef, color = variable)) + geom_line(size=1,alpha=0.8) + scale_color_manual(values =colors)+ labs(title = "LASSO Regression Path", x = "lambda", y = "Coefficient") + theme_bw()+ theme(legend.text = element_text(size =6))ggsave("lasso1.png",plot =p4,width =8,height =6,units = 'in',dpi = 600)
plot(cv_lasso)
# ROC曲线模型评价x_test <- model.matrix(Event ~ ., testData)[,-1]y_test <- as.numeric(testData$Event) - 1y_pred <- predict(cv_lasso, s = cv_lasso$lambda.min, newx = x_test, type = "class")y_prob <- predict(cv_lasso, s = cv_lasso$lambda.min, newx = x_test, type = "response")roc_lasso<- roc(y_test, y_prob[,1])auc_value <- auc(roc_lasso)p5<-ggroc(roc_lasso,legacy.axes = TRUE,size=1,color="#69b3a2")+ ggtitle("ROC curve of LASSO")+ geom_abline(intercept = 0,slope = 1,linetype="dashed",color="grey")+ theme_bw()+ annotate(geom = "text",label=paste0("AUC in test set:",round(auc_value,3)), x=0.7,y=0.05)ggsave("lassoroc.png",plot =p5,width = 4,height = 4,units = 'in',dpi = 600)
可试读58%
微信扫一扫
关注该公众号