前几期几乎都是以文献分享为主,这一期直接一点,跟大家分享一下同时跑多个变量和多个结局的代码,拿来就能用的那种~
# if (!require("R.utils", quietly = TRUE)) install.packages("R.utils")
#
# devtools::install_github("jrs95/hyprcoloc", build_opts = c("--resave-data", "--no-manual"), build_vignettes = TRUE)
# browseVignettes("hyprcoloc")
# install.packages("gwasvcf")
# devtools::install_github("mrcieu/gwasglue") #needed for getting coloc data from opengwas
# devtools::install_github("jrs95/gassocplot")
# devtools::install_github('bar-woolf/MRSamePopTest')
# devtools::install_github("bar-woolf/TwoStepCisMR")
# devtools::install_github("slowkow/ggrepel")
rm(list = ls())
# 加载包 ---------------------------------------------------------------------
if (T) {
library(data.table)
library(coloc)
library(TwoSampleMR)
library(ieugwasr)
library(ggplot2)
library(ggrepel)
# library(gwasglue)
# library(gassocplot)
# library(MRSamePopTest)
# library(TwoStepCisMR)
library(TwoSampleMR)
library(dplyr)
library(stringr)
library(tidyverse)
}
这里我用了一篇文章的补充材料提供的暴露作为示例Phenome-wide Mendelian randomisation analysis of 378,142 cases reveals risk factors for eight common cancers | Nature Communications
“https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-024-46927-z/MediaObjects/41467_2024_46927_MOESM5_ESM.xlsx
source("step1_lib.R")
这里包括了将近4000个trait,所以我用了循环
options(ieugwasr_api = 'gwas-api.mrcieu.ac.uk/')
# 如果出现以下报错:
# (Error in r$status_code : $ operator is invalid for atomic vectors)策略:
# 删除R包
# remove.packages("ieugwasr")
# 重新安装一下
# remotes::install_github("mrcieu/ieugwasr", force = TRUE)
library(ieugwasr)
library(doParallel)
library(foreach)
Phenome <- readxl::read_xlsx("./risk factors for eight common cancers.xlsx",sheet = "Ovarian")
exp <- Phenome$`Trait ID`
result_list <- list() # Create an empty list to store the results
# 定义每次循环处理的子集大小
subset_size <- 100
# 计算需要的循环次数
num_iterations <- ceiling(length(exp) / subset_size)
# 遍历每个子集
for (i in 1:num_iterations) {
# i=38
start_index <- (i - 1) * subset_size + 1 # 子集的起始索引
end_index <- min(i * subset_size, length(exp)) # 子集的结束索引
subset <- exp[start_index:end_index] # 提取子集进行处理
# 在此处进行子集处理
for (j in subset) {
print(j)
exp_gwas <- tryCatch(
TwoSampleMR::extract_instruments(outcomes = j, p1 = 5e-08, clump = TRUE,
p2 = 5e-08, r2 = 0.01, kb = 10000,
access_token = ieugwasr::check_access_token(),
force_server = FALSE),
error = function(e) {
message(paste("Error occurred for GWAS:", j))
NULL # Return NULL when an error occurs
}
)
if (!is.null(exp_gwas)) {
result_list[[j]] <- exp_gwas
}
}
}
# 打印结果列表
print(names(result_list))
save(result_list,file = "phegwas_result_list.Rdata")
这样就获取了多个变量的IVs并保存为一个list,后续也可以用这个数据去和多种结局一一尝试。
这一步是针对gwas catalog的数据,当然也可以根据自己的数据来修改
source("step1_lib.R")
dir <- "./tmp"
dat_38 <- list.files(dir,pattern = "38.tsv.gz")
newfile <- str_split(dat_38,"_",simplify = T)[,1]
# hg19
# [1] "chromosome" "variant_id"
# [3] "base_pair_location" "effect_allele"
# [5] "other_allele" "N"
# [7] "effect_allele_frequency" "T"
# [9] "SE_T" "P_noSPA"
# [11] "beta" "standard_error"
# [13] "p_value" "CONVERGE"
# hg38
# [1] "Name" "chromosome"
# [3] "base_pair_location" "other_allele"
# [5] "effect_allele" "Trait"
# [7] "Cohort" "Model"
# [9] "odds_ratio" "ci_lower"
# [11] "ci_upper" "p_value"
# [13] "effect_allele_frequency" "standard_error"
dir.create("./tmp/dat_38/")
for (i in dat_38) {
f= file.path(dir,i)
print(f)
gwas <- fread(f,data.table = F)
gwas[1:4,1:4]
gwas$Effect <- log(gwas$odds_ratio)
setnames(gwas,old = c("chromosome","base_pair_location","other_allele","effect_allele","effect_allele_frequency"),new = c("CHR","BP","A1","A2","EAF"))
newfile <- str_split(i,"_",simplify = T)[,1]
# 判断数据框中A1或A2列中是否包含指定字符串
if (any(apply(gwas[, c("A1", "A2")], 2, function(col) any(col %in% c("A", "G", "C", "T"))))) {
# 如果包含指定字符串,则执行相应的操作
reformatted2 <- MungeSumstats::format_sumstats(gwas,
ref_genome = "GRCh38",
convert_ref_genome = "GRCh37",
nThread = 2,
save_path =paste0("./tmp/dat_38/",newfile,".tsv.gz")) # 在这里你可以执行其他的操作
} else {
# 如果不包含指定字符串,则打印"条件不符合"
print(paste("uncorrect A1&A2:",i))
}
}
这一步的目的是根据CHR+POS+A1|A2获取RSID,如果你的数据已经有了rsid,就跳过这一步。
不同的数据对应不用的列名,这里因为我用MungeSumstats包以后,列名发生了变化;如果你也使用了这个包,那么列名下面的应该是对应的。如果是其他数据,在运行下面代码之前需要用colnames(dat)检查一下自己的列名,对应上去就好。
outcome <- read_outcome_data("你的文件夹",
sep = "\t",
snps = exposure$SNP,
snp_col = "SNP",
effect_allele_col = "A2",
other_allele_col = "A1",
pval_col = "P",
beta_col = "Effect",
se_col = "SE",
eaf_col = "FRQ",
chr_col = "CHR")
source("step1_lib.R")
load("phegwas_result_list.Rdata")
dir.create("./mr_outputs")
for (i in 1:length(result_list)) {
exposure <- result_list[[i]]
##outcome data
dir <- "./tmp/dat_38/"
out <- list.files(dir,pattern = "38.tsv.gz")
for (j in 1:length(out)) {
outdat <- out[j]
print(outdat)
expname <- unique(exposure$id.exposure)
outname <- trimws(substr(outdat,1,12))
outcome<-tryCatch(
outcome <- read_outcome_data(file.path(dir,outdat),
sep = "\t",
snps = exposure$SNP,
snp_col = "variant_id",
effect_allele_col = "effect_allele",
other_allele_col = "other_allele",
pval_col = "p_value",
beta_col = "beta",
se_col = "standard_error",
eaf_col = "effect_allele_frequency",
chr_col = "chromosome"),
error = function(e) {
message(paste("Error occurred for outcome:", outname))
NULL # Return NULL when an error occurs
}
)
if (!is.null(outcome)) {
dat<-harmonise_data(exposure,outcome)
mr.methods=c("mr_wald_ratio", "mr_ivw", "mr_ivw_fe", "mr_weighted_median", "mr_weighted_mode", "mr_egger_regression")
mr.res <- mr(dat, method_list=mr.methods)
threshold <- 0.05 ##定义p值范围
target_column <- "pval"
if (any(mr.res[[target_column]] < threshold)) {
# 如果存在小于阈值的元素,则执行相应的操作
print(paste("At least one value in column", target_column, "is less than", threshold))
# 在这里你可以执行其他的操作
ors <- generate_odds_ratios(mr.res)
write.csv(ors, paste0("./mr_outputs/",expname,"_",outname,"_mr_results.csv"))
}
}
}
}
这一步,不仅运行了mr,并且将任意一种mr方法的p值小于0.05的结果写为csv文件。
这里用了两个for循环,目的是分析多个暴露和多个结局的相关性。如果是一对多,那就把i对应的循环拿掉;如果是多对一,那就把j对应的循环拿掉即可。遇到问题欢迎后台留言~
ps:后续可以根据阳性结果再进行敏感性分析和MVMR等等更深入的分析
微信扫一扫
关注该公众号