## ## Diego Garrido Martín ## 16/05/2017 ## Generate null set of SNPs with matched maf and distance to TSS with respect to sQTLs ## ## 0. Load libraries and parse arguments library(optparse) library(parallel) suppressPackageStartupMessages(library(data.table)) option_list = list( make_option(c("-s", "--seed"), type="numeric", default=1, help="Seed [default %default]", metavar="numeric"), make_option(c("-Q", "--sqtls"), type="character", help="sQTL file generated by assessEnrich.sh", metavar="character"), make_option(c("-N", "--nonsqtls"), type="character", help="non-sQTL file generated by assessEnrich.sh", metavar="character"), make_option(c("-p", "--permutations"), type="numeric", default=1000, help="Number of resamplings [default %default]", metavar="numeric"), make_option(c("-m", "--maf"), type="numeric", default=0.02, help="Maximum MAF deviation [default %default]", metavar="numeric"), make_option(c("-t", "--tss"), type="numeric", default=1000, help="Maximum TSS distance deviation [default %default]", metavar="numeric"), make_option(c("-c", "--cores"), type="numeric", default=1, help="Number of cores [default %default]", metavar="numeric"), make_option(c("-o", "--output"), type="character", help="Output file", metavar="character"), make_option(c("-f", "--funct"), type="character", help="Function to use: maf, tss_maf, bins_maf [default %default]", metavar="character", default = "bins_maf"), make_option(c("-w", "--wd"), type="character",default="cwd", help="Working directory [default %default]", metavar="character") ) opt_parser <- OptionParser(option_list=option_list) opt <- parse_args(opt_parser) if (is.null(opt$out)){ print_help(opt_parser) stop("Some arguments must be supplied (output file, sQTL-file, non-sQTL file)\n", call.=FALSE) } set.seed(opt$seed) sqtls_file <- opt$sqtls nonsqtls_file <- opt$nonsqtls wd <- opt$wd maf_delta <- opt$maf tss_delta <- opt$tss no_cores <- opt$cores nb.resamplings <- opt$permutations output_file <- opt$output Fun <- opt$funct ## 1. Load input files and set WD if(wd == "cwd"){setwd(getwd())} else{setwd(wd)} sqtls <- as.data.frame(fread(sqtls_file, header = FALSE)) null <- as.data.frame(fread(nonsqtls_file, header = FALSE)) colnames(sqtls) <- colnames(null) <- c("snpId", "geneId", "maf", "distance") # distance would be location in f.bins_maf ## 2. Define functions f.tss_maf <- function(sqtl, maf.delta = maf_delta, tss.delta = tss_delta){ sub.sqtls <- subset(sqtls, snpId == sqtl, select = c(maf, distance)) maf.th <- sub.sqtls$maf tss.th <- sub.sqtls$distance snps.null <- null[with(null, which( maf <= (maf.th + maf.delta) & maf >= (maf.th - maf.delta) & distance <= (tss.th + tss.delta) & distance >= (tss.th - tss.delta))), "snpId"] } f.maf <- function(sqtl, maf.delta = maf_delta){ maf.th <- subset(sqtls, snpId == sqtl)$maf snps.null <- null[with(null, which( maf <= (maf.th + maf.delta) & maf >= (maf.th - maf.delta))), "snpId"] } f.bins_maf <- function(sqtl, maf.delta = maf_delta){ sub.sqtls <- subset(sqtls, snpId == sqtl, select = c(maf, distance)) maf.th <- sub.sqtls$maf bin.th <- sub.sqtls$distance snps.null <- null[with(null, which( maf <= (maf.th + maf.delta) & maf >= (maf.th - maf.delta) & distance == bin.th)), "snpId"] } if (Fun == "tss_maf"){ f <- f.tss_maf } else if(Fun == "maf"){ f <- f.maf } else if(Fun == "bins_maf"){ f <- f.bins_maf } else { stop(sprintf("Invalid function type \"%s\"", Fun)) } ## 3. Run cl <- makeCluster(no_cores) # Initialize cluster clusterSetRNGStream(cl = cl, opt$seed) clusterExport(cl, c("sqtls", "null", "maf_delta", "tss_delta")) # Export variables set <- parLapply(cl, sqtls$snpId, f) # Run stopCluster(cl) # Finish ## 4. Results set.seed(opt$seed) null_table <- matrix(NA, nrow = length(set), ncol = nb.resamplings) for (i in 1:nb.resamplings){ null_table[, i] <- unlist(lapply(set, sample, size = 1)) } null_table <- cbind(sqtls$snpId, null_table) ## 5. Write output write.table(null_table, file = output_file, sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)