## ## GWAS traits clustering based on word similarity ## Diego Garrido Martín (modified Tamara Perteghella - 29/01/2024) ## # Command line args: stringency of the method library(optparse) option_list = list(make_option(c("-t", "--threshold_selected"), type="numeric", default=-1, help="Stringency of the method", metavar="numeric")) opt_parser = OptionParser(option_list=option_list); opt = parse_args(opt_parser); if (opt$threshold_selected==-1){cat("\n WARNING. Default mode: minimum stringency selected\n")} # Load data cat(" Loading data\n") library(data.table) gc = fread("gwas.catalog.strip.bed7") colnames(gc) = c("chr","start","end","rsid","score","strand","trait") gc$strand = rep("+", nrow(gc)) traits = gc$trait # Remove characters that might hinder the word analysis from the traits cat(" Parsing traits\n") traits.u <- traits.o <- unique(traits) traits.u <- gsub("\\-"," ",traits.u) traits.u <- gsub("\\/"," ",traits.u) traits.u <- gsub("\\'"," ",traits.u) traits.u <- gsub("\\,"," ",traits.u) # Get unique words that compose the traits (lowercase) words <- unique(tolower(unlist(strsplit(traits.u," ")))) # Make blacklist (remove words that might hinder the analyses) # Top 50 most common english words top50 <- as.character(read.table("top50.en.txt")[,1]) # words[which(words%in%top50)] words <- words[!words%in%top50] # Numbers 1:10^6 numbers<-1:10^6 words <- words[!words%in%numbers] # 1 letter words count <- sapply(words,nchar) # names(count[count==1]) words <- words[!words%in%names(count[count==1])] # Common words in our dataset that might hinder the analyses (manually selected) M <- matrix(0, nrow=length(words), ncol=length(traits.u)) for( i in 1:length(words) ){ M[i,] <- as.numeric(grepl(sprintf("\\<%s\\>", words[i]), traits.u, ignore.case=T)) } colnames(M)<-traits.u rownames(M)<-words print("first") pre.bl<-sort(rowSums(M),decreasing=T) pre.bl<-pre.bl[pre.bl >= 10] blacklist <- c("levels","response","disorder","traits", "related","treatment","syndrome","type","function","therapy", "major", "color","phenotypes","") words <- words[!words%in%blacklist] # Compute count matrix words x traits cat(" Computing count matrix\n") M <- matrix(0,nrow=length(words),ncol=length(traits.u)) for( i in 1:length(words)){ M[i,]<-as.numeric(grepl(sprintf("\\<%s\\>",words[i]),traits.u,ignore.case=T)) } colnames(M)<-traits.o rownames(M)<-words # Compute distance matrix traits x traits cat(" Computing distances\n") d <- matrix(NA,nrow=length(traits.u),ncol=length(traits.u)) for (i in 1:(dim(M)[2]-1)){ for (j in (i+1):dim(M)[2]){ sentence1=M[,i] sentence2=M[,j] d[i,j]<-crossprod(sentence1, sentence2)/sqrt(crossprod(sentence1) * crossprod(sentence2)) } } d[lower.tri(d)] <- t(d)[lower.tri(d)] colnames(d) <- rownames(d) <- traits.u diag(d)<-0 d[is.nan(d)]<-0 # Tuning distance threshold suppressPackageStartupMessages(library(igraph)) thres <-c(NULL) x<-c(NULL) y<-c(NULL) for (th in round(seq(0,1,0.005),digits=3)){ adj<-d adj[adj>=th]<-1 adj[adjx[length(x)]*0.05]) # Jump > 5% if (opt$threshold_selected==-1) {selth = minth} else{selth = opt$threshold_selected} # Diagnosis plots (min threshold) cat(" Generating plots\n") pdf("summary.default.pdf") par(mfrow=c(2,2)) plot(x,y,xlab="nb of clusters",ylab="max cluster size",type="b",pch=20) plot(thres,y,xlab="threshold",ylab="max cluster size",type="b",pch=20) abline(h=y[thres==minth],col="red") abline(v=minth,col="red") plot(thres,x,xlab="threshold",ylab="nb of clusters",type="b",pch=20) abline(h=x[thres==minth],col="red") abline(v=minth,col="red") plot(thres,1-x/x[length(x)],xlab="threshold",ylab="% reduction",type="b",pch=20) abline(h=(1-x/x[length(x)])[thres==minth],col="red") abline(v=minth,col="red") null<-dev.off() # Diagnosis plots (selected threshold) if (selth != minth){ pdf(sprintf("summary.%s.pdf",selth)) par(mfrow=c(2,2)) plot(x,y,xlab="nb of clusters",ylab="max cluster size",type="b",pch=20) plot(thres,y,xlab="threshold",ylab="max cluster size",type="b",pch=20) abline(h=y[thres==selth],col="blue") abline(v=selth,col="blue") plot(thres,x,xlab="threshold",ylab="nb of clusters",type="b",pch=20) abline(h=x[thres==selth],col="blue") abline(v=selth,col="blue") plot(thres,1-x/x[length(x)],xlab="threshold",ylab="% reduction",type="b",pch=20) abline(h=(1-x/x[length(x)])[thres==selth],col="blue") abline(v=selth,col="blue") null<-dev.off() } # Selected threshold th = selth adj<-d adj[adj>=th]<-1 adj[adj