## ## GWAS traits clustering based on word similarity ## Diego Garrido MartÃn (modified Tamara Perteghella - 29/01/2024) ## # Command line args: stringency of the method library(optparse) option_list = list(make_option(c("-t", "--threshold_selected"), type="numeric", default=-1, help="Stringency of the method", metavar="numeric")) opt_parser = OptionParser(option_list=option_list); opt = parse_args(opt_parser); if (opt$threshold_selected==-1){cat("\n WARNING. Default mode: minimum stringency selected\n")} # Load data cat(" Loading data\n") library(data.table) gc = fread("gwas.catalog.strip.bed7") colnames(gc) = c("chr","start","end","rsid","score","strand","trait") gc$strand = rep("+", nrow(gc)) traits = gc$trait # Remove characters that might hinder the word analysis from the traits cat(" Parsing traits\n") traits.u <- traits.o <- unique(traits) traits.u <- gsub("\\-"," ",traits.u) traits.u <- gsub("\\/"," ",traits.u) traits.u <- gsub("\\'"," ",traits.u) traits.u <- gsub("\\,"," ",traits.u) # Get unique words that compose the traits (lowercase) words <- unique(tolower(unlist(strsplit(traits.u," ")))) # Make blacklist (remove words that might hinder the analyses) # Top 50 most common english words top50 <- as.character(read.table("top50.en.txt")[,1]) # words[which(words%in%top50)] words <- words[!words%in%top50] # Numbers 1:10^6 numbers<-1:10^6 words <- words[!words%in%numbers] # 1 letter words count <- sapply(words,nchar) # names(count[count==1]) words <- words[!words%in%names(count[count==1])] # Common words in our dataset that might hinder the analyses (manually selected) M <- matrix(0, nrow=length(words), ncol=length(traits.u)) for( i in 1:length(words) ){ M[i,] <- as.numeric(grepl(sprintf("\\<%s\\>", words[i]), traits.u, ignore.case=T)) } colnames(M)<-traits.u rownames(M)<-words print("first") pre.bl<-sort(rowSums(M),decreasing=T) pre.bl<-pre.bl[pre.bl >= 10] blacklist <- c("levels","response","disorder","traits", "related","treatment","syndrome","type","function","therapy", "major", "color","phenotypes","") words <- words[!words%in%blacklist] # Compute count matrix words x traits cat(" Computing count matrix\n") M <- matrix(0,nrow=length(words),ncol=length(traits.u)) for( i in 1:length(words)){ M[i,]<-as.numeric(grepl(sprintf("\\<%s\\>",words[i]),traits.u,ignore.case=T)) } colnames(M)<-traits.o rownames(M)<-words # Compute distance matrix traits x traits cat(" Computing distances\n") d <- matrix(NA,nrow=length(traits.u),ncol=length(traits.u)) for (i in 1:(dim(M)[2]-1)){ for (j in (i+1):dim(M)[2]){ sentence1=M[,i] sentence2=M[,j] d[i,j]<-crossprod(sentence1, sentence2)/sqrt(crossprod(sentence1) * crossprod(sentence2)) } } d[lower.tri(d)] <- t(d)[lower.tri(d)] colnames(d) <- rownames(d) <- traits.u diag(d)<-0 d[is.nan(d)]<-0 # Tuning distance threshold suppressPackageStartupMessages(library(igraph)) thres <-c(NULL) x<-c(NULL) y<-c(NULL) for (th in round(seq(0,1,0.005),digits=3)){ adj<-d adj[adj>=th]<-1 adj[adj