##
## GWAS traits clustering based on word similarity
## Diego Garrido Martín (modified Tamara Perteghella - 29/01/2024)
##

# Command line args: stringency of the method

 library(optparse)
 
 option_list = list(make_option(c("-t", "--threshold_selected"), type="numeric", default=-1, help="Stringency of the method", metavar="numeric"))
 opt_parser = OptionParser(option_list=option_list);
 opt = parse_args(opt_parser);
 
 if (opt$threshold_selected==-1){cat("\n WARNING. Default mode: minimum stringency selected\n")}
 
 # Load data
 cat(" Loading data\n")
 library(data.table)
 gc = fread("gwas.catalog.strip.bed7")
 colnames(gc) = c("chr","start","end","rsid","score","strand","trait")
 gc$strand = rep("+", nrow(gc))
 traits = gc$trait
 

# Remove characters that might hinder the word analysis from the traits
 cat(" Parsing traits\n")
 traits.u <- traits.o <- unique(traits)
 traits.u <- gsub("\\-"," ",traits.u)
 traits.u <- gsub("\\/"," ",traits.u)
 traits.u <- gsub("\\'"," ",traits.u)
 traits.u <- gsub("\\,"," ",traits.u)
 
# Get unique words that compose the traits (lowercase)
 words <- unique(tolower(unlist(strsplit(traits.u," "))))
 
# Make blacklist (remove words that might hinder the analyses)
# Top 50 most common english words
top50 <- as.character(read.table("top50.en.txt")[,1])
# words[which(words%in%top50)]
words <- words[!words%in%top50]
 
 # Numbers 1:10^6
 numbers<-1:10^6
 words <- words[!words%in%numbers]
 
 # 1 letter words
 count <- sapply(words,nchar)
 # names(count[count==1])
 words <- words[!words%in%names(count[count==1])]

 # Common words in our dataset that might hinder the analyses (manually selected)
 M <- matrix(0, nrow=length(words), ncol=length(traits.u))
 for( i in 1:length(words) ){
   M[i,] <- as.numeric(grepl(sprintf("\\<%s\\>", words[i]), traits.u, ignore.case=T))
 }
 colnames(M)<-traits.u
 rownames(M)<-words
print("first")
 pre.bl<-sort(rowSums(M),decreasing=T)
 pre.bl<-pre.bl[pre.bl >= 10]
 blacklist <- c("levels","response","disorder","traits",
                "related","treatment","syndrome","type","function","therapy",
                "major", "color","phenotypes","")
 words <- words[!words%in%blacklist]
 
 # Compute count matrix words x traits
 cat(" Computing count matrix\n")
 M <- matrix(0,nrow=length(words),ncol=length(traits.u))
 for( i in 1:length(words)){
   M[i,]<-as.numeric(grepl(sprintf("\\<%s\\>",words[i]),traits.u,ignore.case=T))
 }
 colnames(M)<-traits.o
 rownames(M)<-words
 
 # Compute distance matrix traits x traits
 cat(" Computing distances\n")
 d <- matrix(NA,nrow=length(traits.u),ncol=length(traits.u))
 for (i in 1:(dim(M)[2]-1)){
  for (j in (i+1):dim(M)[2]){
  sentence1=M[,i]
  sentence2=M[,j]
  d[i,j]<-crossprod(sentence1, sentence2)/sqrt(crossprod(sentence1) * crossprod(sentence2))
  }
 }
 d[lower.tri(d)] <- t(d)[lower.tri(d)]
 colnames(d) <- rownames(d) <- traits.u 
 diag(d)<-0
 d[is.nan(d)]<-0
 

 # Tuning distance threshold
 suppressPackageStartupMessages(library(igraph))
 thres <-c(NULL)
 x<-c(NULL)
 y<-c(NULL)
 for (th in round(seq(0,1,0.005),digits=3)){  
   adj<-d
   adj[adj>=th]<-1
   adj[adj<th]<-0
   g<-graph_from_adjacency_matrix(adj)
   c<-components(g)
   x<-c(x,c$no)
   y<-c(y,max(c$csize))
   thres<-c(thres,th)
 }
 
 # Select minimum threshold to avoid very high cluster sizes
 p<-c(NULL)
 for (i in 1:(length(y)-1)){
   p<-c(p,y[i]-y[i+1]) # Compute "jumps" in cluster size as we increase the threshold
 }
 minth = max(thres[-1][p>x[length(x)]*0.05]) # Jump > 5%
 
 if (opt$threshold_selected==-1) {selth = minth} else{selth = opt$threshold_selected}
 
 # Diagnosis plots (min threshold)
 cat(" Generating plots\n")
 
 pdf("summary.default.pdf")
 par(mfrow=c(2,2))
 plot(x,y,xlab="nb of clusters",ylab="max cluster size",type="b",pch=20)
 plot(thres,y,xlab="threshold",ylab="max cluster size",type="b",pch=20)
 abline(h=y[thres==minth],col="red")
 abline(v=minth,col="red")
 plot(thres,x,xlab="threshold",ylab="nb of clusters",type="b",pch=20)
 abline(h=x[thres==minth],col="red")
 abline(v=minth,col="red")
 plot(thres,1-x/x[length(x)],xlab="threshold",ylab="% reduction",type="b",pch=20)
 abline(h=(1-x/x[length(x)])[thres==minth],col="red")
 abline(v=minth,col="red")
 null<-dev.off()

 
 # Diagnosis plots (selected threshold)
 if (selth != minth){
 pdf(sprintf("summary.%s.pdf",selth))
 par(mfrow=c(2,2))
 plot(x,y,xlab="nb of clusters",ylab="max cluster size",type="b",pch=20)
 plot(thres,y,xlab="threshold",ylab="max cluster size",type="b",pch=20)
 abline(h=y[thres==selth],col="blue")
 abline(v=selth,col="blue")
 plot(thres,x,xlab="threshold",ylab="nb of clusters",type="b",pch=20)
 abline(h=x[thres==selth],col="blue")
 abline(v=selth,col="blue")
 plot(thres,1-x/x[length(x)],xlab="threshold",ylab="% reduction",type="b",pch=20)
 abline(h=(1-x/x[length(x)])[thres==selth],col="blue")
 abline(v=selth,col="blue")
 null<-dev.off()
 }
 
 # Selected threshold
 
 th = selth
 adj<-d
 adj[adj>=th]<-1
 adj[adj<th]<-0
 g<-graph_from_adjacency_matrix(adj)
 c<-components(g)

 # Generate keywords
 cat(" Generating keywords\n")
 cluster.no <- 1:c$no
 keywords <- c(NULL)
 for (i in 1:c$no){
    words <- tolower(unlist(strsplit(names(c$membership[c$membership==i])," ")))
    words <- words[!words%in%top50]
    words <- words[!words%in%numbers]
    words <- words[!words%in%names(count[count==1])]
    words <- words[!words%in%blacklist]
    keywords = c(keywords, paste0(names(head(sort(table(words),decreasing=T),n=5)),collapse=","))
 }
 
 V8 <- V9 <- c(NULL)
 traits <- gc$trait
 traits <- gsub("\\-"," ",traits)
 traits <- gsub("\\/"," ",traits)
 traits <- gsub("\\'"," ",traits)
 traits <- gsub("\\,"," ",traits)

 for (i in traits){
   V8 <- c(V8, as.numeric(c$membership[i]))
   V9 <- c(V9, keywords[c$membership[i]])
 }
 gc$V8 <- V8
 gc$V9 <- V9
 
 cat(" Writing output\n")
 
 write.table(gc,file=sprintf("gwas.catalog.semantic_clustering.th%s.bed7",selth),col.names=F,row.names=F,quote=F,sep="\t")
 
 cat(" DONE!\n\n")