## Get and parse GWAS catalog 
## Tamara Perteghella
## 24/10/2023

 # Download GWAS catalog. WARNING: New data is added to the GWAS Catalog and diagram on a weekly basis, with new data generally going public every Tuesday. Data releases include all downloadable spreadsheets.
   
   wget --output-document gwas_catalog_v1.0.2-associations_e100_r2023-10-11.tsv https://www.ebi.ac.uk/gwas/api/search/downloads/alternative

 # File name conventions: 
 #   v1.0 indicates the columns originally proposed by the NCBI
 #   v1.0.1 indicates the addition of ontology annotations, GWAS Catalog study accession numbers and other new columns (dismissed from 1 May 2018)
 #   v1.0.2 indicates the addition of ontology annotations, GWAS Catalog study accession numbers and other new columns such as genotyping technology
 #   eXX indicates the Ensembl release version that the data is mapped to
 #   rYYYY-MM-DD indicates the date on which the GWAS Catalog was released

 # As of 2023-10-11, the GWAS Catalog contains 6586 publications, 555899 top associations and 65846 full summary statistics.
 # GWAS Catalog data is currently mapped to Genome Assembly GRCh38.p14 and dbSNP Build 154.

 # Parsing 

  # 1. Some studies have ; separated lists of snps in the same line, corresponding to haplotypes, that have been split
 
   # Diagnose
     cat gwas_catalog_v1.0.2-associations_e100_r2023-10-11.tsv | awk -F"\t" '$22~/;/' | cut -f22 | wc -l 
     awk -F"\t" ' { for (i = 1; i <= NF; ++i) {if($i ~ ";") print i}} ' <(cat gwas_catalog_v1.0.2-associations_e100_r2023-10-11.tsv | awk -F"\t" '$22~";"') | sort | uniq -c    
     # Fields with ; separated values: 12,13,15,21,22,25

   # Solve
     cat gwas_catalog_v1.0.2-associations_e100_r2023-10-11.tsv | awk 'BEGIN{OFS=FS="\t"; colsI="12,13,15,21,22,25"; split(colsI, cols, ",")} \
     {n=split($22, a, ";"); for(col in cols){d[cols[col]]=$(cols[col])}; for(i=1;i<=n;i++){for(col in cols){split(d[cols[col]], b, ";"); gsub(/ /, "", b[i]); $(cols[col])=b[i]} print }}' > gwas.catalog.extHap.tsv

  # 2. Substitute blanks by NA's (Has to be done in 2nd place
    sed 's/\t\t/\tNA\t/g' gwas.catalog.extHap.tsv | sed 's/\t\t/\tNA\t/g' > gwas.catalog.extHap.NAs.tsv

  # 3. Remove snp interaction studies:
     cat gwas.catalog.extHap.NAs.tsv | awk -F"\t" '$22!~/rs.+xrs.+/' > gwas.catalog.extHap.NAs.noInt.tsv
  
  # 4. Make BED file and: 

     # 4.1. Remove duplicated snp-trait pairs
     # 4.2. Remove snpIds w/o associated possitions
     # 4.3. Consider latest rsID. Note that field 23 "MERGED" denotes whether the SNP has been merged into a subsequent rs record (0 = no; 1 = yes;)
  
       cat  gwas.catalog.extHap.NAs.noInt.tsv | \
        awk 'BEGIN{OFS=FS="\t"} NR>1 && $13!="NA" {if($23==1 && $24!="NA"){print "chr"$12, $13-1, $13, "rs"$24, "0", "+", $8, $(NF-2)} else {print "chr"$12, $13-1, $13, $22, "0", "+", $8, $(NF-2)}}' | sort -Vk1,1 | uniq > gwas.catalog.efo.bed7

  # Download EFO ontology in obo format

    wget http://www.ebi.ac.uk/efo/efo.obo
    
  # Download trait / EFO mapping
 
    wget https://www.ebi.ac.uk/gwas/api/search/downloads/trait_mappings
  
  # 5. Merge similar traits

     # 5.1. Strip the name in parentheses
     
       sed 's/(.*)//g' gwas.catalog.bed7 > gwas.catalog.strip.bed7 
       sed 's/\[.*\]//g' gwas.catalog.strip.bed7 > tmp; mv tmp gwas.catalog.strip.bed7
       tr -d '[:punct:]' <gwas.catalog.strip.bed7 > tmp; mv tmp gwas.catalog.strip.bed7

     # 5.2. Remove duplicates 
      
       sort gwas.catalog.strip.bed7 | uniq > tmp; mv tmp gwas.catalog.strip.bed7

     # 5.3. Cluster together similar traits

       module load Singularity/3.8.3
       cp ../2016-11-15/top50.en.txt ./
       singularity exec ~/docker_images/r4_gencode_phase3_latest.sif Rscript semantic_clustering.R -t 0.8

       # Here we select the unique words that compose the traits, remove the ones that migth hinder the grouping and make a matrix words x traits according to the occurrence of each word in each trait
       # Compute cosine distance between columns (traits)
       # Set a threshold (automatically). You can also provide your own treshold. For this catalog version, the minimum treshold that should be selected is 0.58 (default).
       # Generate a non-weighted adjacency matrix and a graph
       # Get all groups of neighbours
       # Output the bed file (gwas.catalog.semantic_clustering.bed7) with 2 more columns: cluster to which the term belongs to and keywords defining the cluster.