## Get and parse GWAS catalog ## Tamara Perteghella ## 24/10/2023 # Download GWAS catalog. WARNING: New data is added to the GWAS Catalog and diagram on a weekly basis, with new data generally going public every Tuesday. Data releases include all downloadable spreadsheets. wget --output-document gwas_catalog_v1.0.2-associations_e100_r2023-10-11.tsv https://www.ebi.ac.uk/gwas/api/search/downloads/alternative # File name conventions: # v1.0 indicates the columns originally proposed by the NCBI # v1.0.1 indicates the addition of ontology annotations, GWAS Catalog study accession numbers and other new columns (dismissed from 1 May 2018) # v1.0.2 indicates the addition of ontology annotations, GWAS Catalog study accession numbers and other new columns such as genotyping technology # eXX indicates the Ensembl release version that the data is mapped to # rYYYY-MM-DD indicates the date on which the GWAS Catalog was released # As of 2023-10-11, the GWAS Catalog contains 6586 publications, 555899 top associations and 65846 full summary statistics. # GWAS Catalog data is currently mapped to Genome Assembly GRCh38.p14 and dbSNP Build 154. # Parsing # 1. Some studies have ; separated lists of snps in the same line, corresponding to haplotypes, that have been split # Diagnose cat gwas_catalog_v1.0.2-associations_e100_r2023-10-11.tsv | awk -F"\t" '$22~/;/' | cut -f22 | wc -l awk -F"\t" ' { for (i = 1; i <= NF; ++i) {if($i ~ ";") print i}} ' <(cat gwas_catalog_v1.0.2-associations_e100_r2023-10-11.tsv | awk -F"\t" '$22~";"') | sort | uniq -c # Fields with ; separated values: 12,13,15,21,22,25 # Solve cat gwas_catalog_v1.0.2-associations_e100_r2023-10-11.tsv | awk 'BEGIN{OFS=FS="\t"; colsI="12,13,15,21,22,25"; split(colsI, cols, ",")} \ {n=split($22, a, ";"); for(col in cols){d[cols[col]]=$(cols[col])}; for(i=1;i<=n;i++){for(col in cols){split(d[cols[col]], b, ";"); gsub(/ /, "", b[i]); $(cols[col])=b[i]} print }}' > gwas.catalog.extHap.tsv # 2. Substitute blanks by NA's (Has to be done in 2nd place sed 's/\t\t/\tNA\t/g' gwas.catalog.extHap.tsv | sed 's/\t\t/\tNA\t/g' > gwas.catalog.extHap.NAs.tsv # 3. Remove snp interaction studies: cat gwas.catalog.extHap.NAs.tsv | awk -F"\t" '$22!~/rs.+xrs.+/' > gwas.catalog.extHap.NAs.noInt.tsv # 4. Make BED file and: # 4.1. Remove duplicated snp-trait pairs # 4.2. Remove snpIds w/o associated possitions # 4.3. Consider latest rsID. Note that field 23 "MERGED" denotes whether the SNP has been merged into a subsequent rs record (0 = no; 1 = yes;) cat gwas.catalog.extHap.NAs.noInt.tsv | \ awk 'BEGIN{OFS=FS="\t"} NR>1 && $13!="NA" {if($23==1 && $24!="NA"){print "chr"$12, $13-1, $13, "rs"$24, "0", "+", $8, $(NF-2)} else {print "chr"$12, $13-1, $13, $22, "0", "+", $8, $(NF-2)}}' | sort -Vk1,1 | uniq > gwas.catalog.efo.bed7 # Download EFO ontology in obo format wget http://www.ebi.ac.uk/efo/efo.obo # Download trait / EFO mapping wget https://www.ebi.ac.uk/gwas/api/search/downloads/trait_mappings # 5. Merge similar traits # 5.1. Strip the name in parentheses sed 's/(.*)//g' gwas.catalog.bed7 > gwas.catalog.strip.bed7 sed 's/\[.*\]//g' gwas.catalog.strip.bed7 > tmp; mv tmp gwas.catalog.strip.bed7 tr -d '[:punct:]' tmp; mv tmp gwas.catalog.strip.bed7 # 5.2. Remove duplicates sort gwas.catalog.strip.bed7 | uniq > tmp; mv tmp gwas.catalog.strip.bed7 # 5.3. Cluster together similar traits module load Singularity/3.8.3 cp ../2016-11-15/top50.en.txt ./ singularity exec ~/docker_images/r4_gencode_phase3_latest.sif Rscript semantic_clustering.R -t 0.8 # Here we select the unique words that compose the traits, remove the ones that migth hinder the grouping and make a matrix words x traits according to the occurrence of each word in each trait # Compute cosine distance between columns (traits) # Set a threshold (automatically). You can also provide your own treshold. For this catalog version, the minimum treshold that should be selected is 0.58 (default). # Generate a non-weighted adjacency matrix and a graph # Get all groups of neighbours # Output the bed file (gwas.catalog.semantic_clustering.bed7) with 2 more columns: cluster to which the term belongs to and keywords defining the cluster.