# 1. extend the gencode not low (removes 11,000) confidence tss (from all tr biotypes) by 50bp on each side ############################################################################################################ genctss=/users/rg/sdjebali/public_html/Gencode/version19/gencode.v19.annotation_capped_sites_nr_with_confidence.gff.gz zcat $genctss | awk -v ext=50 '$16!="low"{$4=$4-ext; $5=$5+ext; $3="TSSExt"ext; print $0}' | awk '{if($4<1){$4=1}print $0}' | gff2gff > gencodetss_notlow_ext50eachside.gff # chr10 Gencode TSSExt50 100011730 100011830 . + . gene_id ENSG00000230928.1 trlist ENST00000433374.1, trbiotlist antisense, confidence not_low gene_biotype antisense # 168280 (18 fields) # 2. merge them in a stranded way ################################# awk 'BEGIN{OFS="\t"}{print $1, $4-1, $5, ".", ".", $7}' gencodetss_notlow_ext50eachside.gff | mergeBed -i stdin -s -n | awk 'BEGIN{OFS="\t"}{$6=$5; $5="."; print}' > gencodetss_notlow_ext50eachside_merged.bed # chr1 11818 11924 3 . + # 117195 (6 fields) # 3. see how many gencode tss clusters are overlapped by merged cage clusters in a stranded way ################################################################################################ # this is done by simple overlap between the cage clusters and the tss clusters ############################################################################### f5cageclus=/users/rg/sdjebali/FANTOM5/Clusters/Timo/TSS_human_strict.bed.gz zcat $f5cageclus | intersectBed -a gencodetss_notlow_ext50eachside_merged.bed -b stdin -s -c > gencodetss_notlow_ext50eachside_merged_over_TSS_human_strict.bed # chr1 11818 11924 3 . + 0 # 117195 (7 fields) # how many gencode tss are overlapped by cage clusters? ####################################################### awk -v fld=7 -f /users/rg/sdjebali/Awk/compute_prop.awk gencodetss_notlow_ext50eachside_merged_over_TSS_human_strict.bed # 117195 36932 31.5133 # 4. Actually merge the gencode tss clusters with the cage clusters (be careful format is not the same) ####################################################################################################### f5cageclus=~/FANTOM5/Clusters/Timo/TSS_human_strict.bed.gz zcat $f5cageclus > tmp cat tmp gencodetss_notlow_ext50eachside_merged.bed | awk 'BEGIN{OFS="\t"}{print $1, $2, $3, $4, $5, $6}' | mergeBed -i stdin -s -n | awk 'BEGIN{OFS="\t"}{$7=$6; $6=$5; $5="."; print}' > TSS_human_strict_gencodetss_notlow_ext50eachside_merged.bed # chr1 11818 11924 1 . + # 247680 (6 fields) # 5. and then I need to add two types of info ############################################## # 1. annotation class ##################### # Gencode_Only # Cage_Only # Gencode_Cage # 2. gencode tss and genes each big cluster comes from ###################################################### # 1. Annotation class ##################### intersectBed -a TSS_human_strict_gencodetss_notlow_ext50eachside_merged.bed -b gencodetss_notlow_ext50eachside_merged.bed -s -wa -c > TSS_human_strict_gencodetss_notlow_ext50eachside_merged_overgencodenb.bed # chr1 11818 11924 1 . + 1 # 247680 (7 fields) f5cageclus=~/FANTOM5/Clusters/Timo/TSS_human_strict.bed.gz zcat $f5cageclus | intersectBed -a TSS_human_strict_gencodetss_notlow_ext50eachside_merged_overgencodenb.bed -b stdin -s -wa -c > TSS_human_strict_gencodetss_notlow_ext50eachside_merged_over_gencodetssclusters_nb_over_cagetssclusters_nb.bed # chr1 11818 11924 1 . + 1 0 # 247680 (8 fields) # number of gencode tss clusters in a big cluster and same for the cage tss clusters #################################################################################### stats.sh TSS_human_strict_gencodetss_notlow_ext50eachside_merged_over_gencodetssclusters_nb_over_cagetssclusters_nb.bed 7 # Read 247680 items # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.0000 0.0000 0.0000 0.4732 1.0000 3.0000 stats.sh TSS_human_strict_gencodetss_notlow_ext50eachside_merged_over_gencodetssclusters_nb_over_cagetssclusters_nb.bed 8 # Read 247680 items # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.0000 0.0000 1.0000 0.8784 1.0000 13.0000 # Make one big gff file out of them with annotation class inside ################################################################ awk '{print $1, "rikcrg", "tss", $2+1, $3, ".", $6, ".", "class:", (($7!=0) ? (($8!=0) ? "GencCAGE" : "GencOnly") : "CAGEOnly")}' TSS_human_strict_gencodetss_notlow_ext50eachside_merged_over_gencodetssclusters_nb_over_cagetssclusters_nb.bed | gff2gff > TSS_human_strict_gencodetss_notlow_ext50eachside_merged_over_gencodetssclusters_nb_over_cagetssclusters_nb.gff # chr1 rikcrg tss 11819 11924 . + . class: GencOnly # 247680 (10 fields) awk '{print $NF}' TSS_human_strict_gencodetss_notlow_ext50eachside_merged_over_gencodetssclusters_nb_over_cagetssclusters_nb.gff | sort | uniq -c # 130776 CAGEOnly # 36711 GencCAGE # 80193 GencOnly # 2. Add the info of genctss and gene to all big clusters ######################################################### # Overlap to get tss coord ########################## overlap TSS_human_strict_gencodetss_notlow_ext50eachside_merged_over_gencodetssclusters_nb_over_cagetssclusters_nb.gff gencodetss_notlow_ext50eachside.gff -st 1 -m -1 -i 2 -f genctssclus -v | awk '{$(NF-3)=""; $(NF-2)=""; print}' | gff2gff > TSS_human_strict_with_gencodetss_notlow_ext50eachside_merged_withgenctsscoord.gff # chr1 rikcrg tss 11819 11924 . + . class: GencOnly list_genctssclus: chr1_11819_11919_+,chr1_11822_11922_+,chr1_11824_11924_+, # 247680 (12 fields) # Overlap to get genes of tss ############################# overlap TSS_human_strict_with_gencodetss_notlow_ext50eachside_merged_withgenctsscoord.gff gencodetss_notlow_ext50eachside.gff -st 1 -m 10 -i 2 -f genctssclus -nr -v | awk '{$(NF-3)=""; $(NF-2)=""; print}' | gff2gff > TSS_human_strict_with_gencodetss_notlow_ext50eachside_merged_withgenctsscoord_andgnlist.gff # chr1 rikcrg tss 11819 11924 . + . class: GencOnly list_genctssclus: chr1_11819_11919_+,chr1_11822_11922_+,chr1_11824_11924_+, list_genctssclus: ENSG00000223972.4, # 247680 (14 fields) # Clean ####### rm TSS_human_strict_with_gencodetss_notlow_ext50eachside_merged_withgenctsscoord.gff