#!/usr/bin/python __author__="BU" """How many splice juntion of a given gene/transcript have been supported by RNA-Seq""" # Usage python get_sj.py -i input.gtf -o sjfile.txt from os import sys import getopt from optparse import OptionParser # =============== ## OPTION PARSING # =============== parser = OptionParser() parser.add_option("-i", help="Tab separated file, contatining all transcript ids and splice junctions for given gtf.No stdin") parser.add_option("-r", help="File containing supported splice junctions") parser.add_option('-a',help="Annotation (gtf file). Preferebly containing only exon records. It is recommended to use the same file, as for geting inputfile") parser.add_option('-t', help='Output file for support on transcript level') parser.add_option('-g',help='Output file for support on gene level') options, args = parser.parse_args() # Store input and output file infile='' outtrans='' outgenes='' reffile='' mode='' annot='' # Read command line args myopts, args = getopt.getopt(sys.argv[1:],"i:t:g:a:m:r:") # Loop over arguments and assign input to infile and output to outfile # o == option # a == argument passed to the o if myopts == []: print("Usage: %s -i input -o output" % sys.argv[0]) for o, a in myopts: if o == '-i': infile=a if o == '-t': outtrans=a if o == '-g': outgenes=a if o == '-a': annot=a if o == '-r': reffile=a print ("Input file : %s \nOutput file for trans: %s \nOutput file for genes: %s \nAnnotation: %s \nValidated sj file: %s" % (infile,outtrans, outgenes, annot, reffile)) def get_support(): trans_sj={} supported=[] validated={} checkout={} gene_trans={} gene_val={} supgenes={} # -------------------------------------------- # Transcript level # -------------------------------------------- input_file=open(infile, 'r') for line in input_file: line_split=line.split('\t') #print line_split trans_id=line_split[0] sj=line_split[1].split('\n')[0] #print 'sj', sj if trans_id not in trans_sj: trans_sj[trans_id]=[sj] else: trans_sj[trans_id].append(sj) #print trans_id, trans_sj[trans_id] supp_sj=open(reffile, 'r') for lines in supp_sj: #print "SUP", lines.split('\n')[0] supported.append(lines.split('\n')[0]) for key in trans_sj: #print trans_sj[key] for value in trans_sj[key]: #print 'VALUE', value if value in supported: if key not in validated: validated[key]=[value] else: validated[key].append(value) for kyz in trans_sj: if kyz in validated: if len(validated[kyz])==len(trans_sj[kyz]): checkout[kyz]='VALIDATED' else: checkout[kyz]='NOT_VALIDATED' else: checkout[kyz]='NOT_VALIDATED' #print kyz, checkout[kyz] fil=open(outtrans, 'w') for x in checkout: fil.write('%s\t' % x) fil.write('%s\n' % checkout[x]) fil.close() # ----------------------------------------------- # GENE LEVEL # ----------------------------------------------- annotation=open(annot, 'r') for record in annotation: record_split=record.split('\t') cigar=record_split[8].split(';') #print cigar for elem in cigar: if elem.startswith('gene_id'): gene_id=elem.split('"')[1].split(".")[0] #print gene_id if elem.startswith(' transcript_id'): trans_annot=elem.split('"')[1].split(".")[0] #print trans_id if gene_id not in gene_trans: gene_trans[gene_id]=[trans_annot] else: gene_trans[gene_id].append(trans_annot) for keys in gene_trans: #print set(gene_trans[keys]) for item in set(gene_trans[keys]): #print item if item in checkout: if checkout[item] == 'VALIDATED': if keys not in gene_val: gene_val[keys]=['VALIDATED'] else: gene_val[keys].append('VALIDATED') for k in gene_trans: if k in gene_val: if len(set(gene_trans[k])) == len(gene_val[k]): supgenes[k]='VALIDATED' else: supgenes[k]='NOT_VALIDATED' else: supgenes[k]='NOT_VALIDATED' #print k, supgenes[k] f=open(outgenes, 'w') for z in supgenes: f.write('%s\t' % z) f.write('%s\n' % supgenes[z]) f.close() get_support()