#!/usr/bin/python __author__="BU" """This script contains only one fucntion calles get_sj, which selects all possible splice junctions from given GENCODE annotation file and corresponding ENSEMBL transcript ID""" # Usage python get_sj.py -i input.gtf -o sjfile.txt from optparse import OptionParser import gzip from os import sys import getopt # =============== ## OPTION PARSING # =============== parser = OptionParser() parser.add_option("-i", help="The input format is sorted gtf file. No stdin") parser.add_option('-o', help='Output tab-sep two column file with tx and sj') options, args = parser.parse_args() # Store input and output file infile='' outfile='' # Read command line args myopts, args = getopt.getopt(sys.argv[1:],"i:o:") # Loop over arguments and assign input to infile and output to outfile # o == option # a == argument passed to the o if myopts == []: print("Usage: %s -i input -o output" % sys.argv[0]) for o, a in myopts: if o == '-i': infile=a if o == '-o': outfile=a print ("Input file : %s \nOutput file: %s" % (infile,outfile) ) # Body of GET_SJ function def get_sj(): exons_trans={} sj={} starts=[] stops=[] gtf=open(infile) for line in gtf: line_split = line.strip().split('\t') #print line_split record_type = line_split[2] if record_type.startswith('exon'): chrom=line_split[0] #print 'CHROM', chrom start=line_split[3] #print 'START', start stop=line_split[4] #print 'STOP', stop strand=line_split[6] #print 'STR', strand #print "element_set", line_split[9] element_set = line_split[8].strip().split(";") for element in element_set: if element.strip().split(" ")[0].startswith('transcript_id'): trans_id=element.strip().split('"')[1] #print 'TRANS ID', trans_id if trans_id not in exons_trans: exons_trans[trans_id]=[chrom+'_'+start+'_'+stop+'_'+strand] else: exons_trans[trans_id].append(chrom+'_'+start+'_'+stop+'_'+strand) #print 'RESULTS', trans_id, exons_trans[trans_id], '\n' #print len(exons_trans.keys()) for key in exons_trans: #print key exon_list=exons_trans[key] if len(exon_list) >= 2: #print key #print 'LEN', len(exon_list) for sth in exons_trans[key]: #print "STH", sth chr_tmp=sth.split("_")[0] start_tmp=int(sth.split("_")[1]) stop_tmp=int(sth.split("_")[2]) strand_tmp=sth.split("_")[3] starts.append(start_tmp) stops.append(stop_tmp) #print 'EXON LIST', exon_list, '\n' for i in range(0,(len(exon_list)-1)): #print "i", i chr_nb=chr_tmp stop1=str(sorted(stops)[i]) start2=str(sorted(starts)[i+1]) sj_strand=strand_tmp if key not in sj: sj[key]=[chr_nb+'_'+stop1+'_'+start2+'_'+sj_strand] else: sj[key].append(chr_nb+'_'+stop1+'_'+start2+'_'+sj_strand) #print 'FINAL', '\n', key, sj[key] #print len(sj.keys()) starts=[] stops=[] f=open(outfile, 'w') for k in sj: for values in sj[k]: f.write('%s\t' % k) f.write('%s\n' % values) f.close() get_sj()