ó 0Ç]c @ s2dZddlmZmZddlZddlZddlZddlZddlm Z ddl m Z ddl m Z mZejdZejdZejd Zejd Ze eƒZeeƒZe eƒZed kròejZn eeƒZejƒZx\eD]TZejƒZejd ƒZe eƒedd kredZ!e"ed ƒdZ#e"ed ƒdZ$edZ%e#dZ&e$dZ'e%dkrïee!e#d e&d !j(j)ƒZ*ee!e$de'd !j(j)ƒZ+nle%d krNee!e$d e'd !j,j-j(j)ƒZ*ee!e#d e&d!j,j-j(j)ƒZ+n ej.dƒe e*ƒe e+ƒe*d d!Z/e+dd!Z0e e/de0ƒe/dkrDe0dkrDe1e*ƒdkrDe1e+ƒdkrDe*d Z2e*dZ3e+d Z4e+dZ5e e2de3ƒe e4de5ƒee2e3ƒZ6ee4e5ƒZ7n dZ7dZ6e e7de6ƒqqWde8fd„ƒYZ de8fd„ƒYZd„Z9dZ:ej;e:ƒZ<d&d'd(d)d*d+d,d-d.g Z=ge=D]\Z>Z?d e?e>d^qÛZ@geAe1e@ƒƒD]ZBeCe@eB ƒ^qZDdS(/s ScoreSpliceCandidate.py Given the context of a GT or AG in a string of DNA bases and precomputed coefficients files, return a score that indicates how likely it is to be a splice donor or acceptor, respectively, using the maximum entropy method [1]. Usage: create a DonorPredictor or AcceptorPredictor class instance using a coefficients file, and then invoke one or more times supplying 3 bases before the GT and 4 bases after for donor predictions or 18 bases before the AG and 3 bases after for acceptor predictions. Example: from ScoreSpliceCandidate import DonorPredictor, AcceptorPredictor donorPredictor = DonorPredictor('Hsap.donor.mecoef') print(donorPredictor('CAG', 'GAGC')) # Score for CAG-GT-GAGC # Prints 10.173901761046155 acceptorPredictor = AcceptorPredictor('Hsap.acceptor.mecoef') print(acceptorPredictor('TGTGTGCCTTTCACTTTC', 'GCT')) # Score for TGTGTGCCTTTCACTTTC-AG-GCT # Prints 10.791433170214455 See comment at the bottom of this file for format of coefficient files. [1] Yeo, G., & Burge, C. B. (2004). Maximum entropy modeling of short sequence motifs with applications to RNA splicing signals. Journal of Computational Biology : a Journal of Computational Molecular Cell Biology, 11(2-3), 377-394. doi:10.1089/1066527041410418 i˙˙˙˙(tdivisiontprint_functionN(tSeqIO(tFasta(tDonorPredictortAcceptorPredictoriiiit-s tintroniit+iiit tGTtAGi itNARcB seZd„Zd„ZRS(cC sjttjjtjj|ƒƒƒ|_|jjdtjƒ|jjƒt dksft d|ƒ‚dS(Nii@s*%s is not a valid donor coefficients file.( topentostpathtabspatht expandusertfiletseektSEEK_ENDttellt RecordSizetAssertionError(tselftdonorCoefFileName((sA/nfs/users2/rg/jlagarde/sync/julien_utils/ScoreSpliceCandidate.pyt__init__ks'cC s˘t|ƒdks$tt|ƒƒ‚t|ƒdksHtt|ƒƒ‚t||ƒ}|jj|tƒtjt|jj tƒƒd}t j d|dƒS(Niiiga`0’PM0@i( tlenRt_bases_to_numberRRRtstructtunpackt RecordFormattreadtmathtlog(Rt prev3basest next4basestindextcoeff((sA/nfs/users2/rg/jlagarde/sync/julien_utils/ScoreSpliceCandidate.pyt__call__qs $$"(t__name__t __module__RR'(((sA/nfs/users2/rg/jlagarde/sync/julien_utils/ScoreSpliceCandidate.pyRjs RcB seZd„Zd„ZRS(cC sjttjjtjj|ƒƒƒ|_|jjdtjƒ|jjƒt dksft d|ƒ‚dS(Nii€Bs-%s is not a valid acceptor coefficients file.( R RRRRRRRRRR(RtacceptorCoefFileName((sA/nfs/users2/rg/jlagarde/sync/julien_utils/ScoreSpliceCandidate.pyR{s'c C st|ƒdks$tt|ƒƒ‚t|ƒdksHtt|ƒƒ‚||}d}x˜ttƒD]Š\}\}}t|||d!ƒ}|jjt||tƒt j t |jj tƒƒd} |dkrċ|| 9}qe|| }qeWt jd|dƒS(NiiiiigĴâÌ#Y0@i(RRt enumeratetAcceptorStartEndsRRRtAcceptorArrayLengthSumsRRRRR R!R"( Rt prev18basest next3basestbasestcoeffsCombinationtiitstarttendR%R&((sA/nfs/users2/rg/jlagarde/sync/julien_utils/ScoreSpliceCandidate.pyR's$$ "  (R(R)RR'(((sA/nfs/users2/rg/jlagarde/sync/julien_utils/ScoreSpliceCandidate.pyRzs cC sOidd6dd6dd6dd6}d}x |D]}d |||}q/W|S( s1Convert a string of DNA bases to a base-4 number.itAitCitGitTi((R0tBaseMaptresulttb((sA/nfs/users2/rg/jlagarde/sync/julien_utils/ScoreSpliceCandidate.pyR‘s " s*s|0                   "% (+     <     +,