#!/usr/bin/env python # Modified from: <<< # Copyright 2019 Irwin Jungreis # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Modified from >>> # Uses pyfaidx to extract FASTA subsequences (Shirley MD, Ma Z, Pedersen B, Wheelan S. Efficient "pythonic" access to FASTA files using pyfaidx. PeerJ PrePrints 3:e1196. 2015.) """ ScoreSpliceCandidate.py Given the context of a GT or AG in a string of DNA bases and precomputed coefficients files, return a score that indicates how likely it is to be a splice donor or acceptor, respectively, using the maximum entropy method [1]. Usage: create a DonorPredictor or AcceptorPredictor class instance using a coefficients file, and then invoke one or more times supplying 3 bases before the GT and 4 bases after for donor predictions or 18 bases before the AG and 3 bases after for acceptor predictions. Example: from ScoreSpliceCandidate import DonorPredictor, AcceptorPredictor donorPredictor = DonorPredictor('Hsap.donor.mecoef') print(donorPredictor('CAG', 'GAGC')) # Score for CAG-GT-GAGC # Prints 10.173901761046155 acceptorPredictor = AcceptorPredictor('Hsap.acceptor.mecoef') print(acceptorPredictor('TGTGTGCCTTTCACTTTC', 'GCT')) # Score for TGTGTGCCTTTCACTTTC-AG-GCT # Prints 10.791433170214455 See comment at the bottom of this file for format of coefficient files. [1] Yeo, G., & Burge, C. B. (2004). Maximum entropy modeling of short sequence motifs with applications to RNA splicing signals. Journal of Computational Biology : a Journal of Computational Molecular Cell Biology, 11(2-3), 377-394. doi:10.1089/1066527041410418 """ from __future__ import division, print_function import os, struct, math, sys from Bio import SeqIO from pyfaidx import Fasta assert sys.version_info[0] == (2) class DonorPredictor(object) : def __init__(self, donorCoefFileName) : self.file = open(os.path.abspath(os.path.expanduser(donorCoefFileName))) self.file.seek(0, os.SEEK_END) assert self.file.tell() / RecordSize == 16384, \ '%s is not a valid donor coefficients file.' % donorCoefFileName def __call__(self, prev3bases, next4bases) : # Score potential GT splice donor site given 3 prev bases and 4 next ones. assert len(prev3bases) == 3, len(prev3bases) assert len(next4bases) == 4, len(next4bases) index = _bases_to_number(prev3bases + next4bases) self.file.seek(index * RecordSize) coeff = struct.unpack(RecordFormat, self.file.read(RecordSize))[0] return math.log(16.302010666666664 * coeff, 2) class AcceptorPredictor(object) : def __init__(self, acceptorCoefFileName) : self.file = open(os.path.abspath(os.path.expanduser(acceptorCoefFileName))) self.file.seek(0, os.SEEK_END) assert self.file.tell() / RecordSize == 82560, \ '%s is not a valid acceptor coefficients file.' % acceptorCoefFileName def __call__(self, prev18bases, next3bases) : # Score potential AG splice acceptor site given 18 prev bases and 3 next ones. assert len(prev18bases) == 18, len(prev18bases) assert len(next3bases) == 3, len(next3bases) bases = prev18bases + next3bases coeffsCombination = 1 for ii, (start, end) in enumerate(AcceptorStartEnds) : index = _bases_to_number(bases[start : end + 1]) self.file.seek((AcceptorArrayLengthSums[ii] + index) * RecordSize) coeff = struct.unpack(RecordFormat, self.file.read(RecordSize))[0] if ii < 5 : coeffsCombination *= coeff else : coeffsCombination /= coeff return math.log(16.3482025 * coeffsCombination, 2) def _bases_to_number(bases) : "Convert a string of DNA bases to a base-4 number." BaseMap = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3} result = 0 for b in bases : result = 4 * result + BaseMap[b] return result RecordFormat = '