ASR_added

39b93af9 · Mikael Boden · ea1b47e1 · 39b93af9 · 39b93af9 · 39b93af9
Commit 39b93af9 authored Aug 17, 2022 by Mikael Boden
Hide whitespace changes
Inline Side-by-side

Showing with 418 additions and 2 deletions

asr.py asr.py +329 -0

prob.py prob.py +26 -1

sequence.py sequence.py +63 -1

No files found.
--- a/asr.py
+++ b/asr.py
+from sym import *
+import itertools
+import sympy
+import math
+
+"""
+Classes for basic ASR based on an underlying Bayesian network. Does not perform inference of tree topology. Currently 
+only supports small number of sequences (nodes) as ancestral character inference is not heuristic.
+
+Please be aware that this code is mostly bare bones and errors are not handled comprehensively.
+ """
+
+class SubstModel:
+    """ Time-dependent substitution model using CT Markov chains. """
+    def __init__(self, alpha, F, IRM):
+        self.alpha = alpha  # Allowable states for characters at nodes, order corresponding to parameters in F and IRM
+        self.F = F   # Equilibrium frequencies of characters
+        self.IRM = IRM   # Instantaneous rate matrix
+
+        # Set up Q matrix for this model
+        Q = fillLower(F, IRM)
+        Q = makeValid(Q)
+        Q = normalise(Q)
+        self.Q = Q
+
+    def getTransitionMatrix(self, t):
+        """ Compute the transition probability matrix for a branch length, t
+         NOTE: Multiplication of matrices is matrix multiplication (not element-wise multiplication) """
+
+        # Diagonalise Q matrix in the form Q=UDU^-1
+        U, D = self.Q.diagonalize()   # D contains eigenvalues of Q
+
+        # E(t) = exp(Dt)
+        E = sympy.zeros(D.rows)
+        for i in range(D.rows):
+            E[i,i] = math.exp(D[i,i] * t)
+
+        # P(t) = U*E(t)*U^-1
+        P = U*E*U**-1
+
+        return P
+
+    def priorProb(self, x):
+        """Return the prior probability of a given character. Throws error if x is not in the model's alphabet."""
+        try:
+            idx = self.alpha.index(x)
+        except ValueError:
+            raise RuntimeError(f"Character {x} is not in the substitution model's alphabet.")
+
+        return self.F[idx]
+
+    def condProb(self, parent, child, t):
+        """Return the probability that a parent state transitions to a child state in time t."""
+        try:
+            parent_idx = self.alpha.index(parent)
+            child_idx = self.alpha.index(child)
+        except ValueError:
+            raise RuntimeError(f"Either {parent} or {child} is not in the substitution model's alphabet.")
+
+        return self.getTransitionMatrix(t)[parent_idx, child_idx]
+
+class PhyloBNet:
+    """ Discrete Bayesian network. Implementation here is specifically used for representation of discrete characters
+    (such as nt/AA sequence) in phylogenetic trees. Most methods currently support only networks of bifurcating nodes.
+     """
+    def __init__(self, root):
+        self.root = root
+        self.alpha = root.alpha
+        self.nodes = [root]
+        self.model = root.model
+
+    def addNode(self, node):
+        """ Adds a node to the network's list. """
+        self.nodes.append(node)
+        node.network = self
+
+    def addNodes(self, nodes):
+        """ Add a list of nodes to the network. """
+        for node in nodes:
+            self.addNode(node)
+
+    def getNode(self, label):
+        """ Return a node with the given label. Returns None if node is not found."""
+        for node in self.nodes:
+            if node.label == label:
+                return node
+        return None
+
+    def getNodes(self):
+        """ Return a list of all nodes under the root. """
+        return self.nodes
+
+    def getChildrenOf(self, node):
+        """ Return the immediate children of a given node. Input can be the node instance or label of parent
+        Returns None if query node is not found. Returns empty list if node is a leaf."""
+        if isinstance(node, str):  # If label is provided, find corresponding node
+            found = False
+            for node1 in self.nodes:
+                if node == node1.label:
+                    node = node1
+                    found = True
+            if not found:
+                return None
+        else:
+            if not isinstance(node, PhyloBNode):
+                raise RuntimeError('Query node must be a label or PhyloBNode object.')
+
+        children = []
+        for node1 in self.nodes:
+            if node1.isRoot():
+                continue
+            elif node1.parent.label == node.label:
+                children.append(node1)
+
+        return children
+
+    def getLL(self, annots):
+        """ Calculate and return the log-likelihood for a combination of annotations to unknown nodes, given known
+        annotations and the specified substitution model. annots should be supplied as a dictionary {node:annot}.
+        Unknown nodes for which annotations are not provided will be marginalised. """
+
+        # Ensure tempAnnots for all nodes in network are initially cleared
+        for node in self.getNodes():
+            node.clearAnnot(temp=True)
+
+        unknown = [node for node in self.getNodes() if node.getAnnot() is None]  # List of nodes w/out known annotation
+        for node in annots:  # Temporarily annotate unknown nodes in network
+            if node not in unknown:
+                raise RuntimeError('Node: ' + node.label + ' is either already annotated or is not in network.')
+            node.annotate(annots[node], temp=True)
+
+        # Marginalisation to be implemented in future versions
+        # Check for nodes in network w/out either annot or tempAnnot (these need to be marginalised)
+        # sumOver = [node for node in self.getNodes() if not (node.getAnnot(temp=True) or node.getAnnot(temp=False))]
+
+        # Make likelihood calculation for network by determining all relevant conditional probabilities
+        LL = 0
+        for node in self.getNodes():
+            if node.annot is None:
+                annot = node.tempAnnot
+            else:
+                annot = node.annot
+            if node.isRoot():
+                LL += math.log(self.model.priorProb(annot))
+            else:
+                parentAnnot = node.parent.tempAnnot
+                t = node.distance
+                LL += math.log(self.model.condProb(parentAnnot, annot, t))
+
+        return LL
+
+    def getMLJoint(self, reduced=True):
+        """ Return the maximum likelihood set of joint annotations to all unknown nodes. Implementation is currently
+         by brute force (checks all potential combinations - no heuristics). If reduced, a minimal alphabet is used
+         comprising only of characters observed on at least one known node.
+         Returns a dictionary of node labels mapped to inferred annotations, and the overall LL of the ML network.
+         Note: this method does NOT subsequently annotate unknown nodes with the inferred maximally likely characters.
+         """
+        if reduced:
+            symbols = set([node.annot for node in self.getNodes() if node.annot is not None])
+        else:
+            symbols = self.alpha
+
+        # Determine the set of joint annotations to unknown nodes for testing
+        unknown = [node for node in self.getNodes() if node.annot is None]
+        combos = itertools.product(symbols, repeat=len(unknown))
+
+        # Brute force max-likelihood annotation to unknown nodes
+        maxLL = -math.inf   # Store best LL
+        bestCombo = None   # Store combination of annotation to unknown nodes with current best LL
+        for combo in combos:  # For each possible combination of annotations, get the LL
+            LL = self.getLL({unknown[i] : combo[i] for i in range(len(unknown))})
+            if LL > maxLL:
+                maxLL = LL
+                bestCombo = combo
+
+        # Match node label to inferred character at said node
+        combo_labels = {unknown[i].label : bestCombo[i] for i in range(len(unknown))}
+
+        return combo_labels, maxLL
+
+class PhyloBNode:
+    """ Discrete-valued node in a Bayesian network. Can be used to represent a phylogenetic internal or leaf node.
+    Implementation supports at most one parent node.
+    """
+
+    def __init__(self, model, parent=None, distance=None, network=None, alpha=Protein_Alphabet, annot=None, label=None):
+        self.model = model
+        self.parent = parent
+        self.distance = distance   # 'Distance' from parent - akin to time in phylogenetic tree
+        self.network = network   # Updated when added to a Bayesian network
+        self.alpha = alpha   # Allowable alphabet for annotations (e.g. sequence) to node
+        self.annot = annot   # Known annotation (e.g. sequence) at this node
+        self.label = label   # Name of node
+        self.tempAnnot = None  # For use at unknown nodes during likelihood calculations
+
+        if self.annot and self.annot not in self.alpha:
+            raise RuntimeError('Illegal annotation to node: ' + label)
+
+    def isRoot(self):
+        """ Does this appear to be a root node? """
+        if not self.parent:
+            return True
+        else:
+            return False
+
+    def getDescendants(self, transitive=False):
+        """ Return descendants of this node. If transitive, recursively finds ALL nodes under this in network. Else
+        just returns children of this node. Returns empty list if this node has no children."""
+        descendants = [child for child in self.network.getChildren(self.label)]
+        if transitive:
+            for child in self.chidren:
+                descendants.extend(child.getDescendants(transitive=True))
+            return descendants
+        else:
+            return descendants
+
+    def getAnnot(self, temp=False):
+        if temp:
+            return self.tempAnnot
+        else:
+            return self.annot
+
+    def annotate(self, annot, temp=False):
+        """ Update annotation of this node. If temp, annotation is temporary, such as for use in likelihood calculation
+        for unknown nodes in a network. """
+        if annot not in self.alpha:
+            raise RuntimeError('Illegal annotation to node: ' + self.label + ': ' + annot)
+        if temp:
+            self.tempAnnot = annot
+        else:
+            self.annot = annot
+
+    def clearAnnot(self, temp=False):
+        if temp:
+            self.tempAnnot = None
+        else:
+            self.annot = None
+
+
+########################################################################################################################
+# Operations for matrices in time-reversible substitution models. All matrices/vectors are sympy.Matrix objects
+########################################################################################################################
+
+def fillLower(F, IRM):
+    """ If the provided IRM is upper-triangular, need to complete with entries in lower triangle such that IRM*F is a
+    zero vector. """
+
+    if not len(F) == IRM.rows:
+        raise RuntimeError('Number of rows in IRM must be equal to the length of the equilibrium vector.')
+
+    Q = IRM
+    # Populate matrix below diagonal - assume time-reversibility so Q[j,i]=(F[i]/F[j])*Q[i,j]
+    for i in range(Q.rows):
+        for j in range(i+1, Q.cols):
+            Q[j,i] = (F[i]/F[j]) * Q[i,j]
+
+    return Q
+
+def makeValid(Q):
+    """ Ensure row sums of Q are 0 by equating entries in the diagonal to the negative sum of all other elements in
+    the corresponding row. """
+
+    for i in range(Q.rows):
+        Q[i,i] = 0   # Ensure diagonal entry is initially 0
+        rowSum = 0
+        for j in range(Q.cols):
+            rowSum += Q[i,j]
+        Q[i,i] = -rowSum
+
+    return Q
+
+def normalise(Q):
+    """ Normalise Q such that it represents rates of transition per one unit time. """
+
+    diag_sum = 0  # Sum of diagonal should be 1
+    for i in range(Q.rows):
+        diag_sum += Q[i,i]
+
+    for i in range(Q.rows):
+        for j in range(Q.cols):
+            Q[i,j] = Q[i,j] / -diag_sum
+
+    return Q
+
+########################################################################################################################
+# Data for pre-defined substitution models (only JTT is currently implemented)
+# Note: Ordering of F and IRM must correspond to alphabet order for a given model
+########################################################################################################################
+
+# Model alphabets
+ALPHA = {'JTT' : ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']}
+
+# Equilibrium frequency vectors
+F = {'JTT' :
+         sympy.Matrix([[0.076862, 0.051057, 0.042546, 0.051269, 0.020279, 0.041061, 0.061820, 0.074714, 0.022983, 0.052569, 0.091111, 0.059498, 0.023414, 0.040530, 0.050532, 0.068225, 0.058518, 0.014336, 0.032303, 0.066374]])
+}
+
+# Instantaneous rate matrices - Only JTT available currently
+# Operations on IRMs here assume the matrix is upper-triangular.
+IRM = {'JTT' : sympy.Matrix([
+       # A         R         N         D         C   	   Q         E         G         H         I  	     L         K         M         F         P         S         T         W         Y         V
+    [0.000000, 0.531678, 0.557967, 0.827445, 0.574478, 0.556725, 1.066681, 1.740159, 0.219970, 0.361684, 0.310007, 0.369437, 0.469395, 0.138293, 1.959599, 3.887095, 4.582565, 0.084329, 0.139492, 2.924161],
+    [0.000000, 0.000000, 0.451095, 0.154899, 1.019843, 3.021995, 0.318483, 1.359652, 3.210671, 0.239195, 0.372261, 6.529255, 0.431045, 0.065314, 0.710489, 1.001551, 0.650282, 1.257961, 0.235601, 0.171995],
+    [0.000000, 0.000000, 0.000000, 5.549530, 0.313311, 0.768834, 0.578115, 0.773313, 4.025778, 0.491003, 0.137289, 2.529517, 0.330720, 0.073481, 0.121804, 5.057964, 2.351311, 0.027700, 0.700693, 0.164525],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.105625, 0.521646, 7.766557, 1.272434, 1.032342, 0.115968, 0.061486, 0.282466, 0.190001, 0.032522, 0.127164, 0.589268, 0.425159, 0.057466, 0.453952, 0.315261],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.091304, 0.053907, 0.546389, 0.724998, 0.150559, 0.164593, 0.049009, 0.409202, 0.678335, 0.123653, 2.155331, 0.469823, 1.104181, 2.114852, 0.621323],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 3.417706, 0.231294, 5.684080, 0.078270, 0.709004, 2.966732, 0.456901, 0.045683, 1.608126, 0.548807, 0.523825, 0.172206, 0.254745, 0.179771],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.115632, 0.243768, 0.111773, 0.097485, 1.731684, 0.175084, 0.043829, 0.191994, 0.312449, 0.331584, 0.114381, 0.063452, 0.465271],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.201696, 0.053769, 0.069492, 0.269840, 0.130379, 0.050212, 0.208081, 1.874296, 0.316862, 0.544180, 0.052500, 0.470140],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.181788, 0.540571, 0.525096, 0.329660, 0.453428, 1.141961, 0.743458, 0.477355, 0.128193, 5.848400, 0.121827],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 2.335139, 0.202562, 4.831666, 0.777090, 0.098580, 0.405119, 2.553806, 0.134510, 0.303445, 9.533943],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.146481, 3.856906, 2.500294, 1.060504, 0.592511, 0.272514, 0.530324, 0.241094, 1.761439],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.624581, 0.024521, 0.216345, 0.474478, 0.965641, 0.089134, 0.087904, 0.124066],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.436181, 0.164215, 0.285564, 2.114728, 0.201334, 0.189870, 3.038533],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.148483, 0.943971, 0.138904, 0.537922, 5.484236, 0.593478],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 2.788406, 1.176961, 0.069965, 0.113850, 0.211561],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 4.777647, 0.310927, 0.628608, 0.408532],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.080556, 0.201094, 1.143980],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.747889, 0.239697],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.165473],
+    [0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000]
+    ])
+}
+
+# Instantiate pre-defined models
+MODELS = {
+    'JTT' : SubstModel(ALPHA['JTT'], F['JTT'], IRM['JTT'])
+}
--- a/prob.py
+++ b/prob.py
@@ -203,9 +203,19 @@ class Distrib():
                maxprob = self[sym]
        return maxsym

+    def getProb(self):
+        """ Return a list of (sym, prob) tuples, in order of their alphabet"""
+        return [(sym, self.prob(sym)) for sym in self.alpha]
+
+    def getBits(self):
+        """ Return a list of (sym, bits) tuples, in order of their alphabet"""
+        H = sum([-f * (math.log2(f) if f > 0 else 999) for f in self.prob()])
+        I = math.log2(len(self.alpha)) - H
+        return [(s, self.prob(s) * I) for s in self.alpha]
+
    def getsort(self):
        """ Return the list of symbols, in order of their probability. """
-        symlist = [sym for (sym, _) in self.getProbsort()]
+        symlist = [s for (s, _) in self.getProbsort()]
        return symlist

    def getProbsort(self):
@@ -1089,3 +1099,18 @@ def lgamma(x):
        y += 1
        ser += (cof[j] / y)
    return (-tmp + math.log(2.5066282746310005 * ser / x))
+
+
+import sequence as seq
+if __name__ == '__main__':
+    myseqs = [seq.Sequence('TCCTAGCCCC'),
+              seq.Sequence('GCCGCCCCCA'),
+              seq.Sequence('ATCCGCCCGG'),
+              seq.Sequence('CCCCCGCCTT')]
+    mymc = MarkovChain(seq.DNA_Alphabet)
+    for myseq in myseqs:
+        print(myseq, len(myseq))
+        mymc.observe(myseq)
+    for t in mymc.transit:
+        print(t, mymc.transit[t])
+
--- a/sequence.py
+++ b/sequence.py
@@ -1344,6 +1344,61 @@ class PWM(object):
                maxindex = i
        return (maxscore, maxindex)

+def readPWMs(filename, format='MEME'):
+    fh = open(filename, 'rt')
+    VERSION = None
+    ALPHABET = None
+    STRANDS = None
+    BACKGROUND = None
+    MOTIF = None
+    URL = None
+    FOREGROUND = []
+    NSITES = 1
+    EXPECT = None
+    COLLECTION = {}
+    data = fh.read()
+    lines = data.splitlines()
+    for line in lines:
+        myline = line.strip()
+        words = myline.split();
+        if EXPECT == None:
+            try:
+                if myline.startswith('MEME') and len(words) > 1:
+                    VERSION = words[len(words) - 1]
+                elif myline.startswith('ALPHABET') and len(words) > 1:
+                    ALPHABET = Alphabet(words[len(words) - 1])
+                elif myline.startswith('MOTIF') and len(words) > 1:
+                    if MOTIF != None: # we have one motif that needs to be stored first
+                        COLLECTION[MOTIF] = (FOREGROUND, BACKGROUND)
+                    FOREGROUND = []
+                    MOTIF = words[1]
+                elif myline.startswith('URL') and len(words) > 1:
+                    URL = words[1]
+                elif len(words) == 0:
+                    EXPECT = None
+                elif not myline.startswith('#'):
+                    EXPECT = myline
+                    if EXPECT.startswith('letter-probability matrix'):
+                        match = re.compile(r""".*nsites=\s*(?P<name>[0-9]*?)\s.*""", re.VERBOSE).match(EXPECT)
+                        NSITES = int(match.group('name'))
+            except:
+                print('Error in format: ' + line)
+                return None
+        elif len(words) == 0:
+            EXPECT = None
+        elif EXPECT.startswith('Background'):
+            BACKGROUND = Distrib(ALPHABET)
+            for z in zip(words[::2], words[1::2]):
+                BACKGROUND.observe(z[0], float(z[1]))
+        elif EXPECT.startswith('letter-probability matrix'):
+            d = Distrib(ALPHABET)
+            for z in zip(ALPHABET.symbols, words):
+                d.observe(z[0], float(z[1]) * NSITES)
+            FOREGROUND.append(d)
+    # save last motif here
+    COLLECTION[MOTIF] = (FOREGROUND, BACKGROUND)
+    return COLLECTION
+
 # Web Service Functions -------------------

 def getSequence(id, database = 'uniprotkb', start=None, end=None):
@@ -1438,7 +1493,7 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
            ids.append(id.split(':')[1])
    return ids

-if __name__ == '__main__':
+if __name__ == '__main__1':
    aln = readClustalFile('/Users/mikael/simhome/ASR/gappy.aln', Protein_Alphabet)
    x, g, i = aln.outliers()
    for s in range(len(aln)):
@@ -1449,3 +1504,10 @@ if __name__ == '__main__':
        if idx >= 0:
            print('\t', aln[s].sequence[idx:])
    print(('Read', len(aln), 'sequences'))
+
+if __name__ == '__main__':
+    motifs = readPWMs('/Users/mikael/meme-5.4.1/motif_databases/PROKARYOTE/collectf.meme')
+    for name in motifs:
+        print(name)
+        for fg in motifs[name][0]:
+            print('\t', fg)
\ No newline at end of file