phylo.py 29.4 KB
Newer Older
Mikael Boden's avatar
Mikael Boden committed
1 2
'''
Module with methods and classes for phylogeny.
3
Extended to handle n-ary trees (Jan 2019).
Mikael Boden's avatar
Mikael Boden committed
4 5
@author: mikael
'''
Mikael Boden's avatar
Mikael Boden committed
6
import sequence
Mikael Boden's avatar
Mikael Boden committed
7 8 9
from collections import defaultdict
import annotations

Mikael Boden's avatar
Mikael Boden committed
10
class PhyloTree:
11
    """ Rooted, n-ary tree for representing phylogenetic relationships.
Mikael Boden's avatar
Mikael Boden committed
12 13
        Functionality includes labelling and traversing nodes; reading and writing to Newick format;
        association with sequence alignment; maximum parsimony inference of ancestral sequence;
14 15
        generation of rooted tree by UPGMA.
        Known issues: Parsimony does not handle gaps in alignment.
Mikael Boden's avatar
Mikael Boden committed
16 17 18 19 20 21 22 23 24 25 26 27
        Programmers should note that almost all functionality is implemented through recursion. """

    def __init__(self, root):
        """ Create a tree from a node that is "root" in the tree."""
        self.root = root

    def putAlignment(self, aln):
        """ Associate the tree with a set of sequences/alignment.
            Involves assigning the sequence to the leaf nodes. """
        self.aln = aln
        self.root._assignAlignment(aln)

Mikael Boden's avatar
Mikael Boden committed
28 29 30 31 32 33 34 35
    def putAnnotations(self, nexus_annotations):
        self.nexus_annotations = nexus_annotations
        # Update the annotations dictionary so that it contains PhyloNode objects as keys, not text labels
        for node in self.getNodes():
            if node.label in self.nexus_annotations.leaf_annotations:
                self.nexus_annotations.leaf_annotations[node] = self.nexus_annotations.leaf_annotations[node.label]
                self.nexus_annotations.leaf_annotations.pop(node.label)

Mikael Boden's avatar
Mikael Boden committed
36 37 38 39
    def __str__(self):
        """ Produce a printable representation of the tree, specifically the root of the tree. """
        return str(self.root)

Mikael Boden's avatar
Mikael Boden committed
40
    def strSequences(self, start=None, end=None):
Mikael Boden's avatar
Mikael Boden committed
41 42 43 44 45 46 47 48 49 50 51 52 53
        """ Produce a sequence representation of the tree, specifically the root of the tree.
            Specify the start and end positions in the alignment for the sequence to be printed
            (if None the min and max positions will be used). """
        if self.aln != None:
            my_start = start or 0
            my_end = end or self.aln.alignlen
            return self.root._printSequences(my_start, my_end)

    def findLabel(self, label):
        """ Retrieve/return the node with the specified label.
            Returns None if not found."""
        return self.root._findLabel(label)

Mikael Boden's avatar
Mikael Boden committed
54 55 56 57 58 59 60 61
    def getNodes(self, strategy = 'DEPTH-FIRST'):
        """ Returns all nodes as a list """
        nodes = []
        queue = [self.root]
        while len(queue) > 0:
            node = queue.pop()
            nodes.append(node)
            # if strategy.upper().startswith('DEPTH'):
62 63
            if not node.isLeaf():
                queue.extend(node.children)
Mikael Boden's avatar
Mikael Boden committed
64 65
        return nodes

66 67 68 69 70 71 72 73
    def getLeaves(self):
        all = self.getNodes()
        leaves = []
        for n in all:
            if n.isLeaf():
                leaves.append(n)
        return leaves

Mikael Boden's avatar
Mikael Boden committed
74
    def getDescendantsOf(self, node, transitive=False):
Mikael Boden's avatar
Mikael Boden committed
75 76 77 78 79 80 81
        """ Retrieve and return the (list of) descendants (children) of a specified node.
            Node can be the label or the instance.
            transitive indicates if only the direct descendants (False) or if all descendants
            should be returned.
            If node does not exist, None is returned.
            If node has no descendants, an empty list will be returned."""
        if not isinstance(node, PhyloNode):
Mikael Boden's avatar
Mikael Boden committed
82
            node = self.findLabel(node)
Mikael Boden's avatar
Mikael Boden committed
83 84 85 86
        if node:
            return node.getDescendants(transitive)
        return None

Mikael Boden's avatar
Mikael Boden committed
87
    def getAncestorsOf(self, node, transitive=False):
Mikael Boden's avatar
Mikael Boden committed
88 89 90 91 92 93
        """ Retrieve and return the ancestor (transitive=False) or
            ancestors (transitive=True) of a specified node.
            Node can be the label or the instance.
            If node does not exist, None is returned.
            If node is the root of the tree, None is returned."""
        if not isinstance(node, PhyloNode):
Mikael Boden's avatar
Mikael Boden committed
94
            node = self.findLabel(node)
Mikael Boden's avatar
Mikael Boden committed
95
        if node:
96
            return node.getAncestors(transitive)
Mikael Boden's avatar
Mikael Boden committed
97 98 99 100 101 102

    def parsimony(self):
        """ Solve the "small parsimony problem",
            i.e. find the sequences on each of the internal nodes.
            See Jones and Pevzner, p. 368 and onwards, for details. """
        self.root._forwardParsimony(self.aln)  # setup and compute scores for all nodes
Mikael Boden's avatar
Mikael Boden committed
103 104
        self.root._backwardParsimony(self.aln)  # use scores to determine sequences
        return self.root.getSequence()  # return the sequence found at the root
Mikael Boden's avatar
Mikael Boden committed
105

Mikael Boden's avatar
Mikael Boden committed
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
    def swap_annotations(self, annotation_key):
        try:
            for node in self.getNodes():
                if node.isLeaf():
                    node.label = self.nexus_annotations.leaf_annotations[node][annotation_key]
        except:
            return

    def write_to_nexus(self, out_path, write_annotations=True, nexus_annotations=None, exclude_annotations=[], use_symbols=False):
        """
        Writes out the tree to NEXUS format, with any annotations stored in nexus_annotations added to the file
        :param out_path: The path to write the NEXUS file to
        :param nexus_annotations: The NexusAnnotations containing the annotations
        """
        if write_annotations and not nexus_annotations:
            if not self.nexus_annotations:
                raise RuntimeError("This tree file has no associated annotation file. Either associate or supply one as a parameter.")
            nexus_annotations = self.nexus_annotations
        if nexus_annotations:
            for node in self.getNodes():
                if node in self.nexus_annotations.node_annotations:
                    node.annotate_node(self.nexus_annotations.node_annotations, self.nexus_annotations.annotation_symbols, exclude_annotations, use_symbols)
            tree_annotation = str(self) + ";"
            self.swap_annotations("Original")
            for node in self.getNodes():
                if node in self.nexus_annotations.leaf_annotations:
                    node.annotate_node(self.nexus_annotations.leaf_annotations, exclude_annotations)
                leaves = []
                for node in self.getNodes():
                    if node.isLeaf():
                        leaves.append(node.label)
                leaf_annotation = ""
                for leaf in leaves:
                    leaf_annotation += "\n\t%s" % (leaf)
            with open(out_path, "w+") as file:
                file.write(
                    "#NEXUS\nbegin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels%s\n;\nend;\n\nbegin trees;\n\ttree tree_1 = "
                    "[&R] %s\nend;" % (len(leaves), leaf_annotation, tree_annotation))

Mikael Boden's avatar
Mikael Boden committed
145
class PhyloNode:
146 147
    """ A class for a node in a rooted, n-ary tree.
        Contains pointers to multiple descendants/daughters,
Mikael Boden's avatar
Mikael Boden committed
148 149 150 151 152
        optional fields include data, label, sequence and dist.
        If parsimony is used scores and traceback pointers are available.
        A number of methods are named with a _ prefix. These can be, but
        are not intended to be used from outside the class. """

Mikael Boden's avatar
Mikael Boden committed
153 154
    _verbose = True

155 156 157
    def __init__(self, parent = None, label=''):
        """ Initialise a node.
            Set its parent (another PhyloNode), parent can be None.
Mikael Boden's avatar
Mikael Boden committed
158 159 160 161
            Set label to name it.
            Use field data for any type of information associated with node.
            Use dist to indicate the distance to its parent (if any).
            Other fields are used internally, including sequence for associated alignment,
162 163 164
            seqscores, back for maximum parsimony. """
        self.parent = parent
        self.children = None
Mikael Boden's avatar
Mikael Boden committed
165 166 167
        self.data = None
        self.label = label
        self.dist = None
Mikael Boden's avatar
Mikael Boden committed
168
        self.sequence = None  # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
169 170
        self.seqscores = None # The scores propagated from leaves via children
        self.backptr = None   # Pointers back to children: what symbol rendered current/parent symbols
Mikael Boden's avatar
Mikael Boden committed
171 172

    def isLeaf(self):
173 174 175 176 177 178 179
        return self.nChildren() == 0

    def nChildren(self):
        if self.children == None:
            return 0
        else:
            return len(self.children)
Mikael Boden's avatar
Mikael Boden committed
180 181 182

    def __str__(self):
        """ Returns string with node (incl descendants) in a Newick style. """
183 184 185 186
        stubs = ['' for _ in range(self.nChildren())]
        label = dist = ''
        for i in range(self.nChildren()):
            stubs[i] = str(self.children[i])
Mikael Boden's avatar
Mikael Boden committed
187
        if self.dist or self.dist == 0.0:
Mikael Boden's avatar
Mikael Boden committed
188 189
            if self.dist == 0.0: dist = ''
            else: dist = ':' + '%5.3f' % self.dist
Mikael Boden's avatar
Mikael Boden committed
190 191
        if self.label != None:
            label = str(self.label)
192 193 194 195 196 197 198
        if self.nChildren() == 0:
            return label + dist
        else:
            stubstr = '('
            for i in range(len(stubs) - 1):
                stubstr += stubs[i] + ','
            return stubstr + stubs[-1] + ')' + label + dist
Mikael Boden's avatar
Mikael Boden committed
199

Mikael Boden's avatar
Mikael Boden committed
200 201
    def _printSequences(self, start, end):
        """ Returns string with node (incl descendants) in a Newick style. """
202 203 204
        stubs = ['' for _ in range(self.nChildren())]
        label = dist = ''
        for i in range(self.nChildren()):
205
            stubs[i] = self.children[i]._printSequences(start, end)
206
        if self.dist or self.dist == 0.0:
Mikael Boden's avatar
Mikael Boden committed
207
            dist = ':' + str(self.dist)
208 209 210
        if self.sequence != None:
            label = "".join(self.sequence[start:end]) + ""
        else:  # there is no sequence, just use label
211 212 213 214 215 216 217 218
            label = str(self.label)
        if self.nChildren() == 0:
            return label + dist
        else:
            stubstr = '('
            for i in range(len(stubs) - 1):
                stubstr += stubs[i] + ','
            return stubstr + stubs[-1] + ')' + label + dist
Mikael Boden's avatar
Mikael Boden committed
219 220 221 222 223 224

    def _findLabel(self, label):
        """ Find a node by label at this node or in any descendants (recursively). """
        if self.label == label:
            return self
        else:
225 226 227 228
            for i in range(self.nChildren()):
                found = self.children[i]._findLabel(label)
                if found:
                    return found
Mikael Boden's avatar
Mikael Boden committed
229 230 231 232 233
            return None

    def _propagateDistance(self, parent_dist):
        """ Convert absolute distances to relative.
            The only parameter is the absolute distance to the parent of this node. """
Mikael Boden's avatar
Mikael Boden committed
234 235
        travelled = self.dist  # absolute distance to this node
        self.dist = parent_dist - self.dist  # relative distance to this node
236 237
        for i in range(self.nChildren()):
            self.children[i]._propagateDistance(travelled)  # pass absolute distance to this node
Mikael Boden's avatar
Mikael Boden committed
238 239 240 241 242

    def _assignAlignment(self, aln):
        """ Assign an alignment to the node, which implies assigning a sequence to it if one is
            available in the alignment. """
        self.sequence = None
243 244
        for i in range(self.nChildren()):
            self.children[i]._assignAlignment(aln)
Mikael Boden's avatar
Mikael Boden committed
245 246 247 248 249 250 251 252 253
        for seq in aln.seqs:
            if seq.name == self.label:
                self.sequence = seq
                break

    def _forwardParsimony(self, aln):
        """ Internal function that operates recursively to first initialise each node (forward),
            stopping only once a sequence has been assigned to the node,
            then to propagate scores from sequence assigned nodes to root (backward). """
Mikael Boden's avatar
Mikael Boden committed
254
        if self.sequence == None:  # no sequence has been assigned
255
            if self.nChildren() == 0:  # no children, so terminal, cannot propagate scores
Mikael Boden's avatar
Mikael Boden committed
256
                raise RuntimeError("No sequence assigned to leaf node:", self.label)
257 258 259
            scores = [None for _ in range(self.nChildren())]
            for i in range(self.nChildren()):
                scores[i] = self.children[i]._forwardParsimony(aln)
Mikael Boden's avatar
Mikael Boden committed
260 261 262 263
            # for each position in the alignment,
            # introduce (initially zero) score for each symbol in alphabet
            self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
            # for each position in the alignment,
264 265
            # allocate a position to put the each child symbol from which each current node symbol score was determined
            self.backptr = [[[None for _ in aln.alphabet] for _ in range(aln.alignlen)] for _ in range(self.nChildren())]
Mikael Boden's avatar
Mikael Boden committed
266
            for col in range(aln.alignlen):
267 268 269 270 271 272 273 274 275 276 277 278 279
                for i in range(self.nChildren()):
                    # left child will contribute first
                    for a_parent in range(len(aln.alphabet)):
                        best_score = +9999999
                        best_symb = 0
                        for a in range(len(aln.alphabet)):
                            score = (scores[i][col][a] + (
                            1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
                            if score < best_score:
                                best_symb = a
                                best_score = score
                        self.seqscores[col][a_parent] += best_score
                        self.backptr[i][col][a_parent] = best_symb
Mikael Boden's avatar
Mikael Boden committed
280
        else:
Mikael Boden's avatar
Mikael Boden committed
281 282
            self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in
                              self.sequence]  # if we want to weight scores, this would need to change
Mikael Boden's avatar
Mikael Boden committed
283
        if self._verbose: print('Forward:', self.label, '\n\t', self.seqscores)
Mikael Boden's avatar
Mikael Boden committed
284 285
        return self.seqscores

Mikael Boden's avatar
Mikael Boden committed
286
    def _backwardParsimony(self, aln, seq=None):
Mikael Boden's avatar
Mikael Boden committed
287 288
        """ Internal function that operates recursively to inspect scores to determine
            most parsimonious sequence, from root to leaves. """
Mikael Boden's avatar
Mikael Boden committed
289
        if self.sequence == None:  # no sequence has been assigned
290 291
            childbuf = [[] for _ in range(self.nChildren())]
            if self.nChildren() == 0:  # no children, so terminal, cannot propagate scores
Mikael Boden's avatar
Mikael Boden committed
292
                raise RuntimeError("No sequence assigned to leaf node:", self.label)
Mikael Boden's avatar
Mikael Boden committed
293
            if seq == None:  # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
Mikael Boden's avatar
Mikael Boden committed
294 295 296 297
                currbuf = []
                for col in range(aln.alignlen):
                    min_score = 999999
                    min_symb = None
298
                    child_symb = [None for _ in range(self.nChildren())]
Mikael Boden's avatar
Mikael Boden committed
299 300 301 302
                    for a_parent in range(len(aln.alphabet)):
                        if self.seqscores[col][a_parent] < min_score:
                            min_score = self.seqscores[col][a_parent]
                            min_symb = a_parent
303 304
                            for i in range(self.nChildren()):
                                child_symb[i] = self.backptr[i][col][a_parent]
Mikael Boden's avatar
Mikael Boden committed
305
                    currbuf.append(aln.alphabet[min_symb])
306 307
                    for i in range(self.nChildren()):
                        childbuf[i].append(aln.alphabet[child_symb[i]])
Mikael Boden's avatar
Mikael Boden committed
308 309
                self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy=True)
            else:  # Non-root, but not leaf
Mikael Boden's avatar
Mikael Boden committed
310 311 312 313
                self.sequence = seq
                col = 0
                for sym_parent in self.sequence:
                    a_parent = aln.alphabet.index(sym_parent)
314 315 316
                    child_symb = [None for _ in range(self.nChildren())]
                    for i in range(self.nChildren()):
                        child_symb[i] = self.backptr[i][col][a_parent]
317
                        childbuf[i].append(aln.alphabet[child_symb[i]])
Mikael Boden's avatar
Mikael Boden committed
318
                    col += 1
319
            for i in range(self.nChildren()):
320
                self.children[i]._backwardParsimony(aln, sequence.Sequence(childbuf[i], aln.alphabet, self.children[i].label or "Child of "+self.label, gappy=True))
Mikael Boden's avatar
Mikael Boden committed
321
        if self._verbose: print('Backward:', self.label, '\n\t', self.backptr)
Mikael Boden's avatar
Mikael Boden committed
322 323 324 325 326 327
        return self.sequence

    def getSequence(self):
        """ Get the sequence for the node. Return None if no sequence is assigned.
            Requires that an alignment is associated with the tree, and that sequence names match node labels.
            If the explored node is not a leaf, the sequence can be determined by parsimony. """
Mikael Boden's avatar
Mikael Boden committed
328
        if self.sequence != None:  # a sequence has been assigned
Mikael Boden's avatar
Mikael Boden committed
329
            return self.sequence
Mikael Boden's avatar
Mikael Boden committed
330 331
        elif self.seqscores != None:  # inferred by parsimony but not yet assigned
            return None  # determine most parsimonous sequence, not yet implemented
Mikael Boden's avatar
Mikael Boden committed
332

Mikael Boden's avatar
Mikael Boden committed
333
    def isAncestorOf(self, node, transitive=True):
Mikael Boden's avatar
Mikael Boden committed
334 335 336
        """ Decide if this node is the ancestor of specified node.
            If transitive is True (default), all descendants are included.
            If transitive is False, only direct descendants are included. """
337 338 339 340 341 342
        for i in range(self.nChildren()):
            if node == self.children[i]:
                return True
            elif transitive:
                status = self.children[i].isAncestorOf(node, transitive)
                if status: return True
Mikael Boden's avatar
Mikael Boden committed
343 344 345
        else:
            return False

346 347 348 349 350 351 352 353 354 355 356 357 358
    def getAncestors(self, transitive=False):
        """ Retrieve and return (list of) parent nodes.
            If transitive is False (default), only the direct parent is included.
            If transitive is True, all parents (parents of parents etc) are included. """
        if self.parent == None:
            return []
        if not transitive:
            return [self.parent]
        else:
            parents = self.parent.getAncestors(transitive)
            parents.append(self.parent)
        return parents

Mikael Boden's avatar
Mikael Boden committed
359
    def getDescendants(self, transitive=False):
Mikael Boden's avatar
Mikael Boden committed
360 361 362 363
        """ Retrieve and return (list of) nodes descendant of this.
            If transitive is False (default), only direct descendants are included.
            If transitive is True, all descendants are (recursively) included. """
        children = []
364 365
        for i in range(self.nChildren()):
            children.append(self.children[i])
Mikael Boden's avatar
Mikael Boden committed
366 367 368 369 370 371 372 373 374 375 376
        if not transitive:
            return children
        else:
            grandchildren = []
            for c in children:
                d = c.getDescendants(transitive)
                if d:
                    grandchildren.extend(d)
            children.extend(grandchildren)
            return children

Mikael Boden's avatar
Mikael Boden committed
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
    def annotate_node(self, annotations, annotation_symbols= None, exclude_annotations = [], use_symbols=False ):
        annotation_string = "[&"
        for key, val_list in annotations[self].items():
            if type(val_list) != list:
                val_list = [val_list]
            if key not in exclude_annotations:
                # If we are using annotation symbols and the annotation has an associated symbol
                for val in val_list:
                    if use_symbols and val in annotation_symbols:
                        sorted_symbols = sorted([annotation_symbols[val] for val in val_list])
                        annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in sorted_symbols]))
                    else:
                        annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in val_list]))
        # Remove the final comma and add in a closing bracket
        annotation_string = annotation_string[0: len(annotation_string) - 1] + "]"
        if len(annotation_string) > 2:
            if ":" in self.label:
                self.label = self.label.split(":")[0] + annotation_string + self.label.split(":")[1]
            else:
                self.label = self.label + annotation_string

Mikael Boden's avatar
Mikael Boden committed
398 399 400 401
""" ----------------------------------------------------------------------------------------
    Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
    ----------------------------------------------------------------------------------------"""

Mikael Boden's avatar
Mikael Boden committed
402
def runUPGMA(aln, measure, absoluteDistances=False):
Mikael Boden's avatar
Mikael Boden committed
403 404 405 406 407
    """ Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
        Use specified distance metric (see sequence.calcDistances).
        If absoluteDistances is True, the tree will be assigned the total distance from provided species.
        Otherwise, the relative addition at each path will be assigned."""
    D = {}
Mikael Boden's avatar
Mikael Boden committed
408 409
    N = {}  # The number of sequences in each node
    M = aln.calcDistances(measure)  # determine all pairwise distances
Mikael Boden's avatar
Mikael Boden committed
410
    print(M)
411
    nodes = [PhyloNode(label=seq.name) for seq in aln.seqs]  # construct all leaf nodes
Mikael Boden's avatar
Mikael Boden committed
412 413 414 415
    """ For each node-pair, assign the distance between them. """
    for i in range(len(nodes)):
        nodes[i].sequence = aln.seqs[i]
        nodes[i].dist = 0.0
Mikael Boden's avatar
Mikael Boden committed
416
        N[nodes[i]] = 1  # each cluster contains a single sequence
Mikael Boden's avatar
Mikael Boden committed
417
        for j in range(0, i):
Mikael Boden's avatar
Mikael Boden committed
418
            D[frozenset([nodes[i], nodes[j]])] = M[i, j]
Mikael Boden's avatar
Mikael Boden committed
419 420
    """ Treat each node as a cluster, until there is only one cluster left, find the *closest* 
        pair of clusters, and merge that pair into a new cluster (to replace the two that merged). 
Mikael Boden's avatar
Mikael Boden committed
421
        In each case, the new cluster is represented by the (phylo)node that is formed. """
Mikael Boden's avatar
Mikael Boden committed
422
    while len(N) > 1:  # N will contain all "live" clusters, to be reduced to a single below
Mikael Boden's avatar
Mikael Boden committed
423 424
        closest_pair = (None, None)  # The two nodes that are closest to one another according to supplied metric
        closest_dist = None  # The distance between them
Mikael Boden's avatar
Mikael Boden committed
425
        print(len(N), 'nodes remain')
Mikael Boden's avatar
Mikael Boden committed
426
        for pair in D:  # check all pairs which should be merged
Mikael Boden's avatar
Mikael Boden committed
427
            dist = D[pair]
Mikael Boden's avatar
Mikael Boden committed
428 429
            pair_as_list = list(pair)
            print('Inspecting \"' + str(pair_as_list[0]) + '\" and \"' + str(pair_as_list[1]) + '\" at distance %5.3f' % D[pair])
Mikael Boden's avatar
Mikael Boden committed
430
            if closest_dist == None or dist < closest_dist:
Mikael Boden's avatar
Mikael Boden committed
431
                closest_dist = dist
Mikael Boden's avatar
Mikael Boden committed
432
                closest_pair = list(pair)
Mikael Boden's avatar
Mikael Boden committed
433
        # So we know the closest, now we need to merge...
Mikael Boden's avatar
Mikael Boden committed
434
        x = closest_pair[0]  # See Zvelebil and Baum p. 278 for notation
Mikael Boden's avatar
Mikael Boden committed
435
        y = closest_pair[1]
Mikael Boden's avatar
Mikael Boden committed
436
        z = PhyloNode()  # create a new node for the cluster z
Mikael Boden's avatar
Mikael Boden committed
437
        z.dist = D.pop(frozenset([x, y])) / 2.0  # assign the absolute distance, change to relative distance later
Mikael Boden's avatar
Mikael Boden committed
438 439 440
        Nx = N.pop(x)  # find number of sequences in x, remove the cluster from list N
        Ny = N.pop(y)  # find number of sequences in y, remove the cluster from list N
        dz = {}  # new distances to cluster z
Mikael Boden's avatar
Mikael Boden committed
441 442 443 444
        x.parent = z
        y.parent = z
        z.children = [x, y]
        print('Closest pair is \"' + str(x) + '\" ('+str(Nx)+') and \"' + str(y) + '\" ('+str(Ny)+') at distance %5.3f' % (z.dist * 2), 'form new node ' + str(z))
Mikael Boden's avatar
Mikael Boden committed
445
        for w in N:  # for each node w ...
Mikael Boden's avatar
Mikael Boden committed
446
            # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
Mikael Boden's avatar
Mikael Boden committed
447 448 449
            dxw = D.pop(frozenset([x, w]))  # retrieve and remove distance from D: x to w
            dyw = D.pop(frozenset([y, w]))  # retrieve and remove distance from D: y to w
            dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny)  # distance: z to w
Mikael Boden's avatar
Mikael Boden committed
450
            print(str(z) + ' gets distance to \"' + str(w) + '\": (', Nx, '* %5.3f' % dxw, '+', Ny, '* %5.3f' % dyw, ') / (', Nx, '+', Ny, ') = %5.3f' % dz[w])
Mikael Boden's avatar
Mikael Boden committed
451 452 453
        N[z] = Nx + Ny  # total number of sequences in new cluster, insert new cluster in list N
        for w in dz:  # we have to run through the nodes again, now not including the removed x and y
            D[frozenset([z, w])] = dz[w]  # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
Mikael Boden's avatar
Mikael Boden committed
454 455
        nodes.append(z)
    if not absoluteDistances:
Mikael Boden's avatar
Mikael Boden committed
456 457 458 459
        x._propagateDistance(z.dist)  # convert absolute distances to relative by recursing down left path
        y._propagateDistance(z.dist)  # convert absolute distances to relative by recursing down right path
        z.dist = 0.0  # root z is at distance 0 from merged x and y
    return PhyloTree(z)  # make it to tree, return
Mikael Boden's avatar
Mikael Boden committed
460 461 462 463 464

""" ----------------------------------------------------------------------------------------
    Methods for processing files of trees on the Newick format
    ----------------------------------------------------------------------------------------"""

Mikael Boden's avatar
Mikael Boden committed
465
def _findComma(string, level=0):
466
    """ Find all commas at specified level of embedding """
Mikael Boden's avatar
Mikael Boden committed
467
    mylevel = 0
468
    commas = []
Mikael Boden's avatar
Mikael Boden committed
469 470 471 472 473 474
    for i in range(len(string)):
        if string[i] == '(':
            mylevel += 1
        elif string[i] == ')':
            mylevel -= 1
        elif string[i] == ',' and mylevel == level:
475 476
            commas.append(i)
    return commas
Mikael Boden's avatar
Mikael Boden committed
477

Mikael Boden's avatar
Mikael Boden committed
478 479 480
def parseNewickNode(string):
    """ Utility function that recursively parses embedded string using Newick format. """
    first = string.find('(')
Mikael Boden's avatar
Mikael Boden committed
481 482
    last = string[::-1].find(')')  # look from the back
    if first == -1 and last == -1:  # we are at leaf
Mikael Boden's avatar
Mikael Boden committed
483
        y = string.split(':')
484
        node = PhyloNode(label=y[0])
Mikael Boden's avatar
Mikael Boden committed
485 486 487 488 489
        if len(y) >= 2:
            node.dist = float(y[1])
        return node
    elif first >= 0 and last >= 0:
        # remove parentheses
Mikael Boden's avatar
Mikael Boden committed
490
        last = len(string) - last - 1  # correct index to refer from start instead of end of string
Mikael Boden's avatar
Mikael Boden committed
491 492 493
        embed = string[first + 1:last]
        tail = string[last + 1:]
        # find where corresp comma is
494 495
        commas = _findComma(embed)
        if len(commas) < 1:
Mikael Boden's avatar
Mikael Boden committed
496
            raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
497 498 499 500 501 502
        prev_comma = 0
        child_tokens = []
        for comma in commas:
            child_tokens.append(embed[prev_comma:comma].strip())
            prev_comma = comma + 1
        child_tokens.append(embed[prev_comma:].strip())
Mikael Boden's avatar
Mikael Boden committed
503
        y = tail.split(':')
504
        node = PhyloNode(label=y[0])  # node is an instance of the PhyloNode() class
Mikael Boden's avatar
Mikael Boden committed
505 506
        if len(y) >= 2:
            node.dist = float(y[1])
507 508 509 510 511
        node.children = []
        for tok in child_tokens:
            child = parseNewickNode(tok)
            child.parent = node
            node.children.append(child)
Mikael Boden's avatar
Mikael Boden committed
512 513 514 515
        return node
    else:
        raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')

Mikael Boden's avatar
Mikael Boden committed
516

Mikael Boden's avatar
Mikael Boden committed
517 518 519 520 521 522 523 524
def parseNewick(string):
    """ Main method for parsing a Newick string into a (phylogenetic) tree.
        Handles labels (on both leaves and internal nodes), and includes distances (if provided).
        Returns an instance of a PhyloTree. """
    if string.find(';') != -1:
        string = string[:string.find(';')]
    return PhyloTree(parseNewickNode(string))

Mikael Boden's avatar
Mikael Boden committed
525

Mikael Boden's avatar
Mikael Boden committed
526 527 528 529 530 531 532
def readNewick(filename):
    """ Read file on Newick format.
        Returns an instance of a PhyloTree."""
    f = open(filename)
    string = ''.join(f)
    return parseNewick(string)

Mikael Boden's avatar
Mikael Boden committed
533

Mikael Boden's avatar
Mikael Boden committed
534 535 536
def writeNewickFile(filename, my_tree):
    with open(filename, 'w') as fh:
        print(my_tree, end="", file=fh)
Mikael Boden's avatar
Mikael Boden committed
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600


def read_nexus(filename):
    """
    Read a file in Nexus format
    :param filename:
    :return:
    """
    f = open(filename)
    return parse_nexus(f)

def parse_nexus(string):
    string = string.read()
    lines = string.split("\n")
    annotation_dict = defaultdict(dict)
    for num, line in enumerate(lines):
        # print (line)
        if line.strip().startswith("dimensions ntax="):
                taxon_number = line.strip().split("dimensions ntax=")[1].split(";")[0]
        if line.strip().startswith("taxlabels"):
            taxon_num = num + 1
            while not lines[taxon_num].strip().startswith(";"):
                taxon_name = lines[taxon_num].split("[")[0].strip()
                for annot_line in lines[taxon_num].split("[&")[1].split(","):
                    #TODO: Make these regex calls
                    # print ("Annotation Key is ", annot_line.split("=")[0])
                    annot_key = annot_line.split("=")[0]
                    # print (annot_line.split("=")[1])
                    if '"' in annot_line:
                        annot_val = annot_line.split("=")[1].split("\"")[1]
                    else:
                        annot_val = annot_line.split("=")[1].split("]")[0]

                    annotation_dict[taxon_name][annot_key.strip()] = annot_val
                taxon_num +=1
        if line.strip().startswith("begin trees"):
            tree_num = num + 1
            tree = (lines[tree_num].split("[&R]")[1])
    phylo_tree = parseNewick(tree)
    nexus_annotations = annotations.NexusAnnotations(tree=phylo_tree)
    nexus_annotations.add_annotations(annotation_dict)
    phylo_tree.putAnnotations(nexus_annotations)
    ## Extract all of the annotations from the tree and add them to the NexusAnnotations object
    print ("Number of taxons is %s " % (taxon_number))
    return phylo_tree

""" ----------------------------------------------------------------------------------------
    Method for generating a PhyloTree with unique tip names
    ----------------------------------------------------------------------------------------"""

def get_unique_tree(tree):
    unique_tree = tree
    unique_labels = {}
    for node in unique_tree.getNodes():
        if node.isLeaf() and node.label in unique_labels:
            unique_labels[node.label] = unique_labels[node.label] + 1
            node.label = node.label + str(unique_labels[node.label])
        elif node.isLeaf():
            unique_labels[node.label] = 1
    return unique_tree

def unpack_list(list):
    return (" ".join(["%s"] * len(list)) + "!") % (x for x in list)

601
if __name__ == '__main__1':
602
    tree = readNewick('/Users/mikael/simhome/ASR/edge1.nwk')
603 604 605
    print(tree)

if __name__ == '__main__':
Mikael Boden's avatar
Mikael Boden committed
606 607 608 609 610 611 612 613 614 615 616 617
    aln = sequence.readFastaFile('/Users/mikael/Documents/Teaching/SCIE2100/Exams/pdistupgma.aln', sequence.Protein_Alphabet)
    tree = runUPGMA(sequence.Alignment(aln), "fractional")
    writeNewickFile('/Users/mikael/Documents/Teaching/SCIE2100/Exams/pdistupgma.nwk', tree)

if __name__ == '__main__3':
    aln = sequence.readClustalFile('/Users/mikael/simhome/ASR/dp16_example.aln', sequence.Protein_Alphabet)
    tree = runUPGMA(aln, "poisson")
    writeNewickFile('/Users/mikael/simhome/ASR/dp16_example_UPGMA.nwk', tree)

if __name__ == '__main__4':
    tree = readNewick('/Users/mikael/simhome/ASR/parsitest2.nwk')
    tree.putAlignment(sequence.Alignment(sequence.readFastaFile('/Users/mikael/simhome/ASR/parsitest2.aln', sequence.DNA_Alphabet)))
618 619
    tree.parsimony()
    print(tree.strSequences())