phylo_bugfixes

ffe94c34 · Mikael Boden · 85897a23 · ffe94c34 · ffe94c34 · ffe94c34
Commit ffe94c34 authored Mar 09, 2017 by Mikael Boden
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 64 deletions

guide.py guide.py +10 -18

phylo.py phylo.py +29 -38

sequence.py sequence.py +10 -8

No files found.
--- a/guide.py
+++ b/guide.py
@@ -670,7 +670,7 @@ def readGeoFile(filename, id_column = 0):
 # Our implementations are mainly serviced by EBI.
 ###############################################################################

-def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, format = 'fasta'):
+def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, format = 'fasta', debug: bool = True):
    """ Retrieve a single entry from a database
    entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE'
    dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
@@ -681,6 +681,8 @@ def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, form
        entryId = entryId.decode("utf-8")
    url ='http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
    try:
+        if debug:
+            print('DEBUG: Querying URL: {0}'.format(url))
        data = urllib.request.urlopen(url).read()
        if format == 'fasta':
            return readFastaString(data.decode("utf-8"), alphabet)[0]
@@ -1200,7 +1202,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
        nodes[i].dist = 0.0
        N[nodes[i]] = 1  # each cluster contains a single sequence
        for j in range(0, i):
-            D[_getkey(nodes[i], nodes[j])] = M[i, j]
+            D[frozenset([nodes[i], nodes[j]])] = M[i, j]
    """ Now: treat each node as a cluster,
        until there is only one cluster left,
        find the *closest* pair of clusters, and
@@ -1211,26 +1213,25 @@ def runUPGMA(aln, measure, absoluteDistances=False):
        closest_dist = None  # The distance between them
        for pair in D:  # check all pairs which should be merged
            dist = D[pair]
-            if dist < closest_dist or closest_dist == None:
+            if closest_dist == None or dist < closest_dist:
                closest_dist = dist
-                closest_pair = pair
+                closest_pair = list(pair)
        # So we know the closest, now we need to merge...
        x = closest_pair[0]  # See Zvelebil and Baum p. 278 for notation
        y = closest_pair[1]
        z = PhyloNode()  # create a new node for the cluster z
-        z.dist = D.pop(_getkey(x,
-                               y)) / 2.0  # assign the absolute distance, travelled so far, note: this will change to relative distance later
+        z.dist = D.pop(frozenset([x, y])) / 2.0  # assign the absolute distance, travelled so far, note: this will change to relative distance later
        Nx = N.pop(x)  # find number of sequences in x, remove the cluster from list N
        Ny = N.pop(y)  # find number of sequences in y, remove the cluster from list N
        dz = {}  # new distances to cluster z
        for w in N:  # for each node w ...
            # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
-            dxw = D.pop(_getkey(x, w))  # retrieve and remove distance from D: x to w
-            dyw = D.pop(_getkey(y, w))  # retrieve and remove distance from D: y to w
+            dxw = D.pop(frozenset([x, w]))  # retrieve and remove distance from D: x to w
+            dyw = D.pop(frozenset([y, w]))  # retrieve and remove distance from D: y to w
            dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny)  # distance: z to w
        N[z] = Nx + Ny  # total number of sequences in new cluster, insert new cluster in list N
        for w in dz:  # we have to run through the nodes again, now not including the removed x and y
-            D[_getkey(z, w)] = dz[w]  # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
+            D[frozenset([z, w])] = dz[w]  # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
        z.left = x  # link the phylogenetic tree
        z.right = y
        nodes.append(z)
@@ -1240,15 +1241,6 @@ def runUPGMA(aln, measure, absoluteDistances=False):
        z.dist = 0.0  # root z is at distance 0 from merged x and y
    return PhyloTree(z)  # make it to tree, return

-
-def _getkey(node1, node2):
-    """ Construct canonical (unordered) key for two symbols """
-    if node1 <= node2:
-        return tuple([node1, node2])
-    else:
-        return tuple([node2, node1])
-
-
 def _findComma(string, level=0):
    """ Find first comma at specified level of embedding """
    mylevel = 0

--- a/phylo.py
+++ b/phylo.py
@@ -343,68 +343,59 @@ class PhyloNode:
    Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
    ----------------------------------------------------------------------------------------"""

-def runUPGMA(aln, measure, absoluteDistances = False):
+def runUPGMA(aln, measure, absoluteDistances=False):
    """ Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
        Use specified distance metric (see sequence.calcDistances).
        If absoluteDistances is True, the tree will be assigned the total distance from provided species.
        Otherwise, the relative addition at each path will be assigned."""
    D = {}
-    N = {} # The number of sequences in each node
-    M = aln.calcDistances(measure) # determine all pairwise distances
-    nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
+    N = {}  # The number of sequences in each node
+    M = aln.calcDistances(measure)  # determine all pairwise distances
+    nodes = [PhyloNode(seq.name) for seq in aln.seqs]  # construct all leaf nodes
    """ For each node-pair, assign the distance between them. """
    for i in range(len(nodes)):
        nodes[i].sequence = aln.seqs[i]
        nodes[i].dist = 0.0
-        N[nodes[i]] = 1 # each cluster contains a single sequence
+        N[nodes[i]] = 1  # each cluster contains a single sequence
        for j in range(0, i):
-            D[_getkey(nodes[i], nodes[j])] = M[i, j]
+            D[frozenset([nodes[i], nodes[j]])] = M[i, j]
    """ Now: treat each node as a cluster,
        until there is only one cluster left,
        find the *closest* pair of clusters, and
        merge that pair into a new cluster (to replace the two that merged).
        In each case, the new cluster is represented by the (phylo)node that is formed. """
-    while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below
-        closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
-        closest_dist = None         # The distance between them
-        for pair in D:              # check all pairs which should be merged
+    while len(N) > 1:  # N will contain all "live" clusters, to be reduced to a signle below
+        closest_pair = (None, None)  # The two nodes that are closest to one another according to supplied metric
+        closest_dist = None  # The distance between them
+        for pair in D:  # check all pairs which should be merged
            dist = D[pair]
            if closest_dist == None or dist < closest_dist:
                closest_dist = dist
-                closest_pair = pair
+                closest_pair = list(pair)
        # So we know the closest, now we need to merge...
-        x = closest_pair[0]         # See Zvelebil and Baum p. 278 for notation
+        x = closest_pair[0]  # See Zvelebil and Baum p. 278 for notation
        y = closest_pair[1]
-        z = PhyloNode()             # create a new node for the cluster z
-        z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
-        Nx = N.pop(x, None)         # find number of sequences in x, remove the cluster from list N
-        Ny = N.pop(y, None)         # find number of sequences in y, remove the cluster from list N
-        if Nx == None or Ny == None:
-        	continue
-        dz = {}                     # new distances to cluster z
-        for w in N:                 # for each node w ...
+        z = PhyloNode()  # create a new node for the cluster z
+        z.dist = D.pop(frozenset([x, y])) / 2.0  # assign the absolute distance, travelled so far, note: this will change to relative distance later
+        Nx = N.pop(x)  # find number of sequences in x, remove the cluster from list N
+        Ny = N.pop(y)  # find number of sequences in y, remove the cluster from list N
+        dz = {}  # new distances to cluster z
+        for w in N:  # for each node w ...
            # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
-            dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
-            dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
-            dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
-        N[z] = Nx + Ny              # total number of sequences in new cluster, insert new cluster in list N
-        for w in dz:                # we have to run through the nodes again, now not including the removed x and y
-            D[_getkey(z, w)] = dz[w]# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
-        z.left = x                  # link the phylogenetic tree
+            dxw = D.pop(frozenset([x, w]))  # retrieve and remove distance from D: x to w
+            dyw = D.pop(frozenset([y, w]))  # retrieve and remove distance from D: y to w
+            dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny)  # distance: z to w
+        N[z] = Nx + Ny  # total number of sequences in new cluster, insert new cluster in list N
+        for w in dz:  # we have to run through the nodes again, now not including the removed x and y
+            D[frozenset([z, w])] = dz[w]  # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
+        z.left = x  # link the phylogenetic tree
        z.right = y
        nodes.append(z)
    if not absoluteDistances:
-        x._propagateDistance(z.dist)    # convert absolute distances to relative by recursing down left path
-        y._propagateDistance(z.dist)    # convert absolute distances to relative by recursing down right path
-        z.dist = 0.0                    # root z is at distance 0 from merged x and y
-    return PhyloTree(z)             # make it to tree, return
-
-def _getkey(node1, node2):
-    """ Construct canonical (unordered) key for two symbols """
-    if node1 <= node2:
-        return tuple([node1, node2])
-    else:
-        return tuple([node2, node1])
+        x._propagateDistance(z.dist)  # convert absolute distances to relative by recursing down left path
+        y._propagateDistance(z.dist)  # convert absolute distances to relative by recursing down right path
+        z.dist = 0.0  # root z is at distance 0 from merged x and y
+    return PhyloTree(z)  # make it to tree, return

 """ ----------------------------------------------------------------------------------------
    Methods for processing files of trees on the Newick format

--- a/sequence.py
+++ b/sequence.py
@@ -17,7 +17,9 @@ PWM -- defines a weight matrix that can score any site in actual sequences
 Incorporates methods for loading and saving files relevant to the above (e.g. FASTA, ALN, substitution matrices)
 and methods for retrieving relevant data from web services

-This code has gone through many updates and has benefitted from kind contributions of course participants.
+This code has been adapted to Python 3.5 in 2017
+
+This code has gone through many updates and has benefited from kind contributions of course participants.
 Please keep suggestions coming!
 Email: m.boden@uq.edu.au
 """
@@ -91,7 +93,7 @@ class Sequence(object):
    def __len__(self):
        """ Defines what the "len" operator returns for an instance of Sequence, e.g.
        >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
-        >>> print len(seq)
+        >>> print(len(seq))
        9
        """
        return len(self.sequence)
@@ -107,7 +109,7 @@ class Sequence(object):
        """ Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
        >>> seq = Sequence('AGGAT', DNA_Alphabet)
        >>> for sym in seq:
-                print sym
+                print(sym)
        will print A, G, G, A, T (each on a separate row)
        """
        tsyms = tuple(self.sequence)
@@ -116,12 +118,12 @@ class Sequence(object):
    def __contains__(self, item):
        """ Defines what is returned when the "in" operator is used on a Sequence, e.g.
        >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
-        >>> print 'T' in seq
+        >>> print('T' in seq)
        True
            which is equivalent to
-        >>> print seq.__contains__('T')
+        >>> print(seq.__contains__('T'))
        True
-        >>> print 'X' in seq
+        >>> print('X' in seq)
        False
        """
        for sym in self.sequence:
@@ -319,7 +321,7 @@ class Alignment():
    one symbol is gap '-'
    Example usage:
    >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
-    >>> print Alignment(seqs)
+    >>> print(Alignment(seqs))
     THIS-LI-NE-
     --ISALIGNED
    """
@@ -351,7 +353,7 @@ class Alignment():
        """ Defines what the "len" operator returns for an instance of Alignment, e.g.
        >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
        >>> aln = Alignment(seqs)
-        >>> print len(aln)
+        >>> print(len(aln))
        2
        """
        return len(self.seqs)