Commit ffe94c34 authored by Mikael Boden's avatar Mikael Boden

phylo_bugfixes

parent 85897a23
...@@ -670,7 +670,7 @@ def readGeoFile(filename, id_column = 0): ...@@ -670,7 +670,7 @@ def readGeoFile(filename, id_column = 0):
# Our implementations are mainly serviced by EBI. # Our implementations are mainly serviced by EBI.
############################################################################### ###############################################################################
def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, format = 'fasta'): def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, format = 'fasta', debug: bool = True):
""" Retrieve a single entry from a database """ Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE'
dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
...@@ -681,6 +681,8 @@ def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, form ...@@ -681,6 +681,8 @@ def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, form
entryId = entryId.decode("utf-8") entryId = entryId.decode("utf-8")
url ='http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId url ='http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
try: try:
if debug:
print('DEBUG: Querying URL: {0}'.format(url))
data = urllib.request.urlopen(url).read() data = urllib.request.urlopen(url).read()
if format == 'fasta': if format == 'fasta':
return readFastaString(data.decode("utf-8"), alphabet)[0] return readFastaString(data.decode("utf-8"), alphabet)[0]
...@@ -1200,7 +1202,7 @@ def runUPGMA(aln, measure, absoluteDistances=False): ...@@ -1200,7 +1202,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
nodes[i].dist = 0.0 nodes[i].dist = 0.0
N[nodes[i]] = 1 # each cluster contains a single sequence N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i): for j in range(0, i):
D[_getkey(nodes[i], nodes[j])] = M[i, j] D[frozenset([nodes[i], nodes[j]])] = M[i, j]
""" Now: treat each node as a cluster, """ Now: treat each node as a cluster,
until there is only one cluster left, until there is only one cluster left,
find the *closest* pair of clusters, and find the *closest* pair of clusters, and
...@@ -1211,26 +1213,25 @@ def runUPGMA(aln, measure, absoluteDistances=False): ...@@ -1211,26 +1213,25 @@ def runUPGMA(aln, measure, absoluteDistances=False):
closest_dist = None # The distance between them closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged for pair in D: # check all pairs which should be merged
dist = D[pair] dist = D[pair]
if dist < closest_dist or closest_dist == None: if closest_dist == None or dist < closest_dist:
closest_dist = dist closest_dist = dist
closest_pair = pair closest_pair = list(pair)
# So we know the closest, now we need to merge... # So we know the closest, now we need to merge...
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1] y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, z.dist = D.pop(frozenset([x, y])) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
dz = {} # new distances to cluster z dz = {} # new distances to cluster z
for w in N: # for each node w ... for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y) # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w dxw = D.pop(frozenset([x, w])) # retrieve and remove distance from D: x to w
dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w dyw = D.pop(frozenset([y, w])) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[_getkey(z, w)] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278) D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree z.left = x # link the phylogenetic tree
z.right = y z.right = y
nodes.append(z) nodes.append(z)
...@@ -1240,15 +1241,6 @@ def runUPGMA(aln, measure, absoluteDistances=False): ...@@ -1240,15 +1241,6 @@ def runUPGMA(aln, measure, absoluteDistances=False):
z.dist = 0.0 # root z is at distance 0 from merged x and y z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return return PhyloTree(z) # make it to tree, return
def _getkey(node1, node2):
""" Construct canonical (unordered) key for two symbols """
if node1 <= node2:
return tuple([node1, node2])
else:
return tuple([node2, node1])
def _findComma(string, level=0): def _findComma(string, level=0):
""" Find first comma at specified level of embedding """ """ Find first comma at specified level of embedding """
mylevel = 0 mylevel = 0
......
...@@ -343,68 +343,59 @@ class PhyloNode: ...@@ -343,68 +343,59 @@ class PhyloNode:
Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278 Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
----------------------------------------------------------------------------------------""" ----------------------------------------------------------------------------------------"""
def runUPGMA(aln, measure, absoluteDistances = False): def runUPGMA(aln, measure, absoluteDistances=False):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances. """ Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances). Use specified distance metric (see sequence.calcDistances).
If absoluteDistances is True, the tree will be assigned the total distance from provided species. If absoluteDistances is True, the tree will be assigned the total distance from provided species.
Otherwise, the relative addition at each path will be assigned.""" Otherwise, the relative addition at each path will be assigned."""
D = {} D = {}
N = {} # The number of sequences in each node N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances M = aln.calcDistances(measure) # determine all pairwise distances
nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """ """ For each node-pair, assign the distance between them. """
for i in range(len(nodes)): for i in range(len(nodes)):
nodes[i].sequence = aln.seqs[i] nodes[i].sequence = aln.seqs[i]
nodes[i].dist = 0.0 nodes[i].dist = 0.0
N[nodes[i]] = 1 # each cluster contains a single sequence N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i): for j in range(0, i):
D[_getkey(nodes[i], nodes[j])] = M[i, j] D[frozenset([nodes[i], nodes[j]])] = M[i, j]
""" Now: treat each node as a cluster, """ Now: treat each node as a cluster,
until there is only one cluster left, until there is only one cluster left,
find the *closest* pair of clusters, and find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged). merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """ In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged for pair in D: # check all pairs which should be merged
dist = D[pair] dist = D[pair]
if closest_dist == None or dist < closest_dist: if closest_dist == None or dist < closest_dist:
closest_dist = dist closest_dist = dist
closest_pair = pair closest_pair = list(pair)
# So we know the closest, now we need to merge... # So we know the closest, now we need to merge...
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1] y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later z.dist = D.pop(frozenset([x, y])) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x, None) # find number of sequences in x, remove the cluster from list N Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y, None) # find number of sequences in y, remove the cluster from list N Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
if Nx == None or Ny == None: dz = {} # new distances to cluster z
continue for w in N: # for each node w ...
dz = {} # new distances to cluster z
for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y) # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w dxw = D.pop(frozenset([x, w])) # retrieve and remove distance from D: x to w
dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w dyw = D.pop(frozenset([y, w])) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[_getkey(z, w)] = dz[w]# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278) D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree z.left = x # link the phylogenetic tree
z.right = y z.right = y
nodes.append(z) nodes.append(z)
if not absoluteDistances: if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
y._propagateDistance(z.dist) # convert absolute distances to relative by recursing down right path y._propagateDistance(z.dist) # convert absolute distances to relative by recursing down right path
z.dist = 0.0 # root z is at distance 0 from merged x and y z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return return PhyloTree(z) # make it to tree, return
def _getkey(node1, node2):
""" Construct canonical (unordered) key for two symbols """
if node1 <= node2:
return tuple([node1, node2])
else:
return tuple([node2, node1])
""" ---------------------------------------------------------------------------------------- """ ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format Methods for processing files of trees on the Newick format
......
...@@ -17,7 +17,9 @@ PWM -- defines a weight matrix that can score any site in actual sequences ...@@ -17,7 +17,9 @@ PWM -- defines a weight matrix that can score any site in actual sequences
Incorporates methods for loading and saving files relevant to the above (e.g. FASTA, ALN, substitution matrices) Incorporates methods for loading and saving files relevant to the above (e.g. FASTA, ALN, substitution matrices)
and methods for retrieving relevant data from web services and methods for retrieving relevant data from web services
This code has gone through many updates and has benefitted from kind contributions of course participants. This code has been adapted to Python 3.5 in 2017
This code has gone through many updates and has benefited from kind contributions of course participants.
Please keep suggestions coming! Please keep suggestions coming!
Email: m.boden@uq.edu.au Email: m.boden@uq.edu.au
""" """
...@@ -91,7 +93,7 @@ class Sequence(object): ...@@ -91,7 +93,7 @@ class Sequence(object):
def __len__(self): def __len__(self):
""" Defines what the "len" operator returns for an instance of Sequence, e.g. """ Defines what the "len" operator returns for an instance of Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet) >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print len(seq) >>> print(len(seq))
9 9
""" """
return len(self.sequence) return len(self.sequence)
...@@ -107,7 +109,7 @@ class Sequence(object): ...@@ -107,7 +109,7 @@ class Sequence(object):
""" Defines how a Sequence should be "iterated", i.e. what its elements are, e.g. """ Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
>>> seq = Sequence('AGGAT', DNA_Alphabet) >>> seq = Sequence('AGGAT', DNA_Alphabet)
>>> for sym in seq: >>> for sym in seq:
print sym print(sym)
will print A, G, G, A, T (each on a separate row) will print A, G, G, A, T (each on a separate row)
""" """
tsyms = tuple(self.sequence) tsyms = tuple(self.sequence)
...@@ -116,12 +118,12 @@ class Sequence(object): ...@@ -116,12 +118,12 @@ class Sequence(object):
def __contains__(self, item): def __contains__(self, item):
""" Defines what is returned when the "in" operator is used on a Sequence, e.g. """ Defines what is returned when the "in" operator is used on a Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet) >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print 'T' in seq >>> print('T' in seq)
True True
which is equivalent to which is equivalent to
>>> print seq.__contains__('T') >>> print(seq.__contains__('T'))
True True
>>> print 'X' in seq >>> print('X' in seq)
False False
""" """
for sym in self.sequence: for sym in self.sequence:
...@@ -319,7 +321,7 @@ class Alignment(): ...@@ -319,7 +321,7 @@ class Alignment():
one symbol is gap '-' one symbol is gap '-'
Example usage: Example usage:
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)] >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print Alignment(seqs) >>> print(Alignment(seqs))
THIS-LI-NE- THIS-LI-NE-
--ISALIGNED --ISALIGNED
""" """
...@@ -351,7 +353,7 @@ class Alignment(): ...@@ -351,7 +353,7 @@ class Alignment():
""" Defines what the "len" operator returns for an instance of Alignment, e.g. """ Defines what the "len" operator returns for an instance of Alignment, e.g.
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)] >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> aln = Alignment(seqs) >>> aln = Alignment(seqs)
>>> print len(aln) >>> print(len(aln))
2 2
""" """
return len(self.seqs) return len(self.seqs)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment