Commit ffe94c34 authored by Mikael Boden's avatar Mikael Boden

phylo_bugfixes

parent 85897a23
......@@ -670,7 +670,7 @@ def readGeoFile(filename, id_column = 0):
# Our implementations are mainly serviced by EBI.
###############################################################################
def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, format = 'fasta'):
def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, format = 'fasta', debug: bool = True):
""" Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE'
dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
......@@ -681,6 +681,8 @@ def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, form
entryId = entryId.decode("utf-8")
url ='http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
try:
if debug:
print('DEBUG: Querying URL: {0}'.format(url))
data = urllib.request.urlopen(url).read()
if format == 'fasta':
return readFastaString(data.decode("utf-8"), alphabet)[0]
......@@ -1200,7 +1202,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
nodes[i].dist = 0.0
N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i):
D[_getkey(nodes[i], nodes[j])] = M[i, j]
D[frozenset([nodes[i], nodes[j]])] = M[i, j]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
......@@ -1211,26 +1213,25 @@ def runUPGMA(aln, measure, absoluteDistances=False):
closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged
dist = D[pair]
if dist < closest_dist or closest_dist == None:
if closest_dist == None or dist < closest_dist:
closest_dist = dist
closest_pair = pair
closest_pair = list(pair)
# So we know the closest, now we need to merge...
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x,
y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
z.dist = D.pop(frozenset([x, y])) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
dz = {} # new distances to cluster z
for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
dxw = D.pop(frozenset([x, w])) # retrieve and remove distance from D: x to w
dyw = D.pop(frozenset([y, w])) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[_getkey(z, w)] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree
z.right = y
nodes.append(z)
......@@ -1240,15 +1241,6 @@ def runUPGMA(aln, measure, absoluteDistances=False):
z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return
def _getkey(node1, node2):
""" Construct canonical (unordered) key for two symbols """
if node1 <= node2:
return tuple([node1, node2])
else:
return tuple([node2, node1])
def _findComma(string, level=0):
""" Find first comma at specified level of embedding """
mylevel = 0
......
......@@ -343,68 +343,59 @@ class PhyloNode:
Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
----------------------------------------------------------------------------------------"""
def runUPGMA(aln, measure, absoluteDistances = False):
def runUPGMA(aln, measure, absoluteDistances=False):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances).
If absoluteDistances is True, the tree will be assigned the total distance from provided species.
Otherwise, the relative addition at each path will be assigned."""
D = {}
N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances
nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances
nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for i in range(len(nodes)):
nodes[i].sequence = aln.seqs[i]
nodes[i].dist = 0.0
N[nodes[i]] = 1 # each cluster contains a single sequence
N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i):
D[_getkey(nodes[i], nodes[j])] = M[i, j]
D[frozenset([nodes[i], nodes[j]])] = M[i, j]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged
dist = D[pair]
if closest_dist == None or dist < closest_dist:
closest_dist = dist
closest_pair = pair
closest_pair = list(pair)
# So we know the closest, now we need to merge...
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x, None) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y, None) # find number of sequences in y, remove the cluster from list N
if Nx == None or Ny == None:
continue
dz = {} # new distances to cluster z
for w in N: # for each node w ...
z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(frozenset([x, y])) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
dz = {} # new distances to cluster z
for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[_getkey(z, w)] = dz[w]# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree
dxw = D.pop(frozenset([x, w])) # retrieve and remove distance from D: x to w
dyw = D.pop(frozenset([y, w])) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree
z.right = y
nodes.append(z)
if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
y._propagateDistance(z.dist) # convert absolute distances to relative by recursing down right path
z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return
def _getkey(node1, node2):
""" Construct canonical (unordered) key for two symbols """
if node1 <= node2:
return tuple([node1, node2])
else:
return tuple([node2, node1])
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
y._propagateDistance(z.dist) # convert absolute distances to relative by recursing down right path
z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return
""" ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format
......
......@@ -17,7 +17,9 @@ PWM -- defines a weight matrix that can score any site in actual sequences
Incorporates methods for loading and saving files relevant to the above (e.g. FASTA, ALN, substitution matrices)
and methods for retrieving relevant data from web services
This code has gone through many updates and has benefitted from kind contributions of course participants.
This code has been adapted to Python 3.5 in 2017
This code has gone through many updates and has benefited from kind contributions of course participants.
Please keep suggestions coming!
Email: m.boden@uq.edu.au
"""
......@@ -91,7 +93,7 @@ class Sequence(object):
def __len__(self):
""" Defines what the "len" operator returns for an instance of Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print len(seq)
>>> print(len(seq))
9
"""
return len(self.sequence)
......@@ -107,7 +109,7 @@ class Sequence(object):
""" Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
>>> seq = Sequence('AGGAT', DNA_Alphabet)
>>> for sym in seq:
print sym
print(sym)
will print A, G, G, A, T (each on a separate row)
"""
tsyms = tuple(self.sequence)
......@@ -116,12 +118,12 @@ class Sequence(object):
def __contains__(self, item):
""" Defines what is returned when the "in" operator is used on a Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print 'T' in seq
>>> print('T' in seq)
True
which is equivalent to
>>> print seq.__contains__('T')
>>> print(seq.__contains__('T'))
True
>>> print 'X' in seq
>>> print('X' in seq)
False
"""
for sym in self.sequence:
......@@ -319,7 +321,7 @@ class Alignment():
one symbol is gap '-'
Example usage:
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print Alignment(seqs)
>>> print(Alignment(seqs))
THIS-LI-NE-
--ISALIGNED
"""
......@@ -351,7 +353,7 @@ class Alignment():
""" Defines what the "len" operator returns for an instance of Alignment, e.g.
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> aln = Alignment(seqs)
>>> print len(aln)
>>> print(len(aln))
2
"""
return len(self.seqs)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment