Commit 8fa94535 authored by Mikael Boden's avatar Mikael Boden

added_regex_search_in_gappy_sequences

parent 9889428a
Pipeline #46 failed with stages
import annotations
import phylo
tree = phylo.parseNewick("(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
......@@ -10,17 +12,17 @@ import phylo
working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
print (tree)
print (tree.nexus_annotations.annotations)
tree.swap_annotations("PDB")
print (tree)
print (tree.nexus_annotations.annotations)
# working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
# tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
#
# print (tree)
# print (tree.nexus_annotations.annotations)
#
# tree.swap_annotations("PDB")
#
# print (tree)
# print (tree.nexus_annotations.annotations)
#
# tree.write_to_nexus(working_dir + "output.nexus")
......
from collections import defaultdict
from phylo import *
import phylo
import matplotlib
import random
......@@ -146,3 +147,5 @@ class NexusAnnotations():
def generate_colour_list(self, num):
return num
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
......@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
f.close()
if __name__ == '__main__':
bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf')
bf = GtfFile('/Users/mikael/simhome/NFIX/WT1689.gtf')
print(bf.chroms.keys())
g = bf.generate('chr12')
print(next(g))
......
'''
Module with methods and classes for phylogeny.
Extended to handle n-ary trees (Jan 2019).
@author: mikael
'''
import sequence
from collections import defaultdict
import annotations
class PhyloTree:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
""" Rooted, n-ary tree for representing phylogenetic relationships.
Functionality includes labelling and traversing nodes; reading and writing to Newick format;
association with sequence alignment; maximum parsimony inference of ancestral sequence;
generation of single, bifurcating rooted tree by UPGMA.
Known issues: Binary only; Parsimony does not handle gaps in alignment.
generation of rooted tree by UPGMA.
Known issues: Parsimony does not handle gaps in alignment.
Programmers should note that almost all functionality is implemented through recursion. """
def __init__(self, root):
......@@ -27,7 +27,6 @@ class PhyloTree:
def putAnnotations(self, nexus_annotations):
self.nexus_annotations = nexus_annotations
# Update the annotations dictionary so that it contains PhyloNode objects as keys, not text labels
for node in self.getNodes():
if node.label in self.nexus_annotations.leaf_annotations:
......@@ -60,10 +59,18 @@ class PhyloTree:
node = queue.pop()
nodes.append(node)
# if strategy.upper().startswith('DEPTH'):
if node.left: queue.append(node.left)
if node.right: queue.append(node.right)
if not node.isLeaf():
queue.extend(node.children)
return nodes
def getLeaves(self):
all = self.getNodes()
leaves = []
for n in all:
if n.isLeaf():
leaves.append(n)
return leaves
def getDescendantsOf(self, node, transitive=False):
""" Retrieve and return the (list of) descendants (children) of a specified node.
Node can be the label or the instance.
......@@ -86,28 +93,7 @@ class PhyloTree:
if not isinstance(node, PhyloNode):
node = self.findLabel(node)
if node:
myroot = self.root
found = False
branching = []
while not found and myroot != None:
branching.append(myroot)
# check if "myroot" is a leaf node, i.e. does not have children
if myroot.left == node or myroot.right == node:
found = True
break
if myroot.left != None: # myroot has a "left" child
# check if the "left" child of "myroot" is the ancestor of "node"
if myroot.left.isAncestorOf(node, transitive=True): # if yes,
myroot = myroot.left # move to the "left" child
else: # if not,
myroot = myroot.right # move to the "right" child
else: # myroot does NOT have a "left" child, so let's move "right"
myroot = myroot.right
if found and transitive:
return branching
elif found and len(branching) > 0:
return branching[len(branching) - 1]
return None
return node.getAncestors(transitive)
def parsimony(self):
""" Solve the "small parsimony problem",
......@@ -117,12 +103,8 @@ class PhyloTree:
self.root._backwardParsimony(self.aln) # use scores to determine sequences
return self.root.getSequence() # return the sequence found at the root
def canonise(self):
self.root._canonise()
def swap_annotations(self, annotation_key):
try:
for node in self.getNodes():
if node.isLeaf():
node.label = self.nexus_annotations.leaf_annotations[node][annotation_key]
......@@ -135,103 +117,91 @@ class PhyloTree:
:param out_path: The path to write the NEXUS file to
:param nexus_annotations: The NexusAnnotations containing the annotations
"""
if write_annotations and not nexus_annotations:
if not self.nexus_annotations:
raise RuntimeError("This tree file has no associated annotation file. Either associate or supply one as a parameter.")
nexus_annotations = self.nexus_annotations
if nexus_annotations:
for node in self.getNodes():
if node in self.nexus_annotations.node_annotations:
node.annotate_node(self.nexus_annotations.node_annotations, self.nexus_annotations.annotation_symbols, exclude_annotations, use_symbols)
tree_annotation = str(self) + ";"
self.swap_annotations("Original")
for node in self.getNodes():
if node in self.nexus_annotations.leaf_annotations:
node.annotate_node(self.nexus_annotations.leaf_annotations, exclude_annotations)
leaves = []
for node in self.getNodes():
if node.isLeaf():
leaves.append(node.label)
leaf_annotation = ""
for leaf in leaves:
leaf_annotation += "\n\t%s" % (leaf)
with open(out_path, "w+") as file:
file.write(
"#NEXUS\nbegin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels%s\n;\nend;\n\nbegin trees;\n\ttree tree_1 = "
"[&R] %s\nend;" % (len(leaves), leaf_annotation, tree_annotation))
class PhyloNode:
""" A class for a node in a rooted, binary (bifurcating) tree.
Contains pointers to descendants/daughters (left and right),
""" A class for a node in a rooted, n-ary tree.
Contains pointers to multiple descendants/daughters,
optional fields include data, label, sequence and dist.
If parsimony is used scores and traceback pointers are available.
A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """
def __init__(self, label=''):
""" Initialise an initially unlinked node.
Populate fields left and right to link it with other nodes.
def __init__(self, parent = None, label=''):
""" Initialise a node.
Set its parent (another PhyloNode), parent can be None.
Set label to name it.
Use field data for any type of information associated with node.
Use dist to indicate the distance to its parent (if any).
Other fields are used internally, including sequence for associated alignment,
seqscores, backleft and backright for maximum parsimony. """
self.left = None
self.right = None
seqscores, back for maximum parsimony. """
self.parent = parent
self.children = None
self.data = None
self.label = label
self.dist = None
self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
self.seqscores = None # The scores propagated from leaves via children
self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols
self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
self.seqscores = None # The scores propagated from leaves via children
self.backptr = None # Pointers back to children: what symbol rendered current/parent symbols
def isLeaf(self):
return self.left == self.right == None
return self.nChildren() == 0
def nChildren(self):
if self.children == None:
return 0
else:
return len(self.children)
def __str__(self):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = str(self.left)
if self.right:
right = str(self.right)
stubs = ['' for _ in range(self.nChildren())]
label = dist = ''
for i in range(self.nChildren()):
stubs[i] = str(self.children[i])
if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist)
if self.label != None:
label = str(self.label)
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if self.nChildren() == 0:
return label + dist
else:
stubstr = '('
for i in range(len(stubs) - 1):
stubstr += stubs[i] + ','
return stubstr + stubs[-1] + ')' + label + dist
# there is no label
'''
if not self.left and self.right:
return ',' + right
elif self.left and not self.right:
return left + ','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
'''
# def __le__(self, other):
# """ Returns indication of less than other node. """
......@@ -247,38 +217,31 @@ class PhyloNode:
def _printSequences(self, start, end):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = self.left._printSequences(start, end)
if self.right:
right = self.right._printSequences(start, end)
if self.dist:
stubs = ['' for _ in range(self.nChildren())]
label = dist = ''
for i in range(self.nChildren()):
stubs[i] = self._printSequences(self.children[i], start, end)
if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist)
if self.sequence != None:
label = "".join(self.sequence[start:end]) + ""
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if not self.left and self.right:
return ',' + right
elif self.left and not self.right:
return left + ','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
if self.label != None:
label = str(self.label)
if self.nChildren() == 0:
return label + dist
else:
stubstr = '('
for i in range(len(stubs) - 1):
stubstr += stubs[i] + ','
return stubstr + stubs[-1] + ')' + label + dist
def _findLabel(self, label):
""" Find a node by label at this node or in any descendants (recursively). """
if self.label == label:
return self
else:
if self.left:
foundLeft = self.left._findLabel(label)
if foundLeft:
return foundLeft
if self.right:
return self.right._findLabel(label)
for i in range(self.nChildren()):
found = self.children[i]._findLabel(label)
if found:
return found
return None
def _propagateDistance(self, parent_dist):
......@@ -286,24 +249,21 @@ class PhyloNode:
The only parameter is the absolute distance to the parent of this node. """
travelled = self.dist # absolute distance to this node
self.dist = parent_dist - self.dist # relative distance to this node
if self.left != None: # if there is a child node...
self.left._propagateDistance(travelled) # pass absolute distance to this node
if self.right != None:
self.right._propagateDistance(travelled)
for i in range(self.nChildren()):
self.children[i]._propagateDistance(travelled) # pass absolute distance to this node
def _assignAlignment(self, aln):
""" Assign an alignment to the node, which implies assigning a sequence to it if one is
available in the alignment. """
self.sequence = None
if self.left != None:
self.left._assignAlignment(aln)
if self.right != None:
self.right._assignAlignment(aln)
for i in range(self.nChildren()):
self.children[i]._assignAlignment(aln)
for seq in aln.seqs:
if seq.name == self.label:
self.sequence = seq
break
""" # Not sure if this is required (putting nodes into a canonical ordering)
def _canonise(self):
if self.left == None and self.right == None: # at leaf
return self.label
......@@ -315,52 +275,38 @@ class PhyloNode:
self.right = tmpnode
return myright
return myleft
"""
def _forwardParsimony(self, aln):
""" Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node,
then to propagate scores from sequence assigned nodes to root (backward). """
if self.sequence == None: # no sequence has been assigned
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
if self.nChildren() == 0: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
scoresleft = scoresright = None
if self.left != None:
scoresleft = self.left._forwardParsimony(aln)
if self.right != None:
scoresright = self.right._forwardParsimony(aln)
scores = [None for _ in range(self.nChildren())]
for i in range(self.nChildren()):
scores[i] = self.children[i]._forwardParsimony(aln)
# for each position in the alignment,
# introduce (initially zero) score for each symbol in alphabet
self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
# for each position in the alignment,
# allocate a position to put the left child symbol from which each current node symbol score was determined
self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
# allocate a position to put the right child symbol from which each current node symbol score was determined
self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
# allocate a position to put the each child symbol from which each current node symbol score was determined
self.backptr = [[[None for _ in aln.alphabet] for _ in range(aln.alignlen)] for _ in range(self.nChildren())]
for col in range(aln.alignlen):
# left child will contribute first
for a_parent in range(len(aln.alphabet)):
best_score_left = +9999999
best_symb_left = 0
for a in range(len(aln.alphabet)):
score = (scoresleft[col][a] + (
1 if a != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_left:
best_symb_left = a
best_score_left = score
self.seqscores[col][a_parent] = best_score_left
self.backleft[col][a_parent] = best_symb_left
# right child will contribute next
for a_parent in range(len(aln.alphabet)):
best_score_right = +9999999
best_symb_right = 0
for a in range(len(aln.alphabet)):
score = (scoresright[col][a] + (
1 if a != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_right:
best_symb_right = a
best_score_right = score
self.seqscores[col][a_parent] += best_score_right
self.backright[col][a_parent] = best_symb_right
for i in range(self.nChildren()):
# left child will contribute first
for a_parent in range(len(aln.alphabet)):
best_score = +9999999
best_symb = 0
for a in range(len(aln.alphabet)):
score = (scores[i][col][a] + (
1 if a != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score:
best_symb = a
best_score = score
self.seqscores[col][a_parent] += best_score
self.backptr[i][col][a_parent] = best_symb
else:
self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in
self.sequence] # if we want to weight scores, this would need to change
......@@ -370,39 +316,37 @@ class PhyloNode:
""" Internal function that operates recursively to inspect scores to determine
most parsimonious sequence, from root to leaves. """
if self.sequence == None: # no sequence has been assigned
leftbuf = []
rightbuf = []
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
childbuf = [[] for _ in range(self.nChildren())]
if self.nChildren() == 0: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
currbuf = []
for col in range(aln.alignlen):
min_score = 999999
min_symb = None
left_symb = None
right_symb = None
child_symb = [None for _ in range(self.nChildren())]
for a_parent in range(len(aln.alphabet)):
if self.seqscores[col][a_parent] < min_score:
min_score = self.seqscores[col][a_parent]
min_symb = a_parent
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
for i in range(self.nChildren()):
child_symb[i] = self.backptr[i][col][a_parent]
currbuf.append(aln.alphabet[min_symb])
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
for i in range(self.nChildren()):
childbuf[i].append(aln.alphabet[child_symb[i]])
self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy=True)
else: # Non-root, but not leaf
self.sequence = seq
col = 0
for sym_parent in self.sequence:
a_parent = aln.alphabet.index(sym_parent)
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
child_symb = [None for _ in range(self.nChildren())]
for i in range(self.nChildren()):
child_symb[i] = self.backptr[i][col][a_parent]
childbuf.append(aln.alphabet[child_symb[i]])
col += 1
self.left._backwardParsimony(aln, sequence.Sequence(leftbuf, aln.alphabet, self.label, gappy=True))
self.right._backwardParsimony(aln, sequence.Sequence(rightbuf, aln.alphabet, self.label, gappy=True))
for i in range(self.nChildren()):
self.children[i]._backwardParsimony(aln, sequence.Sequence(childbuf[i], aln.alphabet, self.label, gappy=True))
return self.sequence
def getSequence(self):
......@@ -418,26 +362,35 @@ class PhyloNode:
""" Decide if this node is the ancestor of specified node.
If transitive is True (default), all descendants are included.
If transitive is False, only direct descendants are included. """
if node == self.left or node == self.right:
return True
elif transitive:
if self.left:
statusLeft = self.left.isAncestorOf(node, transitive)
if statusLeft: return True
if self.right:
return self.right.isAncestorOf(node, transitive)
for i in range(self.nChildren()):
if node == self.children[i]:
return True
elif transitive:
status = self.children[i].isAncestorOf(node, transitive)
if status: return True
else:
return False
def getAncestors(self, transitive=False):
""" Retrieve and return (list of) parent nodes.
If transitive is False (default), only the direct parent is included.
If transitive is True, all parents (parents of parents etc) are included. """
if self.parent == None:
return []
if not transitive:
return [self.parent]
else:
parents = self.parent.getAncestors(transitive)
parents.append(self.parent)
return parents
def getDescendants(self, transitive=False):
""" Retrieve and return (list of) nodes descendant of this.
If transitive is False (default), only direct descendants are included.
If transitive is True, all descendants are (recursively) included. """
children = []
if self.left:
children.append(self.left)
if self.right:
children.append(self.right)
for i in range(self.nChildren()):
children.append(self.children[i])
if not transitive:
return children
else:
......@@ -450,13 +403,11 @@ class PhyloNode:
return children
def annotate_node(self, annotations, annotation_symbols= None, exclude_annotations = [], use_symbols=False ):
annotation_string = "[&"
for key, val_list in annotations[self].items():
if type(val_list) != list:
val_list = [val_list]
if key not in exclude_annotations:
# If we are using annotation symbols and the annotation has an associated symbol
for val in val_list:
if use_symbols and val in annotation_symbols:
......@@ -464,11 +415,8 @@ class PhyloNode:
annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in sorted_symbols]))
else:
annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in val_list]))
# Remove the final comma and add in a closing bracket
annotation_string = annotation_string[0: len(annotation_string) - 1] + "]"
if len(annotation_string) > 2:
if ":" in self.label:
self.label = self.label.split(":")[0] + annotation_string + self.label.split(":")[1]
......@@ -488,7 +436,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
D = {}
N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances
nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
nodes = [PhyloNode(label=seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for i in range(len(nodes)):
nodes[i].sequence = aln.seqs[i]
......@@ -525,8 +473,9 @@ def runUPGMA(aln, measure, absoluteDistances=False):
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree
z.right = y
x.parent = z
y.parent = z
z.children = [x, y]
nodes.append(z)
if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
......@@ -534,24 +483,22 @@ def runUPGMA(aln, measure, absoluteDistances=False):
z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return
""" ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format
----------------------------------------------------------------------------------------"""
def _findComma(string, level=0):
""" Find first comma at specified level of embedding """
""" Find all commas at specified level of embedding """
mylevel = 0
commas = []
for i in range(len(string)):
if string[i] == '(':
mylevel += 1
elif string[i] == ')':
mylevel -= 1
elif string[i] == ',' and mylevel == level:
return i
return -1
commas.append(i)
return commas
def parseNewickNode(string):
""" Utility function that recursively parses embedded string using Newick format. """
......@@ -559,7 +506,7 @@ def parseNewickNode(string):
last = string[::-1].find(')') # look from the back
if first == -1 and last == -1: # we are at leaf
y = string.split(':')
node = PhyloNode(y[0])
node = PhyloNode(label=y[0])
if len(y) >= 2:
node.dist = float(y[1])
return node
......@@ -569,17 +516,24 @@ def parseNewickNode(string):
embed = string[first + 1:last]
tail = string[last + 1:]
# find where corresp comma is
comma = _findComma(embed)
if comma == -1:
commas = _findComma(embed)
if len(commas) < 1:
raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
left = embed[0:comma].strip()
right = embed[comma + 1:].strip()
prev_comma = 0
child_tokens = []
for comma in commas:
child_tokens.append(embed[prev_comma:comma].strip())
prev_comma = comma + 1
child_tokens.append(embed[prev_comma:].strip())
y = tail.split(':')
node = PhyloNode(y[0]) # node is an instance of the PhyloNode() class
node = PhyloNode(label=y[0]) # node is an instance of the PhyloNode() class
if len(y) >= 2:
node.dist = float(y[1])
node.left = parseNewickNode(left)
node.right = parseNewickNode(right)
node.children = []
for tok in child_tokens:
child = parseNewickNode(tok)
child.parent = node
node.children.append(child)
return node
else:
raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
......@@ -628,8 +582,6 @@ def parse_nexus(string):
taxon_num = num + 1
while not lines[taxon_num].strip().startswith(";"):
taxon_name = lines[taxon_num].split("[")[0].strip()
for annot_line in lines[taxon_num].split("[&")[1].split(","):
#TODO: Make these regex calls
# print ("Annotation Key is ", annot_line.split("=")[0])
......@@ -641,34 +593,18 @@ def parse_nexus(string):
annot_val = annot_line.split("=")[1].split("]")[0]
annotation_dict[taxon_name][annot_key.strip()] = annot_val
taxon_num +=1
if line.strip().startswith("begin trees"):
tree_num = num + 1
tree = (lines[tree_num].split("[&R]")[1])
phylo_tree = parseNewick(tree)
nexus_annotations = annotations.NexusAnnotations(tree=phylo_tree)
nexus_annotations.add_annotations(annotation_dict)
# print (nexus_annotations.annotations)
phylo_tree.putAnnotations(nexus_annotations)
## Extract all of the annotations from the tree and add them to the NexusAnnotations object
print ("Number of taxons is %s " % (taxon_number))
return phylo_tree
""" ----------------------------------------------------------------------------------------
Method for generating a PhyloTree with unique tip names
----------------------------------------------------------------------------------------"""
......@@ -676,7 +612,6 @@ def parse_nexus(string):
def get_unique_tree(tree):
unique_tree = tree
unique_labels = {}
for node in unique_tree.getNodes():
if node.isLeaf() and node.label in unique_labels:
unique_labels[node.label] = unique_labels[node.label] + 1
......@@ -688,3 +623,6 @@ def get_unique_tree(tree):
def unpack_list(list):
return (" ".join(["%s"] * len(list)) + "!") % (x for x in list)
if __name__ == '__main__':
tree = readNewick('/Users/mikael/simhome/ASR/edge1.nwk')
print(tree)
\ No newline at end of file
......@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
if parse_defline:
parsed = parseDefline(seqinfo[0])