Commit 8fa94535 authored by Mikael Boden's avatar Mikael Boden

added_regex_search_in_gappy_sequences

parent 9889428a
Pipeline #46 failed with stages
import annotations import annotations
import phylo import phylo
tree = phylo.parseNewick("(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk") # tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
...@@ -10,17 +12,17 @@ import phylo ...@@ -10,17 +12,17 @@ import phylo
working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/" # working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
tree = phylo.read_nexus(working_dir + "annotation_simple.nexus") # tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
#
print (tree) # print (tree)
print (tree.nexus_annotations.annotations) # print (tree.nexus_annotations.annotations)
#
tree.swap_annotations("PDB") # tree.swap_annotations("PDB")
#
print (tree) # print (tree)
print (tree.nexus_annotations.annotations) # print (tree.nexus_annotations.annotations)
# #
# tree.write_to_nexus(working_dir + "output.nexus") # tree.write_to_nexus(working_dir + "output.nexus")
......
from collections import defaultdict from collections import defaultdict
from phylo import * from phylo import *
import phylo
import matplotlib import matplotlib
import random import random
...@@ -146,3 +147,5 @@ class NexusAnnotations(): ...@@ -146,3 +147,5 @@ class NexusAnnotations():
def generate_colour_list(self, num): def generate_colour_list(self, num):
return num return num
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
...@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None): ...@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
f.close() f.close()
if __name__ == '__main__': if __name__ == '__main__':
bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf') bf = GtfFile('/Users/mikael/simhome/NFIX/WT1689.gtf')
print(bf.chroms.keys()) print(bf.chroms.keys())
g = bf.generate('chr12') g = bf.generate('chr12')
print(next(g)) print(next(g))
......
''' '''
Module with methods and classes for phylogeny. Module with methods and classes for phylogeny.
Extended to handle n-ary trees (Jan 2019).
@author: mikael @author: mikael
''' '''
import sequence import sequence
from collections import defaultdict from collections import defaultdict
import annotations import annotations
class PhyloTree: class PhyloTree:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships. """ Rooted, n-ary tree for representing phylogenetic relationships.
Functionality includes labelling and traversing nodes; reading and writing to Newick format; Functionality includes labelling and traversing nodes; reading and writing to Newick format;
association with sequence alignment; maximum parsimony inference of ancestral sequence; association with sequence alignment; maximum parsimony inference of ancestral sequence;
generation of single, bifurcating rooted tree by UPGMA. generation of rooted tree by UPGMA.
Known issues: Binary only; Parsimony does not handle gaps in alignment. Known issues: Parsimony does not handle gaps in alignment.
Programmers should note that almost all functionality is implemented through recursion. """ Programmers should note that almost all functionality is implemented through recursion. """
def __init__(self, root): def __init__(self, root):
...@@ -27,7 +27,6 @@ class PhyloTree: ...@@ -27,7 +27,6 @@ class PhyloTree:
def putAnnotations(self, nexus_annotations): def putAnnotations(self, nexus_annotations):
self.nexus_annotations = nexus_annotations self.nexus_annotations = nexus_annotations
# Update the annotations dictionary so that it contains PhyloNode objects as keys, not text labels # Update the annotations dictionary so that it contains PhyloNode objects as keys, not text labels
for node in self.getNodes(): for node in self.getNodes():
if node.label in self.nexus_annotations.leaf_annotations: if node.label in self.nexus_annotations.leaf_annotations:
...@@ -60,10 +59,18 @@ class PhyloTree: ...@@ -60,10 +59,18 @@ class PhyloTree:
node = queue.pop() node = queue.pop()
nodes.append(node) nodes.append(node)
# if strategy.upper().startswith('DEPTH'): # if strategy.upper().startswith('DEPTH'):
if node.left: queue.append(node.left) if not node.isLeaf():
if node.right: queue.append(node.right) queue.extend(node.children)
return nodes return nodes
def getLeaves(self):
all = self.getNodes()
leaves = []
for n in all:
if n.isLeaf():
leaves.append(n)
return leaves
def getDescendantsOf(self, node, transitive=False): def getDescendantsOf(self, node, transitive=False):
""" Retrieve and return the (list of) descendants (children) of a specified node. """ Retrieve and return the (list of) descendants (children) of a specified node.
Node can be the label or the instance. Node can be the label or the instance.
...@@ -86,28 +93,7 @@ class PhyloTree: ...@@ -86,28 +93,7 @@ class PhyloTree:
if not isinstance(node, PhyloNode): if not isinstance(node, PhyloNode):
node = self.findLabel(node) node = self.findLabel(node)
if node: if node:
myroot = self.root return node.getAncestors(transitive)
found = False
branching = []
while not found and myroot != None:
branching.append(myroot)
# check if "myroot" is a leaf node, i.e. does not have children
if myroot.left == node or myroot.right == node:
found = True
break
if myroot.left != None: # myroot has a "left" child
# check if the "left" child of "myroot" is the ancestor of "node"
if myroot.left.isAncestorOf(node, transitive=True): # if yes,
myroot = myroot.left # move to the "left" child
else: # if not,
myroot = myroot.right # move to the "right" child
else: # myroot does NOT have a "left" child, so let's move "right"
myroot = myroot.right
if found and transitive:
return branching
elif found and len(branching) > 0:
return branching[len(branching) - 1]
return None
def parsimony(self): def parsimony(self):
""" Solve the "small parsimony problem", """ Solve the "small parsimony problem",
...@@ -117,12 +103,8 @@ class PhyloTree: ...@@ -117,12 +103,8 @@ class PhyloTree:
self.root._backwardParsimony(self.aln) # use scores to determine sequences self.root._backwardParsimony(self.aln) # use scores to determine sequences
return self.root.getSequence() # return the sequence found at the root return self.root.getSequence() # return the sequence found at the root
def canonise(self):
self.root._canonise()
def swap_annotations(self, annotation_key): def swap_annotations(self, annotation_key):
try: try:
for node in self.getNodes(): for node in self.getNodes():
if node.isLeaf(): if node.isLeaf():
node.label = self.nexus_annotations.leaf_annotations[node][annotation_key] node.label = self.nexus_annotations.leaf_annotations[node][annotation_key]
...@@ -135,103 +117,91 @@ class PhyloTree: ...@@ -135,103 +117,91 @@ class PhyloTree:
:param out_path: The path to write the NEXUS file to :param out_path: The path to write the NEXUS file to
:param nexus_annotations: The NexusAnnotations containing the annotations :param nexus_annotations: The NexusAnnotations containing the annotations
""" """
if write_annotations and not nexus_annotations: if write_annotations and not nexus_annotations:
if not self.nexus_annotations: if not self.nexus_annotations:
raise RuntimeError("This tree file has no associated annotation file. Either associate or supply one as a parameter.") raise RuntimeError("This tree file has no associated annotation file. Either associate or supply one as a parameter.")
nexus_annotations = self.nexus_annotations nexus_annotations = self.nexus_annotations
if nexus_annotations: if nexus_annotations:
for node in self.getNodes(): for node in self.getNodes():
if node in self.nexus_annotations.node_annotations: if node in self.nexus_annotations.node_annotations:
node.annotate_node(self.nexus_annotations.node_annotations, self.nexus_annotations.annotation_symbols, exclude_annotations, use_symbols) node.annotate_node(self.nexus_annotations.node_annotations, self.nexus_annotations.annotation_symbols, exclude_annotations, use_symbols)
tree_annotation = str(self) + ";" tree_annotation = str(self) + ";"
self.swap_annotations("Original") self.swap_annotations("Original")
for node in self.getNodes(): for node in self.getNodes():
if node in self.nexus_annotations.leaf_annotations: if node in self.nexus_annotations.leaf_annotations:
node.annotate_node(self.nexus_annotations.leaf_annotations, exclude_annotations) node.annotate_node(self.nexus_annotations.leaf_annotations, exclude_annotations)
leaves = [] leaves = []
for node in self.getNodes(): for node in self.getNodes():
if node.isLeaf(): if node.isLeaf():
leaves.append(node.label) leaves.append(node.label)
leaf_annotation = "" leaf_annotation = ""
for leaf in leaves: for leaf in leaves:
leaf_annotation += "\n\t%s" % (leaf) leaf_annotation += "\n\t%s" % (leaf)
with open(out_path, "w+") as file: with open(out_path, "w+") as file:
file.write( file.write(
"#NEXUS\nbegin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels%s\n;\nend;\n\nbegin trees;\n\ttree tree_1 = " "#NEXUS\nbegin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels%s\n;\nend;\n\nbegin trees;\n\ttree tree_1 = "
"[&R] %s\nend;" % (len(leaves), leaf_annotation, tree_annotation)) "[&R] %s\nend;" % (len(leaves), leaf_annotation, tree_annotation))
class PhyloNode: class PhyloNode:
""" A class for a node in a rooted, binary (bifurcating) tree. """ A class for a node in a rooted, n-ary tree.
Contains pointers to descendants/daughters (left and right), Contains pointers to multiple descendants/daughters,
optional fields include data, label, sequence and dist. optional fields include data, label, sequence and dist.
If parsimony is used scores and traceback pointers are available. If parsimony is used scores and traceback pointers are available.
A number of methods are named with a _ prefix. These can be, but A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """ are not intended to be used from outside the class. """
def __init__(self, label=''): def __init__(self, parent = None, label=''):
""" Initialise an initially unlinked node. """ Initialise a node.
Populate fields left and right to link it with other nodes. Set its parent (another PhyloNode), parent can be None.
Set label to name it. Set label to name it.
Use field data for any type of information associated with node. Use field data for any type of information associated with node.
Use dist to indicate the distance to its parent (if any). Use dist to indicate the distance to its parent (if any).
Other fields are used internally, including sequence for associated alignment, Other fields are used internally, including sequence for associated alignment,
seqscores, backleft and backright for maximum parsimony. """ seqscores, back for maximum parsimony. """
self.left = None self.parent = parent
self.right = None self.children = None
self.data = None self.data = None
self.label = label self.label = label
self.dist = None self.dist = None
self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral) self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
self.seqscores = None # The scores propagated from leaves via children self.seqscores = None # The scores propagated from leaves via children
self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols self.backptr = None # Pointers back to children: what symbol rendered current/parent symbols
self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
def isLeaf(self): def isLeaf(self):
return self.left == self.right == None return self.nChildren() == 0
def nChildren(self):
if self.children == None:
return 0
else:
return len(self.children)
def __str__(self): def __str__(self):
""" Returns string with node (incl descendants) in a Newick style. """ """ Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = '' stubs = ['' for _ in range(self.nChildren())]
if self.left: label = dist = ''
left = str(self.left) for i in range(self.nChildren()):
if self.right: stubs[i] = str(self.children[i])
right = str(self.right)
if self.dist or self.dist == 0.0: if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist) dist = ':' + str(self.dist)
if self.label != None: if self.label != None:
label = str(self.label) label = str(self.label)
if not self.left and not self.right: if self.nChildren() == 0:
return label + dist return label + dist
else: else:
return '(' + left + ',' + right + ')' + label + dist stubstr = '('
else: # there is no label for i in range(len(stubs) - 1):
stubstr += stubs[i] + ','
return stubstr + stubs[-1] + ')' + label + dist
# there is no label
'''
if not self.left and self.right: if not self.left and self.right:
return ',' + right return ',' + right
elif self.left and not self.right: elif self.left and not self.right:
return left + ',' return left + ','
elif self.left and self.right: elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist return '(' + left + ',' + right + ')' + dist
'''
# def __le__(self, other): # def __le__(self, other):
# """ Returns indication of less than other node. """ # """ Returns indication of less than other node. """
...@@ -247,38 +217,31 @@ class PhyloNode: ...@@ -247,38 +217,31 @@ class PhyloNode:
def _printSequences(self, start, end): def _printSequences(self, start, end):
""" Returns string with node (incl descendants) in a Newick style. """ """ Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = '' stubs = ['' for _ in range(self.nChildren())]
if self.left: label = dist = ''
left = self.left._printSequences(start, end) for i in range(self.nChildren()):
if self.right: stubs[i] = self._printSequences(self.children[i], start, end)
right = self.right._printSequences(start, end) if self.dist or self.dist == 0.0:
if self.dist:
dist = ':' + str(self.dist) dist = ':' + str(self.dist)
if self.sequence != None: if self.label != None:
label = "".join(self.sequence[start:end]) + "" label = str(self.label)
if not self.left and not self.right: if self.nChildren() == 0:
return label + dist return label + dist
else: else:
return '(' + left + ',' + right + ')' + label + dist stubstr = '('
else: # there is no label for i in range(len(stubs) - 1):
if not self.left and self.right: stubstr += stubs[i] + ','
return ',' + right return stubstr + stubs[-1] + ')' + label + dist
elif self.left and not self.right:
return left + ','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def _findLabel(self, label): def _findLabel(self, label):
""" Find a node by label at this node or in any descendants (recursively). """ """ Find a node by label at this node or in any descendants (recursively). """
if self.label == label: if self.label == label:
return self return self
else: else:
if self.left: for i in range(self.nChildren()):
foundLeft = self.left._findLabel(label) found = self.children[i]._findLabel(label)
if foundLeft: if found:
return foundLeft return found
if self.right:
return self.right._findLabel(label)
return None return None
def _propagateDistance(self, parent_dist): def _propagateDistance(self, parent_dist):
...@@ -286,24 +249,21 @@ class PhyloNode: ...@@ -286,24 +249,21 @@ class PhyloNode:
The only parameter is the absolute distance to the parent of this node. """ The only parameter is the absolute distance to the parent of this node. """
travelled = self.dist # absolute distance to this node travelled = self.dist # absolute distance to this node
self.dist = parent_dist - self.dist # relative distance to this node self.dist = parent_dist - self.dist # relative distance to this node
if self.left != None: # if there is a child node... for i in range(self.nChildren()):
self.left._propagateDistance(travelled) # pass absolute distance to this node self.children[i]._propagateDistance(travelled) # pass absolute distance to this node
if self.right != None:
self.right._propagateDistance(travelled)
def _assignAlignment(self, aln): def _assignAlignment(self, aln):
""" Assign an alignment to the node, which implies assigning a sequence to it if one is """ Assign an alignment to the node, which implies assigning a sequence to it if one is
available in the alignment. """ available in the alignment. """
self.sequence = None self.sequence = None
if self.left != None: for i in range(self.nChildren()):
self.left._assignAlignment(aln) self.children[i]._assignAlignment(aln)
if self.right != None:
self.right._assignAlignment(aln)
for seq in aln.seqs: for seq in aln.seqs:
if seq.name == self.label: if seq.name == self.label:
self.sequence = seq self.sequence = seq
break break
""" # Not sure if this is required (putting nodes into a canonical ordering)
def _canonise(self): def _canonise(self):
if self.left == None and self.right == None: # at leaf if self.left == None and self.right == None: # at leaf
return self.label return self.label
...@@ -315,52 +275,38 @@ class PhyloNode: ...@@ -315,52 +275,38 @@ class PhyloNode:
self.right = tmpnode self.right = tmpnode
return myright return myright
return myleft return myleft
"""
def _forwardParsimony(self, aln): def _forwardParsimony(self, aln):
""" Internal function that operates recursively to first initialise each node (forward), """ Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node, stopping only once a sequence has been assigned to the node,
then to propagate scores from sequence assigned nodes to root (backward). """ then to propagate scores from sequence assigned nodes to root (backward). """
if self.sequence == None: # no sequence has been assigned if self.sequence == None: # no sequence has been assigned
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores if self.nChildren() == 0: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label) raise RuntimeError("No sequence assigned to leaf node:", self.label)
scoresleft = scoresright = None scores = [None for _ in range(self.nChildren())]
if self.left != None: for i in range(self.nChildren()):
scoresleft = self.left._forwardParsimony(aln) scores[i] = self.children[i]._forwardParsimony(aln)
if self.right != None:
scoresright = self.right._forwardParsimony(aln)
# for each position in the alignment, # for each position in the alignment,
# introduce (initially zero) score for each symbol in alphabet # introduce (initially zero) score for each symbol in alphabet
self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)] self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
# for each position in the alignment, # for each position in the alignment,
# allocate a position to put the left child symbol from which each current node symbol score was determined # allocate a position to put the each child symbol from which each current node symbol score was determined
self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)] self.backptr = [[[None for _ in aln.alphabet] for _ in range(aln.alignlen)] for _ in range(self.nChildren())]
# allocate a position to put the right child symbol from which each current node symbol score was determined
self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
for col in range(aln.alignlen): for col in range(aln.alignlen):
# left child will contribute first for i in range(self.nChildren()):
for a_parent in range(len(aln.alphabet)): # left child will contribute first
best_score_left = +9999999 for a_parent in range(len(aln.alphabet)):
best_symb_left = 0 best_score = +9999999
for a in range(len(aln.alphabet)): best_symb = 0
score = (scoresleft[col][a] + ( for a in range(len(aln.alphabet)):
1 if a != a_parent else 0)) # if we want to weight scores, this would need to change score = (scores[i][col][a] + (
if score < best_score_left: 1 if a != a_parent else 0)) # if we want to weight scores, this would need to change
best_symb_left = a if score < best_score:
best_score_left = score best_symb = a
self.seqscores[col][a_parent] = best_score_left best_score = score
self.backleft[col][a_parent] = best_symb_left self.seqscores[col][a_parent] += best_score
# right child will contribute next self.backptr[i][col][a_parent] = best_symb
for a_parent in range(len(aln.alphabet)):
best_score_right = +9999999
best_symb_right = 0
for a in range(len(aln.alphabet)):
score = (scoresright[col][a] + (
1 if a != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_right:
best_symb_right = a
best_score_right = score
self.seqscores[col][a_parent] += best_score_right
self.backright[col][a_parent] = best_symb_right
else: else:
self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in
self.sequence] # if we want to weight scores, this would need to change self.sequence] # if we want to weight scores, this would need to change
...@@ -370,39 +316,37 @@ class PhyloNode: ...@@ -370,39 +316,37 @@ class PhyloNode:
""" Internal function that operates recursively to inspect scores to determine """ Internal function that operates recursively to inspect scores to determine
most parsimonious sequence, from root to leaves. """ most parsimonious sequence, from root to leaves. """
if self.sequence == None: # no sequence has been assigned if self.sequence == None: # no sequence has been assigned
leftbuf = [] childbuf = [[] for _ in range(self.nChildren())]
rightbuf = [] if self.nChildren() == 0: # no children, so terminal, cannot propagate scores
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label) raise RuntimeError("No sequence assigned to leaf node:", self.label)
if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
currbuf = [] currbuf = []
for col in range(aln.alignlen): for col in range(aln.alignlen):
min_score = 999999 min_score = 999999
min_symb = None min_symb = None
left_symb = None child_symb = [None for _ in range(self.nChildren())]
right_symb = None
for a_parent in range(len(aln.alphabet)): for a_parent in range(len(aln.alphabet)):
if self.seqscores[col][a_parent] < min_score: if self.seqscores[col][a_parent] < min_score:
min_score = self.seqscores[col][a_parent] min_score = self.seqscores[col][a_parent]
min_symb = a_parent min_symb = a_parent
left_symb = self.backleft[col][a_parent] for i in range(self.nChildren()):
right_symb = self.backright[col][a_parent] child_symb[i] = self.backptr[i][col][a_parent]
currbuf.append(aln.alphabet[min_symb]) currbuf.append(aln.alphabet[min_symb])
leftbuf.append(aln.alphabet[left_symb]) for i in range(self.nChildren()):
rightbuf.append(aln.alphabet[right_symb]) childbuf[i].append(aln.alphabet[child_symb[i]])
self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy=True) self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy=True)
else: # Non-root, but not leaf else: # Non-root, but not leaf
self.sequence = seq self.sequence = seq
col = 0 col = 0
for sym_parent in self.sequence: for sym_parent in self.sequence:
a_parent = aln.alphabet.index(sym_parent) a_parent = aln.alphabet.index(sym_parent)
left_symb = self.backleft[col][a_parent] child_symb = [None for _ in range(self.nChildren())]
right_symb = self.backright[col][a_parent] for i in range(self.nChildren()):
leftbuf.append(aln.alphabet[left_symb]) child_symb[i] = self.backptr[i][col][a_parent]
rightbuf.append(aln.alphabet[right_symb]) childbuf.append(aln.alphabet[child_symb[i]])
col += 1 col += 1
self.left._backwardParsimony(aln, sequence.Sequence(leftbuf, aln.alphabet, self.label, gappy=True)) for i in range(self.nChildren()):
self.right._backwardParsimony(aln, sequence.Sequence(rightbuf, aln.alphabet, self.label, gappy=True)) self.children[i]._backwardParsimony(aln, sequence.Sequence(childbuf[i], aln.alphabet, self.label, gappy=True))
return self.sequence return self.sequence
def getSequence(self): def getSequence(self):
...@@ -418,26 +362,35 @@ class PhyloNode: ...@@ -418,26 +362,35 @@ class PhyloNode:
""" Decide if this node is the ancestor of specified node. """ Decide if this node is the ancestor of specified node.
If transitive is True (default), all descendants are included. If transitive is True (default), all descendants are included.
If transitive is False, only direct descendants are included. """ If transitive is False, only direct descendants are included. """
if node == self.left or node == self.right: for i in range(self.nChildren()):
return True if node == self.children[i]:
elif transitive: return True
if self.left: elif transitive:
statusLeft = self.left.isAncestorOf(node, transitive) status = self.children[i].isAncestorOf(node, transitive)
if statusLeft: return True if status: return True
if self.right:
return self.right.isAncestorOf(node, transitive)
else: else:
return False return False
def getAncestors(self, transitive=False):
""" Retrieve and return (list of) parent nodes.
If transitive is False (default), only the direct parent is included.
If transitive is True, all parents (parents of parents etc) are included. """
if self.parent == None:
return []
if not transitive:
return [self.parent]
else:
parents = self.parent.getAncestors(transitive)
parents.append(self.parent)
return parents
def getDescendants(self, transitive=False): def getDescendants(self, transitive=False):
""" Retrieve and return (list of) nodes descendant of this. """ Retrieve and return (list of) nodes descendant of this.
If transitive is False (default), only direct descendants are included. If transitive is False (default), only direct descendants are included.
If transitive is True, all descendants are (recursively) included. """ If transitive is True, all descendants are (recursively) included. """
children = [] children = []
if self.left: for i in range(self.nChildren()):
children.append(self.left) children.append(self.children[i])
if self.right:
children.append(self.right)
if not transitive: if not transitive:
return children return children
else: else:
...@@ -450,13 +403,11 @@ class PhyloNode: ...@@ -450,13 +403,11 @@ class PhyloNode:
return children return children
def annotate_node(self, annotations, annotation_symbols= None, exclude_annotations = [], use_symbols=False ): def annotate_node(self, annotations, annotation_symbols= None, exclude_annotations = [], use_symbols=False ):
annotation_string = "[&" annotation_string = "[&"
for key, val_list in annotations[self].items(): for key, val_list in annotations[self].items():
if type(val_list) != list: if type(val_list) != list:
val_list = [val_list] val_list = [val_list]
if key not in exclude_annotations: if key not in exclude_annotations:
# If we are using annotation symbols and the annotation has an associated symbol # If we are using annotation symbols and the annotation has an associated symbol
for val in val_list: for val in val_list:
if use_symbols and val in annotation_symbols: if use_symbols and val in annotation_symbols:
...@@ -464,11 +415,8 @@ class PhyloNode: ...@@ -464,11 +415,8 @@ class PhyloNode:
annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in sorted_symbols])) annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in sorted_symbols]))
else: else:
annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in val_list])) annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in val_list]))
# Remove the final comma and add in a closing bracket # Remove the final comma and add in a closing bracket
annotation_string = annotation_string[0: len(annotation_string) - 1] + "]" annotation_string = annotation_string[0: len(annotation_string) - 1] + "]"
if len(annotation_string) > 2: if len(annotation_string) > 2:
if ":" in self.label: if ":" in self.label:
self.label = self.label.split(":")[0] + annotation_string + self.label.split(":")[1] self.label = self.label.split(":")[0] + annotation_string + self.label.split(":")[1]
...@@ -488,7 +436,7 @@ def runUPGMA(aln, measure, absoluteDistances=False): ...@@ -488,7 +436,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
D = {} D = {}
N = {} # The number of sequences in each node N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances M = aln.calcDistances(measure) # determine all pairwise distances
nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes nodes = [PhyloNode(label=seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """ """ For each node-pair, assign the distance between them. """
for i in range(len(nodes)): for i in range(len(nodes)):
nodes[i].sequence = aln.seqs[i] nodes[i].sequence = aln.seqs[i]
...@@ -525,8 +473,9 @@ def runUPGMA(aln, measure, absoluteDistances=False): ...@@ -525,8 +473,9 @@ def runUPGMA(aln, measure, absoluteDistances=False):
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278) D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree x.parent = z
z.right = y y.parent = z
z.children = [x, y]
nodes.append(z) nodes.append(z)
if not absoluteDistances: if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
...@@ -534,24 +483,22 @@ def runUPGMA(aln, measure, absoluteDistances=False): ...@@ -534,24 +483,22 @@ def runUPGMA(aln, measure, absoluteDistances=False):
z.dist = 0.0 # root z is at distance 0 from merged x and y z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return return PhyloTree(z) # make it to tree, return
""" ---------------------------------------------------------------------------------------- """ ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format Methods for processing files of trees on the Newick format
----------------------------------------------------------------------------------------""" ----------------------------------------------------------------------------------------"""
def _findComma(string, level=0): def _findComma(string, level=0):
""" Find first comma at specified level of embedding """ """ Find all commas at specified level of embedding """
mylevel = 0 mylevel = 0
commas = []
for i in range(len(string)): for i in range(len(string)):
if string[i] == '(': if string[i] == '(':
mylevel += 1 mylevel += 1
elif string[i] == ')': elif string[i] == ')':
mylevel -= 1 mylevel -= 1
elif string[i] == ',' and mylevel == level: elif string[i] == ',' and mylevel == level:
return i commas.append(i)
return -1 return commas
def parseNewickNode(string): def parseNewickNode(string):
""" Utility function that recursively parses embedded string using Newick format. """ """ Utility function that recursively parses embedded string using Newick format. """
...@@ -559,7 +506,7 @@ def parseNewickNode(string): ...@@ -559,7 +506,7 @@ def parseNewickNode(string):
last = string[::-1].find(')') # look from the back last = string[::-1].find(')') # look from the back
if first == -1 and last == -1: # we are at leaf if first == -1 and last == -1: # we are at leaf
y = string.split(':') y = string.split(':')
node = PhyloNode(y[0]) node = PhyloNode(label=y[0])
if len(y) >= 2: if len(y) >= 2:
node.dist = float(y[1]) node.dist = float(y[1])
return node return node
...@@ -569,17 +516,24 @@ def parseNewickNode(string): ...@@ -569,17 +516,24 @@ def parseNewickNode(string):
embed = string[first + 1:last] embed = string[first + 1:last]
tail = string[last + 1:] tail = string[last + 1:]
# find where corresp comma is # find where corresp comma is
comma = _findComma(embed) commas = _findComma(embed)
if comma == -1: if len(commas) < 1:
raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"') raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
left = embed[0:comma].strip() prev_comma = 0
right = embed[comma + 1:].strip() child_tokens = []
for comma in commas:
child_tokens.append(embed[prev_comma:comma].strip())
prev_comma = comma + 1
child_tokens.append(embed[prev_comma:].strip())
y = tail.split(':') y = tail.split(':')
node = PhyloNode(y[0]) # node is an instance of the PhyloNode() class node = PhyloNode(label=y[0]) # node is an instance of the PhyloNode() class
if len(y) >= 2: if len(y) >= 2:
node.dist = float(y[1]) node.dist = float(y[1])
node.left = parseNewickNode(left) node.children = []
node.right = parseNewickNode(right) for tok in child_tokens:
child = parseNewickNode(tok)
child.parent = node
node.children.append(child)
return node return node
else: else:
raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"') raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
...@@ -628,8 +582,6 @@ def parse_nexus(string): ...@@ -628,8 +582,6 @@ def parse_nexus(string):
taxon_num = num + 1 taxon_num = num + 1
while not lines[taxon_num].strip().startswith(";"): while not lines[taxon_num].strip().startswith(";"):
taxon_name = lines[taxon_num].split("[")[0].strip() taxon_name = lines[taxon_num].split("[")[0].strip()
for annot_line in lines[taxon_num].split("[&")[1].split(","): for annot_line in lines[taxon_num].split("[&")[1].split(","):
#TODO: Make these regex calls #TODO: Make these regex calls
# print ("Annotation Key is ", annot_line.split("=")[0]) # print ("Annotation Key is ", annot_line.split("=")[0])
...@@ -641,34 +593,18 @@ def parse_nexus(string): ...@@ -641,34 +593,18 @@ def parse_nexus(string):
annot_val = annot_line.split("=")[1].split("]")[0] annot_val = annot_line.split("=")[1].split("]")[0]
annotation_dict[taxon_name][annot_key.strip()] = annot_val annotation_dict[taxon_name][annot_key.strip()] = annot_val
taxon_num +=1 taxon_num +=1
if line.strip().startswith("begin trees"): if line.strip().startswith("begin trees"):
tree_num = num + 1 tree_num = num + 1
tree = (lines[tree_num].split("[&R]")[1]) tree = (lines[tree_num].split("[&R]")[1])
phylo_tree = parseNewick(tree) phylo_tree = parseNewick(tree)
nexus_annotations = annotations.NexusAnnotations(tree=phylo_tree) nexus_annotations = annotations.NexusAnnotations(tree=phylo_tree)
nexus_annotations.add_annotations(annotation_dict) nexus_annotations.add_annotations(annotation_dict)
# print (nexus_annotations.annotations)
phylo_tree.putAnnotations(nexus_annotations) phylo_tree.putAnnotations(nexus_annotations)
## Extract all of the annotations from the tree and add them to the NexusAnnotations object ## Extract all of the annotations from the tree and add them to the NexusAnnotations object
print ("Number of taxons is %s " % (taxon_number)) print ("Number of taxons is %s " % (taxon_number))
return phylo_tree return phylo_tree
""" ---------------------------------------------------------------------------------------- """ ----------------------------------------------------------------------------------------
Method for generating a PhyloTree with unique tip names Method for generating a PhyloTree with unique tip names
----------------------------------------------------------------------------------------""" ----------------------------------------------------------------------------------------"""
...@@ -676,7 +612,6 @@ def parse_nexus(string): ...@@ -676,7 +612,6 @@ def parse_nexus(string):
def get_unique_tree(tree): def get_unique_tree(tree):
unique_tree = tree unique_tree = tree
unique_labels = {} unique_labels = {}
for node in unique_tree.getNodes(): for node in unique_tree.getNodes():
if node.isLeaf() and node.label in unique_labels: if node.isLeaf() and node.label in unique_labels:
unique_labels[node.label] = unique_labels[node.label] + 1 unique_labels[node.label] = unique_labels[node.label] + 1
...@@ -688,3 +623,6 @@ def get_unique_tree(tree): ...@@ -688,3 +623,6 @@ def get_unique_tree(tree):
def unpack_list(list): def unpack_list(list):
return (" ".join(["%s"] * len(list)) + "!") % (x for x in list) return (" ".join(["%s"] * len(list)) + "!") % (x for x in list)
if __name__ == '__main__':
tree = readNewick('/Users/mikael/simhome/ASR/edge1.nwk')
print(tree)
\ No newline at end of file
...@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl ...@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
if parse_defline: if parse_defline:
parsed = parseDefline(seqinfo[0]) parsed = parseDefline(seqinfo[0])
seqname = parsed[0] seqname = parsed[0]
else: seqinfo = line[1:]
else: # we are not parsing the sequence name so no need to duplicate it in the info
seqname = seqinfo[0] seqname = seqinfo[0]
seqinfo = line[1:] if len(seqinfo) > 0: # more than a name
edited_info = ''
for infopart in seqinfo[1:]:
edited_info += infopart + ' '
seqinfo = edited_info
else:
seqinfo = ''
except IndexError as errmsg: except IndexError as errmsg:
if not ignore: if not ignore:
raise RuntimeError(errmsg) raise RuntimeError(errmsg)
...@@ -717,60 +724,62 @@ class Alignment(): ...@@ -717,60 +724,62 @@ class Alignment():
distmat[i, j] = distmat[j, i] = dist distmat[i, j] = distmat[j, i] = dist
return distmat return distmat
def writeHTML(self, filename = None): def writeHTML(self, filename = None, col_start = None, col_end = None):
""" Generate HTML that displays the alignment in color. """ Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym) Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue' and that each symbol maps to a text string naming the color, e.g. 'blue'
""" """
col_start = col_start or 0
col_end = col_end or self.alignlen
html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n''' html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
html += '''<p style="font-size:12px">\n'''
maxNameLength = self.getnamelen() maxNameLength = self.getnamelen()
html += ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0: if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
html += str(i/10+1)[0] html += str(i/10+1)[0]
else: elif (i >= col_start and i < col_end):
html += ' ' html += ' '
html += '%s\n' % (self.alignlen) # html += '%s\n' % (col_end)
html += '\n'
if self.alignlen > 10: if self.alignlen > 10:
html += ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0: if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
index = len(str(i/10 + 1).split('.')[0]) index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0' html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
else: elif (i >= col_start and i < col_end):
html += ' ' html += ' '
html += '\n' html += '\n'
if self.alignlen > 100: if self.alignlen > 100:
html += ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0 and i >= 99: if (i+1) % 10 == 0 and i >= 99 and (i >= col_start and i < col_end):
index = len(str(i/10 + 1).split('.')[0]) index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0' html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
elif (i >= col_start and i < col_end):
else:
html += ' ' html += ' '
html += '\n' html += '\n'
if self.alignlen > 1000: if self.alignlen > 1000:
html += ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0: if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' ' html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
elif (i >= col_start and i < col_end):
else:
html += ' ' html += ' '
html += '\n' html += '\n'
for seq in self.seqs: for seq in self.seqs:
html += seq.name.ljust(maxNameLength) + ' ' html += seq.name.ljust(maxNameLength) + ' '
for sym in seq: for sym in seq[col_start:col_end]:
color = self.alphabet.getAnnotation('html-color', sym) color = self.alphabet.getAnnotation('html-color', sym)
if not color: if not color:
color = 'white' color = 'white'
html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym) html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
html += '\n' html += '\n'
html += '</pre></body></html>' html += '</p></pre></body></html>'
if filename: if filename:
fh = open(filename, 'w') fh = open(filename, 'w')
fh.write(html) fh.write(html)
...@@ -1187,19 +1196,25 @@ class Regexp(object): ...@@ -1187,19 +1196,25 @@ class Regexp(object):
def __str__(self): def __str__(self):
return self.pattern return self.pattern
def search(self, sequence): def search(self, sequence, gappy = False):
""" Find matches to the motif in the specified sequence. Returns a list """ Find matches to the motif in the specified sequence. Returns a list
of triples, of the form (position, matched string, score). Note that of triples, of the form (position, matched string, score). Note that
the score is always 1.0 because a regexp either matches the score is always 1.0 because a regexp either matches
or doesn't. """ or doesn't. """
if not type(sequence) is Sequence: if not type(sequence) is Sequence:
sequence = Sequence(sequence) sequence = Sequence(sequence)
sequenceString = sequence[:] if gappy == False or sequence.gappy == False:
results = [] sequenceString = sequence[:]
for match in self.regex.finditer(sequenceString): results = []
results.append((match.start(), match.group(), 1.0)) for match in self.regex.finditer(sequenceString):
return results results.append((match.start(), match.group(), 1.0))
return results
else: # if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
degapped, idxs = sequence.getDegapped()
results = []
for match in self.regex.finditer(''.join(degapped)):
results.append((idxs[match.start()], match.group(), 1.0))
return results
class PWM(object): class PWM(object):
......
...@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet, ...@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
'Protein': Protein_Alphabet, 'Protein': Protein_Alphabet,
'ProteinwX': Protein_wX, 'ProteinwX': Protein_wX,
'ProteinwSTOP' : Protein_wSTOP, 'ProteinwSTOP' : Protein_wSTOP,
'ProteinwGAP': Protein_wGAP,
'DSSP_Alphabet' : DSSP_Alphabet, 'DSSP_Alphabet' : DSSP_Alphabet,
'DSSP3_Alphabet' : DSSP3_Alphabet} 'DSSP3_Alphabet' : DSSP3_Alphabet}
# The preferred order in which a predefined alphabet is assigned to a sequence # The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid) # (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP', 'DSSP_Alphabet', 'DSSP3_Alphabet'] preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP',
'ProteinwGAP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
# Useful annotations # Useful annotations
DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'}) DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'}) RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'}) #Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
Protein_Alphabet.annotateAll('html-color', {
#orange*/
'G': "#F5A259",
#green*/
'N':"#00f900",
'Q':"#00f900",
'S': "#00f900",
'T': "#00f900",
#red*/
'K': "#f62f00",
'R': "#f62f00",
#blue/purple*/
'A':"#92b2f3",
'I': "#92b2f3",
'L': "#92b2f3",
'M': "#92b2f3",
'V': "#92b2f3",
'W': "#92b2f3",
'F': "#92b2f3",
#yellow*/
'P': "#FFFB00",
#pink*/
'C':"#F59692",
#aqua*/
'H': "#04B2B3",
'Y': "#04B2B3",
#purple*/
'D':"#CE64CB",
'E':"#CE64CB"})
# ------------------ Substitution Matrix ------------------ # ------------------ Substitution Matrix ------------------
......
import urllib.request import urllib.request
import urllib.parse
import os import os
from time import sleep from time import sleep
import stats import stats
...@@ -16,10 +17,11 @@ import json ...@@ -16,10 +17,11 @@ import json
http://www.ebi.ac.uk/Tools/webservices/tutorials http://www.ebi.ac.uk/Tools/webservices/tutorials
""" """
__ebiUrl__ = 'http://www.ebi.ac.uk/Tools/' __ebiUrl__ = 'http://www.ebi.ac.uk/Tools/'
__ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/' __ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__ = 'http://www.uniprot.org/' __uniprotUrl__ = 'http://www.uniprot.org/'
__ebiSearchUrl__ = 'http://www.ebi.ac.uk/ebisearch/' __ebiSearchUrl__ = 'http://www.ebi.ac.uk/ebisearch/'
def fetch(entryId, dbName='uniprotkb', format='fasta'): def fetch(entryId, dbName='uniprotkb', format='fasta'):
""" """
...@@ -31,7 +33,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'): ...@@ -31,7 +33,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb&id=P63166&format=fasta&style=raw&Retrieve=Retrieve http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb&id=P63166&format=fasta&style=raw&Retrieve=Retrieve
""" """
# Construct URL # Construct URL
url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&Retrieve=Retrieve&db=' + dbName + '&format=' + format + '&id=' + entryId url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&Retrieve=Retrieve&db=' + dbName + '&format=' + format + '&id=' + entryId
# Get the entry # Get the entry
try: try:
...@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'): ...@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
except urllib.error.HTTPError as ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def search(query, dbName='uniprot', format='list', limit=100, columns=""): def search(query, dbName='uniprot', format='list', limit=100, columns=""):
""" """
Retrieve multiple entries matching query from a database currently only via UniProtKB Retrieve multiple entries matching query from a database currently only via UniProtKB
...@@ -54,10 +57,14 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""): ...@@ -54,10 +57,14 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
""" """
if dbName.startswith('uniprot'): if dbName.startswith('uniprot'):
# Construct URL # Construct URL
if limit == None: # no limit to number of results returned if limit == None: # no limit to number of results returned
url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query + '&columns='+ columns url = "{}{}/?format={}&query={}&columns={}".format(__uniprotUrl__, dbName, format,
urllib.parse.quote(query),
columns)
else: else:
url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query + '&columns='+ columns url = "{}{}/?format={}&limit={}&query={}&columns={}".format(__uniprotUrl__, dbName, format, str(limit),
urllib.parse.quote(query), columns)
# Get the entries # Get the entries
try: try:
...@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""): ...@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
dbs = dbName.split(":") dbs = dbName.split(":")
if len(dbs) > 1: if len(dbs) > 1:
dbName = dbs[1] dbName = dbs[1]
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/' base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
url = base + "esearch.fcgi?db={}&term={}+AND+srcdb_refseq[" \
"prop]&retmax={}".format(dbName, urllib.parse.quote(query), str(limit))
print (url)
# Get the entries # Get the entries
try: try:
data = urllib.request.urlopen(url).read().decode("utf-8") data = urllib.request.urlopen(url).read().decode("utf-8")
words = data.split("</Id>") words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]] words = [w[w.find("<Id>") + 4:] for w in words[:-1]]
if format == 'list': if format == 'list':
return words return words
elif format == 'fasta' and len(words) > 0: elif format == 'fasta' and len(words) > 0:
...@@ -93,9 +107,10 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""): ...@@ -93,9 +107,10 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return return
authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
3702: ['Arabidopsis thaliana', 'TAIR_ID'], authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'], 3702: ['Arabidopsis thaliana', 'TAIR_ID'],
4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
10090: ['Mus musculus', 'MGI_ID']} 10090: ['Mus musculus', 'MGI_ID']}
""" """
...@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html ...@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries. Note that this service can be slow for queries involving a large number of entries.
""" """
def getGOReport(positives, background = None):
def getGOReport(positives, background=None):
""" Generate a complete GO term report for a set of genes (positives). """ Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided). Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
...@@ -135,7 +151,7 @@ def getGOReport(positives, background = None): ...@@ -135,7 +151,7 @@ def getGOReport(positives, background = None):
for t in term_set: for t in term_set:
term_cnt[t] = fg_list.count(t) term_cnt[t] = fg_list.count(t)
sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
else: # a background is provided else: # a background is provided
for t in term_set: for t in term_set:
fg_hit = fg_list.count(t) fg_hit = fg_list.count(t)
bg_hit = bg_list.count(t) bg_hit = bg_list.count(t)
...@@ -148,11 +164,12 @@ def getGOReport(positives, background = None): ...@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
for t in sorted_cnt: for t in sorted_cnt:
defin = getGODef(t[0]) defin = getGODef(t[0])
if background != None: if background != None:
ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name'])) ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0] + t[1][1], defin['name']))
else: else:
ret.append((t[0], t[1], defin['name'])) ret.append((t[0], t[1], defin['name']))
return ret return ret
def getGODef(goterm): def getGODef(goterm):
""" """
Retrieve information about a GO term Retrieve information about a GO term
...@@ -165,7 +182,7 @@ def getGODef(goterm): ...@@ -165,7 +182,7 @@ def getGODef(goterm):
url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
entry={'id': None, 'name': None, 'aspect': None} entry = {'id': None, 'name': None, 'aspect': None}
data = urllib.request.urlopen(url).read().decode("utf-8") data = urllib.request.urlopen(url).read().decode("utf-8")
ret = json.loads(data) ret = json.loads(data)
for row in ret['results']: for row in ret['results']:
...@@ -179,6 +196,7 @@ def getGODef(goterm): ...@@ -179,6 +196,7 @@ def getGODef(goterm):
except urllib.error.HTTPError as ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def getGOTerms(genes): def getGOTerms(genes):
""" """
Retrieve all GO terms for a given set of genes (or single gene). Retrieve all GO terms for a given set of genes (or single gene).
...@@ -187,9 +205,9 @@ def getGOTerms(genes): ...@@ -187,9 +205,9 @@ def getGOTerms(genes):
if type(genes) != list and type(genes) != set and type(genes) != tuple: if type(genes) != list and type(genes) != set and type(genes) != tuple:
genes = [genes] genes = [genes]
map = dict() map = dict()
batchsize = 100 # size of query batch batchsize = 100 # size of query batch
genecnt = 0 genecnt = 0
limitpage = 100 # number of record on each returned page limitpage = 100 # number of record on each returned page
while genecnt < len(genes): while genecnt < len(genes):
genebatch = [] genebatch = []
for index in range(batchsize): for index in range(batchsize):
...@@ -237,6 +255,7 @@ def getGOTerms(genes): ...@@ -237,6 +255,7 @@ def getGOTerms(genes):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return map return map
def getGenes(goterms, taxo=None): def getGenes(goterms, taxo=None):
""" """
Retrieve all genes/proteins for a given set of GO terms (or single GO term). Retrieve all genes/proteins for a given set of GO terms (or single GO term).
...@@ -247,9 +266,9 @@ def getGenes(goterms, taxo=None): ...@@ -247,9 +266,9 @@ def getGenes(goterms, taxo=None):
if type(goterms) != list and type(goterms) != set and type(goterms) != tuple: if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
goterms = [goterms] goterms = [goterms]
map = dict() map = dict()
batchsize = 10 # size of query batch batchsize = 10 # size of query batch
termcnt = 0 termcnt = 0
limitpage = 100 # number of record on each returned page limitpage = 100 # number of record on each returned page
while termcnt < len(goterms): while termcnt < len(goterms):
termbatch = [] termbatch = []
for index in range(batchsize): for index in range(batchsize):
...@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None): ...@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
else: else:
break break
termcnt += 1 termcnt += 1
uri_string = 'annotation/search?limit=' + str(limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId=' uri_string = 'annotation/search?limit=' + str(
limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId='
for i in range(len(termbatch)): for i in range(len(termbatch)):
term = termbatch[i] term = termbatch[i]
uri_string += term + "," if i < len(termbatch) - 1 else term uri_string += term + "," if i < len(termbatch) - 1 else term
...@@ -295,11 +315,11 @@ def getGenes(goterms, taxo=None): ...@@ -295,11 +315,11 @@ def getGenes(goterms, taxo=None):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return map return map
class EBI(object):
__email__ = 'anon@uq.edu.au' # to whom emails about jobs should go class EBI(object):
__ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available __email__ = 'anon@uq.edu.au' # to whom emails about jobs should go
__checkInterval__ = 2 # how long to wait between checking job status __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
__checkInterval__ = 2 # how long to wait between checking job status
def __init__(self, service=None): def __init__(self, service=None):
""" Initialise service session. """ Initialise service session.
...@@ -349,7 +369,8 @@ class EBI(object): ...@@ -349,7 +369,8 @@ class EBI(object):
if self.isLocked(): if self.isLocked():
raise RuntimeError("""You currently have a %s job running. You must raise RuntimeError("""You currently have a %s job running. You must
wait until it is complete before submitting another job. Go to wait until it is complete before submitting another job. Go to
%sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId)) %sstatus/%s to check the status of the job.""" % (
self.service, self.__ebiServiceUrl__, self.jobId))
url = self.__ebiServiceUrl__ + self.service + '/run/' url = self.__ebiServiceUrl__ + self.service + '/run/'
# ncbiblast database parameter needs special handling # ncbiblast database parameter needs special handling
if self.service == 'ncbiblast': if self.service == 'ncbiblast':
...@@ -423,8 +444,8 @@ class EBI(object): ...@@ -423,8 +444,8 @@ class EBI(object):
else: else:
return results return results
def getUniProtDict(ids, cols="", db='uniprot', identities=None):
def getUniProtDict(ids, cols="", db='uniprot', identities=None):
""" """
:param ids: The list of UniProt IDs :param ids: The list of UniProt IDs
...@@ -439,11 +460,11 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -439,11 +460,11 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
*** EXAMPLE USAGE *** *** EXAMPLE USAGE ***
Get a list of UniProt IDs and a list of UniProt columns you're interested in. Get a list of UniProt IDs and a list of UniProt columns you're interested in.
Full list of UniProt column names - https://www.uniprot.org/help/uniprotkb_column_names Full list of UniProt column names - https://www.uniprot.org/help/uniprotkb_column_names
uniprot_names = ['Q9LIR4', 'Q1JUQ1', 'P05791', 'P0ADF6'] uniprot_names = ['Q9LIR4', 'Q1JUQ1', 'P05791', 'P0ADF6']
cols = ["lineage(SUPERKINGDOM)", "genes", "lineage(KINGDOM)"] cols = ["lineage(SUPERKINGDOM)", "genes", "lineage(KINGDOM)"]
up_dict = getUniProtDict(uniprot_names, cols) up_dict = getUniProtDict(uniprot_names, cols)
for record in up_dict: for record in up_dict:
print (record, up_dict[record].get("lineage(SUPERKINGDOM)")) print (record, up_dict[record].get("lineage(SUPERKINGDOM)"))
...@@ -452,22 +473,21 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -452,22 +473,21 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
for record in up_dict: for record in up_dict:
print (record, up_dict[record].get("genes")) print (record, up_dict[record].get("genes"))
If a record doesn't have an entry in UniProt for that column it'll just return None If a record doesn't have an entry in UniProt for that column it'll just return None
print (up_dict['Q1JUQ1']) print (up_dict['Q1JUQ1'])
print (up_dict['Q1JUQ1']['lineage(KINGDOM)']) print (up_dict['Q1JUQ1']['lineage(KINGDOM)'])
*** EXAMPLE USAGE FOR UNIREF SEARCHING *** *** EXAMPLE USAGE FOR UNIREF SEARCHING ***
up_dict = getUniProtDict(["Q9LIR4", "P99999"], cols=["members"], db="uniref", identities = 1.0) up_dict = getUniProtDict(["Q9LIR4", "P99999"], cols=["members"], db="uniref", identities = 1.0)
You can either pass a list of identities for each UniProt identifier (in which case the list of identities must be You can either pass a list of identities for each UniProt identifier (in which case the list of identities must be
the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at. the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
""" """
# Format the lists of IDs and columns correctly # Format the lists of IDs and columns correctly
cols = ",".join(cols) cols = ",".join(cols)
...@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
if type(identities) != list: if type(identities) != list:
identities = [identities] * len(ids) identities = [identities] * len(ids)
elif len(identities) != len(ids): elif len(identities) != len(ids):
raise RuntimeError('Either supply a single identity threshold or supply one for each identifier in the list') raise RuntimeError(
'Either supply a single identity threshold or supply one for each identifier in the list')
# Check that the identity thresholds are valid values # Check that the identity thresholds are valid values
for x in identities: for x in identities:
if x not in [1.0, 0.9, 0.5]: if x not in [1.0, 0.9, 0.5]:
raise RuntimeError("UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - " + str(x)) raise RuntimeError(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - " + str(x))
# Add the query syntax around the identifiers # Add the query syntax around the identifiers
updated_ids = "" updated_ids = ""
...@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
url = 'https://www.uniprot.org/' + db + '/' url = 'https://www.uniprot.org/' + db + '/'
params = { params = {
'format': 'tab', 'format': 'tab',
'query': updated_ids, 'query': updated_ids,
...@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
# For each record we retrieve, split the line by tabs and build up the UniProt dict # For each record we retrieve, split the line by tabs and build up the UniProt dict
for line in page.split("\n")[1:]: for line in page.split("\n")[1:]:
if line: if line:
splitlines= line.split("\t") splitlines = line.split("\t")
id_dict = {} id_dict = {}
pos = 1 pos = 1
for col in cols.split(","): for col in cols.split(","):
id_dict[col] = None if splitlines[pos] == "" else splitlines[pos] id_dict[col] = None if splitlines[pos] == "" else splitlines[pos]
pos +=1 pos += 1
up_dict[splitlines[0]] = id_dict up_dict[splitlines[0]] = id_dict
return up_dict return up_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment