added_regex_search_in_gappy_sequences

8fa94535 · Mikael Boden · 9889428a · 8fa94535 · 8fa94535 · 8fa94535
Commit 8fa94535 authored Jun 03, 2019 by Mikael Boden
7 changed files
--- a/annotation_test.py
+++ b/annotation_test.py
 import annotations
 import phylo
+tree = phylo.parseNewick("(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);")
+# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")

 # tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")

@@ -10,17 +12,17 @@ import phylo



-working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
-
-tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
-
-print (tree)
-print (tree.nexus_annotations.annotations)
-
-tree.swap_annotations("PDB")
-
-print (tree)
-print (tree.nexus_annotations.annotations)
+# working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
+#
+# tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
+#
+# print (tree)
+# print (tree.nexus_annotations.annotations)
+#
+# tree.swap_annotations("PDB")
+#
+# print (tree)
+# print (tree.nexus_annotations.annotations)
 #
 # tree.write_to_nexus(working_dir + "output.nexus")


--- a/annotations.py
+++ b/annotations.py
 from collections import defaultdict
 from phylo import *
+import phylo
 import matplotlib
 import random

@@ -146,3 +147,5 @@ class NexusAnnotations():
    def generate_colour_list(self, num):
        return num

+# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
+
--- a/gtf.py
+++ b/gtf.py
@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
    f.close()

 if __name__ == '__main__':
-    bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf')
+    bf = GtfFile('/Users/mikael/simhome/NFIX/WT1689.gtf')
    print(bf.chroms.keys())
    g = bf.generate('chr12')
    print(next(g))

--- a/phylo.py
+++ b/phylo.py
 '''
 Module with methods and classes for phylogeny.
+Extended to handle n-ary trees (Jan 2019).
 @author: mikael
 '''
 import sequence
 from collections import defaultdict
 import annotations

-
 class PhyloTree:
-    """ Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
+    """ Rooted, n-ary tree for representing phylogenetic relationships.
        Functionality includes labelling and traversing nodes; reading and writing to Newick format;
        association with sequence alignment; maximum parsimony inference of ancestral sequence;
-        generation of single, bifurcating rooted tree by UPGMA.
-        Known issues: Binary only; Parsimony does not handle gaps in alignment.
+        generation of rooted tree by UPGMA.
+        Known issues: Parsimony does not handle gaps in alignment.
        Programmers should note that almost all functionality is implemented through recursion. """

    def __init__(self, root):
@@ -27,7 +27,6 @@ class PhyloTree:

    def putAnnotations(self, nexus_annotations):
        self.nexus_annotations = nexus_annotations
-
        # Update the annotations dictionary so that it contains PhyloNode objects as keys, not text labels
        for node in self.getNodes():
            if node.label in self.nexus_annotations.leaf_annotations:
@@ -60,10 +59,18 @@ class PhyloTree:
            node = queue.pop()
            nodes.append(node)
            # if strategy.upper().startswith('DEPTH'):
-            if node.left: queue.append(node.left)
-            if node.right: queue.append(node.right)
+            if not node.isLeaf():
+                queue.extend(node.children)
        return nodes

+    def getLeaves(self):
+        all = self.getNodes()
+        leaves = []
+        for n in all:
+            if n.isLeaf():
+                leaves.append(n)
+        return leaves
+
    def getDescendantsOf(self, node, transitive=False):
        """ Retrieve and return the (list of) descendants (children) of a specified node.
            Node can be the label or the instance.
@@ -86,28 +93,7 @@ class PhyloTree:
        if not isinstance(node, PhyloNode):
            node = self.findLabel(node)
        if node:
-            myroot = self.root
-            found = False
-            branching = []
-            while not found and myroot != None:
-                branching.append(myroot)
-                # check if "myroot" is a leaf node, i.e. does not have children
-                if myroot.left == node or myroot.right == node:
-                    found = True
-                    break
-                if myroot.left != None:  # myroot has a "left" child
-                    # check if the "left" child of "myroot" is the ancestor of "node"
-                    if myroot.left.isAncestorOf(node, transitive=True):  # if yes,
-                        myroot = myroot.left  # move to the "left" child
-                    else:  # if not,
-                        myroot = myroot.right  # move to the "right" child
-                else:  # myroot does NOT have a "left" child, so let's move "right"
-                    myroot = myroot.right
-            if found and transitive:
-                return branching
-            elif found and len(branching) > 0:
-                return branching[len(branching) - 1]
-            return None
+            return node.getAncestors(transitive)

    def parsimony(self):
        """ Solve the "small parsimony problem",
@@ -117,12 +103,8 @@ class PhyloTree:
        self.root._backwardParsimony(self.aln)  # use scores to determine sequences
        return self.root.getSequence()  # return the sequence found at the root

-    def canonise(self):
-        self.root._canonise()
-
    def swap_annotations(self, annotation_key):
        try:
-
            for node in self.getNodes():
                if node.isLeaf():
                    node.label = self.nexus_annotations.leaf_annotations[node][annotation_key]
@@ -135,103 +117,91 @@ class PhyloTree:
        :param out_path: The path to write the NEXUS file to
        :param nexus_annotations: The NexusAnnotations containing the annotations
        """
-
        if write_annotations and not nexus_annotations:
            if not self.nexus_annotations:
                raise RuntimeError("This tree file has no associated annotation file. Either associate or supply one as a parameter.")
            nexus_annotations = self.nexus_annotations
-
-
-
-
-
        if nexus_annotations:
-
            for node in self.getNodes():
-
                if node in self.nexus_annotations.node_annotations:
                    node.annotate_node(self.nexus_annotations.node_annotations, self.nexus_annotations.annotation_symbols, exclude_annotations, use_symbols)
-
            tree_annotation = str(self) + ";"
-
            self.swap_annotations("Original")
-
            for node in self.getNodes():
-
                if node in self.nexus_annotations.leaf_annotations:
                    node.annotate_node(self.nexus_annotations.leaf_annotations, exclude_annotations)
-
                leaves = []
-
                for node in self.getNodes():
                    if node.isLeaf():
                        leaves.append(node.label)
-
                leaf_annotation = ""
-
                for leaf in leaves:
                    leaf_annotation += "\n\t%s" % (leaf)
-
-
-
            with open(out_path, "w+") as file:
                file.write(
                    "#NEXUS\nbegin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels%s\n;\nend;\n\nbegin trees;\n\ttree tree_1 = "
                    "[&R] %s\nend;" % (len(leaves), leaf_annotation, tree_annotation))

-
-
 class PhyloNode:
-    """ A class for a node in a rooted, binary (bifurcating) tree.
-        Contains pointers to descendants/daughters (left and right),
+    """ A class for a node in a rooted, n-ary tree.
+        Contains pointers to multiple descendants/daughters,
        optional fields include data, label, sequence and dist.
        If parsimony is used scores and traceback pointers are available.
        A number of methods are named with a _ prefix. These can be, but
        are not intended to be used from outside the class. """

-    def __init__(self, label=''):
-        """ Initialise an initially unlinked node.
-            Populate fields left and right to link it with other nodes.
+    def __init__(self, parent = None, label=''):
+        """ Initialise a node.
+            Set its parent (another PhyloNode), parent can be None.
            Set label to name it.
            Use field data for any type of information associated with node.
            Use dist to indicate the distance to its parent (if any).
            Other fields are used internally, including sequence for associated alignment,
-            seqscores, backleft and backright for maximum parsimony. """
-        self.left = None
-        self.right = None
+            seqscores, back for maximum parsimony. """
+        self.parent = parent
+        self.children = None
        self.data = None
        self.label = label
        self.dist = None
        self.sequence = None  # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
-        self.seqscores = None  # The scores propagated from leaves via children
-        self.backleft = None  # Pointers back to left child: what symbol rendered current/parent symbols
-        self.backright = None  # Pointers back to right child: what symbol rendered current/parent symbols
+        self.seqscores = None # The scores propagated from leaves via children
+        self.backptr = None   # Pointers back to children: what symbol rendered current/parent symbols

    def isLeaf(self):
-        return self.left == self.right == None
+        return self.nChildren() == 0
+
+    def nChildren(self):
+        if self.children == None:
+            return 0
+        else:
+            return len(self.children)

    def __str__(self):
        """ Returns string with node (incl descendants) in a Newick style. """
-        left = right = label = dist = ''
-        if self.left:
-            left = str(self.left)
-        if self.right:
-            right = str(self.right)
+        stubs = ['' for _ in range(self.nChildren())]
+        label = dist = ''
+        for i in range(self.nChildren()):
+            stubs[i] = str(self.children[i])
        if self.dist or self.dist == 0.0:
            dist = ':' + str(self.dist)
        if self.label != None:
            label = str(self.label)
-            if not self.left and not self.right:
-                return label + dist
-            else:
-                return '(' + left + ',' + right + ')' + label + dist
-        else:  # there is no label
+        if self.nChildren() == 0:
+            return label + dist
+        else:
+            stubstr = '('
+            for i in range(len(stubs) - 1):
+                stubstr += stubs[i] + ','
+            return stubstr + stubs[-1] + ')' + label + dist
+        # there is no label
+            '''
            if not self.left and self.right:
                return ',' + right
            elif self.left and not self.right:
                return left + ','
            elif self.left and self.right:
                return '(' + left + ',' + right + ')' + dist
+                '''

    # def __le__(self, other):
    #     """ Returns indication of less than other node. """
@@ -247,38 +217,31 @@ class PhyloNode:

    def _printSequences(self, start, end):
        """ Returns string with node (incl descendants) in a Newick style. """
-        left = right = label = dist = ''
-        if self.left:
-            left = self.left._printSequences(start, end)
-        if self.right:
-            right = self.right._printSequences(start, end)
-        if self.dist:
+        stubs = ['' for _ in range(self.nChildren())]
+        label = dist = ''
+        for i in range(self.nChildren()):
+            stubs[i] = self._printSequences(self.children[i], start, end)
+        if self.dist or self.dist == 0.0:
            dist = ':' + str(self.dist)
-        if self.sequence != None:
-            label = "".join(self.sequence[start:end]) + ""
-            if not self.left and not self.right:
-                return label + dist
-            else:
-                return '(' + left + ',' + right + ')' + label + dist
-        else:  # there is no label
-            if not self.left and self.right:
-                return ',' + right
-            elif self.left and not self.right:
-                return left + ','
-            elif self.left and self.right:
-                return '(' + left + ',' + right + ')' + dist
+        if self.label != None:
+            label = str(self.label)
+        if self.nChildren() == 0:
+            return label + dist
+        else:
+            stubstr = '('
+            for i in range(len(stubs) - 1):
+                stubstr += stubs[i] + ','
+            return stubstr + stubs[-1] + ')' + label + dist

    def _findLabel(self, label):
        """ Find a node by label at this node or in any descendants (recursively). """
        if self.label == label:
            return self
        else:
-            if self.left:
-                foundLeft = self.left._findLabel(label)
-                if foundLeft:
-                    return foundLeft
-            if self.right:
-                return self.right._findLabel(label)
+            for i in range(self.nChildren()):
+                found = self.children[i]._findLabel(label)
+                if found:
+                    return found
            return None

    def _propagateDistance(self, parent_dist):
@@ -286,24 +249,21 @@ class PhyloNode:
            The only parameter is the absolute distance to the parent of this node. """
        travelled = self.dist  # absolute distance to this node
        self.dist = parent_dist - self.dist  # relative distance to this node
-        if self.left != None:  # if there is a child node...
-            self.left._propagateDistance(travelled)  # pass absolute distance to this node
-        if self.right != None:
-            self.right._propagateDistance(travelled)
+        for i in range(self.nChildren()):
+            self.children[i]._propagateDistance(travelled)  # pass absolute distance to this node

    def _assignAlignment(self, aln):
        """ Assign an alignment to the node, which implies assigning a sequence to it if one is
            available in the alignment. """
        self.sequence = None
-        if self.left != None:
-            self.left._assignAlignment(aln)
-        if self.right != None:
-            self.right._assignAlignment(aln)
+        for i in range(self.nChildren()):
+            self.children[i]._assignAlignment(aln)
        for seq in aln.seqs:
            if seq.name == self.label:
                self.sequence = seq
                break

+    """ # Not sure if this is required (putting nodes into a canonical ordering)
    def _canonise(self):
        if self.left == None and self.right == None:  # at leaf
            return self.label
@@ -315,52 +275,38 @@ class PhyloNode:
            self.right = tmpnode
            return myright
        return myleft
+    """

    def _forwardParsimony(self, aln):
        """ Internal function that operates recursively to first initialise each node (forward),
            stopping only once a sequence has been assigned to the node,
            then to propagate scores from sequence assigned nodes to root (backward). """
        if self.sequence == None:  # no sequence has been assigned
-            if self.left == None and self.right == None:  # no children, so terminal, cannot propagate scores
+            if self.nChildren() == 0:  # no children, so terminal, cannot propagate scores
                raise RuntimeError("No sequence assigned to leaf node:", self.label)
-            scoresleft = scoresright = None
-            if self.left != None:
-                scoresleft = self.left._forwardParsimony(aln)
-            if self.right != None:
-                scoresright = self.right._forwardParsimony(aln)
+            scores = [None for _ in range(self.nChildren())]
+            for i in range(self.nChildren()):
+                scores[i] = self.children[i]._forwardParsimony(aln)
            # for each position in the alignment,
            # introduce (initially zero) score for each symbol in alphabet
            self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
            # for each position in the alignment,
-            # allocate a position to put the left child symbol from which each current node symbol score was determined
-            self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
-            # allocate a position to put the right child symbol from which each current node symbol score was determined
-            self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
+            # allocate a position to put the each child symbol from which each current node symbol score was determined
+            self.backptr = [[[None for _ in aln.alphabet] for _ in range(aln.alignlen)] for _ in range(self.nChildren())]
            for col in range(aln.alignlen):
-                # left child will contribute first
-                for a_parent in range(len(aln.alphabet)):
-                    best_score_left = +9999999
-                    best_symb_left = 0
-                    for a in range(len(aln.alphabet)):
-                        score = (scoresleft[col][a] + (
-                        1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
-                        if score < best_score_left:
-                            best_symb_left = a
-                            best_score_left = score
-                    self.seqscores[col][a_parent] = best_score_left
-                    self.backleft[col][a_parent] = best_symb_left
-                # right child will contribute next
-                for a_parent in range(len(aln.alphabet)):
-                    best_score_right = +9999999
-                    best_symb_right = 0
-                    for a in range(len(aln.alphabet)):
-                        score = (scoresright[col][a] + (
-                        1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
-                        if score < best_score_right:
-                            best_symb_right = a
-                            best_score_right = score
-                    self.seqscores[col][a_parent] += best_score_right
-                    self.backright[col][a_parent] = best_symb_right
+                for i in range(self.nChildren()):
+                    # left child will contribute first
+                    for a_parent in range(len(aln.alphabet)):
+                        best_score = +9999999
+                        best_symb = 0
+                        for a in range(len(aln.alphabet)):
+                            score = (scores[i][col][a] + (
+                            1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
+                            if score < best_score:
+                                best_symb = a
+                                best_score = score
+                        self.seqscores[col][a_parent] += best_score
+                        self.backptr[i][col][a_parent] = best_symb
        else:
            self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in
                              self.sequence]  # if we want to weight scores, this would need to change
@@ -370,39 +316,37 @@ class PhyloNode:
        """ Internal function that operates recursively to inspect scores to determine
            most parsimonious sequence, from root to leaves. """
        if self.sequence == None:  # no sequence has been assigned
-            leftbuf = []
-            rightbuf = []
-            if self.left == None and self.right == None:  # no children, so terminal, cannot propagate scores
+            childbuf = [[] for _ in range(self.nChildren())]
+            if self.nChildren() == 0:  # no children, so terminal, cannot propagate scores
                raise RuntimeError("No sequence assigned to leaf node:", self.label)
            if seq == None:  # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
                currbuf = []
                for col in range(aln.alignlen):
                    min_score = 999999
                    min_symb = None
-                    left_symb = None
-                    right_symb = None
+                    child_symb = [None for _ in range(self.nChildren())]
                    for a_parent in range(len(aln.alphabet)):
                        if self.seqscores[col][a_parent] < min_score:
                            min_score = self.seqscores[col][a_parent]
                            min_symb = a_parent
-                            left_symb = self.backleft[col][a_parent]
-                            right_symb = self.backright[col][a_parent]
+                            for i in range(self.nChildren()):
+                                child_symb[i] = self.backptr[i][col][a_parent]
                    currbuf.append(aln.alphabet[min_symb])
-                    leftbuf.append(aln.alphabet[left_symb])
-                    rightbuf.append(aln.alphabet[right_symb])
+                    for i in range(self.nChildren()):
+                        childbuf[i].append(aln.alphabet[child_symb[i]])
                self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy=True)
            else:  # Non-root, but not leaf
                self.sequence = seq
                col = 0
                for sym_parent in self.sequence:
                    a_parent = aln.alphabet.index(sym_parent)
-                    left_symb = self.backleft[col][a_parent]
-                    right_symb = self.backright[col][a_parent]
-                    leftbuf.append(aln.alphabet[left_symb])
-                    rightbuf.append(aln.alphabet[right_symb])
+                    child_symb = [None for _ in range(self.nChildren())]
+                    for i in range(self.nChildren()):
+                        child_symb[i] = self.backptr[i][col][a_parent]
+                        childbuf.append(aln.alphabet[child_symb[i]])
                    col += 1
-            self.left._backwardParsimony(aln, sequence.Sequence(leftbuf, aln.alphabet, self.label, gappy=True))
-            self.right._backwardParsimony(aln, sequence.Sequence(rightbuf, aln.alphabet, self.label, gappy=True))
+            for i in range(self.nChildren()):
+                self.children[i]._backwardParsimony(aln, sequence.Sequence(childbuf[i], aln.alphabet, self.label, gappy=True))
        return self.sequence

    def getSequence(self):
@@ -418,26 +362,35 @@ class PhyloNode:
        """ Decide if this node is the ancestor of specified node.
            If transitive is True (default), all descendants are included.
            If transitive is False, only direct descendants are included. """
-        if node == self.left or node == self.right:
-            return True
-        elif transitive:
-            if self.left:
-                statusLeft = self.left.isAncestorOf(node, transitive)
-                if statusLeft: return True
-            if self.right:
-                return self.right.isAncestorOf(node, transitive)
+        for i in range(self.nChildren()):
+            if node == self.children[i]:
+                return True
+            elif transitive:
+                status = self.children[i].isAncestorOf(node, transitive)
+                if status: return True
        else:
            return False

+    def getAncestors(self, transitive=False):
+        """ Retrieve and return (list of) parent nodes.
+            If transitive is False (default), only the direct parent is included.
+            If transitive is True, all parents (parents of parents etc) are included. """
+        if self.parent == None:
+            return []
+        if not transitive:
+            return [self.parent]
+        else:
+            parents = self.parent.getAncestors(transitive)
+            parents.append(self.parent)
+        return parents
+
    def getDescendants(self, transitive=False):
        """ Retrieve and return (list of) nodes descendant of this.
            If transitive is False (default), only direct descendants are included.
            If transitive is True, all descendants are (recursively) included. """
        children = []
-        if self.left:
-            children.append(self.left)
-        if self.right:
-            children.append(self.right)
+        for i in range(self.nChildren()):
+            children.append(self.children[i])
        if not transitive:
            return children
        else:
@@ -450,13 +403,11 @@ class PhyloNode:
            return children

    def annotate_node(self, annotations, annotation_symbols= None, exclude_annotations = [], use_symbols=False ):
-
        annotation_string = "[&"
        for key, val_list in annotations[self].items():
            if type(val_list) != list:
                val_list = [val_list]
            if key not in exclude_annotations:
-
                # If we are using annotation symbols and the annotation has an associated symbol
                for val in val_list:
                    if use_symbols and val in annotation_symbols:
@@ -464,11 +415,8 @@ class PhyloNode:
                        annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in sorted_symbols]))
                    else:
                        annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in val_list]))
-
-
        # Remove the final comma and add in a closing bracket
        annotation_string = annotation_string[0: len(annotation_string) - 1] + "]"
-
        if len(annotation_string) > 2:
            if ":" in self.label:
                self.label = self.label.split(":")[0] + annotation_string + self.label.split(":")[1]
@@ -488,7 +436,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
    D = {}
    N = {}  # The number of sequences in each node
    M = aln.calcDistances(measure)  # determine all pairwise distances
-    nodes = [PhyloNode(seq.name) for seq in aln.seqs]  # construct all leaf nodes
+    nodes = [PhyloNode(label=seq.name) for seq in aln.seqs]  # construct all leaf nodes
    """ For each node-pair, assign the distance between them. """
    for i in range(len(nodes)):
        nodes[i].sequence = aln.seqs[i]
@@ -525,8 +473,9 @@ def runUPGMA(aln, measure, absoluteDistances=False):
        N[z] = Nx + Ny  # total number of sequences in new cluster, insert new cluster in list N
        for w in dz:  # we have to run through the nodes again, now not including the removed x and y
            D[frozenset([z, w])] = dz[w]  # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
-        z.left = x  # link the phylogenetic tree
-        z.right = y
+        x.parent = z
+        y.parent = z
+        z.children = [x, y]
        nodes.append(z)
    if not absoluteDistances:
        x._propagateDistance(z.dist)  # convert absolute distances to relative by recursing down left path
@@ -534,24 +483,22 @@ def runUPGMA(aln, measure, absoluteDistances=False):
        z.dist = 0.0  # root z is at distance 0 from merged x and y
    return PhyloTree(z)  # make it to tree, return

-
 """ ----------------------------------------------------------------------------------------
    Methods for processing files of trees on the Newick format
    ----------------------------------------------------------------------------------------"""

-
 def _findComma(string, level=0):
-    """ Find first comma at specified level of embedding """
+    """ Find all commas at specified level of embedding """
    mylevel = 0
+    commas = []
    for i in range(len(string)):
        if string[i] == '(':
            mylevel += 1
        elif string[i] == ')':
            mylevel -= 1
        elif string[i] == ',' and mylevel == level:
-            return i
-    return -1
-
+            commas.append(i)
+    return commas

 def parseNewickNode(string):
    """ Utility function that recursively parses embedded string using Newick format. """
@@ -559,7 +506,7 @@ def parseNewickNode(string):
    last = string[::-1].find(')')  # look from the back
    if first == -1 and last == -1:  # we are at leaf
        y = string.split(':')
-        node = PhyloNode(y[0])
+        node = PhyloNode(label=y[0])
        if len(y) >= 2:
            node.dist = float(y[1])
        return node
@@ -569,17 +516,24 @@ def parseNewickNode(string):
        embed = string[first + 1:last]
        tail = string[last + 1:]
        # find where corresp comma is
-        comma = _findComma(embed)
-        if comma == -1:
+        commas = _findComma(embed)
+        if len(commas) < 1:
            raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
-        left = embed[0:comma].strip()
-        right = embed[comma + 1:].strip()
+        prev_comma = 0
+        child_tokens = []
+        for comma in commas:
+            child_tokens.append(embed[prev_comma:comma].strip())
+            prev_comma = comma + 1
+        child_tokens.append(embed[prev_comma:].strip())
        y = tail.split(':')
-        node = PhyloNode(y[0])  # node is an instance of the PhyloNode() class
+        node = PhyloNode(label=y[0])  # node is an instance of the PhyloNode() class
        if len(y) >= 2:
            node.dist = float(y[1])
-        node.left = parseNewickNode(left)
-        node.right = parseNewickNode(right)
+        node.children = []
+        for tok in child_tokens:
+            child = parseNewickNode(tok)
+            child.parent = node
+            node.children.append(child)
        return node
    else:
        raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
@@ -628,8 +582,6 @@ def parse_nexus(string):
            taxon_num = num + 1
            while not lines[taxon_num].strip().startswith(";"):
                taxon_name = lines[taxon_num].split("[")[0].strip()
-
-
                for annot_line in lines[taxon_num].split("[&")[1].split(","):
                    #TODO: Make these regex calls
                    # print ("Annotation Key is ", annot_line.split("=")[0])
@@ -641,34 +593,18 @@ def parse_nexus(string):
                        annot_val = annot_line.split("=")[1].split("]")[0]

                    annotation_dict[taxon_name][annot_key.strip()] = annot_val
-
                taxon_num +=1
-
-
        if line.strip().startswith("begin trees"):
            tree_num = num + 1
            tree = (lines[tree_num].split("[&R]")[1])
-
    phylo_tree = parseNewick(tree)
-
    nexus_annotations = annotations.NexusAnnotations(tree=phylo_tree)
-
-
    nexus_annotations.add_annotations(annotation_dict)
-
-
-
-    # print (nexus_annotations.annotations)
-
    phylo_tree.putAnnotations(nexus_annotations)
-
-
    ## Extract all of the annotations from the tree and add them to the NexusAnnotations object
-
    print ("Number of taxons is %s " % (taxon_number))
    return phylo_tree

-
 """ ----------------------------------------------------------------------------------------
    Method for generating a PhyloTree with unique tip names
    ----------------------------------------------------------------------------------------"""
@@ -676,7 +612,6 @@ def parse_nexus(string):
 def get_unique_tree(tree):
    unique_tree = tree
    unique_labels = {}
-
    for node in unique_tree.getNodes():
        if node.isLeaf() and node.label in unique_labels:
            unique_labels[node.label] = unique_labels[node.label] + 1
@@ -688,3 +623,6 @@ def get_unique_tree(tree):
 def unpack_list(list):
    return (" ".join(["%s"] * len(list)) + "!") % (x for x in list)

+if __name__ == '__main__':
+    tree = readNewick('/Users/mikael/simhome/ASR/edge1.nwk')
+    print(tree)
\ No newline at end of file
--- a/sequence.py
+++ b/sequence.py
@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
                    if parse_defline:
                        parsed = parseDefline(seqinfo[0])
                        seqname = parsed[0]
-                    else:
+                        seqinfo = line[1:]
+                    else: # we are not parsing the sequence name so no need to duplicate it in the info
                        seqname = seqinfo[0]
-                    seqinfo = line[1:]
+                        if len(seqinfo) > 0: # more than a name
+                            edited_info = ''
+                            for infopart in seqinfo[1:]:
+                                edited_info += infopart + ' '
+                            seqinfo = edited_info
+                        else:
+                            seqinfo = ''
                except IndexError as errmsg:
                    if not ignore:
                        raise RuntimeError(errmsg)
@@ -717,60 +724,62 @@ class Alignment():
                distmat[i, j] = distmat[j, i] = dist
        return distmat

-    def writeHTML(self, filename = None):
+    def writeHTML(self, filename = None, col_start = None, col_end = None):
        """ Generate HTML that displays the alignment in color.
            Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
            and that each symbol maps to a text string naming the color, e.g. 'blue'
        """
+        col_start = col_start or 0
+        col_end = col_end or self.alignlen
        html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
+        html += '''<p style="font-size:12px">\n'''
        maxNameLength =  self.getnamelen()
        html += ''.ljust(maxNameLength) + ' '
        for i in range(self.alignlen - 1):
-            if (i+1) % 10 == 0:
+            if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
                html += str(i/10+1)[0]
-            else:
+            elif (i >= col_start and i < col_end):
                html += ' '
-        html += '%s\n' % (self.alignlen)
+#        html += '%s\n' % (col_end)
+        html += '\n'

        if self.alignlen > 10:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0:
+                if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
                    index = len(str(i/10 + 1).split('.')[0])
                    html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
-                else:
+                elif (i >= col_start and i < col_end):
                    html += ' '
            html += '\n'

        if self.alignlen > 100:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0 and i >= 99:
+                if (i+1) % 10 == 0 and i >= 99  and (i >= col_start and i < col_end):
                    index = len(str(i/10 + 1).split('.')[0])
                    html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
-
-                else:
+                elif (i >= col_start and i < col_end):
                    html += ' '
            html += '\n'

        if self.alignlen > 1000:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0:
+                if (i+1) % 10 == 0  and (i >= col_start and i < col_end):
                    html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
-
-                else:
+                elif (i >= col_start and i < col_end):
                    html += ' '
            html += '\n'
        for seq in self.seqs:
            html += seq.name.ljust(maxNameLength) + ' '
-            for sym in seq:
+            for sym in seq[col_start:col_end]:
                color = self.alphabet.getAnnotation('html-color', sym)
                if not color:
                    color = 'white'
                html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
            html += '\n'
-        html += '</pre></body></html>'
+        html += '</p></pre></body></html>'
        if filename:
            fh = open(filename, 'w')
            fh.write(html)
@@ -1187,19 +1196,25 @@ class Regexp(object):
    def __str__(self):
        return self.pattern

-    def search(self, sequence):
+    def search(self, sequence, gappy = False):
        """ Find matches to the motif in the specified sequence. Returns a list
        of triples, of the form (position, matched string, score). Note that
        the score is always 1.0 because a regexp either matches
        or doesn't. """
        if not type(sequence) is Sequence:
            sequence = Sequence(sequence)
-        sequenceString = sequence[:]
-        results = []
-        for match in self.regex.finditer(sequenceString):
-            results.append((match.start(), match.group(), 1.0))
-        return results
-
+        if gappy == False or sequence.gappy == False:
+            sequenceString = sequence[:]
+            results = []
+            for match in self.regex.finditer(sequenceString):
+                results.append((match.start(), match.group(), 1.0))
+            return results
+        else:  # if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
+            degapped, idxs = sequence.getDegapped()
+            results = []
+            for match in self.regex.finditer(''.join(degapped)):
+                results.append((idxs[match.start()], match.group(), 1.0))
+            return results

 class PWM(object):


--- a/sym.py
+++ b/sym.py
@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
                   'Protein': Protein_Alphabet,
                   'ProteinwX': Protein_wX,
                   'ProteinwSTOP' : Protein_wSTOP,
+                   'ProteinwGAP': Protein_wGAP,
                   'DSSP_Alphabet' : DSSP_Alphabet,
                   'DSSP3_Alphabet' : DSSP3_Alphabet}
 # The preferred order in which a predefined alphabet is assigned to a sequence
 # (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
-preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
+preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP',
+                  'ProteinwGAP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
 # Useful annotations
 DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
 RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
-Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
+#Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
+Protein_Alphabet.annotateAll('html-color', {
+#orange*/
+'G': "#F5A259",
+#green*/
+'N':"#00f900",
+'Q':"#00f900",
+'S': "#00f900",
+'T': "#00f900",
+#red*/
+'K': "#f62f00",
+'R': "#f62f00",
+#blue/purple*/
+'A':"#92b2f3",
+'I': "#92b2f3",
+'L': "#92b2f3",
+'M': "#92b2f3",
+'V': "#92b2f3",
+'W': "#92b2f3",
+'F': "#92b2f3",
+#yellow*/
+'P': "#FFFB00",
+#pink*/
+'C':"#F59692",
+#aqua*/
+'H': "#04B2B3",
+'Y': "#04B2B3",
+#purple*/
+'D':"#CE64CB",
+'E':"#CE64CB"})

 # ------------------ Substitution Matrix ------------------


--- a/webservice.py
+++ b/webservice.py
 import urllib.request
+import urllib.parse
 import os
 from time import sleep
 import stats
@@ -16,10 +17,11 @@ import json
    http://www.ebi.ac.uk/Tools/webservices/tutorials
    """

-__ebiUrl__ =        'http://www.ebi.ac.uk/Tools/'
-__ebiGOUrl__ =      'https://www.ebi.ac.uk/QuickGO/services/'
-__uniprotUrl__ =    'http://www.uniprot.org/'
-__ebiSearchUrl__ =  'http://www.ebi.ac.uk/ebisearch/'
+__ebiUrl__ = 'http://www.ebi.ac.uk/Tools/'
+__ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/'
+__uniprotUrl__ = 'http://www.uniprot.org/'
+__ebiSearchUrl__ = 'http://www.ebi.ac.uk/ebisearch/'
+

 def fetch(entryId, dbName='uniprotkb', format='fasta'):
    """
@@ -31,7 +33,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):

    http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb&id=P63166&format=fasta&style=raw&Retrieve=Retrieve
    """
-     # Construct URL
+    # Construct URL
    url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&Retrieve=Retrieve&db=' + dbName + '&format=' + format + '&id=' + entryId
    # Get the entry
    try:
@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
    except urllib.error.HTTPError as ex:
        raise RuntimeError(ex.read())

+
 def search(query, dbName='uniprot', format='list', limit=100, columns=""):
    """
    Retrieve multiple entries matching query from a database currently only via UniProtKB
@@ -54,10 +57,14 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
    """
    if dbName.startswith('uniprot'):
        # Construct URL
-        if limit == None: # no limit to number of results returned
-            url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query + '&columns='+ columns
+        if limit == None:  # no limit to number of results returned
+            url = "{}{}/?format={}&query={}&columns={}".format(__uniprotUrl__, dbName, format,
+                                                               urllib.parse.quote(query),
+                                                               columns)
        else:
-            url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query + '&columns='+ columns
+            url = "{}{}/?format={}&limit={}&query={}&columns={}".format(__uniprotUrl__, dbName, format, str(limit),
+                                                                        urllib.parse.quote(query), columns)
+
        # Get the entries

        try:
@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
        dbs = dbName.split(":")
        if len(dbs) > 1:
            dbName = dbs[1]
+
+
        base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
-        url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
+
+        url = base + "esearch.fcgi?db={}&term={}+AND+srcdb_refseq[" \
+              "prop]&retmax={}".format(dbName, urllib.parse.quote(query), str(limit))
+
+        print (url)
+
        # Get the entries
        try:
            data = urllib.request.urlopen(url).read().decode("utf-8")
            words = data.split("</Id>")
-            words = [w[w.find("<Id>")+4:] for w in words[:-1]]
+            words = [w[w.find("<Id>") + 4:] for w in words[:-1]]
            if format == 'list':
                return words
            elif format == 'fasta' and len(words) > 0:
@@ -93,9 +107,10 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
            raise RuntimeError(ex.read())
    return

-authorised_database_tag = {9606:  ['Homo sapiens', 'ACC', 'ID'],
-                           3702:  ['Arabidopsis thaliana', 'TAIR_ID'],
-                           4932:  ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
+
+authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
+                           3702: ['Arabidopsis thaliana', 'TAIR_ID'],
+                           4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
                           10090: ['Mus musculus', 'MGI_ID']}

 """
@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
 Note that this service can be slow for queries involving a large number of entries.
 """

-def getGOReport(positives, background = None):
+
+def getGOReport(positives, background=None):
    """ Generate a complete GO term report for a set of genes (positives).
        Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
        Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
@@ -135,7 +151,7 @@ def getGOReport(positives, background = None):
        for t in term_set:
            term_cnt[t] = fg_list.count(t)
        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
-    else: # a background is provided
+    else:  # a background is provided
        for t in term_set:
            fg_hit = fg_list.count(t)
            bg_hit = bg_list.count(t)
@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
    for t in sorted_cnt:
        defin = getGODef(t[0])
        if background != None:
-            ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name']))
+            ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0] + t[1][1], defin['name']))
        else:
            ret.append((t[0], t[1], defin['name']))
    return ret

+
 def getGODef(goterm):
    """
    Retrieve information about a GO term
@@ -165,7 +182,7 @@ def getGODef(goterm):
    url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm
    # Get the entry: fill in the fields specified below
    try:
-        entry={'id': None, 'name': None, 'aspect': None}
+        entry = {'id': None, 'name': None, 'aspect': None}
        data = urllib.request.urlopen(url).read().decode("utf-8")
        ret = json.loads(data)
        for row in ret['results']:
@@ -179,6 +196,7 @@ def getGODef(goterm):
    except urllib.error.HTTPError as ex:
        raise RuntimeError(ex.read())

+
 def getGOTerms(genes):
    """
    Retrieve all GO terms for a given set of genes (or single gene).
@@ -187,9 +205,9 @@ def getGOTerms(genes):
    if type(genes) != list and type(genes) != set and type(genes) != tuple:
        genes = [genes]
    map = dict()
-    batchsize = 100 # size of query batch
+    batchsize = 100  # size of query batch
    genecnt = 0
-    limitpage = 100 # number of record on each returned page
+    limitpage = 100  # number of record on each returned page
    while genecnt < len(genes):
        genebatch = []
        for index in range(batchsize):
@@ -237,6 +255,7 @@ def getGOTerms(genes):
            raise RuntimeError(ex.read())
    return map

+
 def getGenes(goterms, taxo=None):
    """
    Retrieve all genes/proteins for a given set of GO terms (or single GO term).
@@ -247,9 +266,9 @@ def getGenes(goterms, taxo=None):
    if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
        goterms = [goterms]
    map = dict()
-    batchsize = 10 # size of query batch
+    batchsize = 10  # size of query batch
    termcnt = 0
-    limitpage = 100 # number of record on each returned page
+    limitpage = 100  # number of record on each returned page
    while termcnt < len(goterms):
        termbatch = []
        for index in range(batchsize):
@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
            else:
                break
            termcnt += 1
-        uri_string = 'annotation/search?limit=' + str(limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId='
+        uri_string = 'annotation/search?limit=' + str(
+            limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId='
        for i in range(len(termbatch)):
            term = termbatch[i]
            uri_string += term + "," if i < len(termbatch) - 1 else term
@@ -295,11 +315,11 @@ def getGenes(goterms, taxo=None):
            raise RuntimeError(ex.read())
    return map

-class EBI(object):

-    __email__ =         'anon@uq.edu.au'                            # to whom emails about jobs should go
-    __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
-    __checkInterval__ = 2                                           # how long to wait between checking job status
+class EBI(object):
+    __email__ = 'anon@uq.edu.au'  # to whom emails about jobs should go
+    __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/'  # Use UQ mirror when available
+    __checkInterval__ = 2  # how long to wait between checking job status

    def __init__(self, service=None):
        """ Initialise service session.
@@ -349,7 +369,8 @@ class EBI(object):
        if self.isLocked():
            raise RuntimeError("""You currently have a %s job running. You must
                                  wait until it is complete before submitting another job. Go to
-                                  %sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId))
+                                  %sstatus/%s to check the status of the job.""" % (
+            self.service, self.__ebiServiceUrl__, self.jobId))
        url = self.__ebiServiceUrl__ + self.service + '/run/'
        # ncbiblast database parameter needs special handling
        if self.service == 'ncbiblast':
@@ -423,8 +444,8 @@ class EBI(object):
        else:
            return results

-def getUniProtDict(ids, cols="", db='uniprot', identities=None):

+def getUniProtDict(ids, cols="", db='uniprot', identities=None):
    """

    :param ids: The list of UniProt IDs
@@ -439,11 +460,11 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
    *** EXAMPLE USAGE ***
    Get a list of UniProt IDs and a list of UniProt columns you're interested in.
    Full list of UniProt column names - https://www.uniprot.org/help/uniprotkb_column_names
-    
+
    uniprot_names = ['Q9LIR4', 'Q1JUQ1', 'P05791', 'P0ADF6']
    cols = ["lineage(SUPERKINGDOM)", "genes", "lineage(KINGDOM)"]    
    up_dict = getUniProtDict(uniprot_names, cols)
-    
+
    for record in up_dict:
        print (record, up_dict[record].get("lineage(SUPERKINGDOM)"))

@@ -452,22 +473,21 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
    for record in up_dict:
        print (record, up_dict[record].get("genes"))

-    
+
    If a record doesn't have an entry in UniProt for that column it'll just return None
-    
+
    print (up_dict['Q1JUQ1'])
    print (up_dict['Q1JUQ1']['lineage(KINGDOM)'])
-    
-    
+
+
    *** EXAMPLE USAGE FOR UNIREF SEARCHING ***
-    
+
    up_dict = getUniProtDict(["Q9LIR4", "P99999"], cols=["members"], db="uniref", identities = 1.0)
-    
+
    You can either pass a list of identities for each UniProt identifier (in which case the list of identities must be
    the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
    """

-
    # Format the lists of IDs and columns correctly
    cols = ",".join(cols)

@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
        if type(identities) != list:
            identities = [identities] * len(ids)
        elif len(identities) != len(ids):
-            raise RuntimeError('Either supply a single identity threshold or supply one for each identifier in the list')
+            raise RuntimeError(
+                'Either supply a single identity threshold or supply one for each identifier in the list')

        # Check that the identity thresholds are valid values
        for x in identities:
            if x not in [1.0, 0.9, 0.5]:
-                raise RuntimeError("UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - " + str(x))
+                raise RuntimeError(
+                    "UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - " + str(x))

        # Add the query syntax around the identifiers
        updated_ids = ""
@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):

    url = 'https://www.uniprot.org/' + db + '/'

-
-
    params = {
        'format': 'tab',
        'query': updated_ids,
@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
    # For each record we retrieve, split the line by tabs and build up the UniProt dict
    for line in page.split("\n")[1:]:
        if line:
-            splitlines= line.split("\t")
+            splitlines = line.split("\t")
            id_dict = {}
            pos = 1
            for col in cols.split(","):
                id_dict[col] = None if splitlines[pos] == "" else splitlines[pos]
-                pos +=1
+                pos += 1
            up_dict[splitlines[0]] = id_dict

    return up_dict