EDITOR=emacsclient

eff5f374 · Mikael Boden · f5d19d0a · eff5f374 · eff5f374 · eff5f374
Commit eff5f374 authored Jul 01, 2018 by Mikael Boden
16 changed files
--- a/annotation_test.py
+++ b/annotation_test.py
+import annotations
+import phylo
+
+# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
+
+# print (tree)
+# unique_tree = get_unique_tree(tree)
+#
+# print (unique_tree)
+
+
+
+working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
+
+tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
+
+print (tree)
+print (tree.nexus_annotations.annotations)
+
+tree.swap_annotations("PDB")
+
+print (tree)
+print (tree.nexus_annotations.annotations)
+#
+# tree.write_to_nexus(working_dir + "output.nexus")
+
+# nexus_annotations = annotations.NexusAnnotations2()
+
+# for node in tree.getNodes():
+#     if node.isLeaf():
+#         nexus_annotations.add_annotation(node, "node", "annotation")
+#
+# print (nexus_annotations.annotations)
+
+
+# REGEX CODE
+# import re
+#
+# pattern = r"<person>(.*?)</person>"
+# re.findall(pattern, str, flags=0) #you may need to add flags= re.DOTALL if your str is multiline
\ No newline at end of file
--- a/annotations.py
+++ b/annotations.py
+from collections import defaultdict
+from phylo import *
+import matplotlib
+import random
+
+matplotlib_colours={'aliceblue':'#F0F8FF','aqua':'#00FFFF','aquamarine':'#7FFFD4','azure':'#F0FFFF','beige':'#F5F5DC','bisque':'#FFE4C4','blanchedalmond':'#FFEBCD','blue':'#0000FF','blueviolet':'#8A2BE2','brown':'#A52A2A','burlywood':'#DEB887','cadetblue':'#5F9EA0','chartreuse':'#7FFF00','chocolate':'#D2691E','coral':'#FF7F50','cornflowerblue':'#6495ED','crimson':'#DC143C','cyan':'#00FFFF','darkblue':'#00008B','darkcyan':'#008B8B','darkgoldenrod':'#B8860B','darkgreen':'#006400','darkkhaki':'#BDB76B','darkmagenta':'#8B008B','darkolivegreen':'#556B2F','darkorange':'#FF8C00','darkorchid':'#9932CC','darkred':'#8B0000','darksalmon':'#E9967A','darkseagreen':'#8FBC8F','darkslateblue':'#483D8B','darkturquoise':'#00CED1','darkviolet':'#9400D3','deeppink':'#FF1493','deepskyblue':'#00BFFF','dodgerblue':'#1E90FF','firebrick':'#B22222','floralwhite':'#FFFAF0','forestgreen':'#228B22','fuchsia':'#FF00FF','gainsboro':'#DCDCDC','ghostwhite':'#F8F8FF','gold':'#FFD700','goldenrod':'#DAA520','green':'#008000','greenyellow':'#ADFF2F','honeydew':'#F0FFF0','hotpink':'#FF69B4','indianred':'#CD5C5C','indigo':'#4B0082','khaki':'#F0E68C','lavender':'#E6E6FA','lavenderblush':'#FFF0F5','lawngreen':'#7CFC00','lemonchiffon':'#FFFACD','lightblue':'#ADD8E6','lightcoral':'#F08080','lightcyan':'#E0FFFF','lightgoldenrodyellow':'#FAFAD2','lightgreen':'#90EE90','lightgray':'#D3D3D3','lightpink':'#FFB6C1','lightsalmon':'#FFA07A','lightseagreen':'#20B2AA','lightskyblue':'#87CEFA','lightslategray':'#778899','lightsteelblue':'#B0C4DE','lightyellow':'#FFFFE0','lime':'#00FF00','limegreen':'#32CD32','magenta':'#FF00FF','maroon':'#800000','mediumaquamarine':'#66CDAA','mediumblue':'#0000CD','mediumorchid':'#BA55D3','mediumpurple':'#9370DB','mediumseagreen':'#3CB371','mediumslateblue':'#7B68EE','mediumspringgreen':'#00FA9A','mediumturquoise':'#48D1CC','mediumvioletred':'#C71585','midnightblue':'#191970','mintcream':'#F5FFFA','mistyrose':'#FFE4E1','moccasin':'#FFE4B5','navajowhite':'#FFDEAD','navy':'#000080','oldlace':'#FDF5E6','olive':'#808000','olivedrab':'#6B8E23','orange':'#FFA500','orangered':'#FF4500','orchid':'#DA70D6','palegoldenrod':'#EEE8AA','palegreen':'#98FB98','paleturquoise':'#AFEEEE','palevioletred':'#DB7093','papayawhip':'#FFEFD5','peachpuff':'#FFDAB9','peru':'#CD853F','pink':'#FFC0CB','plum':'#DDA0DD','powderblue':'#B0E0E6','purple':'#800080','red':'#FF0000','rosybrown':'#BC8F8F','royalblue':'#4169E1','saddlebrown':'#8B4513','salmon':'#FA8072','sandybrown':'#FAA460','seagreen':'#2E8B57','seashell':'#FFF5EE','sienna':'#A0522D','skyblue':'#87CEEB','slateblue':'#6A5ACD','springgreen':'#00FF7F','steelblue':'#4682B4','tan':'#D2B48C','teal':'#008080','thistle':'#D8BFD8','tomato':'#FF6347','turquoise':'#40E0D0','violet':'#EE82EE','wheat':'#F5DEB3','yellow':'#FFFF00','yellowgreen':'#9ACD32'}
+
+twenty_colours ={'dodgerblue':'#1E90FF', 'orangered':'#FF4500', 'greenyellow':'#ADFF2F', 'orchid':'#DA70D6'}
+
+symbols = ["*", "^", "!", "#", "~", "+", ":", "<", ">", "@", "%", "=", "-"]
+
+class NexusAnnotations():
+    """
+    Defines a set of annotations for a phylogenetic tree, to be written to NEXUS format
+    """
+    node_annotations = defaultdict(list)
+    leaf_annotations = defaultdict(list)
+    used_colours = set()
+    annotated_nodes = set()
+
+    def __init__(self, colour_dict=matplotlib_colours, symbol_list = symbols, tree=None, node_annotations=defaultdict(list), leaf_annotations=defaultdict(list), annotation_symbols={}):
+        self.tree = tree
+        self.node_annotations = node_annotations
+        self.leaf_annotations = leaf_annotations
+        self.annotation_symbols = annotation_symbols
+        self.colour_dict = colour_dict
+        self.symbol_list = symbol_list
+        self.used_colours = set()
+        self.annotated_nodes = set()
+
+        # if type(self.tree) != PhyloTree:
+        #     raise RuntimeError("NexusAnnotations need the tree to be a PhyloTree object")
+
+        if tree:
+            self.add_original_annotations()
+
+    def add_original_annotations(self):
+        """
+        Add an entry for the original labels of the tree, so we can map back if we need to
+        """
+        for node in self.tree.getNodes():
+            if node.isLeaf():
+                self.leaf_annotations[node] = {"Original" : node.label}
+
+    def add_annotations(self, annotation_dict):
+        for node in self.tree.getNodes():
+            if node.label in annotation_dict:
+                for key, val in annotation_dict[node.label].items():
+                    self.leaf_annotations[node][key] = val
+        # print (self.leaf_annotations)
+
+
+    def add_annotation(self, node, key="", annotation=[], annotate_descendants=False):
+
+        self.node_annotations[node] = {key:annotation}
+        self.annotated_nodes.add(node)
+
+        for annot in annotation:
+            if annot not in self.annotation_symbols:
+                symbol = self.generate_annotation_symbol(annot)
+                self.add_annotation_symbol(annot, symbol)
+
+        # Add in a symbol to represent this annotation if one doesn't already exist
+
+
+
+        if annotate_descendants:
+            for descendant in node.getDescendants():
+                self.add_annotation(descendant, key, annotation)
+
+
+    def add_colour_annotation(self, node, colour=None, random_colour=False, colour_descendants=True):
+        """
+        Add a single colour annotation to a set of nodes
+        :param labels: The nodes to annotate
+        :param colour: The colour to annotate with
+        """
+
+        if not colour:
+            colour = self.generate_colour()
+
+
+        if colour_descendants:
+            for descendant in node.getDescendants(transitive=True):
+                self.node_annotations[descendant] = {"!color": colour}
+                self.annotated_nodes.add(descendant)
+                self.used_colours.add(colour)
+
+
+    def add_colour_annotations(self, nodes, colour_list=None, colour_descendants=False):
+        """
+        Add multiple colour annotations to a list of nodes
+        :param nodes: A list of lists of nodes to annotate
+        :param colour_list: The colour to annotate each list of nodes with
+        """
+
+        if not colour_list:
+            colour_list = self.generate_colour_list(len(nodes))
+
+        for index, node_set in enumerate(nodes):
+            set_colour = self.colour_dict[colour_list[index]]
+
+
+            print(set_colour)
+            print(index, node_set)
+            for node in node_set:
+                self.add_colour_annotation(node, set_colour, colour_descendants=colour_descendants)
+
+    def generate_annotation_symbol(self, annotation):
+
+        i = 0
+        while i < len(self.symbol_list):
+            symbol = random.choice(self.symbol_list)
+            if symbol not in self.annotation_symbols.values():
+                return symbol
+        i+=1
+
+    def add_annotation_symbol(self, symbol, annotation):
+
+        self.annotation_symbols[symbol] = annotation
+
+
+
+    def generate_colour(self):
+        """
+        Generate a colour that hasn't been used yet in this set of Nexus Annotations
+        :return: A unique colour
+        """
+
+        i = 0
+        while i < len(self.colour_dict.values()):
+            colour = random.choice(list(self.colour_dict.values()))
+            if colour not in self.used_colours:
+                return colour
+        i+=1
+
+    def generate_colour_list(self, num):
+        return num
+
--- a/bed.py
+++ b/bed.py
@@ -6,10 +6,10 @@ class BedEntry():
        self.chrom = chrom
        self.chromStart = chromStart
        self.chromEnd = chromEnd
-        self.blockCount = None
        self.usestrand = False
        self.strand = None
        self.name = ''
+        self.blocks = None # interval tree with blocks

    def addOption(self,
                  name = None,
@@ -32,46 +32,64 @@ class BedEntry():
                  zscore = None,
                  bg = None):
        if name: self.name = name
-        if score: self.score = score
+        if score != None: self.score = score
        if strand:
            self.strand = strand
            self.usestrand = True # use reverse complement when sequence is requested from genome
-        if thickStart: self.thickStart = thickStart
-        if thickEnd: self.thickEnd = thickEnd
-        if itemRgb: self.itemRgb = [int(color) for color in itemRgb.split(',')]
-        if blockCount:
-            self.blockCount = max(0, blockCount)
+        if thickStart != None: self.thickStart = thickStart
+        if thickEnd != None: self.thickEnd = thickEnd
+        if itemRgb != None: self.itemRgb = [int(color) for color in itemRgb.split(',')]
+        if blockCount != None:
            if blockCount > 0:
-                self.blockSizes = [int(sizeword) for sizeword in blockSizes.rstrip(',').split(',')]
-                self.blockStarts = [int(startword) for startword in blockStarts.rstrip(',').split(',')]
-                if len(self.blockSizes) != blockCount or len(self.blockStarts) != blockCount:
+                blockSizes = [int(sizeword) for sizeword in blockSizes.rstrip(',').split(',')]
+                blockStarts = [int(startword) for startword in blockStarts.rstrip(',').split(',')]
+                if len(blockSizes) != blockCount or len(blockStarts) != blockCount:
                    raise RuntimeError('Blockcount is incorrect in BED entry \"%s\"' % str(self))
-        if signalValue: self.signalValue = signalValue
-        if pValue: self.pValue = pValue
-        if qValue: self.qValue = qValue
-        if peak: self.peak = peak
-        if tags: self.tags = tags
-        if summit: self.summit = summit
-        if fold: self.fold = fold
-        if fdr: self.fdr = fdr
-        if bg: self.bg = bg
-        if zscore: self.zscore = zscore
+                for i in range(blockCount):
+                    self.addBlock(blockStarts[i], blockSizes[i])
+        if signalValue != None: self.signalValue = signalValue
+        if pValue != None: self.pValue = pValue
+        if qValue != None: self.qValue = qValue
+        if peak != None: self.peak = peak
+        if tags != None: self.tags = tags
+        if summit != None: self.summit = summit
+        if fold != None: self.fold = fold
+        if fdr != None: self.fdr = fdr
+        if bg != None: self.bg = bg
+        if zscore != None: self.zscore = zscore

-    def __str__(self):
-        return str((self.chrom, self.chromStart, self.chromEnd))
+    def __len__(self):
+        if self.blocks:
+            return len(self.blocks)
+        else:
+            return 0

-    def __getitem__(self, i):
-        if self.blockCount:
-            return (self.chrom, self.chromStart + self.blockStarts[i], self.chromStart + self.blockStarts[i] + self.blockSizes[i])
+    def addBlock(self, relative_start, size):
+        if not self.blocks:
+            self.blocks = ival.IntervalTree()
+        self.blocks.put(ival.Interval(self.chromStart + relative_start, self.chromStart + relative_start + size))

-    def __iter__(self):
-        if self.blockCount:
-            for i in range(self.blockCount):
-                if self.blockSizes[i] > 0:
-                    yield (self.chrom, self.chromStart + self.blockStarts[i], self.chromStart + self.blockStarts[i] + self.blockSizes[i])
+    def getBlocks(self):
+        return self.blocks

-    def __len__(self):
-        return self.blockCount
+    def __str__(self):
+        if self.strand == '+' or self.strand == '-':
+           return self.chrom + ':' + str(self.chromStart) + '-' + str(self.chromEnd) + self.strand
+        return self.chrom + ':' + str(self.chromStart) + '-' + str(self.chromEnd)
+
+    def __iter__(self):
+        if self.blocks:
+            for b in self.blocks:
+                yield (self.chrom, b.ival.min, b.ival.max)
+
+    def isBlockOverlap(self, entry):
+        if not self.blocks:
+            return None
+        if isinstance(entry, BedEntry):
+            if (entry.chrom == self.chrom):
+                return self.blocks.isect(entry.getInterval()) != None
+        elif isinstance(entry, ival.Interval):
+            return self.blocks.isect(entry) != None

    def loc(self, genome = None, fixedwidth = None, usesummit = False, useshift = None):
        """ Retrieve the genomic location for BED entry, or sequence if genome is provided
@@ -357,7 +375,7 @@ def readBedFile(filename, format = 'Limited'):
                chromStart = int(words[1])
                chromEnd = int(words[2])
            entry = BedEntry(chrom, chromStart, chromEnd)
-            if format.lower().startswith('opt'):
+            if format.lower().startswith('opt') or format.lower().startswith('bed12'):
                if len(words) >= 12:
                    entry.addOption(name = words[3], score = float(words[4]), strand = words[5], thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8], blockCount = int(words[9]), blockSizes = words[10], blockStarts = words[11])
                elif len(words) >= 9:
@@ -417,30 +435,73 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
    if header:
        f.write(header + '\n')
    for row in entries:
-        if format == 'Peaks':
-            f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
-        elif format == 'Limited':
-            f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
-        elif format == 'Strand':
-            f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd, row.strand, row.name))
+        if row.blocks: # BED12
+            f.write("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.thickStart, row.thickEnd))
+            if row.itemRgb:
+                if len(row.itemRgb) == 3:
+                    f.write("%d,%d,%d\t" % (row.itemRgb[0],row.itemRgb[1],row.itemRgb[2]))
+                else:
+                    f.write("0\t")
+            else:
+                f.write("0\t")
+            f.write("%d\t" % (len(row)))
+            blockStarts = []
+            blockSizes = []
+            for b in row.blocks:
+                blockStarts.append(b.ival.min - row.chromStart)
+                blockSizes.append(len(b.ival))
+            for b in blockSizes:
+                f.write("%d," % (b))
+            f.write("\t")
+            for b in blockStarts:
+                f.write("%d," % (b))
+            f.write("\n")
        else:
-            f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
-        f.write("\n")
+            if format == 'Peaks':
+                f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
+            elif format == 'Limited':
+                f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
+            elif format == 'Strand':
+                f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd, row.strand, row.name))
+            else:
+                f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
+            f.write("\n")
    f.close()

 if __name__ == '__main__':
-    bf = BedFile('/Users/mikael/binfpy/BIOL3014/Week7/mm10_genes.bed', 'optional')
+#    bf = BedFile('/Users/mikael/binfpy/BIOL3014/Week7/mm10_genes.bed', 'optional')
+    bf = BedFile('/Volumes/Share/ARCDP19/Analysis/Young/Young_flat.bed', 'optional')
    print(bf.chroms.keys())
    g = bf.generate('chr1')
    print(next(g))
    print(next(g))
    print(next(g))
    cnt = 0
+    collect = []
    for entry in bf:
        cnt += 1
        print(str(cnt) + '\t' + str(entry))
-        if cnt == 100:
+        collect.append(entry)
+        if cnt == 7:
+            for b in entry:
+                print('\t', b)
+        if cnt == 10:
            break
+    writeBedFile(collect, '/Users/mikael/Desktop/test.bed')
+    bf2 = BedFile('/Users/mikael/Desktop/test.bed', 'opt')
+    q = ival.Interval(3805000, 3806000)
+    t2 = ival.IntervalTree()
+    t2.put(q, "blah")
+    for entry in bf2:
+        if entry.isBlockOverlap(q):
+            print('Found:', entry)
+            tree = entry.getBlocks()
+            t2.putAll(tree)
+            for t in t2:
+                print(t)
+
+
+
    entry1 = BedEntry('chrX', 3858266, 3858530)
    print(entry1 in bf)
    entry2 = BedEntry('chrX', 10047550, 10067694)

--- a/cell.py
+++ b/cell.py
+def readOBOFile(obofile):
+    """
+    Read/load OBO file that contains ontology defs for
+    Uber anatomy ontology (Uberon), Cell Ontology (CL) and Experimental Factor Ontology (EFO)
+    http://cellontology.org
+    see also http://www.obofoundry.org/
+
+    Example of one "term" and one "typedef" entry (note CL refers to cell ontology and UBERON to the anatomy ontology:
+
+    [Term]
+    id: CL:0000513
+    name: cardiac muscle myoblast
+    namespace: cell
+    alt_id: CL:0000714
+    def: "A precursor cell destined to differentiate into cardiac muscle cell." [GOC:tfm, MESH:A11.635.470]
+    synonym: "cardiac muscle progenitor cell" EXACT []
+    synonym: "cardiomyocyte progenitor cell" EXACT []
+    xref: FMA:84797
+    is_a: CL:0002494 ! cardiocyte
+    is_a: CL:0010021 ! cardiac myoblast
+    intersection_of: CL:0000056 ! myoblast
+    intersection_of: develops_into CL:0000746 ! cardiac muscle cell
+    intersection_of: part_of UBERON:0001133 ! cardiac muscle tissue
+    relationship: develops_into CL:0000746 ! cardiac muscle cell
+    relationship: part_of UBERON:0001133 ! cardiac muscle tissue
+
+    [Typedef]
+    id: part_of
+    name: part of
+    def: "a core relation that holds between a part and its whole" []
+    xref: BFO:0000050
+    is_transitive: true
+    is_a: overlaps ! overlaps
+    inverse_of: has_part ! has part
+    """
+    src = open(obofile, 'r')
+    terms = {}
+    in_term_def = False
+    in_type_def = False
+    for line in src:
+        if in_term_def:
+            word = line.split()
+            if line.startswith('id: '):
+                term_id = word[1].strip()
+                term_is = set()
+            elif line.startswith('name: '):
+                term_name = line[6:].strip()
+            elif line.startswith('def: '):
+                # Note this is a multi-line field, delimited by "'s
+                pass
+            elif line.startswith('is_a: '):
+                term_is.add((word[1].strip(), 'is_a'))
+            elif line.startswith('relationship: '):
+                term_is.add((word[2], word[1]))
+            elif line.startswith('intersection_of: '):
+                pass
+            elif line.startswith('is_obsolete: '):
+                in_term_def = False # ignore this entry
+        if line.startswith('[Term]'):
+            if in_term_def: # already defining one, stash it before moving on to the next...
+                terms[term_id] = (term_name, term_is)
+            elif in_type_def:
+                in_type_def = False
+            in_term_def = True
+        if line.startswith('[Typedef]'):
+            if in_term_def: # already defining one, stash it before moving on to the next...
+                terms[term_id] = (term_name, term_is)
+                in_term_def= False
+            in_type_def = True
+    if in_term_def: #  defining one, stash it
+        terms[term_id] = (term_name, term_is)
+    return terms
+
+def getChildren(terms, parent):
+    all = []
+    for t in terms:
+        (name, isa) = terms[t]
+        for (id, rel) in isa:
+            if id == parent:
+                all.append((t, rel))
+    return all
+
+def getParents(terms, child):
+    all = []
+    (name, isa) = terms[child]
+    for (id, rel) in isa:
+        all.append((id, rel))
+    return all
+
+def getID(terms, query_name):
+    for t in terms:
+        (name, isa) = terms[t]
+        if name == query_name:
+            return t
+
+def getName(terms, id):
+    return terms[id][0]
+
+def listParents(terms, query_name):
+    child_ID = getID(terms, query_name)
+    all = []
+    for (parent, rel) in getParents(terms, child_ID):
+        all.append(getName(terms, parent))
+    return all
+
+if __name__ == '__main__':
+    terms = readOBOFile('/Users/mikael/simhome/share/cl.obo')
+    print(len(terms))
\ No newline at end of file
--- a/colours.py
+++ b/colours.py
+
+
+
+for name, hex in colours.items():
+    print (name, hex)
\ No newline at end of file
--- a/dictionary_test.py
+++ b/dictionary_test.py
+name = "Gabe"
+
+name = name[0: 2]
+
+print (name)
\ No newline at end of file
--- a/godata.py
+++ b/godata.py
 '''
-Created on Jul 12, 2012, amended April 2015
+Created on Jul 12, 2012, amended April 2015 and again May 2018

 Module for managing Gene Ontology data, in particular gene:terms
 annotations and term definitions
@@ -26,8 +26,9 @@ Subsequently you can construct instances of BinGO and query terms and genes, rou
 from struct import pack, unpack, calcsize, error
 import operator
 import time
-import os
+import sys
 import stats
+import gzip

 # Character codes used by binary format to identify ontology
 onto_codes = {
@@ -82,7 +83,7 @@ class GO():
    termdefs = {}   # definitions: termdefs[term] = (onto, set((term, rel)), name)
    children = {}   # redundant, parent-to-child structure: children[term] = set((term, rel))

-    def __init__(self, annotFile, obofile, annotfile_columns = (1,2,3,4,6,8)):
+    def __init__(self, annotFile, obofile, annotfile_columns = (1,2,3,4,6,8), include_genes = None):
        """ Start GO session with specified data loaded:
        annotfile: name of annotation file, e.g.'gene_association.tair'
        OBO file: name of gene ontology definition file, e.g. 'gene_ontology_ext.obo'
@@ -91,6 +92,7 @@ class GO():
        (The default seems to work for most annotation files, but sometime if you wish to cross reference
        say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations:
        go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8))
+        Optionally, specify what genes should be included when reading the annotations; None (default) means include everything.
        """
        print(("Started at", time.asctime()))
        # Get GO definitions
@@ -110,7 +112,10 @@ class GO():
                    pass
        print(("Read %d GO definitions" % len(terms)))
        # open annotation file to analyse and index data
-        src = open(annotFile, 'r')
+        if annotFile.endswith(".gz"):
+            src = gzip.open(annotFile, 'rt')
+        else:
+            src = open(annotFile, 'rt')
        gene_cnt = 0
        cnt = 0
        for line in src:
@@ -118,14 +123,14 @@ class GO():
            if line.startswith('!'):
                continue
            (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns)
-            #print(gene, symb, qual, term, evid, onto, taxa)
-            try:
-                (taxa_q, terms_map) = self.annots[gene]
-                terms_map[term] = (evid, qual != 'NOT')
-            except KeyError: # not a previously encountered gene
-                gene_cnt += 1
-                terms_map = {term: (evid, qual != 'NOT')}
-                self.annots[gene] = (taxa, terms_map)
+            if include_genes == None or gene in include_genes:
+                try:
+                    (taxa_q, terms_map) = self.annots[gene]
+                    terms_map[term] = (evid, qual != 'NOT')
+                except KeyError: # not a previously encountered gene
+                    gene_cnt += 1
+                    terms_map = {term: (evid, qual != 'NOT')}
+                    self.annots[gene] = (taxa, terms_map)
        src.close()
        print(("Read annotations for %d genes" % gene_cnt))

@@ -896,6 +901,7 @@ def readOBOFile(obofile):
            in_term_def = True
        if line.startswith('[Typedef]'):
            if in_term_def: # already defining one, stash it before moving on to the next...
+                terms[term_id] = (term_name, term_onto, term_is)
                in_term_def= False
            in_type_def = True
    if in_term_def: #  defining one, stash it
@@ -1037,3 +1043,52 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
    # done, close
    dst.close()
    print(("Completed at", time.asctime()))
+
+def reduceAnnotFile(annotfile, newannotfile, include_genes, annotfile_column_gene = 1):
+    """ Reduce size of annotation file by specifying the genes of interest:
+    annotfile: name of annotation file, e.g. 'goa_uniprot_all.gaf'
+    newannotfile: name of new, reduced-size annotation file, e.g. 'goa_uniprot_some.gaf'
+    include_genes: set/list of gene names to include
+    Optionally, specify what column in the annotation file that contains name of gene product:
+    """
+    print(("Started at", time.asctime()))
+    # open annotation file to analyse and index data
+    if annotfile.endswith(".gz"):
+        src = gzip.open(annotfile, 'rt')
+        dst = gzip.open(newannotfile, 'wt')
+    else:
+        src = open(annotfile, 'rt')
+        dst = open(newannotfile, 'wt')
+    cnt0 = 0
+    cnt1 = 0
+    for line in src:
+        if line.startswith('!'):
+            continue
+        cnt0 += 1
+        fields = line.strip().split('\t')
+        gene = fields[annotfile_column_gene]
+        if gene in include_genes:
+            dst.write(line)
+            cnt1 += 1
+    src.close()
+    dst.close()
+    print(("Read %d annotations; wrote %d annotations" % (cnt0, cnt1)))
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print('Usage: godata <COMMAND> where <COMMAND> is one of the following', \
+        "\n\tinclude <gaf-file-src> <gaf-file-dest> <gene-txt-file> where",\
+        "\n\t\t<gaf-file-src> is the original annotation file",\
+        "\n\t\t<gaf-file-dest> is the new, reduced-size annotation file",\
+        "\n\t\t<gene-txt-file> lists gene names to include in new gaf-file-dest")
+        sys.exit(1)
+    if sys.argv[1].upper() == 'INCLUDE' and len(sys.argv) > 4:
+        f = open(sys.argv[4])
+        include_genes = set()
+        for line in f:
+            include_genes.add(line.strip())
+        reduceAnnotFile(sys.argv[2], sys.argv[3], include_genes)
+    else:
+        print('Unknown command \"' + sys.argv[1] + '\" or arguments:'),
+        for arg in sys.argv[2:]:
+            print(arg),
--- a/gtf.py
+++ b/gtf.py
+import shlex
+import ival
+
+class GtfEntry():
+
+    '''
+    GFF fields:
+    seqname - The name of the sequence. Must be a chromosome or scaffold.
+    source - The program that generated this feature.
+    feature - The name of this type of feature. Some examples of standard feature types are "CDS" "start_codon" "stop_codon" and "exon"li>
+    start - The starting position of the feature in the sequence. The first base is numbered 1.
+    end - The ending position of the feature (inclusive).
+    score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). If there is no score value, enter ":.":.
+    strand - Valid entries include "+", "-", or "." (for don't know/don't care).
+    frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be ".".
+    group - All lines with the same group are linked together into a single item.
+    '''
+    def __init__(self, chrom, start, end, feature, score = ".", source = "unknown", strand = ".", frame = ".", group = None):
+        self.seqname = chrom
+        self.start = start
+        self.end = end
+        self.feature = feature
+        self.score = score
+        self.strand = strand
+        self.source = source
+        self.frame = frame
+        self.group = group
+        self.attr = {}
+        if self.group:
+            fields = self.group.split(';')
+            for f in fields:
+                pair = shlex.split(f.strip())
+                if len(pair) == 2:
+                    self.attr[pair[0]] = pair[1]
+
+    def __getitem__(self, item):
+        return self.attr[item]
+
+    def __contains__(self, item):
+        return item in self.attr
+
+    def __str__(self):
+        return str((self.seqname, self.start, self.end))
+
+    def __len__(self):
+        return self.end - self.start
+
+    def getInterval(self):
+        return ival.Interval(self.start, self.end)
+
+def dist(entry1, entry2, signed = False, centre2centre = False):
+    """ Calculate and return the BedEntry with the closest distance (from one end of the interval of this to the end of the interval of that).
+        If centre2centre is True, use the centre-to-centre distance instead.
+        If signed is True, the distance is negative if this interval is after the that.
+    """
+    if isinstance(entry1, GtfEntry) and isinstance(entry2, GtfEntry):
+        if (entry1.seqname == entry2.seqname):
+            return ival.dist(entry1.getInterval(), entry2.getInterval(), signed, centre2centre)
+    return None
+
+class GtfFile:
+    """ Read GTF/GFF file.
+
+        See http://genome.ucsc.edu/FAQ/FAQformat#format1
+    """
+
+    def __init__(self, entries):
+        """
+        Create a GtfFile instance.
+        :param entries: an iterable of entries or a filename
+        """
+        if isinstance(entries, str): # filename
+            self.chroms = readGtfFile(entries)
+        else:
+            self.chroms = dict()
+            for entry in entries:
+                # check if the chromosome has been seen before
+                tree = self.chroms.get(entry.chrom)
+                if not tree:
+                    tree = ival.IntervalTree()
+                    self.chroms[entry.chrom] = tree
+                # put the entry in the interval tree for the appropriate chromosome
+                iv = ival.Interval(entry.start, entry.end)
+                tree.put(iv, entry)
+
+    def __len__(self):
+        n = 0
+        for c in self.chroms:
+            n += len(self.chroms[c])
+        return n
+
+    def generate(self, chrom):
+        mytree = self.chroms.get(chrom)
+        if mytree != None:
+            for e in mytree:
+                for entry in e.values:
+                    yield entry
+
+    def __iter__(self):
+        self.chromqueue = ival.Stack()
+        for c in sorted(self.chroms.keys())[::-1]:
+            self.chromqueue.push(self.generate(c))
+        self.current = self.chromqueue.pop()
+        return self
+
+    def __next__(self):
+        try:
+            ret = next(self.current)
+        except StopIteration:
+            if not self.chromqueue.isEmpty():
+                self.current = self.chromqueue.pop()
+                ret = next(self.current)
+            else:
+                raise StopIteration
+        return ret
+
+    def __contains__(self, item):
+        if isinstance(item, GtfEntry):
+            tree = self.chroms.get(item.chrom)
+            if tree == None: return False
+            else: return ival.Interval(item.start, item.end) in tree
+        else:
+            return False
+
+    def getOverlap(self, item):
+        if isinstance(item, GtfEntry):
+            tree = self.chroms.get(item.chrom)
+            if tree == None: return None
+            else:
+                iv = ival.Interval(item.start, item.end)
+                res = tree.isectall(iv)
+                ret = []
+                for r in res:
+                    ret.extend(r.values)
+                return ret
+        else: return None
+
+    def getClosest(self, item):
+        if isinstance(item, GtfEntry):
+            tree = self.chroms.get(item.chrom)
+            if tree == None: return None
+            else:
+                iv = ival.Interval(item.start, item.end)
+                node = tree.closest(iv)
+                if node != None: return node.values
+                else: return None
+        else: return None
+
+    def getOneOfClosest(self, item):
+        all = self.getClosest(item)
+        if all == None: return None
+        else: return next(iter(all))
+
+    def getOneOfOverlap(self, item):
+        all = self.getOverlap(item)
+        if all == None: return None
+        elif len(all) == 0: return None
+        else: return next(iter(all))
+
+def readGtfFile(filename):
+    """ Read a GTF/GFF file.
+    """
+    f = open(filename)
+    row = 0
+    acceptHeaderRows = 1
+    headerRow = None
+    chroms = dict()
+    for line in f:
+        row += 1
+        words = line.strip().split('\t')
+        if len(words) == 0:
+            continue # ignore empty lines
+        if words[0].strip().startswith('#'):
+            continue # comment
+        if words[0].strip().startswith('browser'):
+            continue # ignore
+        if words[0].strip().startswith('track'):
+            continue # ignore
+        try:
+            seqname = words[0]
+            source = words[1]
+            feature = words[2]
+            start = int(words[3])
+            end = int(words[4])
+            score = None
+            if words[5].isnumeric():
+                score = int(words[5])
+            strand = '.'
+            if words[6] == '+' or words[6] == '-':
+                strand = words[6]
+            frame = None
+            if words[7].isdigit():
+                frame = int(words[7])
+            group = None
+            if len(words) > 8:
+                group = words[8]
+            entry = GtfEntry(seqname, start, end, feature, score, source, strand, frame, group)
+            # check if the chromosome has been seen before
+            tree = chroms.get(seqname)
+            if not tree:
+                tree = ival.IntervalTree()
+                chroms[seqname] = tree
+            # put the entry in the interval tree for the appropriate chromosome
+            iv = ival.Interval(entry.start, entry.end)
+            tree.put(iv, entry)
+        except RuntimeError as e:
+            if not acceptHeaderRows:
+                raise RuntimeError('Error in GTF/GFF file at row %d (%s)' % (row, e.strerror))
+            else:
+                headerRow = words
+                acceptHeaderRows -= 1 # count down the number of header rows that can occur
+    f.close()
+    return chroms
+
+def writeGtfFile(entries, filename, header = None):
+    """ Save the GTF entries to a file.
+    """
+    f = open(filename, 'w')
+    if header:
+        f.write(header + '\n')
+    for row in entries:
+        f.write("%s\t%s\t%s\t%d\t%d\t%d\t%s\t%s\t%s" % (row.chrom, row.source, row.feature, row.start, row.end, row.score, row.strand, row.frame, row.group))
+        f.write("\n")
+    f.close()
+
+if __name__ == '__main__':
+    bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf')
+    print(bf.chroms.keys())
+    g = bf.generate('chr12')
+    print(next(g))
+    print(next(g))
+    print(next(g))
+    cnt = 0
+    for entry in bf:
+        cnt += 1
+        print(str(cnt) + '\t' + str(entry))
+        if cnt == 100:
+            break
--- a/guide.py
+++ b/guide.py
@@ -1074,25 +1074,29 @@ class PhyloNode:
            # allocate a position to put the right child symbol from which each current node symbol score was determined
            self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
            for col in range(aln.alignlen):
+                # left child will contribute first
                for a_parent in range(len(aln.alphabet)):
                    best_score_left = +9999999
-                    best_score_right = +9999999
                    best_symb_left = 0
-                    best_symb_right = 0
-                    for a_left in range(len(aln.alphabet)):
-                        score = (scoresleft[col][a_left] + (
-                        1 if a_left != a_parent else 0))  # if we want to weight scores, this would need to change
+                    for a in range(len(aln.alphabet)):
+                        score = (scoresleft[col][a] + (
+                        1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
                        if score < best_score_left:
-                            best_symb_left = a_left
+                            best_symb_left = a
                            best_score_left = score
-                    for a_right in range(len(aln.alphabet)):
-                        score = (scoresright[col][a_right] + (
-                        1 if a_right != a_parent else 0))  # if we want to weight scores, this would need to change
+                    self.seqscores[col][a_parent] = best_score_left
+                    self.backleft[col][a_parent] = best_symb_left
+                # right child will contribute next
+                for a_parent in range(len(aln.alphabet)):
+                    best_score_right = +9999999
+                    best_symb_right = 0
+                    for a in range(len(aln.alphabet)):
+                        score = (scoresright[col][a] + (
+                        1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
                        if score < best_score_right:
-                            best_symb_right = a_right
+                            best_symb_right = a
                            best_score_right = score
-                    self.seqscores[col][a_parent] = best_score_left + best_score_right
-                    self.backleft[col][a_parent] = best_symb_left
+                    self.seqscores[col][a_parent] += best_score_right
                    self.backright[col][a_parent] = best_symb_right
        else:
            self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in

--- a/ival.py
+++ b/ival.py
@@ -10,6 +10,10 @@ class IntervalTree:
    stack = None

    def __iter__(self):
+        """
+        Create an iterator for the tree; this will iterate through nodes of the tree. Note that a node has one interval and (potentially) multiple values.
+        :return: iterator of IntervalNodes stored in this tree
+        """
        self.current = self.root
        self.stack = Stack()
        return self
@@ -26,7 +30,10 @@ class IntervalTree:
        return ret

    def __len__(self):
-        return self.root.N
+        if self.root != None:
+            return self.root.N
+        else:
+            return 0

    def __contains__(self, ival):
        return self.get(ival) != None
@@ -88,6 +95,10 @@ class IntervalTree:
        else:
            self.root = self._randomizedInsert(self.root, ival, value)

+    def putAll(self, tree):
+        for i in tree:
+            self.put(i.getInterval(), tree.get(i.getInterval()))
+
    def _randomizedInsert(self, node, ival, value):
        if node == None: return IntervalNode(ival, value)
        if random.uniform(0,1) * node.N < 1.0: return self._rootInsert(node, ival, value)
@@ -176,7 +187,33 @@ class IntervalNode:
        if value != None:
            self.values.add(value)

+    def getInterval(self):
+        """
+        Retrieve the interval that defines this node
+        :return: the interval
+        """
+        return self.ival
+
+    def getMin(self):
+        """
+        Retrieve the min value for the interval of this node
+        :return: the min value of the interval
+        """
+        return self.ival.min
+
+    def getMax(self):
+        """
+        Retrieve the max value for the interval of this node
+        :return: the max value of the interval
+        """
+        return self.ival.max
+
    def add(self, value):
+        """
+        Add a new value to this node, to the set of values associated with the interval
+        :param value:
+        :return:
+        """
        if value:
            self.values.add(value)

@@ -294,6 +331,9 @@ class Interval:
    def __sizeof__(self):
        return self.max - self.min

+    def __len__(self):
+        return self.max - self.min
+
    def dist(self, that, signed = False, centre2centre = False):
        """ Calculate and return the closest distance (from one end of the interval of this to the end of the interval of that).
            If centre2centre is True, use the centre-to-centre distance instead.

--- a/ml.py
+++ b/ml.py
@@ -299,6 +299,17 @@ def eucdist(v1, v2):
        diff += (v1[i] - v2[i])**2
    return math.sqrt(diff)

+def cosdist(v1, v2):
+    if len(v1) != len(v2):
+        return None
+    sum0 = 0
+    sum1 = 0
+    sum2 = 0
+    for i in range(len(v1)):
+        sum0 += v1[i]*v2[i]
+        sum1 += v1[i]*v1[i]
+        sum2 += v2[i]*v2[i]
+    return 1 - (sum0 / (math.sqrt(sum1*sum2)))



--- a/phylo.py
+++ b/phylo.py
@@ -3,6 +3,9 @@ Module with methods and classes for phylogeny.
 @author: mikael
 '''
 import sequence
+from collections import defaultdict
+import annotations
+

 class PhyloTree:
    """ Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
@@ -22,11 +25,20 @@ class PhyloTree:
        self.aln = aln
        self.root._assignAlignment(aln)

+    def putAnnotations(self, nexus_annotations):
+        self.nexus_annotations = nexus_annotations
+
+        # Update the annotations dictionary so that it contains PhyloNode objects as keys, not text labels
+        for node in self.getNodes():
+            if node.label in self.nexus_annotations.leaf_annotations:
+                self.nexus_annotations.leaf_annotations[node] = self.nexus_annotations.leaf_annotations[node.label]
+                self.nexus_annotations.leaf_annotations.pop(node.label)
+
    def __str__(self):
        """ Produce a printable representation of the tree, specifically the root of the tree. """
        return str(self.root)

-    def strSequences(self, start = None, end = None):
+    def strSequences(self, start=None, end=None):
        """ Produce a sequence representation of the tree, specifically the root of the tree.
            Specify the start and end positions in the alignment for the sequence to be printed
            (if None the min and max positions will be used). """
@@ -40,7 +52,19 @@ class PhyloTree:
            Returns None if not found."""
        return self.root._findLabel(label)

-    def getDescendantsOf(self, node, transitive = False):
+    def getNodes(self, strategy = 'DEPTH-FIRST'):
+        """ Returns all nodes as a list """
+        nodes = []
+        queue = [self.root]
+        while len(queue) > 0:
+            node = queue.pop()
+            nodes.append(node)
+            # if strategy.upper().startswith('DEPTH'):
+            if node.left: queue.append(node.left)
+            if node.right: queue.append(node.right)
+        return nodes
+
+    def getDescendantsOf(self, node, transitive=False):
        """ Retrieve and return the (list of) descendants (children) of a specified node.
            Node can be the label or the instance.
            transitive indicates if only the direct descendants (False) or if all descendants
@@ -53,7 +77,7 @@ class PhyloTree:
            return node.getDescendants(transitive)
        return None

-    def getAncestorsOf(self, node, transitive = False):
+    def getAncestorsOf(self, node, transitive=False):
        """ Retrieve and return the ancestor (transitive=False) or
            ancestors (transitive=True) of a specified node.
            Node can be the label or the instance.
@@ -71,18 +95,18 @@ class PhyloTree:
                if myroot.left == node or myroot.right == node:
                    found = True
                    break
-                if myroot.left != None: # myroot has a "left" child
+                if myroot.left != None:  # myroot has a "left" child
                    # check if the "left" child of "myroot" is the ancestor of "node"
-                    if myroot.left.isAncestorOf(node, transitive = True): # if yes,
-                        myroot = myroot.left    # move to the "left" child
-                    else:                       # if not,
-                        myroot = myroot.right   # move to the "right" child
-                else: # myroot does NOT have a "left" child, so let's move "right"
+                    if myroot.left.isAncestorOf(node, transitive=True):  # if yes,
+                        myroot = myroot.left  # move to the "left" child
+                    else:  # if not,
+                        myroot = myroot.right  # move to the "right" child
+                else:  # myroot does NOT have a "left" child, so let's move "right"
                    myroot = myroot.right
            if found and transitive:
                return branching
            elif found and len(branching) > 0:
-                return branching[len(branching)-1]
+                return branching[len(branching) - 1]
            return None

    def parsimony(self):
@@ -90,12 +114,73 @@ class PhyloTree:
            i.e. find the sequences on each of the internal nodes.
            See Jones and Pevzner, p. 368 and onwards, for details. """
        self.root._forwardParsimony(self.aln)  # setup and compute scores for all nodes
-        self.root._backwardParsimony(self.aln) # use scores to determine sequences
-        return self.root.getSequence() # return the sequence found at the root
+        self.root._backwardParsimony(self.aln)  # use scores to determine sequences
+        return self.root.getSequence()  # return the sequence found at the root

    def canonise(self):
        self.root._canonise()

+    def swap_annotations(self, annotation_key):
+        try:
+
+            for node in self.getNodes():
+                if node.isLeaf():
+                    node.label = self.nexus_annotations.leaf_annotations[node][annotation_key]
+        except:
+            return
+
+    def write_to_nexus(self, out_path, write_annotations=True, nexus_annotations=None, exclude_annotations=[], use_symbols=False):
+        """
+        Writes out the tree to NEXUS format, with any annotations stored in nexus_annotations added to the file
+        :param out_path: The path to write the NEXUS file to
+        :param nexus_annotations: The NexusAnnotations containing the annotations
+        """
+
+        if write_annotations and not nexus_annotations:
+            if not self.nexus_annotations:
+                raise RuntimeError("This tree file has no associated annotation file. Either associate or supply one as a parameter.")
+            nexus_annotations = self.nexus_annotations
+
+
+
+
+
+        if nexus_annotations:
+
+            for node in self.getNodes():
+
+                if node in self.nexus_annotations.node_annotations:
+                    node.annotate_node(self.nexus_annotations.node_annotations, self.nexus_annotations.annotation_symbols, exclude_annotations, use_symbols)
+
+            tree_annotation = str(self) + ";"
+
+            self.swap_annotations("Original")
+
+            for node in self.getNodes():
+
+                if node in self.nexus_annotations.leaf_annotations:
+                    node.annotate_node(self.nexus_annotations.leaf_annotations, exclude_annotations)
+
+                leaves = []
+
+                for node in self.getNodes():
+                    if node.isLeaf():
+                        leaves.append(node.label)
+
+                leaf_annotation = ""
+
+                for leaf in leaves:
+                    leaf_annotation += "\n\t%s" % (leaf)
+
+
+
+            with open(out_path, "w+") as file:
+                file.write(
+                    "#NEXUS\nbegin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels%s\n;\nend;\n\nbegin trees;\n\ttree tree_1 = "
+                    "[&R] %s\nend;" % (len(leaves), leaf_annotation, tree_annotation))
+
+
+
 class PhyloNode:
    """ A class for a node in a rooted, binary (bifurcating) tree.
        Contains pointers to descendants/daughters (left and right),
@@ -104,7 +189,7 @@ class PhyloNode:
        A number of methods are named with a _ prefix. These can be, but
        are not intended to be used from outside the class. """

-    def __init__(self, label = ''):
+    def __init__(self, label=''):
        """ Initialise an initially unlinked node.
            Populate fields left and right to link it with other nodes.
            Set label to name it.
@@ -117,10 +202,13 @@ class PhyloNode:
        self.data = None
        self.label = label
        self.dist = None
-        self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
-        self.seqscores = None # The scores propagated from leaves via children
-        self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols
-        self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
+        self.sequence = None  # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
+        self.seqscores = None  # The scores propagated from leaves via children
+        self.backleft = None  # Pointers back to left child: what symbol rendered current/parent symbols
+        self.backright = None  # Pointers back to right child: what symbol rendered current/parent symbols
+
+    def isLeaf(self):
+        return self.left == self.right == None

    def __str__(self):
        """ Returns string with node (incl descendants) in a Newick style. """
@@ -137,26 +225,26 @@ class PhyloNode:
                return label + dist
            else:
                return '(' + left + ',' + right + ')' + label + dist
-        else: # there is no label
+        else:  # there is no label
            if not self.left and self.right:
-                return ','+right
+                return ',' + right
            elif self.left and not self.right:
-                return left+','
+                return left + ','
            elif self.left and self.right:
                return '(' + left + ',' + right + ')' + dist
-                
-    def __le__(self, other):
-        """ Returns indication of less than other node. """
-        return other and self.__hash__() <= other.__hash__()
-    	
-    def __eq__(self, other):
-        """ Returns indication of equivalence to other node. """
-        return other and self.__hash__() == other.__hash__()
-
-    def __hash__(self):
-        """ Returns hash of object. """
-        return hash((self.label, self.dist, self.sequence))
-        
+
+    # def __le__(self, other):
+    #     """ Returns indication of less than other node. """
+    #     return other and self.__hash__() <= other.__hash__()
+    #
+    # def __eq__(self, other):
+    #     """ Returns indication of equivalence to other node. """
+    #     return other and self.__hash__() == other.__hash__()
+    #
+    # def __hash__(self):
+    #     """ Returns hash of object. """
+    #     return hash((self.label, self.dist, self.sequence))
+
    def _printSequences(self, start, end):
        """ Returns string with node (incl descendants) in a Newick style. """
        left = right = label = dist = ''
@@ -172,11 +260,11 @@ class PhyloNode:
                return label + dist
            else:
                return '(' + left + ',' + right + ')' + label + dist
-        else: # there is no label
+        else:  # there is no label
            if not self.left and self.right:
-                return ','+right
+                return ',' + right
            elif self.left and not self.right:
-                return left+','
+                return left + ','
            elif self.left and self.right:
                return '(' + left + ',' + right + ')' + dist

@@ -196,10 +284,10 @@ class PhyloNode:
    def _propagateDistance(self, parent_dist):
        """ Convert absolute distances to relative.
            The only parameter is the absolute distance to the parent of this node. """
-        travelled = self.dist               # absolute distance to this node
-        self.dist = parent_dist - self.dist # relative distance to this node
-        if self.left != None:               # if there is a child node...
-            self.left._propagateDistance(travelled) # pass absolute distance to this node
+        travelled = self.dist  # absolute distance to this node
+        self.dist = parent_dist - self.dist  # relative distance to this node
+        if self.left != None:  # if there is a child node...
+            self.left._propagateDistance(travelled)  # pass absolute distance to this node
        if self.right != None:
            self.right._propagateDistance(travelled)

@@ -217,7 +305,7 @@ class PhyloNode:
                break

    def _canonise(self):
-        if self.left == None and self.right == None: # at leaf
+        if self.left == None and self.right == None:  # at leaf
            return self.label
        myleft = self.left._canonise()
        myright = self.right._canonise();
@@ -232,8 +320,8 @@ class PhyloNode:
        """ Internal function that operates recursively to first initialise each node (forward),
            stopping only once a sequence has been assigned to the node,
            then to propagate scores from sequence assigned nodes to root (backward). """
-        if self.sequence == None: # no sequence has been assigned
-            if self.left == None and self.right == None:    # no children, so terminal, cannot propagate scores
+        if self.sequence == None:  # no sequence has been assigned
+            if self.left == None and self.right == None:  # no children, so terminal, cannot propagate scores
                raise RuntimeError("No sequence assigned to leaf node:", self.label)
            scoresleft = scoresright = None
            if self.left != None:
@@ -242,7 +330,6 @@ class PhyloNode:
                scoresright = self.right._forwardParsimony(aln)
            # for each position in the alignment,
            # introduce (initially zero) score for each symbol in alphabet
-	#Project "Substitution weights" should focus on this line of code
            self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
            # for each position in the alignment,
            # allocate a position to put the left child symbol from which each current node symbol score was determined
@@ -250,37 +337,44 @@ class PhyloNode:
            # allocate a position to put the right child symbol from which each current node symbol score was determined
            self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
            for col in range(aln.alignlen):
+                # left child will contribute first
                for a_parent in range(len(aln.alphabet)):
                    best_score_left = +9999999
-                    best_score_right = +9999999
                    best_symb_left = 0
-                    best_symb_right = 0
-                    for a_left in range(len(aln.alphabet)):
-                        score = (scoresleft[col][a_left] + (1 if a_left != a_parent else 0)) # if we want to weight scores, this would need to change
+                    for a in range(len(aln.alphabet)):
+                        score = (scoresleft[col][a] + (
+                        1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
                        if score < best_score_left:
-                            best_symb_left = a_left
+                            best_symb_left = a
                            best_score_left = score
-                    for a_right in range(len(aln.alphabet)):
-                        score = (scoresright[col][a_right] + (1 if a_right != a_parent else 0)) # if we want to weight scores, this would need to change
+                    self.seqscores[col][a_parent] = best_score_left
+                    self.backleft[col][a_parent] = best_symb_left
+                # right child will contribute next
+                for a_parent in range(len(aln.alphabet)):
+                    best_score_right = +9999999
+                    best_symb_right = 0
+                    for a in range(len(aln.alphabet)):
+                        score = (scoresright[col][a] + (
+                        1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
                        if score < best_score_right:
-                            best_symb_right = a_right
+                            best_symb_right = a
                            best_score_right = score
-                    self.seqscores[col][a_parent] = best_score_left + best_score_right
-                    self.backleft[col][a_parent] = best_symb_left
+                    self.seqscores[col][a_parent] += best_score_right
                    self.backright[col][a_parent] = best_symb_right
        else:
-            self.seqscores = [[0 if a==sym else 999999 for a in aln.alphabet] for sym in self.sequence] # if we want to weight scores, this would need to change
+            self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in
+                              self.sequence]  # if we want to weight scores, this would need to change
        return self.seqscores

-    def _backwardParsimony(self, aln, seq = None):
+    def _backwardParsimony(self, aln, seq=None):
        """ Internal function that operates recursively to inspect scores to determine
            most parsimonious sequence, from root to leaves. """
-        if self.sequence == None: # no sequence has been assigned
+        if self.sequence == None:  # no sequence has been assigned
            leftbuf = []
            rightbuf = []
-            if self.left == None and self.right == None:    # no children, so terminal, cannot propagate scores
+            if self.left == None and self.right == None:  # no children, so terminal, cannot propagate scores
                raise RuntimeError("No sequence assigned to leaf node:", self.label)
-            if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
+            if seq == None:  # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
                currbuf = []
                for col in range(aln.alignlen):
                    min_score = 999999
@@ -296,8 +390,8 @@ class PhyloNode:
                    currbuf.append(aln.alphabet[min_symb])
                    leftbuf.append(aln.alphabet[left_symb])
                    rightbuf.append(aln.alphabet[right_symb])
-                self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy = True)
-            else: # Non-root, but not leaf
+                self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy=True)
+            else:  # Non-root, but not leaf
                self.sequence = seq
                col = 0
                for sym_parent in self.sequence:
@@ -307,20 +401,20 @@ class PhyloNode:
                    leftbuf.append(aln.alphabet[left_symb])
                    rightbuf.append(aln.alphabet[right_symb])
                    col += 1
-            self.left._backwardParsimony(aln, sequence.Sequence(leftbuf, aln.alphabet, self.label, gappy = True))
-            self.right._backwardParsimony(aln, sequence.Sequence(rightbuf, aln.alphabet, self.label, gappy = True))
+            self.left._backwardParsimony(aln, sequence.Sequence(leftbuf, aln.alphabet, self.label, gappy=True))
+            self.right._backwardParsimony(aln, sequence.Sequence(rightbuf, aln.alphabet, self.label, gappy=True))
        return self.sequence

    def getSequence(self):
        """ Get the sequence for the node. Return None if no sequence is assigned.
            Requires that an alignment is associated with the tree, and that sequence names match node labels.
            If the explored node is not a leaf, the sequence can be determined by parsimony. """
-        if self.sequence != None: # a sequence has been assigned
+        if self.sequence != None:  # a sequence has been assigned
            return self.sequence
-        elif self.seqscores != None: # inferred by parsimony but not yet assigned
-            return None # determine most parsimonous sequence, not yet implemented
+        elif self.seqscores != None:  # inferred by parsimony but not yet assigned
+            return None  # determine most parsimonous sequence, not yet implemented

-    def isAncestorOf(self, node, transitive = True):
+    def isAncestorOf(self, node, transitive=True):
        """ Decide if this node is the ancestor of specified node.
            If transitive is True (default), all descendants are included.
            If transitive is False, only direct descendants are included. """
@@ -335,7 +429,7 @@ class PhyloNode:
        else:
            return False

-    def getDescendants(self, transitive = False):
+    def getDescendants(self, transitive=False):
        """ Retrieve and return (list of) nodes descendant of this.
            If transitive is False (default), only direct descendants are included.
            If transitive is True, all descendants are (recursively) included. """
@@ -355,10 +449,37 @@ class PhyloNode:
            children.extend(grandchildren)
            return children

+    def annotate_node(self, annotations, annotation_symbols= None, exclude_annotations = [], use_symbols=False ):
+
+        annotation_string = "[&"
+        for key, val_list in annotations[self].items():
+            if type(val_list) != list:
+                val_list = [val_list]
+            if key not in exclude_annotations:
+
+                # If we are using annotation symbols and the annotation has an associated symbol
+                for val in val_list:
+                    if use_symbols and val in annotation_symbols:
+                        sorted_symbols = sorted([annotation_symbols[val] for val in val_list])
+                        annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in sorted_symbols]))
+                    else:
+                        annotation_string += '%s="%s",' % (key, ' '.join(['%s' % (val,) for val in val_list]))
+
+
+        # Remove the final comma and add in a closing bracket
+        annotation_string = annotation_string[0: len(annotation_string) - 1] + "]"
+
+        if len(annotation_string) > 2:
+            if ":" in self.label:
+                self.label = self.label.split(":")[0] + annotation_string + self.label.split(":")[1]
+            else:
+                self.label = self.label + annotation_string
+
 """ ----------------------------------------------------------------------------------------
    Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
    ----------------------------------------------------------------------------------------"""

+
 def runUPGMA(aln, measure, absoluteDistances=False):
    """ Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
        Use specified distance metric (see sequence.calcDistances).
@@ -413,11 +534,13 @@ def runUPGMA(aln, measure, absoluteDistances=False):
        z.dist = 0.0  # root z is at distance 0 from merged x and y
    return PhyloTree(z)  # make it to tree, return

+
 """ ----------------------------------------------------------------------------------------
    Methods for processing files of trees on the Newick format
    ----------------------------------------------------------------------------------------"""

-def _findComma(string, level = 0):
+
+def _findComma(string, level=0):
    """ Find first comma at specified level of embedding """
    mylevel = 0
    for i in range(len(string)):
@@ -429,11 +552,12 @@ def _findComma(string, level = 0):
            return i
    return -1

+
 def parseNewickNode(string):
    """ Utility function that recursively parses embedded string using Newick format. """
    first = string.find('(')
-    last = string[::-1].find(')') # look from the back
-    if first == -1 and last == -1: # we are at leaf
+    last = string[::-1].find(')')  # look from the back
+    if first == -1 and last == -1:  # we are at leaf
        y = string.split(':')
        node = PhyloNode(y[0])
        if len(y) >= 2:
@@ -441,7 +565,7 @@ def parseNewickNode(string):
        return node
    elif first >= 0 and last >= 0:
        # remove parentheses
-        last = len(string) - last - 1 # correct index to refer from start instead of end of string
+        last = len(string) - last - 1  # correct index to refer from start instead of end of string
        embed = string[first + 1:last]
        tail = string[last + 1:]
        # find where corresp comma is
@@ -451,7 +575,7 @@ def parseNewickNode(string):
        left = embed[0:comma].strip()
        right = embed[comma + 1:].strip()
        y = tail.split(':')
-        node = PhyloNode(y[0]) #node is an instance of the PhyloNode() class
+        node = PhyloNode(y[0])  # node is an instance of the PhyloNode() class
        if len(y) >= 2:
            node.dist = float(y[1])
        node.left = parseNewickNode(left)
@@ -460,6 +584,7 @@ def parseNewickNode(string):
    else:
        raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')

+
 def parseNewick(string):
    """ Main method for parsing a Newick string into a (phylogenetic) tree.
        Handles labels (on both leaves and internal nodes), and includes distances (if provided).
@@ -468,6 +593,7 @@ def parseNewick(string):
        string = string[:string.find(';')]
    return PhyloTree(parseNewickNode(string))

+
 def readNewick(filename):
    """ Read file on Newick format.
        Returns an instance of a PhyloTree."""
@@ -475,6 +601,90 @@ def readNewick(filename):
    string = ''.join(f)
    return parseNewick(string)

+
 def writeNewickFile(filename, my_tree):
    with open(filename, 'w') as fh:
        print(my_tree, end="", file=fh)
+
+
+def read_nexus(filename):
+    """
+    Read a file in Nexus format
+    :param filename:
+    :return:
+    """
+    f = open(filename)
+    return parse_nexus(f)
+
+def parse_nexus(string):
+    string = string.read()
+    lines = string.split("\n")
+    annotation_dict = defaultdict(dict)
+    for num, line in enumerate(lines):
+        # print (line)
+        if line.strip().startswith("dimensions ntax="):
+                taxon_number = line.strip().split("dimensions ntax=")[1].split(";")[0]
+        if line.strip().startswith("taxlabels"):
+            taxon_num = num + 1
+            while not lines[taxon_num].strip().startswith(";"):
+                taxon_name = lines[taxon_num].split("[")[0].strip()
+
+
+                for annot_line in lines[taxon_num].split("[&")[1].split(","):
+                    #TODO: Make these regex calls
+                    # print ("Annotation Key is ", annot_line.split("=")[0])
+                    annot_key = annot_line.split("=")[0]
+                    # print (annot_line.split("=")[1])
+                    if '"' in annot_line:
+                        annot_val = annot_line.split("=")[1].split("\"")[1]
+                    else:
+                        annot_val = annot_line.split("=")[1].split("]")[0]
+
+                    annotation_dict[taxon_name][annot_key.strip()] = annot_val
+
+                taxon_num +=1
+
+
+        if line.strip().startswith("begin trees"):
+            tree_num = num + 1
+            tree = (lines[tree_num].split("[&R]")[1])
+
+    phylo_tree = parseNewick(tree)
+
+    nexus_annotations = annotations.NexusAnnotations(tree=phylo_tree)
+
+
+    nexus_annotations.add_annotations(annotation_dict)
+
+
+
+    # print (nexus_annotations.annotations)
+
+    phylo_tree.putAnnotations(nexus_annotations)
+
+
+    ## Extract all of the annotations from the tree and add them to the NexusAnnotations object
+
+    print ("Number of taxons is %s " % (taxon_number))
+    return phylo_tree
+
+
+""" ----------------------------------------------------------------------------------------
+    Method for generating a PhyloTree with unique tip names
+    ----------------------------------------------------------------------------------------"""
+
+def get_unique_tree(tree):
+    unique_tree = tree
+    unique_labels = {}
+
+    for node in unique_tree.getNodes():
+        if node.isLeaf() and node.label in unique_labels:
+            unique_labels[node.label] = unique_labels[node.label] + 1
+            node.label = node.label + str(unique_labels[node.label])
+        elif node.isLeaf():
+            unique_labels[node.label] = 1
+    return unique_tree
+
+def unpack_list(list):
+    return (" ".join(["%s"] * len(list)) + "!") % (x for x in list)
+
--- a/sequence.py
+++ b/sequence.py
@@ -234,7 +234,7 @@ def parseDefline(string):
    if len(string) == 0: return ('', '', '', '')
    s = string.split()[0]
    if re.match("^sp\|[A-Z][A-Z0-9]{5}\|\S+", s):            arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
-    elif re.match("^tr\|[A-Z][A-Z0-9]{5}\|\S+", s):          arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
+    elif re.match("^tr\|[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
    elif re.match("^gi\|[0-9]*\|\S+\|\S+", s):               arg = s.split('|');  return (arg[1], arg[3], arg[0], arg[2])
    elif re.match("gb\|\S+\|\S+", s):                        arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
    elif re.match("emb\|\S+\|\S+", s):                       arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
@@ -766,9 +766,9 @@ def alignGlobal(seqA, seqB, substMatrix, gap = -1):
    for i in range(1, lenA + 1):
        for j in range(1, lenB + 1):
            match  = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
-            delete = S[i-1, j  ] + gap
-            insert = S[i  , j-1] + gap
-            S[i, j] = max([match, delete, insert])
+            fromTop = S[i-1, j  ] + gap 
+            fromLeft = S[i  , j-1] + gap 
+            S[i, j] = max([match, fromTop, fromLeft])
    # Traceback the optimal alignment
    alignA = '' # a string for sequence A when aligned (e.g. 'THIS-LI-NE-', initially empty).
    alignB = '' # a string for sequence B when aligned (e.g. '--ISALIGNED', initially empty).
@@ -825,9 +825,9 @@ def alignLocal(seqA, seqB, substMatrix, gap = -1):
    for i in range(1, lenA + 1):
        for j in range(1, lenB + 1):
            match  = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
-            delete = S[i-1, j  ] + gap
-            insert = S[i  , j-1] + gap
-            S[i, j] = max([match, delete, insert, 0])  # Local: add option that we re-start alignment from "0"
+            fromTop = S[i-1, j  ] + gap 
+            fromLeft = S[i  , j-1] + gap 
+            S[i, j] = max([match, fromTop, fromLeft, 0]) # Local: add option that we re-start alignment from "0"
    # Trace back the optimal alignment
    alignA = ''
    alignB = ''

--- a/stats.py
+++ b/stats.py
@@ -430,7 +430,8 @@ def getRSpval(a, b):
    for elem in b:
        lst.append([elem, -1, 0])
    # ok sort it
-    lst.sort(lambda p, q: cmp(p[0], q[0]))
+    #lst.sort(lambda p, q: cmp(p[0], q[0]))
+    sorted(lst, key=lambda x: x[0])

    # let's go through it and edit each rank
    rank=0
@@ -492,7 +493,7 @@ def getRSpval(a, b):
    za=((ta_obs-ta_null)+da)/sd           # the z value for A which is the mirror of ...
    zb=((tb_obs-tb_null)+db)/sd           # the z value for B (we only need one)
    p=f(za)                        # figure out the area of the normal distribution
-    u=ua;                                 # remember one of the U values
+    u=ua                                 # remember one of the U values
    return p                              # the p-value: null is that a==b, one-sided (a has lower values)



--- a/sym.py
+++ b/sym.py
@@ -126,6 +126,7 @@ RNA_Alphabet = Alphabet('ACGU')
 Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
 Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
 Protein_Alphabet_wSTOP = Protein_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
+Protein_wGAP  = Alphabet('ACDEFGHIKLMNPQRSTVWY-')
 DSSP_Alphabet = Alphabet('GHITEBSC')
 DSSP3_Alphabet = Alphabet('HEC')


--- a/webservice.py
+++ b/webservice.py
@@ -33,7 +33,6 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
    # Get the entry
    try:
        data = urllib.request.urlopen(url).read().decode("utf-8")
-        print (type(data))
        if data.startswith("ERROR"):
            raise RuntimeError(data)
        return data