EDITOR=emacsclient

eff5f374 · Mikael Boden · f5d19d0a · eff5f374 · eff5f374 · eff5f374
Commit eff5f374 authored Jul 01, 2018 by Mikael Boden
16 changed files
--- a/annotation_test.py
+++ b/annotation_test.py
+import annotations
+import phylo
+
+# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
+
+# print (tree)
+# unique_tree = get_unique_tree(tree)
+#
+# print (unique_tree)
+
+
+
+working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
+
+tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
+
+print (tree)
+print (tree.nexus_annotations.annotations)
+
+tree.swap_annotations("PDB")
+
+print (tree)
+print (tree.nexus_annotations.annotations)
+#
+# tree.write_to_nexus(working_dir + "output.nexus")
+
+# nexus_annotations = annotations.NexusAnnotations2()
+
+# for node in tree.getNodes():
+#     if node.isLeaf():
+#         nexus_annotations.add_annotation(node, "node", "annotation")
+#
+# print (nexus_annotations.annotations)
+
+
+# REGEX CODE
+# import re
+#
+# pattern = r"<person>(.*?)</person>"
+# re.findall(pattern, str, flags=0) #you may need to add flags= re.DOTALL if your str is multiline
\ No newline at end of file
--- a/annotations.py
+++ b/annotations.py
+from collections import defaultdict
+from phylo import *
+import matplotlib
+import random
+
+matplotlib_colours={'aliceblue':'#F0F8FF','aqua':'#00FFFF','aquamarine':'#7FFFD4','azure':'#F0FFFF','beige':'#F5F5DC','bisque':'#FFE4C4','blanchedalmond':'#FFEBCD','blue':'#0000FF','blueviolet':'#8A2BE2','brown':'#A52A2A','burlywood':'#DEB887','cadetblue':'#5F9EA0','chartreuse':'#7FFF00','chocolate':'#D2691E','coral':'#FF7F50','cornflowerblue':'#6495ED','crimson':'#DC143C','cyan':'#00FFFF','darkblue':'#00008B','darkcyan':'#008B8B','darkgoldenrod':'#B8860B','darkgreen':'#006400','darkkhaki':'#BDB76B','darkmagenta':'#8B008B','darkolivegreen':'#556B2F','darkorange':'#FF8C00','darkorchid':'#9932CC','darkred':'#8B0000','darksalmon':'#E9967A','darkseagreen':'#8FBC8F','darkslateblue':'#483D8B','darkturquoise':'#00CED1','darkviolet':'#9400D3','deeppink':'#FF1493','deepskyblue':'#00BFFF','dodgerblue':'#1E90FF','firebrick':'#B22222','floralwhite':'#FFFAF0','forestgreen':'#228B22','fuchsia':'#FF00FF','gainsboro':'#DCDCDC','ghostwhite':'#F8F8FF','gold':'#FFD700','goldenrod':'#DAA520','green':'#008000','greenyellow':'#ADFF2F','honeydew':'#F0FFF0','hotpink':'#FF69B4','indianred':'#CD5C5C','indigo':'#4B0082','khaki':'#F0E68C','lavender':'#E6E6FA','lavenderblush':'#FFF0F5','lawngreen':'#7CFC00','lemonchiffon':'#FFFACD','lightblue':'#ADD8E6','lightcoral':'#F08080','lightcyan':'#E0FFFF','lightgoldenrodyellow':'#FAFAD2','lightgreen':'#90EE90','lightgray':'#D3D3D3','lightpink':'#FFB6C1','lightsalmon':'#FFA07A','lightseagreen':'#20B2AA','lightskyblue':'#87CEFA','lightslategray':'#778899','lightsteelblue':'#B0C4DE','lightyellow':'#FFFFE0','lime':'#00FF00','limegreen':'#32CD32','magenta':'#FF00FF','maroon':'#800000','mediumaquamarine':'#66CDAA','mediumblue':'#0000CD','mediumorchid':'#BA55D3','mediumpurple':'#9370DB','mediumseagreen':'#3CB371','mediumslateblue':'#7B68EE','mediumspringgreen':'#00FA9A','mediumturquoise':'#48D1CC','mediumvioletred':'#C71585','midnightblue':'#191970','mintcream':'#F5FFFA','mistyrose':'#FFE4E1','moccasin':'#FFE4B5','navajowhite':'#FFDEAD','navy':'#000080','oldlace':'#FDF5E6','olive':'#808000','olivedrab':'#6B8E23','orange':'#FFA500','orangered':'#FF4500','orchid':'#DA70D6','palegoldenrod':'#EEE8AA','palegreen':'#98FB98','paleturquoise':'#AFEEEE','palevioletred':'#DB7093','papayawhip':'#FFEFD5','peachpuff':'#FFDAB9','peru':'#CD853F','pink':'#FFC0CB','plum':'#DDA0DD','powderblue':'#B0E0E6','purple':'#800080','red':'#FF0000','rosybrown':'#BC8F8F','royalblue':'#4169E1','saddlebrown':'#8B4513','salmon':'#FA8072','sandybrown':'#FAA460','seagreen':'#2E8B57','seashell':'#FFF5EE','sienna':'#A0522D','skyblue':'#87CEEB','slateblue':'#6A5ACD','springgreen':'#00FF7F','steelblue':'#4682B4','tan':'#D2B48C','teal':'#008080','thistle':'#D8BFD8','tomato':'#FF6347','turquoise':'#40E0D0','violet':'#EE82EE','wheat':'#F5DEB3','yellow':'#FFFF00','yellowgreen':'#9ACD32'}
+
+twenty_colours ={'dodgerblue':'#1E90FF', 'orangered':'#FF4500', 'greenyellow':'#ADFF2F', 'orchid':'#DA70D6'}
+
+symbols = ["*", "^", "!", "#", "~", "+", ":", "<", ">", "@", "%", "=", "-"]
+
+class NexusAnnotations():
+    """
+    Defines a set of annotations for a phylogenetic tree, to be written to NEXUS format
+    """
+    node_annotations = defaultdict(list)
+    leaf_annotations = defaultdict(list)
+    used_colours = set()
+    annotated_nodes = set()
+
+    def __init__(self, colour_dict=matplotlib_colours, symbol_list = symbols, tree=None, node_annotations=defaultdict(list), leaf_annotations=defaultdict(list), annotation_symbols={}):
+        self.tree = tree
+        self.node_annotations = node_annotations
+        self.leaf_annotations = leaf_annotations
+        self.annotation_symbols = annotation_symbols
+        self.colour_dict = colour_dict
+        self.symbol_list = symbol_list
+        self.used_colours = set()
+        self.annotated_nodes = set()
+
+        # if type(self.tree) != PhyloTree:
+        #     raise RuntimeError("NexusAnnotations need the tree to be a PhyloTree object")
+
+        if tree:
+            self.add_original_annotations()
+
+    def add_original_annotations(self):
+        """
+        Add an entry for the original labels of the tree, so we can map back if we need to
+        """
+        for node in self.tree.getNodes():
+            if node.isLeaf():
+                self.leaf_annotations[node] = {"Original" : node.label}
+
+    def add_annotations(self, annotation_dict):
+        for node in self.tree.getNodes():
+            if node.label in annotation_dict:
+                for key, val in annotation_dict[node.label].items():
+                    self.leaf_annotations[node][key] = val
+        # print (self.leaf_annotations)
+
+
+    def add_annotation(self, node, key="", annotation=[], annotate_descendants=False):
+
+        self.node_annotations[node] = {key:annotation}
+        self.annotated_nodes.add(node)
+
+        for annot in annotation:
+            if annot not in self.annotation_symbols:
+                symbol = self.generate_annotation_symbol(annot)
+                self.add_annotation_symbol(annot, symbol)
+
+        # Add in a symbol to represent this annotation if one doesn't already exist
+
+
+
+        if annotate_descendants:
+            for descendant in node.getDescendants():
+                self.add_annotation(descendant, key, annotation)
+
+
+    def add_colour_annotation(self, node, colour=None, random_colour=False, colour_descendants=True):
+        """
+        Add a single colour annotation to a set of nodes
+        :param labels: The nodes to annotate
+        :param colour: The colour to annotate with
+        """
+
+        if not colour:
+            colour = self.generate_colour()
+
+
+        if colour_descendants:
+            for descendant in node.getDescendants(transitive=True):
+                self.node_annotations[descendant] = {"!color": colour}
+                self.annotated_nodes.add(descendant)
+                self.used_colours.add(colour)
+
+
+    def add_colour_annotations(self, nodes, colour_list=None, colour_descendants=False):
+        """
+        Add multiple colour annotations to a list of nodes
+        :param nodes: A list of lists of nodes to annotate
+        :param colour_list: The colour to annotate each list of nodes with
+        """
+
+        if not colour_list:
+            colour_list = self.generate_colour_list(len(nodes))
+
+        for index, node_set in enumerate(nodes):
+            set_colour = self.colour_dict[colour_list[index]]
+
+
+            print(set_colour)
+            print(index, node_set)
+            for node in node_set:
+                self.add_colour_annotation(node, set_colour, colour_descendants=colour_descendants)
+
+    def generate_annotation_symbol(self, annotation):
+
+        i = 0
+        while i < len(self.symbol_list):
+            symbol = random.choice(self.symbol_list)
+            if symbol not in self.annotation_symbols.values():
+                return symbol
+        i+=1
+
+    def add_annotation_symbol(self, symbol, annotation):
+
+        self.annotation_symbols[symbol] = annotation
+
+
+
+    def generate_colour(self):
+        """
+        Generate a colour that hasn't been used yet in this set of Nexus Annotations
+        :return: A unique colour
+        """
+
+        i = 0
+        while i < len(self.colour_dict.values()):
+            colour = random.choice(list(self.colour_dict.values()))
+            if colour not in self.used_colours:
+                return colour
+        i+=1
+
+    def generate_colour_list(self, num):
+        return num
+
--- a/bed.py
+++ b/bed.py
@@ -6,10 +6,10 @@ class BedEntry():
        self.chrom = chrom
        self.chromStart = chromStart
        self.chromEnd = chromEnd
-        self.blockCount = None
        self.usestrand = False
        self.strand = None
        self.name = ''
+        self.blocks = None # interval tree with blocks

    def addOption(self,
                  name = None,
@@ -32,46 +32,64 @@ class BedEntry():
                  zscore = None,
                  bg = None):
        if name: self.name = name
-        if score: self.score = score
+        if score != None: self.score = score
        if strand:
            self.strand = strand
            self.usestrand = True # use reverse complement when sequence is requested from genome
-        if thickStart: self.thickStart = thickStart
-        if thickEnd: self.thickEnd = thickEnd
-        if itemRgb: self.itemRgb = [int(color) for color in itemRgb.split(',')]
-        if blockCount:
-            self.blockCount = max(0, blockCount)
+        if thickStart != None: self.thickStart = thickStart
+        if thickEnd != None: self.thickEnd = thickEnd
+        if itemRgb != None: self.itemRgb = [int(color) for color in itemRgb.split(',')]
+        if blockCount != None:
            if blockCount > 0:
-                self.blockSizes = [int(sizeword) for sizeword in blockSizes.rstrip(',').split(',')]
-                self.blockStarts = [int(startword) for startword in blockStarts.rstrip(',').split(',')]
-                if len(self.blockSizes) != blockCount or len(self.blockStarts) != blockCount:
+                blockSizes = [int(sizeword) for sizeword in blockSizes.rstrip(',').split(',')]
+                blockStarts = [int(startword) for startword in blockStarts.rstrip(',').split(',')]
+                if len(blockSizes) != blockCount or len(blockStarts) != blockCount:
                    raise RuntimeError('Blockcount is incorrect in BED entry \"%s\"' % str(self))
-        if signalValue: self.signalValue = signalValue
-        if pValue: self.pValue = pValue
-        if qValue: self.qValue = qValue
-        if peak: self.peak = peak
-        if tags: self.tags = tags
-        if summit: self.summit = summit
-        if fold: self.fold = fold
-        if fdr: self.fdr = fdr
-        if bg: self.bg = bg
-        if zscore: self.zscore = zscore
+                for i in range(blockCount):
+                    self.addBlock(blockStarts[i], blockSizes[i])
+        if signalValue != None: self.signalValue = signalValue
+        if pValue != None: self.pValue = pValue
+        if qValue != None: self.qValue = qValue
+        if peak != None: self.peak = peak
+        if tags != None: self.tags = tags
+        if summit != None: self.summit = summit
+        if fold != None: self.fold = fold
+        if fdr != None: self.fdr = fdr
+        if bg != None: self.bg = bg
+        if zscore != None: self.zscore = zscore

-    def __str__(self):
-        return str((self.chrom, self.chromStart, self.chromEnd))
+    def __len__(self):
+        if self.blocks:
+            return len(self.blocks)
+        else:
+            return 0

-    def __getitem__(self, i):
-        if self.blockCount:
-            return (self.chrom, self.chromStart + self.blockStarts[i], self.chromStart + self.blockStarts[i] + self.blockSizes[i])
+    def addBlock(self, relative_start, size):
+        if not self.blocks:
+            self.blocks = ival.IntervalTree()
+        self.blocks.put(ival.Interval(self.chromStart + relative_start, self.chromStart + relative_start + size))

-    def __iter__(self):
-        if self.blockCount:
-            for i in range(self.blockCount):
-                if self.blockSizes[i] > 0:
-                    yield (self.chrom, self.chromStart + self.blockStarts[i], self.chromStart + self.blockStarts[i] + self.blockSizes[i])
+    def getBlocks(self):
+        return self.blocks

-    def __len__(self):
-        return self.blockCount
+    def __str__(self):
+        if self.strand == '+' or self.strand == '-':
+           return self.chrom + ':' + str(self.chromStart) + '-' + str(self.chromEnd) + self.strand
+        return self.chrom + ':' + str(self.chromStart) + '-' + str(self.chromEnd)
+
+    def __iter__(self):
+        if self.blocks:
+            for b in self.blocks:
+                yield (self.chrom, b.ival.min, b.ival.max)
+
+    def isBlockOverlap(self, entry):
+        if not self.blocks:
+            return None
+        if isinstance(entry, BedEntry):
+            if (entry.chrom == self.chrom):
+                return self.blocks.isect(entry.getInterval()) != None
+        elif isinstance(entry, ival.Interval):
+            return self.blocks.isect(entry) != None

    def loc(self, genome = None, fixedwidth = None, usesummit = False, useshift = None):
        """ Retrieve the genomic location for BED entry, or sequence if genome is provided
@@ -357,7 +375,7 @@ def readBedFile(filename, format = 'Limited'):
                chromStart = int(words[1])
                chromEnd = int(words[2])
            entry = BedEntry(chrom, chromStart, chromEnd)
-            if format.lower().startswith('opt'):
+            if format.lower().startswith('opt') or format.lower().startswith('bed12'):
                if len(words) >= 12:
                    entry.addOption(name = words[3], score = float(words[4]), strand = words[5], thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8], blockCount = int(words[9]), blockSizes = words[10], blockStarts = words[11])
                elif len(words) >= 9:
@@ -417,30 +435,73 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
    if header:
        f.write(header + '\n')
    for row in entries:
-        if format == 'Peaks':
-            f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
-        elif format == 'Limited':
-            f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
-        elif format == 'Strand':
-            f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd, row.strand, row.name))
+        if row.blocks: # BED12
+            f.write("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.thickStart, row.thickEnd))
+            if row.itemRgb:
+                if len(row.itemRgb) == 3:
+                    f.write("%d,%d,%d\t" % (row.itemRgb[0],row.itemRgb[1],row.itemRgb[2]))
+                else:
+                    f.write("0\t")
+            else:
+                f.write("0\t")
+            f.write("%d\t" % (len(row)))
+            blockStarts = []
+            blockSizes = []
+            for b in row.blocks:
+                blockStarts.append(b.ival.min - row.chromStart)
+                blockSizes.append(len(b.ival))
+            for b in blockSizes:
+                f.write("%d," % (b))
+            f.write("\t")
+            for b in blockStarts:
+                f.write("%d," % (b))
+            f.write("\n")
        else:
-            f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
-        f.write("\n")
+            if format == 'Peaks':
+                f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
+            elif format == 'Limited':
+                f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
+            elif format == 'Strand':
+                f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd, row.strand, row.name))
+            else:
+                f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
+            f.write("\n")
    f.close()

 if __name__ == '__main__':
-    bf = BedFile('/Users/mikael/binfpy/BIOL3014/Week7/mm10_genes.bed', 'optional')
+#    bf = BedFile('/Users/mikael/binfpy/BIOL3014/Week7/mm10_genes.bed', 'optional')
+    bf = BedFile('/Volumes/Share/ARCDP19/Analysis/Young/Young_flat.bed', 'optional')
    print(bf.chroms.keys())
    g = bf.generate('chr1')
    print(next(g))
    print(next(g))
    print(next(g))
    cnt = 0
+    collect = []
    for entry in bf:
        cnt += 1
        print(str(cnt) + '\t' + str(entry))
-        if cnt == 100:
+        collect.append(entry)
+        if cnt == 7:
+            for b in entry:
+                print('\t', b)
+        if cnt == 10:
            break
+    writeBedFile(collect, '/Users/mikael/Desktop/test.bed')
+    bf2 = BedFile('/Users/mikael/Desktop/test.bed', 'opt')
+    q = ival.Interval(3805000, 3806000)
+    t2 = ival.IntervalTree()
+    t2.put(q, "blah")
+    for entry in bf2:
+        if entry.isBlockOverlap(q):
+            print('Found:', entry)
+            tree = entry.getBlocks()
+            t2.putAll(tree)
+            for t in t2:
+                print(t)
+
+
+
    entry1 = BedEntry('chrX', 3858266, 3858530)
    print(entry1 in bf)
    entry2 = BedEntry('chrX', 10047550, 10067694)

--- a/cell.py
+++ b/cell.py
+def readOBOFile(obofile):
+    """
+    Read/load OBO file that contains ontology defs for
+    Uber anatomy ontology (Uberon), Cell Ontology (CL) and Experimental Factor Ontology (EFO)
+    http://cellontology.org
+    see also http://www.obofoundry.org/
+
+    Example of one "term" and one "typedef" entry (note CL refers to cell ontology and UBERON to the anatomy ontology:
+
+    [Term]
+    id: CL:0000513
+    name: cardiac muscle myoblast
+    namespace: cell
+    alt_id: CL:0000714
+    def: "A precursor cell destined to differentiate into cardiac muscle cell." [GOC:tfm, MESH:A11.635.470]
+    synonym: "cardiac muscle progenitor cell" EXACT []
+    synonym: "cardiomyocyte progenitor cell" EXACT []
+    xref: FMA:84797
+    is_a: CL:0002494 ! cardiocyte
+    is_a: CL:0010021 ! cardiac myoblast
+    intersection_of: CL:0000056 ! myoblast
+    intersection_of: develops_into CL:0000746 ! cardiac muscle cell
+    intersection_of: part_of UBERON:0001133 ! cardiac muscle tissue
+    relationship: develops_into CL:0000746 ! cardiac muscle cell
+    relationship: part_of UBERON:0001133 ! cardiac muscle tissue
+
+    [Typedef]
+    id: part_of
+    name: part of
+    def: "a core relation that holds between a part and its whole" []
+    xref: BFO:0000050
+    is_transitive: true
+    is_a: overlaps ! overlaps
+    inverse_of: has_part ! has part
+    """
+    src = open(obofile, 'r')
+    terms = {}
+    in_term_def = False
+    in_type_def = False
+    for line in src:
+        if in_term_def:
+            word = line.split()
+            if line.startswith('id: '):
+                term_id = word[1].strip()
+                term_is = set()
+            elif line.startswith('name: '):
+                term_name = line[6:].strip()
+            elif line.startswith('def: '):
+                # Note this is a multi-line field, delimited by "'s
+                pass
+            elif line.startswith('is_a: '):
+                term_is.add((word[1].strip(), 'is_a'))
+            elif line.startswith('relationship: '):
+                term_is.add((word[2], word[1]))
+            elif line.startswith('intersection_of: '):
+                pass
+            elif line.startswith('is_obsolete: '):
+                in_term_def = False # ignore this entry
+        if line.startswith('[Term]'):
+            if in_term_def: # already defining one, stash it before moving on to the next...
+                terms[term_id] = (term_name, term_is)
+            elif in_type_def:
+                in_type_def = False
+            in_term_def = True
+        if line.startswith('[Typedef]'):
+            if in_term_def: # already defining one, stash it before moving on to the next...
+                terms[term_id] = (term_name, term_is)
+                in_term_def= False
+            in_type_def = True
+    if in_term_def: #  defining one, stash it
+        terms[term_id] = (term_name, term_is)
+    return terms
+
+def getChildren(terms, parent):
+    all = []
+    for t in terms:
+        (name, isa) = terms[t]
+        for (id, rel) in isa:
+            if id == parent:
+                all.append((t, rel))
+    return all
+
+def getParents(terms, child):
+    all = []
+    (name, isa) = terms[child]
+    for (id, rel) in isa:
+        all.append((id, rel))
+    return all
+
+def getID(terms, query_name):
+    for t in terms:
+        (name, isa) = terms[t]
+        if name == query_name:
+            return t
+
+def getName(terms, id):
+    return terms[id][0]
+
+def listParents(terms, query_name):
+    child_ID = getID(terms, query_name)
+    all = []
+    for (parent, rel) in getParents(terms, child_ID):
+        all.append(getName(terms, parent))
+    return all
+
+if __name__ == '__main__':
+    terms = readOBOFile('/Users/mikael/simhome/share/cl.obo')
+    print(len(terms))
\ No newline at end of file
--- a/colours.py
+++ b/colours.py
+
+
+
+for name, hex in colours.items():
+    print (name, hex)
\ No newline at end of file
--- a/dictionary_test.py
+++ b/dictionary_test.py
+name = "Gabe"
+
+name = name[0: 2]
+
+print (name)
\ No newline at end of file
--- a/godata.py
+++ b/godata.py
 '''
-Created on Jul 12, 2012, amended April 2015
+Created on Jul 12, 2012, amended April 2015 and again May 2018

 Module for managing Gene Ontology data, in particular gene:terms
 annotations and term definitions
@@ -26,8 +26,9 @@ Subsequently you can construct instances of BinGO and query terms and genes, rou
 from struct import pack, unpack, calcsize, error
 import operator
 import time
-import os
+import sys
 import stats
+import gzip

 # Character codes used by binary format to identify ontology
 onto_codes = {
@@ -82,7 +83,7 @@ class GO():
    termdefs = {}   # definitions: termdefs[term] = (onto, set((term, rel)), name)
    children = {}   # redundant, parent-to-child structure: children[term] = set((term, rel))

-    def __init__(self, annotFile, obofile, annotfile_columns = (1,2,3,4,6,8)):
+    def __init__(self, annotFile, obofile, annotfile_columns = (1,2,3,4,6,8), include_genes = None):
        """ Start GO session with specified data loaded:
        annotfile: name of annotation file, e.g.'gene_association.tair'
        OBO file: name of gene ontology definition file, e.g. 'gene_ontology_ext.obo'
@@ -91,6 +92,7 @@ class GO():
        (The default seems to work for most annotation files, but sometime if you wish to cross reference
        say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations:
        go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8))
+        Optionally, specify what genes should be included when reading the annotations; None (default) means include everything.
        """
        print(("Started at", time.asctime()))
        # Get GO definitions
@@ -110,7 +112,10 @@ class GO():
                    pass
        print(("Read %d GO definitions" % len(terms)))
        # open annotation file to analyse and index data
-        src = open(annotFile, 'r')
+        if annotFile.endswith(".gz"):
+            src = gzip.open(annotFile, 'rt')
+        else:
+            src = open(annotFile, 'rt')
        gene_cnt = 0
        cnt = 0
        for line in src:
@@ -118,14 +123,14 @@ class GO():
            if line.startswith('!'):
                continue
            (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns)
-            #print(gene, symb, qual, term, evid, onto, taxa)
-            try:
-                (taxa_q, terms_map) = self.annots[gene]
-                terms_map[term] = (evid, qual != 'NOT')
-            except KeyError: # not a previously encountered gene
-                gene_cnt += 1
-                terms_map = {term: (evid, qual != 'NOT')}
-                self.annots[gene] = (taxa, terms_map)
+            if include_genes == None or gene in include_genes:
+                try:
+                    (taxa_q, terms_map) = self.annots[gene]
+                    terms_map[term] = (evid, qual != 'NOT')
+                except KeyError: # not a previously encountered gene
+                    gene_cnt += 1
+                    terms_map = {term: (evid, qual != 'NOT')}
+                    self.annots[gene] = (taxa, terms_map)
        src.close()
        print(("Read annotations for %d genes" % gene_cnt))

@@ -896,6 +901,7 @@ def readOBOFile(obofile):
            in_term_def = True
        if line.startswith('[Typedef]'):
            if in_term_def: # already defining one, stash it before moving on to the next...
+                terms[term_id] = (term_name, term_onto, term_is)
                in_term_def= False
            in_type_def = True
    if in_term_def: #  defining one, stash it
@@ -1037,3 +1043,52 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
    # done, close
    dst.close()
    print(("Completed at", time.asctime()))
+
+def reduceAnnotFile(annotfile, newannotfile, include_genes, annotfile_column_gene = 1):
+    """ Reduce size of annotation file by specifying the genes of interest:
+    annotfile: name of annotation file, e.g. 'goa_uniprot_all.gaf'
+    newannotfile: name of new, reduced-size annotation file, e.g. 'goa_uniprot_some.gaf'
+    include_genes: set/list of gene names to include
+    Optionally, specify what column in the annotation file that contains name of gene product:
+    """
+    print(("Started at", time.asctime()))
+    # open annotation file to analyse and index data
+    if annotfile.endswith(".gz"):
+        src = gzip.open(annotfile, 'rt')
+        dst = gzip.open(newannotfile, 'wt')
+    else:
+        src = open(annotfile, 'rt')
+        dst = open(newannotfile, 'wt')
+    cnt0 = 0
+    cnt1 = 0
+    for line in src:
+        if line.startswith('!'):
+            continue
+        cnt0 += 1
+        fields = line.strip().split('\t')
+        gene = fields[annotfile_column_gene]
+        if gene in include_genes:
+            dst.write(line)
+            cnt1 += 1
+    src.close()
+    dst.close()
+    print(("Read %d annotations; wrote %d annotations" % (cnt0, cnt1)))
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print('Usage: godata <COMMAND> where <COMMAND> is one of the following', \
+        "\n\tinclude <gaf-file-src> <gaf-file-dest> <gene-txt-file> where",\
+        "\n\t\t<gaf-file-src> is the original annotation file",\
+        "\n\t\t<gaf-file-dest> is the new, reduced-size annotation file",\
+        "\n\t\t<gene-txt-file> lists gene names to include in new gaf-file-dest")
+        sys.exit(1)
+    if sys.argv[1].upper() == 'INCLUDE' and len(sys.argv) > 4:
+        f = open(sys.argv[4])
+        include_genes = set()
+        for line in f:
+            include_genes.add(line.strip())
+        reduceAnnotFile(sys.argv[2], sys.argv[3], include_genes)
+    else:
+        print('Unknown command \"' + sys.argv[1] + '\" or arguments:'),
+        for arg in sys.argv[2:]:
+            print(arg),
--- a/gtf.py
+++ b/gtf.py
+import shlex
+import ival
+
+class GtfEntry():
+
+    '''
+    GFF fields:
+    seqname - The name of the sequence. Must be a chromosome or scaffold.
+    source - The program that generated this feature.
+    feature - The name of this type of feature. Some examples of standard feature types are "CDS" "start_codon" "stop_codon" and "exon"li>
+    start - The starting position of the feature in the sequence. The first base is numbered 1.
+    end - The ending position of the feature (inclusive).
+    score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). If there is no score value, enter ":.":.
+    strand - Valid entries include "+", "-", or "." (for don't know/don't care).
+    frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be ".".
+    group - All lines with the same group are linked together into a single item.
+    '''
+    def __init__(self, chrom, start, end, feature, score = ".", source = "unknown", strand = ".", frame = ".", group = None):
+        self.seqname = chrom
+        self.start = start
+        self.end = end
+        self.feature = feature
+        self.score = score
+        self.strand = strand
+        self.source = source
+        self.frame = frame
+        self.group = group
+        self.attr = {}
+        if self.group:
+            fields = self.group.split(';')
+            for f in fields:
+                pair = shlex.split(f.strip())
+                if len(pair) == 2:
+                    self.attr[pair[0]] = pair[1]
+
+    def __getitem__(self, item):
+        return self.attr[item]
+
+    def __contains__(self, item):
+        return item in self.attr
+
+    def __str__(self):
+        return str((self.seqname, self.start, self.end))
+
+    def __len__(self):
+        return self.end - self.start
+
+    def getInterval(self):
+        return ival.Interval(self.start, self.end)
+
+def dist(entry1, entry2, signed = False, centre2centre = False):
+    """ Calculate and return the BedEntry with the closest distance (from one end of the interval of this to the end of the interval of that).
+        If centre2centre is True, use the centre-to-centre distance instead.
+        If signed is True, the distance is negative if this interval is after the that.
+    """
+    if isinstance(entry1, GtfEntry) and isinstance(entry2, GtfEntry):
+        if (entry1.seqname == entry2.seqname):
+            return ival.dist(entry1.getInterval(), entry2.getInterval(), signed, centre2centre)
+    return None
+
+class GtfFile:
+    """ Read GTF/GFF file.
+
+        See http://genome.ucsc.edu/FAQ/FAQformat#format1
+    """
+
+    def __init__(self, entries):
+        """
+        Create a GtfFile instance.
+        :param entries: an iterable of entries or a filename
+        """
+        if isinstance(entries, str): # filename
+            self.chroms = readGtfFile(entries)
+        else:
+            self.chroms = dict()
+            for entry in entries:
+                # check if the chromosome has been seen before
+                tree = self.chroms.get(entry.chrom)
+                if not tree:
+                    tree = ival.IntervalTree()
+                    self.chroms[entry.chrom] = tree
+                # put the entry in the interval tree for the appropriate chromosome
+                iv = ival.Interval(entry.start, entry.end)
+                tree.put(iv, entry)
+
+    def __len__(self):
+        n = 0
+        for c in self.chroms:
+            n += len(self.chroms[c])
+        return n
+
+    def generate(self, chrom):
+        mytree = self.chroms.get(chrom)
+        if mytree != None:
+            for e in mytree:
+                for entry in e.values:
+                    yield entry
+
+    def __iter__(self):
+        self.chromqueue = ival.Stack()
+        for c in sorted(self.chroms.keys())[::-1]:
+            self.chromqueue.push(self.generate(c))
+        self.current = self.chromqueue.pop()
+        return self
+
+    def __next__(self):
+        try:
+            ret = next(self.current)
+        except StopIteration:
+            if not self.chromqueue.isEmpty():
+                self.current = self.chromqueue.pop()
+                ret = next(self.current)
+            else:
+                raise StopIteration
+        return ret
+
+    def __contains__(self, item):
+        if isinstance(item, GtfEntry):
+            tree = self.chroms.get(item.chrom)
+            if tree == None: return False
+            else: return ival.Interval(item.start, item.end) in tree
+        else:
+            return False
+
+    def getOverlap(self, item):
+        if isinstance(item, GtfEntry):
+            tree = self.chroms.get(item.chrom)
+            if tree == None: return None
+            else:
+                iv = ival.Interval(item.start, item.end)
+                res = tree.isectall(iv)
+                ret = []
+                for r in res:
+                    ret.extend(r.values)
+                return ret
+        else: return None
+
+    def getClosest(self, item):
+        if isinstance(item, GtfEntry):
+            tree = self.chroms.get(item.chrom)
+            if tree == None: return None
+            else:
+                iv = ival.Interval(item.start, item.end)
+                node = tree.closest(iv)
+                if node != None: return node.values
+                else: return None
+        else: return None
+
+    def getOneOfClosest(self, item):
+        all = self.getClosest(item)
+        if all == None: return None
+        else: return next(iter(all))
+
+    def getOneOfOverlap(self, item):
+        all = self.getOverlap(item)
+        if all == None: return None
+        elif len(all) == 0: return None
+        else: return next(iter(all))
+
+def readGtfFile(filename):
+    """ Read a GTF/GFF file.
+    """
+    f = open(filename)
+    row = 0
+    acceptHeaderRows = 1
+    headerRow = None
+    chroms = dict()
+    for line in f:
+        row += 1
+        words = line.strip().split('\t')
+        if len(words) == 0:
+            continue # ignore empty lines
+        if words[0].strip().startswith('#'):
+            continue # comment
+        if words[0].strip().startswith('browser'):
+            continue # ignore
+        if words[0].strip().startswith('track'):
+            continue # ignore
+        try:
+            seqname = words[0]
+            source = words[1]
+            feature = words[2]
+            start = int(words[3])
+            end = int(words[4])
+            score = None
+            if words[5].isnumeric():
+                score = int(words[5])
+            strand = '.'
+            if words[6] == '+' or words[6] == '-':
+                strand = words[6]
+            frame = None
+            if words[7].isdigit():
+                frame = int(words[7])
+            group = None
+            if len(words) > 8:
+                group = words[8]
+            entry = GtfEntry(seqname, start, end, feature, score, source, strand, frame, group)
+            # check if the chromosome has been seen before
+            tree = chroms.get(seqname)
+            if not tree:
+                tree = ival.IntervalTree()
+                chroms[seqname] = tree
+            # put the entry in the interval tree for the appropriate chromosome
+            iv = ival.Interval(entry.start, entry.end)
+            tree.put(iv, entry)
+        except RuntimeError as e:
+            if not acceptHeaderRows:
+                raise RuntimeError('Error in GTF/GFF file at row %d (%s)' % (row, e.strerror))
+            else:
+                headerRow = words
+                acceptHeaderRows -= 1 # count down the number of header rows that can occur
+    f.close()
+    return chroms
+
+def writeGtfFile(entries, filename, header = None):
+    """ Save the GTF entries to a file.
+    """
+    f = open(filename, 'w')
+    if header:
+        f.write(header + '\n')
+    for row in entries:
+        f.write("%s\t%s\t%s\t%d\t%d\t%d\t%s\t%s\t%s" % (row.chrom, row.source, row.feature, row.start, row.end, row.score, row.strand, row.frame, row.group))
+        f.write("\n")
+    f.close()
+
+if __name__ == '__main__':
+    bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf')
+    print(bf.chroms.keys())
+    g = bf.generate('chr12')
+    print(next(g))
+    print(next(g))
+    print(next(g))
+    cnt = 0
+    for entry in bf:
+        cnt += 1
+        print(str(cnt) + '\t' + str(entry))
+        if cnt == 100:
+            break
--- a/guide.py
+++ b/guide.py
@@ -1074,25 +1074,29 @@ class PhyloNode:
            # allocate a position to put the right child symbol from which each current node symbol score was determined
            self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
            for col in range(aln.alignlen):
+                # left child will contribute first
                for a_parent in range(len(aln.alphabet)):
                    best_score_left = +9999999
-                    best_score_right = +9999999
                    best_symb_left = 0
-                    best_symb_right = 0
-                    for a_left in range(len(aln.alphabet)):
-                        score = (scoresleft[col][a_left] + (
-                        1 if a_left != a_parent else 0))  # if we want to weight scores, this would need to change
+                    for a in range(len(aln.alphabet)):
+                        score = (scoresleft[col][a] + (
+                        1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
                        if score < best_score_left:
-                            best_symb_left = a_left
+                            best_symb_left = a
                            best_score_left = score
-                    for a_right in range(len(aln.alphabet)):
-                        score = (scoresright[col][a_right] + (
-                        1 if a_right != a_parent else 0))  # if we want to weight scores, this would need to change
+                    self.seqscores[col][a_parent] = best_score_left
+                    self.backleft[col][a_parent] = best_symb_left
+                # right child will contribute next
+                for a_parent in range(len(aln.alphabet)):
+                    best_score_right = +9999999
+                    best_symb_right = 0
+                    for a in range(len(aln.alphabet)):
+                        score = (scoresright[col][a] + (
+                        1 if a != a_parent else 0))  # if we want to weight scores, this would need to change
                        if score < best_score_right:
-                            best_symb_right = a_right
+                            best_symb_right = a
                            best_score_right = score
-                    self.seqscores[col][a_parent] = best_score_left + best_score_right
-                    self.backleft[col][a_parent] = best_symb_left
+                    self.seqscores[col][a_parent] += best_score_right
                    self.backright[col][a_parent] = best_symb_right
        else:
            self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in

--- a/ival.py
+++ b/ival.py
@@ -10,6 +10,10 @@ class IntervalTree:
    stack = None

    def __iter__(self):
+        """
+        Create an iterator for the tree; this will iterate through nodes of the tree. Note that a node has one interval and (potentially) multiple values.
+        :return: iterator of IntervalNodes stored in this tree
+        """
        self.current = self.root
        self.stack = Stack()
        return self
@@ -26,7 +30,10 @@ class IntervalTree:
        return ret

    def __len__(self):
-        return self.root.N
+        if self.root != None:
+            return self.root.N
+        else:
+            return 0

    def __contains__(self, ival):
        return self.get(ival) != None
@@ -88,6 +95,10 @@ class IntervalTree:
        else:
            self.root = self._randomizedInsert(self.root, ival, value)

+    def putAll(self, tree):
+        for i in tree:
+            self.put(i.getInterval(), tree.get(i.getInterval()))
+
    def _randomizedInsert(self, node, ival, value):
        if node == None: return IntervalNode(ival, value)
        if random.uniform(0,1) * node.N < 1.0: return self._rootInsert(node, ival, value)
@@ -176,7 +187,33 @@ class IntervalNode:
        if value != None:
            self.values.add(value)

+    def getInterval(self):
+        """
+        Retrieve the interval that defines this node
+        :return: the interval
+        """
+        return self.ival
+
+    def getMin(self):
+        """
+        Retrieve the min value for the interval of this node
+        :return: the min value of the interval
+        """
+        return self.ival.min
+
+    def getMax(self):
+        """
+        Retrieve the max value for the interval of this node
+        :return: the max value of the interval
+        """
+        return self.ival.max
+
    def add(self, value):
+        """
+        Add a new value to this node, to the set of values associated with the interval
+        :param value:
+        :return:
+        """
        if value:
            self.values.add(value)

@@ -294,6 +331,9 @@ class Interval:
    def __sizeof__(self):
        return self.max - self.min

+    def __len__(self):
+        return self.max - self.min
+
    def dist(self, that, signed = False, centre2centre = False):
        """ Calculate and return the closest distance (from one end of the interval of this to the end of the interval of that).
            If centre2centre is True, use the centre-to-centre distance instead.

--- a/ml.py
+++ b/ml.py
@@ -299,6 +299,17 @@ def eucdist(v1, v2):
        diff += (v1[i] - v2[i])**2
    return math.sqrt(diff)

+def cosdist(v1, v2):
+    if len(v1) != len(v2):
+        return None
+    sum0 = 0
+    sum1 = 0
+    sum2 = 0
+    for i in range(len(v1)):
+        sum0 += v1[i]*v2[i]
+        sum1 += v1[i]*v1[i]
+        sum2 += v2[i]*v2[i]
+    return 1 - (sum0 / (math.sqrt(sum1*sum2)))



--- a/phylo.py
+++ b/phylo.py
--- a/sequence.py
+++ b/sequence.py
@@ -234,7 +234,7 @@ def parseDefline(string):
    if len(string) == 0: return ('', '', '', '')
    s = string.split()[0]
    if re.match("^sp\|[A-Z][A-Z0-9]{5}\|\S+", s):            arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
-    elif re.match("^tr\|[A-Z][A-Z0-9]{5}\|\S+", s):          arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
+    elif re.match("^tr\|[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
    elif re.match("^gi\|[0-9]*\|\S+\|\S+", s):               arg = s.split('|');  return (arg[1], arg[3], arg[0], arg[2])
    elif re.match("gb\|\S+\|\S+", s):                        arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
    elif re.match("emb\|\S+\|\S+", s):                       arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
@@ -766,9 +766,9 @@ def alignGlobal(seqA, seqB, substMatrix, gap = -1):
    for i in range(1, lenA + 1):
        for j in range(1, lenB + 1):
            match  = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
-            delete = S[i-1, j  ] + gap
-            insert = S[i  , j-1] + gap
-            S[i, j] = max([match, delete, insert])
+            fromTop = S[i-1, j  ] + gap 
+            fromLeft = S[i  , j-1] + gap 
+            S[i, j] = max([match, fromTop, fromLeft])
    # Traceback the optimal alignment
    alignA = '' # a string for sequence A when aligned (e.g. 'THIS-LI-NE-', initially empty).
    alignB = '' # a string for sequence B when aligned (e.g. '--ISALIGNED', initially empty).
@@ -825,9 +825,9 @@ def alignLocal(seqA, seqB, substMatrix, gap = -1):
    for i in range(1, lenA + 1):
        for j in range(1, lenB + 1):
            match  = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
-            delete = S[i-1, j  ] + gap
-            insert = S[i  , j-1] + gap
-            S[i, j] = max([match, delete, insert, 0])  # Local: add option that we re-start alignment from "0"
+            fromTop = S[i-1, j  ] + gap 
+            fromLeft = S[i  , j-1] + gap 
+            S[i, j] = max([match, fromTop, fromLeft, 0]) # Local: add option that we re-start alignment from "0"
    # Trace back the optimal alignment
    alignA = ''
    alignB = ''

--- a/stats.py
+++ b/stats.py
@@ -430,7 +430,8 @@ def getRSpval(a, b):
    for elem in b:
        lst.append([elem, -1, 0])
    # ok sort it
-    lst.sort(lambda p, q: cmp(p[0], q[0]))
+    #lst.sort(lambda p, q: cmp(p[0], q[0]))
+    sorted(lst, key=lambda x: x[0])

    # let's go through it and edit each rank
    rank=0
@@ -492,7 +493,7 @@ def getRSpval(a, b):
    za=((ta_obs-ta_null)+da)/sd           # the z value for A which is the mirror of ...
    zb=((tb_obs-tb_null)+db)/sd           # the z value for B (we only need one)
    p=f(za)                        # figure out the area of the normal distribution
-    u=ua;                                 # remember one of the U values
+    u=ua                                 # remember one of the U values
    return p                              # the p-value: null is that a==b, one-sided (a has lower values)



--- a/sym.py
+++ b/sym.py
@@ -126,6 +126,7 @@ RNA_Alphabet = Alphabet('ACGU')
 Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
 Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
 Protein_Alphabet_wSTOP = Protein_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
+Protein_wGAP  = Alphabet('ACDEFGHIKLMNPQRSTVWY-')
 DSSP_Alphabet = Alphabet('GHITEBSC')
 DSSP3_Alphabet = Alphabet('HEC')


--- a/webservice.py
+++ b/webservice.py
@@ -33,7 +33,6 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
    # Get the entry
    try:
        data = urllib.request.urlopen(url).read().decode("utf-8")
-        print (type(data))
        if data.startswith("ERROR"):
            raise RuntimeError(data)
        return data