release 2016.1

3ff492de · Mikael Boden · 3ff492de · 3ff492de · 3ff492de · 3ff492de
Commit 3ff492de authored Jul 13, 2016 by Mikael Boden
18 changed files
--- a/binomial.py
+++ b/binomial.py
+from math import log, exp
+import sys
+
+
+MAXIT = 100
+EPS = 3.0e-7
+FPMIN = 1.0e-300
+gamma_c = [76.18009172947146,
+           -86.50532032941677,
+           24.01409824083091,
+           -1.23173957245,
+           0.1208650973866179e-2,
+           -0.5395239384953e-5]
+
+
+def log_binomial_ncdf(N, k, p):
+    """
+    Log of one minus the cumulative distribution function of the binomial dist.
+
+    The binomial density gives the probability of k successes in N independent
+    trials each with probability p of success.
+    """
+    if (k==0):
+        return 0
+    else:
+        return log_betai(k, N-k+1, p)
+
+
+def betai (a, b, x):
+    """
+    Incomplete beta function
+    """
+    if (x<0 or x>1): die("Bad x=`" + str(x) + "' in routine betai")
+    if (x==0 or x==1):
+        bt = 0
+    else:
+        bt = exp(gammaln(a+b)-gammaln(a)-gammaln(b)+a*log(x)+b*log(1-x))
+
+    thresh = (a+1)/(a+b+2.0)
+    if (x<thresh):
+        return(bt*betacf(a,b,x)/a)
+    else:
+        return(1.0-bt*betacf(b,a,1.0-x)/b)
+
+
+def log_betai(a, b, x):
+    """
+    log incomplete beta function
+    """
+    if (x<0 or x>1): die("Bad x=`" + str(x) + "' in routine betai")
+    if (x==0 or x==1):
+        log_bt = -1e300           # log(0)
+    else:
+        log_bt = gammaln(a+b)-gammaln(a)-gammaln(b)+a*log(x)+b*log(1.0-x)
+
+    thresh = (a+1.0)/(a+b+2.0)
+    if (x<thresh):
+        return(log_bt + log(betacf(a,b,x)/a))
+    else:
+        return(log(1.0 - exp(log_bt)*betacf(b,a,1.0-x)/b))
+
+
+def betacf(a, b, x):
+    """
+    used by betai
+    """
+    qab = a+b
+    qap = a+1.0
+    qam = a-1.0
+    c = 1.0
+    d = 1.0-qab*x/qap
+
+    if (abs(d) < FPMIN): d = FPMIN
+    d = 1.0/d
+    h = d
+
+    for m in range(1, MAXIT+1):
+        m2 = 2.0*m
+        aa = m*(b-m)*x/((qam+m2)*(a+m2))
+        d=1.0+aa*d
+        if (abs(d) < FPMIN): d=FPMIN
+        c=1.0+aa/c
+        if (abs(c) < FPMIN): c=FPMIN
+        d = 1.0/d
+        h *= d*c
+        aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2))
+
+        d=1.0+aa*d
+        if (abs(d) < FPMIN): d=FPMIN
+        c=1.0+aa/c
+        if (abs(c) < FPMIN): c=FPMIN
+        d = 1.0/d
+
+        delta = d*c
+        h *= delta
+        if (abs(delta-1.0) < EPS): break
+
+    if (m > MAXIT):  print >> sys.stderr, ("a or b too big or MAXIT too small "
+                                           "in betacf")
+    return h
+
+
+def gammaln(x):
+    """
+    Compute log gamma function
+    """
+    xx = x
+    s = 1.000000000190015
+    for i in range(0, 6):
+        xx += 1
+        s += gamma_c[i]/xx
+
+    res = ((x+0.5) * log(x+5.5)) - (x+5.5) + log(2.5066282746310005*s/x)
+    if (res >= 0):
+        return res
+    else:
+        return 0					# avoid roundoff error
+
+
+def die(string):
+    print >> sys.stderr, string
+
--- a/genome.py
+++ b/genome.py
+import math
+import numpy as np
+
+class GeneExpression:
+
+    dataset = ''            # name of data set (if any)
+    genes = {}              # a dictionary of gene names to profile matrix index
+    matrix = None           # a numpy two-dim array holding all expression values
+    headers = []            # the names of the samples/experiments, e.g. GSM123
+    default_value_if_null = None # Default value to use if entry is not set (e.g. addSamples may not add values for all genes)
+    
+    def __init__(self, datasetname='', headerlist=[], genedict={}):
+        """ Create a gene expression data set.
+            The class will store gene names and associated profiles (in which values correspond to "samples"). 
+            It also stores headers (which names correspond to samples, i.e. experiments). 
+            Data should be provided as 
+            (0) a name of the set
+            (1) a list of sample names (headerlist; must agree with the number of values in each gene profile)
+            (2) a gene name dictionary where values contain the expression profile (genedict; profile is an iterable with the same number of elements)
+            For example
+            >>> g = GeneExpression("MySet", ['Sample1', 'Sample2'], {'G1': [0.13, 1.23], 'G2': [4.1, -0.9], 'G3': [2.1, -2.1]})
+        """
+        self.dataset = datasetname
+        self.genes = {}
+        ndx = 0
+        for gene in genedict:
+            self.genes[gene] = ndx
+            ndx += 1 
+        self.matrix = self._createMatrix(genedict)
+        if len(self.matrix) == 0:
+            nsamples = 0
+        else:
+            nsamples = len(self.matrix[0])  
+        if isinstance(headerlist, str):
+            headerlist = [headerlist]
+        if len(headerlist) != nsamples:
+            raise RuntimeError("The number of headers (%d) is not equal to the number of samples (%d)" % (len(headerlist), nsamples))
+        self.headers = headerlist or ['S%d' % cnt for cnt in range(nsamples)]
+        
+    def _createMatrix(self, genedict):
+        """ Internal method for constructing a numpy matrix from a gene-profile dictionary. """
+        ngenes = len(self.genes)
+        allow_new_genes = False
+        if ngenes == 0:  # if instance is empty, include all genes in dict
+            ngenes = len(genedict)
+            allow_new_genes = True 
+        nsamples = 0
+        for gene in genedict:
+            profile = genedict[gene]
+            try:
+                actual = len(profile)
+            except TypeError:
+                actual = 1
+                genedict[gene] = [profile]
+            if nsamples == 0:
+                nsamples = actual
+            elif nsamples != actual:
+                raise RuntimeError("Each gene must have the same number of samples (see %s)" % gene)
+        matrix = np.empty((ngenes, nsamples))
+        matrix[:] = self.default_value_if_null
+        ndx = 0
+        for gene in genedict:
+            try:
+                ndx = self.genes[gene]
+                matrix[ndx] = genedict[gene]
+            except: # no match in current gene list
+                if allow_new_genes:
+                    matrix[ndx] = genedict[gene]
+                    self.genes[gene] = ndx
+                    ndx += 1
+        return matrix
+     
+    def getHeaders(self, indices = None):
+        """ Retrieve headers (names of experiments/samples).
+            If indices is None (default), all headers are returned, e.g.
+            >>> g.getHeaders()
+            ['Sample1', 'Sample2']
+            If indices is a single integer, the header for the corresponding entry is returned, e.g.
+            >>> g.getHeaders(1)
+            'Sample2'
+            If indices is an iterable of integers (multiple indices), the list of corresponding headers is returned, e.g.
+            >>> g.getHeaders([1,0])
+            ['Sample2', 'Sample1']
+        """
+        if indices == None: 
+            return self.headers
+        elif isinstance(indices, int) or indices is slice:
+            return self.headers[indices]
+        else:
+            ret = []
+            for index in indices:
+                ret.append(self.headers[index]) 
+            return ret
+
+    def getGenes(self, names = None):
+        """ Retrieve applicable gene-profile entries.
+            If names is None (default), all gene names are returned, e.g.
+            >>> g.getGenes()
+            ['G1', 'G2', 'G3']
+            If names is a single string, the profile for the corresponding entry is returned, e.g. 
+            >>> g.getGenes('G2')
+            array([ 4.1, -0.9])
+            If names is an iterable of strings (multiple gene names), a dictionary with gene name as key and profile as value is returned.
+            >>> g.getGenes(['G3','G2'])
+            {'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
+            """
+        if names == None:
+            return self.genes.keys()
+        elif isinstance(names, str):
+            return self.matrix[self.genes[names],:]
+        else:
+            ret = {}
+            for name in names:
+                ret[name] = self.matrix[self.genes[name],:] 
+            return ret
+
+    def __getitem__(self, ndx):
+        """ Retrieve a specified sample (or a "slice" of samples) for all genes, e.g.
+        >>> g[0:2]
+            array([[ 2.1 , -2.1 ],
+                   [ 4.1 , -0.9 ],
+                   [ 0.13,  1.23]])
+            Note that the order of rows/genes is NOT necessarily the same as that used for inserting the data. 
+        """
+        return self.matrix[:,ndx]
+    
+    def getHeaderIndex(self, headers):
+        """ Find the index of the named experiment.
+            Raises a ValueError if not in list. """
+        if isinstance(headers, str):
+            return self.headers.index(headers)
+        else:
+            return [self.headers.index(header) for header in headers]
+    
+    def getSamples(self, samples):
+        """Construct a gene dictionary including only samples in specified indices, e.g.
+        >>> g.getSamples(0)
+            {'G1': 0.13, 'G2': 4.0999999999999996, 'G3': 2.1000000000000001}
+        >>> g.getSamples('Sample2')
+            {'G1': 1.23, 'G2': -0.90000000000000002, 'G3': -2.1000000000000001}
+        >>> g.getSamples(['Sample2','Sample1'])
+            {'G1': array([ 1.23,  0.13]),
+             'G2': array([-0.9,  4.1]),
+             'G3': array([-2.1,  2.1])}
+        """
+        try:
+            index = self.getHeaderIndex(samples)
+        except:
+            index = samples
+        mygenes = {}
+        for (name, ndx) in self.genes.items():
+            mygenes[name] = self.matrix[ndx, index]
+        return mygenes
+
+    def sort(self, sample, descending=True):
+        """Get a list of gene names, sorted by order of value in specified sample, e.g.
+        >>> g.sort(0)
+            ['G2', 'G3', 'G1']
+        Then retrieve actual genes using e.g. 
+        >>> g.getGenes('G2')
+            array([-0.9,  4.1])
+        """
+        try:
+            index = self.getHeaderIndex(sample)
+            sort_ndx = np.nan_to_num(self.matrix[:,index]).argsort()
+        except:
+            sort_ndx = np.nan_to_num(self.matrix[:,sample]).argsort()
+        name_tuples = sorted(self.genes.items(), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
+        names = []
+        if descending:
+            for (name, index) in [name_tuples[index] for index in sort_ndx[::-1]]: # reverse the order 
+                names.append(name)
+        else:
+            for (name, index) in [name_tuples[index] for index in sort_ndx]: # maintain order 
+                names.append(name)
+        return names
+
+    def addSamples(self, headerlist, genedict):
+        """Add a sample or multiple samples to the current data set.
+           genedict is a dictionary with the same keys as the current gene set.
+           Only values for genes in the current set will be added (others are ignored).
+           >>> g.addSamples('Sample3', {'G1': 3.4, 'G2': -3.0})
+        """
+        newmat = self._createMatrix(genedict)
+        nsamples = len(newmat[0])  
+        if headerlist != None:
+            if isinstance(headerlist, str):
+                headerlist = [headerlist]
+            if len(headerlist) != nsamples:
+                raise RuntimeError("The number of headers (%d) is not equal to the number of samples (%d)" % (len(headerlist), nsamples))
+        if len(self.matrix) == 0:
+            self.matrix = newmat
+        else:
+            self.matrix = np.hstack((self.matrix, newmat))
+        self.headers.extend(headerlist or ['S%d' % cnt+len(self.headers) for cnt in range(nsamples)])
+
+    def getRatio(self, index1, index2):
+        """ Get the ratio of two samples in the data set (index1 and index2).
+            Creates and returns a gene dictionary with the corresponding ratios. """
+        mygenes = {}
+        mdiv = self.matrix[:, index1] / self.matrix[:, index2]
+        for (name, ndx) in self.genes.items():
+            mygenes[name] = mdiv[ndx]
+        return mygenes
+
+    def getLogRatio(self, index1, index2):
+        """ Get the log2-transformed ratio of two samples (index1 and index2)
+            Creates and returns a gene dictionary with the corresponding log-ratios. """
+        mygenes = {}
+        mlr = np.log2(self.matrix[:, index1] / self.matrix[:, index2])
+        for (name, ndx) in self.genes.items():
+            mygenes[name] = mlr[ndx]
+        return mygenes
+    
+    def getPearson(self, probeID):
+        """ Given a probe identifier, returns a gene/probe dictionary:
+        identifiers to correlation coefficients with the specified probe. """
+        index = self.genes[probeID]
+        profile = self.matrix[index, :]
+        mygenes = {}
+        for (name, ndx) in self.genes.items():
+            other = self.matrix[ndx, :]
+            mygenes[name] = pearson(profile, other)
+        return mygenes
+    
+    def writeGEOFile(self, filename):
+        """ Save data as a truncated GEO SOFT file named filename. """
+        line = '^DATASET = ' + self.dataset + '\n'
+        line += '!dataset_table_begin\nID_REF\tIDENTIFIER\t'
+        for header in self.headers:
+            line += header + '\t'
+        line += '\n'
+        for gene in self.genes:
+            line += gene + '\t' + gene + '\t'
+            index = self.genes[gene]
+            for value in self.matrix[index, :]:
+                line += format(value, '5.3f') + '\t'
+            line += '\n'
+        line += '!dataset_table_end\n'
+        fh = open(filename, 'w')
+        fh.write(line)
+        fh.close()
+    
+    def getZScore(self, index):
+        """ Get the Z-score of each expression value.
+            index can be a list of indices (for which the z-score is computed independently).
+            Important: assumes that values are normally distributed.
+            For example use log-transformed ratios. """
+        # Calculate mean and standard deviation of the list of values
+        mu = np.mean(self.matrix[:, index], axis=0)
+        sd = np.std(self.matrix[:, index], axis=0)
+        # Calculate Z-score for the given column for each gene
+        zscore = (self.matrix[:, index] - mu) / sd
+        mygenes = {}
+        for (name, ndx) in self.genes.items():
+            try:
+                mygenes[name] = zscore[ndx, :]
+            except IndexError:
+                mygenes[name] = zscore[ndx]
+        # Return the dictionary of Z-scores
+        return mygenes
+
+# Utility functions
+
+def readGEOFile(filename, id_column=0):
+    """Read a Gene Expression Omnibus file; return a GeneExpression instance.
+    id_column indicates what field of each row that should be taken as
+    gene identifier.
+    """
+    fh = open(filename, "rU")
+    manylines = fh.read()
+    fh.close()
+    # If True, ignore genes with null samples; if False, use default value
+    ignore_gene_if_null = False
+    default_value_if_null = None # Default value to use if entry is null and not ignored
+    # Indicates whether we're reading the data section or metadata
+    data_rows = False
+    cnt_data = 0
+    cnt_null = 0
+    dataset = ''    # name of dataset
+    headers = []    # list of headers
+    genes = {}      # dict with gene-name as key, expression profile as a list of floats
+    for line in manylines.splitlines():
+        if line.startswith('^DATASET'):
+            dataset = line.split('= ')[1]
+            continue
+        if line.startswith('!dataset_table_begin'):
+            data_rows = True
+            continue
+        if line.startswith('!dataset_table_end'):
+            data_rows = False
+            continue
+        if line.startswith('!') or line.startswith('#') \
+            or line.startswith('^'):
+                continue
+        if len(line.strip()) == 0:
+            continue
+        if data_rows:
+            cnt_data += 1
+            ignore = False
+            name = line.split('\t')[id_column]
+            # Ignore control probes
+            if name.startswith("AFFX"):
+                continue
+            if (cnt_data == 1):  # First line contains the headers
+                headers = line.split('\t')
+            else:
+                values = []
+                cnt_word = 0
+                for word in line.split('\t'):
+                    cnt_word += 1
+                    if cnt_word <= (id_column + 1):
+                        continue
+                    if word == 'null':
+                        cnt_null += 1
+                        if ignore_gene_if_null:
+                            ignore = True
+                            break
+                        else:
+                            word = default_value_if_null
+                    try:
+                        if word == None:
+                            values.append(None)
+                        else:
+                            values.append(float(word))
+                    except:  # ignore values that are not "float"
+                        continue
+                if ignore:
+                    pass
+                elif not name in genes:
+                    genes[name] = values
+    if len(genes) == 0:
+        raise RuntimeError('No data in file')
+    print 'Data set %s contains %d entries' % (dataset, len(genes))
+    if cnt_null > 0:
+        print 'Data set has %d null-values' % (cnt_null)
+    return GeneExpression(dataset, headers[2:], genes)
+
+
+# ------------------ Helpful Extra Functions ------------------
+
+def pearson(X, Y):
+    """ Pearson correlation coefficient (r).
+    Note that we are using the standard deviation of the sample, NOT the sample
+    standard deviation (see http://en.wikipedia.org/wiki/Standard_deviation). """
+    Xmu = np.mean(X)
+    Xvar = np.var(X)
+    Ymu = np.mean(Y)
+    Yvar = np.var(Y)
+    if len(X) != len(Y):
+        raise RuntimeError('vectors are of uneven length')
+    n = len(X)
+    sum = 0.0
+    for i in range(n):
+        sum += (X[i] * Y[i])
+    if n == 0 or Xvar == 0 or Yvar == 0:
+        return 0
+    return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar))
+
+# ------------------- Example ---------------------
+
+ge3716 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS3716.soft')
+
+ratio = GeneExpression('GDS3716_ratio')
+ratio.addSamples('S1_ER+/Healthy', ge3716.getRatio( 33,  0))
+ratio.addSamples('S2_ER+/Healthy', ge3716.getRatio( 34,  1))
+ratio.addSamples('S3_ER+/Healthy', ge3716.getRatio( 35,  2))
+ratio.addSamples('S4_ER+/Healthy', ge3716.getRatio( 36,  3))
+ratio.addSamples('S5_ER+/Healthy', ge3716.getRatio( 37,  4))
+ratio.addSamples('S6_ER+/Healthy', ge3716.getRatio( 38,  5))
+ratio.addSamples('S7_ER+/Healthy', ge3716.getRatio( 39,  6))
+ratio.addSamples('S8_ER+/Healthy', ge3716.getRatio( 40,  7))
+ratio.addSamples('S9_ER+/Healthy', ge3716.getRatio( 41,  8))
+ratio.addSamples('S1_ER-/Healthy', ge3716.getRatio( 24,  9))
+ratio.addSamples('S2_ER-/Healthy', ge3716.getRatio( 25, 10))
+ratio.addSamples('S3_ER-/Healthy', ge3716.getRatio( 26, 11))
+ratio.addSamples('S4_ER-/Healthy', ge3716.getRatio( 27, 12))
+ratio.addSamples('S5_ER-/Healthy', ge3716.getRatio( 28, 13))
+ratio.addSamples('S6_ER-/Healthy', ge3716.getRatio( 29, 14))
+ratio.addSamples('S7_ER-/Healthy', ge3716.getRatio( 30, 15))
+ratio.addSamples('S8_ER-/Healthy', ge3716.getRatio( 31, 16))
+ratio.addSamples('S9_ER-/Healthy', ge3716.getRatio( 32, 17))
+ratio.writeGEOFile('/Users/mikael/workspace/COSC2000/GDS3716_ratios.soft')
+print ge3716.getHeaders()
+
+
+z = ratio.getZScore(0) # NOT recommended! Ratios are NOT normally distributed! Use log-ratios instead.
+
+ge38 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS38.soft', id_column = 1)
+cln2_profile = ge38.getGenes('CLN2')
+pcorr = ge38.getPearson('CLN2')
+gp = GeneExpression('Ex3', 'PC_CLN2', pcorr)
+sorted = gp.sort('PC_CLN2', True)
+print sorted[0], ge38.getGenes(sorted[0])
+print sorted[1], ge38.getGenes(sorted[1])
+
--- a/gibbs.py
+++ b/gibbs.py
+"""
+Motif discovery using Gibb's sampling
+@author: mikael
+"""
+
+import math
+import random
+
+import sym
+import prob
+import sequence
+
+class GibbsMotif():
+    """
+    A class for discovering linear motifs in sequence data.
+    Uses Gibb's sampling (Lawrence et al., Science 262:208-214 1993).
+
+    Also see http://bayesweb.wadsworth.org/gibbs/content.html which has info
+    on "site sampling", "motif sampling", "recursive sampling" and "centroid
+    sampling". The first is implemented (roughly) below.
+    """
+    def __init__(self, seqs, length, alignment = None):
+        """ Construct a "discovery" session by providing the sequences that will be used.
+            seqs: sequences in which the motif is sought
+            length: length of sought pattern (W)
+            alignment: positions in each sequence for the initial alignment (use only if the alignment
+            has been determined from a previous run).
+            """
+        self.seqs = seqs
+        self.length = length # length of motif 1..W
+        seqs = self.seqs
+        self.alphabet = None
+        k = 0
+        for s in seqs:
+            if self.alphabet != None and self.alphabet != s.alphabet:
+                raise RuntimeError("Sequences invalid: different alphabets")
+            self.alphabet = s.alphabet
+            if alignment:
+                if alignment[k] < 0 or alignment[k] >= len(s):
+                    raise RuntimeError("Initial alignment invalid: does not match sequence " + s.name)
+            k += 1
+        """ Initialise parameters that are part of the setup (below) """
+        self.alignment = alignment or [ random.randint(0, len(s) - length) for s in seqs ] # starting positions defining alignment
+
+    def discover(self, pseudocount = None, niter = None):
+        """ Find the most probable common pattern represented by a
+            position weight matrix (PWM), based on W+1 distributions
+            pseudocount: the distribution used for pseudo-counts (default is uniform)
+            niter: number of iterations (if None, 100*N is used; where N is number of seqs).
+        """
+        """ Initialise parameters necessary for the discovery run (below) """
+        N = len(self.seqs) # number of sequences 1..N
+        seqs = self.seqs
+        W = self.length    # motif width
+        """ background that will be used as pseudo-counts """
+        pseudocount = pseudocount or prob.Distrib(self.alphabet, 1.0)
+        """ q: the foreground distribution (specifying the W distributions in aligned columns)
+            p: the background distribution (for non-aligned positions in all sequences) """
+        q = [ prob.Distrib(self.alphabet, pseudocount) for _ in range(W) ]
+        p = prob.Distrib(self.alphabet, pseudocount)
+        a = self.alignment
+
+        new_z = random.randint(0, N-1) # pick a random sequence to withhold
+        for k in range(N):
+            if k != new_z:
+                k_len = len(seqs[k]) # length of current seq
+                offset = 0
+                for i in range(k_len):
+                    if i >= a[k] and i < a[k] + W: # within pattern
+                        q[offset].observe(seqs[k][i])
+                        offset += 1
+                    else: # outside pattern
+                        p.observe(seqs[k][i])
+
+        """ Main loop: predictive update step THEN sampling step, repeat... """
+        niter = niter or 100 * N # use specified number of iterations or default
+        for round in range(niter):
+
+            """ Predictive update step:
+                One of the N sequences are chosen at random: z.
+                We will not use it in the profile, nor background so we
+                exclude it from our counts. """
+            prev_z = new_z
+            new_z = random.randint(0, N - 1)
+            # q's and p's are updated from current a's and all sequences except z,
+            # which is the same as use old q's and p's and subtract z's contribs...
+            offset = 0
+            for i in range(len(seqs[new_z])):
+                if i >= a[new_z] and i < a[new_z] + W: # within pattern
+                    q[offset].observe(seqs[new_z][i], -1) # subtract the count
+                    offset += 1
+                else: # outside pattern
+                    p.observe(seqs[new_z][i], -1) # subtract the count
+            # ... and add back the previous and now updated z
+            offset = 0
+            for i in range(len(seqs[prev_z])):
+                if i >= a[prev_z] and i < a[prev_z] + W: # within pattern
+                    q[offset].observe(seqs[prev_z][i], +1) # add the count
+                    offset += 1
+                else: # outside pattern
+                    p.observe(seqs[prev_z][i], +1) # add the count
+
+            """ Sampling step:
+                Consider each position x in z as a match: find a weight Ax """
+            z_len = len(seqs[new_z]) # length of seq z
+            A = [ 0.0 for _ in range(z_len) ]
+            Asum = 0.0
+            for x in range(z_len - W + 1): # look at all starts for a W-wide pattern
+                Px = 1.0; Qx = 1.0
+                for w in range(W):
+                    Px *= p[seqs[new_z][x+w]]
+                    Qx *= q[w][seqs[new_z][x+w]]
+                try:
+                    A[x] = Qx / Px
+                except ZeroDivisionError:
+                    pass
+                Asum += A[x]
+            for x in range(z_len - W + 1): # score all starts for a W-wide pattern
+                A[x] /= Asum               # normalise so that all Ax's sum to 1.0
+            # Pick the next a[z], with a probability proportional to Ax
+            pick = random.random()         # any value between 0 and 1
+            cumul = 0.0                    # cumulative probability
+            for x in range(z_len - W + 1): # check starts for a W-wide pattern
+                cumul += A[x]
+                if pick <= cumul:          # check if our random pick is smaller than the cumulative prob
+                    a[new_z] = x
+                    break
+
+            """ Evaluate data log-likelihood """
+            if round % 100 == 0: # but only every 100th round
+                LL = 0.0
+                for k in range(N):
+                    Pk = 1.0; Qk = 1.0
+                    for w in range(W):
+                        Pk *= p[seqs[k][a[k]+w]]
+                        Qk *= q[w][seqs[k][a[k]+w]]
+                    try:
+                        LL += math.log(Qk / Pk)
+                    except ZeroDivisionError:
+                        pass
+                print "LL @ %5d=\t%5.2f" % (round, LL)
+
+        # end main for-loop
+        self.q = q
+        self.p = p
+        self.alignment = a
+        return q
+
+    def getForeground(self):
+        """ Return the probability distributions for columns in the discovered alignment. """
+        return self.q
+
+    def getBackground(self):
+        """ Return the probability distributions for the background used in the discovery. """
+        return self.p
+
+def getAlignment(seqs, motif, background):
+    """ Retrieve the best alignment (positions) in provided sequences defined by the specified
+        motif params.
+        seqs: sequence data
+        motif: the foreground distribution (specifying the W distributions in aligned columns)
+        background: the background distribution (for non-aligned positions in all sequences)
+        Note that this is similar but not the same as the stochastically selected alignment that
+        is kept while training. It can be implemented using a PWM constructed from a previous session.
+        Note also that this alignment can be used as input to continue an earlier discovery session
+        when motif distributions had been saved.  """
+    N = len(seqs)
+    q = motif
+    p = background
+    W = len(q)
+    a = [0 for _ in range(N)] # start positions unknown
+    for k in range(N):
+        k_len = len(seqs[k]) # length of seq k
+        Amax = None
+        xmax = 0
+        for x in range(k_len - W + 1):
+            Px = 1.0; Qx = 1.0
+            for w in range(W):
+                Px *= p[seqs[k][x+w]]
+                Qx *= q[w][seqs[k][x+w]]
+            try:
+                Atmp = math.log(Qx / Px)
+            except ZeroDivisionError:
+                pass
+            if Amax == None or Amax < Atmp:
+                Amax = Atmp
+                xmax = x
+        a[k] = xmax
+    return a
+
+class GibbsAlign():
+    """ A class for performing ungapped sequence alignment.
+        Uses Gibb's sampling (Lawrence et al., Science 262:208-214 1993).
+    """
+
+    def __init__(self, seqs, length, alignment = None):
+        """ Construct a "discover" session by providing the sequences that will be aligned.
+            seqs: sequences that will be aligned
+            length: maximum length of alignment (must be equal or greater than max sequence length)
+            alignment: positions in each sequence for the initial alignment (use only if the alignment
+            has been determined from a previous run).
+            """
+        self.seqs = seqs
+        self.length = length # length of motif 1..W
+        seqs = self.seqs
+        self.alphabet = None
+        k = 0
+        for s in seqs:
+            if self.alphabet != None and self.alphabet != s.alphabet:
+                raise RuntimeError("Sequences invalid: different alphabets")
+            self.alphabet = s.alphabet
+            if alignment:
+                if alignment[k] < 0 or alignment[k] >= len(s):
+                    raise RuntimeError("Initial alignment invalid: does not match sequence " + s.name)
+            k += 1
+        """ Initialise parameters that are part of the setup (below) """
+        self.alignment = alignment or [ random.randint(0, length - len(s)) for s in seqs ] # starting offsets defining alignment
+
+    def discover(self, pseudocount = None, niter = None):
+        """ Find the most probable common pattern represented by a
+            position weight matrix (PWM), based on W+1 distributions
+            pseudocount: the distribution used for pseudo-counts (default is uniform)
+            niter: number of iterations (if None, 100*N is used; where N is number of seqs).
+        """
+        """ Initialise parameters necessary for the discovery run (below) """
+        N = len(self.seqs) # number of sequences 1..N
+        seqs = self.seqs
+        W = self.length    # alignment width
+        """ background that will be used as pseudo-counts """
+        pseudocount = pseudocount or prob.Distrib(self.alphabet, 1.0)
+        """ q: the foreground distribution (specifying the W distributions in aligned columns)
+            p: the background distribution (for non-aligned positions in all sequences) """
+        q = [ prob.Distrib(self.alphabet, pseudocount) for _ in range(W) ]
+        p = prob.Distrib(self.alphabet, pseudocount)
+        a = self.alignment
+
+        new_z = random.randint(0, N-1) # pick a random sequence to withhold
+        for k in range(N):
+            if k != new_z:
+                k_len = len(seqs[k]) # length of current seq
+                offset = 0
+                for i in range(k_len):
+                    if i >= a[k] and i < a[k] + W: # within pattern
+                        q[offset].observe(seqs[k][i])
+                        offset += 1
+                    else: # outside pattern
+                        p.observe(seqs[k][i])
+
+        """ Main loop: predictive update step THEN sampling step, repeat... """
+        niter = niter or 100 * N # use specified number of iterations or default
+        for round in range(niter):
+
+            """ Predictive update step:
+                One of the N sequences are chosen at random: z.
+                We will not use it in the profile, nor background so we
+                exclude it from our counts. """
+            prev_z = new_z
+            new_z = random.randint(0, N - 1)
+            # q's and p's are updated from current a's and all sequences except z,
+            # which is the same as use old q's and p's and subtract z's contribs...
+            offset = 0
+            for i in range(len(seqs[new_z])):
+                if i >= a[new_z] and i < a[new_z] + W: # within pattern
+                    q[offset].observe(seqs[new_z][i], -1) # subtract the count
+                    offset += 1
+                else: # outside pattern
+                    p.observe(seqs[new_z][i], -1) # subtract the count
+            # ... and add back the previous and now updated z
+            offset = 0
+            for i in range(len(seqs[prev_z])):
+                if i >= a[prev_z] and i < a[prev_z] + W: # within pattern
+                    q[offset].observe(seqs[prev_z][i], +1) # add the count
+                    offset += 1
+                else: # outside pattern
+                    p.observe(seqs[prev_z][i], +1) # add the count
+
+            """ Sampling step:
+                Consider each position x in z as a match: find a weight Ax """
+            z_len = len(seqs[new_z]) # length of seq z
+            A = [ 0.0 for _ in range(z_len) ]
+            Asum = 0.0
+            for x in range(z_len - W + 1): # look at all starts for a W-wide pattern
+                Px = 1.0; Qx = 1.0
+                for w in range(W):
+                    Px *= p[seqs[new_z][x+w]]
+                    Qx *= q[w][seqs[new_z][x+w]]
+                try:
+                    A[x] = Qx / Px
+                except ZeroDivisionError:
+                    pass
+                Asum += A[x]
+            for x in range(z_len - W + 1): # score all starts for a W-wide pattern
+                A[x] /= Asum               # normalise so that all Ax's sum to 1.0
+            # Pick the next a[z], with a probability proportional to Ax
+            pick = random.random()         # any value between 0 and 1
+            cumul = 0.0                    # cumulative probability
+            for x in range(z_len - W + 1): # check starts for a W-wide pattern
+                cumul += A[x]
+                if pick <= cumul:          # check if our random pick is smaller than the cumulative prob
+                    a[new_z] = x
+                    break
+
+            """ Evaluate data log-likelihood """
+            if round % 100 == 0: # but only every 100th round
+                LL = 0.0
+                for k in range(N):
+                    Pk = 1.0; Qk = 1.0
+                    for w in range(W):
+                        Pk *= p[seqs[k][a[k]+w]]
+                        Qk *= q[w][seqs[k][a[k]+w]]
+                    try:
+                        LL += math.log(Qk / Pk)
+                    except ZeroDivisionError:
+                        pass
+                print "LL @ %5d=\t%5.2f" % (round, LL)
+
+        # end main for-loop
+        self.q = q
+        self.p = p
+        self.alignment = a
+        return q
+
+    def getForeground(self):
+        """ Return the probability distributions for columns in the discovered alignment. """
+        return self.q
+
+    def getBackground(self):
+        """ Return the probability distributions for the background used in the discovery. """
+        return self.p
--- a/godata.py
+++ b/godata.py
+'''
+Created on Jul 12, 2012, amended April 2015
+
+Module for managing Gene Ontology data, in particular gene:terms
+annotations and term definitions
+
+It can be used on files you can download from geneontology.org.
+The class GO is constructed from:
+- annotation file which is (usually) specific to the species of interest
+- OBO file which defines the GO terms and their relationships
+  e.g.
+  > go = GO('gene_association.goa_ref_human', 'go-basic.obo')
+Internal data structures are created so that you can query
+- what are the terms of my gene (or genes)? Use getTerms
+- what are the genes of my term? Use getGenes
+- what terms occur amongst my genes, ranked by their absolute count? Use getGOReport without background
+- what terms are statistically enriched in my genes, relative a background set of genes? Use getGOReport with background
+
+The class BinGO works with a compact (memory saving) binary format that aggregates information from an annotation
+file and an OBO file. Therefore, you first need to construct this binary file, using writeBitFile.
+Subsequently you can construct instances of BinGO and query terms and genes, roughly in the manner identified above for GO.
+
+@author: mikael
+'''
+
+from struct import pack, unpack, calcsize, error
+import operator
+import time
+import os
+import stats
+
+# Character codes used by binary format to identify ontology
+onto_codes = {
+              'P': 'Biological process',
+              'F': 'Molecular function',
+              'C': 'Cellular component'}
+
+# Labels for edges in the ontology graph, index is used in binary format
+onto_rel = ['is_a', 'isect', 'part_of', 'has_part', 'regulates']
+
+# Evidence codes assigned to annotations, an index is assigned when creating binary file and is stored in its header
+evid_codes = { # Experimental Evidence Codes
+    'EXP': 'Inferred from Experiment',
+    'IDA': 'Inferred from Direct Assay',
+    'IPI': 'Inferred from Physical Interaction',
+    'IMP': 'Inferred from Mutant Phenotype',
+    'IGI': 'Inferred from Genetic Interaction',
+    'IEP': 'Inferred from Expression Pattern',
+               #Computational Analysis Evidence Codes
+    'ISS': 'Inferred from Sequence or Structural Similarity',
+    'ISO': 'Inferred from Sequence Orthology',
+    'ISA': 'Inferred from Sequence Alignment',
+    'ISM': 'Inferred from Sequence Model',
+    'IGC': 'Inferred from Genomic Context',
+    'IBA': 'Inferred from Biological aspect of Ancestor',
+    'IBD': 'Inferred from Biological aspect of Descendant',
+    'IKR': 'Inferred from Key Residues',
+    'IRD': 'Inferred from Rapid Divergence',
+    'RCA': 'inferred from Reviewed Computational Analysis',
+    'TAS': 'Traceable Author Statement',
+    'NAS': 'Non-traceable Author Statement',
+               #Curator Statement Evidence Codes
+    'IC': 'Inferred by Curator',
+    'ND': 'No biological Data available',
+               #Automatically-assigned Evidence Codes
+    'IEA': 'Inferred from Electronic Annotation',
+               #Obsolete Evidence Codes
+    'NR': 'Not Recorded'}
+
+class GO():
+    """ Classical interface for working with GO terms usually within the same species and when memory is not a major issue.
+        Implementations are relatively efficient (for Python at least).
+        Major functions:
+        __init__: construct instance of GO session from an annotation file and an OBO file (geneontology.org)
+        getTerms: get GO terms from gene or genes (transitively or not)
+        getGenes: get genes that are annotated with given term or terms
+        getGOReport: perform basic gene set enrichment
+        """
+
+    # Structures to hold all data relevant to session
+    annots = {}     # annotations: annots[gene] = (taxa, terms[term] = (evid, T/F))
+    termdefs = {}   # definitions: termdefs[term] = (onto, set((term, rel)), name)
+    children = {}   # redundant, parent-to-child structure: children[term] = set((term, rel))
+
+    def __init__(self, annotFile, obofile, annotfile_columns = (1,2,3,4,6,8)):
+        """ Start GO session with specified data loaded:
+        annotfile: name of annotation file, e.g.'gene_association.tair'
+        OBO file: name of gene ontology definition file, e.g. 'gene_ontology_ext.obo'
+        Optionally, specify what columns in the annotation file that contains in order:
+        gene, symb, qual, term, evid, onto. Note that index starts at 0 NOT 1.
+        (The default seems to work for most annotation files, but sometime if you wish to cross reference
+        say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations:
+        go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8))
+        """
+        print "Started at", time.asctime()
+        # Get GO definitions
+        terms = readOBOFile(obofile)
+        for term in terms:
+            (term_name, term_onto, term_is) = terms[term]
+            self.termdefs[term] = (term_onto, term_is, term_name)
+            self.children[term] = set()
+
+        for term in self.termdefs:
+            (term_onto, term_is, term_name) = self.termdefs[term]
+            for (parent, prel) in term_is:
+                try:
+                    cset = self.children[parent]
+                    cset.add((term, prel))
+                except KeyError:
+                    pass
+        print "Read %d GO definitions" % len(terms)
+        # open annotation file to analyse and index data
+        src = open(annotFile, 'r')
+        gene_cnt = 0
+        cnt = 0
+        for line in src:
+            cnt += 1
+            if line.startswith('!'):
+                continue
+            (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns)
+            try:
+                (taxa_q, terms_map) = self.annots[gene]
+                terms_map[term] = (evid, qual != 'NOT')
+            except KeyError: # not a previously encountered gene
+                gene_cnt += 1
+                terms_map = {term: (evid, qual != 'NOT')}
+                self.annots[gene] = (taxa, terms_map)
+        src.close()
+        print "Read annotations for %d genes" % gene_cnt
+
+    def _makeIntoList(self, id_or_ids):
+        if type(id_or_ids) != list and type(id_or_ids) != set and type(id_or_ids) != tuple:
+            return [id_or_ids]
+        return id_or_ids
+
+    def getTerms(self, genes_or_gene, evid = None, onto = None, include_more_general = True):
+        """ Retrieve all terms for a gene or a set/list/tuple of genes.
+            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
+            If onto(logy) is specified the method includes only entries from specified ontology ('P', 'F' or 'C').
+            If include_more_general is true, terms that are transitively related are included.
+            With multiple genes provided in query, the result is a map, keyed by gene (each identifying a set of terms).
+            When only one gene is provided, the result is simply a set of terms.
+        """
+        if type(genes_or_gene) != list and type(genes_or_gene) != set and type(genes_or_gene) != tuple:
+            return self.getTerms4Gene(genes_or_gene, evid, onto, include_more_general)
+        else:
+            return self.getTerms4Genes(genes_or_gene, evid, onto, include_more_general)
+
+    def getTerms4Genes(self, genes, evid = None, onto = None, include_more_general = True):
+        """ Retrieve all GO terms for a given set/list/tuple of genes.
+            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
+            If onto(logy) is specified the method includes only entries from specified ontology ('P', 'F' or 'C').
+            If include_more_general is True (default) then transitively related terms are included.
+            With multiple genes provided in query, the result is a map, keyed by gene (each identifying a set of terms).
+        """
+        gomap = {} # gene to GO terms map
+        genes = self._makeIntoList(genes)
+        for gene in genes:
+            gomap[gene] = self.getTerms4Gene(gene, evid, onto, include_more_general)
+        return gomap
+
+    def getTerms4Gene(self, gene, evid = None, onto = None, include_more_general = True):
+        """ Retrieve all GO terms for a given (single) gene.
+            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
+            If onto(logy) is specified the method includes only entries from specified ontology ('P', 'F' or 'C').
+            If include_more_general is True (default) then transitively related terms are included
+            When only one gene is provided, the result is simply a set of terms.
+        """
+        direct = set()
+        # STEP 1: Find all terms directly associated with specified genes
+        try:
+            (taxa, terms_map) = self.annots[gene]
+            for term in terms_map:
+                (term_evid, term_qual) = terms_map[term]
+                if (evid == None or evid == term_evid) and term_qual:
+                    direct.add(term)
+        except KeyError:
+            return set() # gene was not found, hence no annotations for it
+        # STEP 2: Find terms associated with (indirect) parents of terms from STEP 1
+        indirect = set()
+        if include_more_general:
+            for term in direct:
+                parents = self.getParents(term, include_more_general)
+                for parent in parents:
+                    indirect.add(parent)
+        return direct.union(indirect)
+
+    def getGenes(self, terms_or_term, evid = None, taxa = None, rel = None, include_more_specific = False):
+        """ Retrieve all genes that are annotated with specified term or terms,
+            qualified by evidence, taxa and relation type, e.g. "is_a".
+            If multiple terms are provided, a map is returned keyed by term (each identifying set of genes).
+            With a single term provided, a set of genes is returned.
+        """
+        if type(terms_or_term) != list and type(terms_or_term) != set and type(terms_or_term) != tuple:
+            return self.getGenes4Term(terms_or_term, evid, taxa, rel, include_more_specific)
+        else:
+            return self.getGenes4Terms(terms_or_term, evid, taxa, rel, include_more_specific)
+
+    def getGenes4Terms(self, terms, evid = None, taxa = None, rel = None, include_more_specific = False):
+        """ Retrieve all genes that are annotated with specified terms,
+            qualified by evidence, taxa and relation type, e.g. "is_a".
+            Since multiple terms are provided, a map is returned keyed by term (each identifying set of genes).
+        """
+        gomap = {} # term to genes map
+        terms = self._makeIntoList(terms)
+        for term in terms:
+            gomap[term] = self.getGenes4Term(term, evid, taxa, rel, include_more_specific)
+        return gomap
+
+    def getGenes4Term(self, term, evid = None, taxa = None, rel = None, include_more_specific = False):
+        """ Retrieve all genes that are annotated with specified term or terms,
+            qualified by evidence, taxa and relation type, e.g. "is_a".
+            With a single term provided, a set of genes is returned.
+        """
+        genes = self._getGenes4Term(term, evid, taxa, rel)
+        if include_more_specific:
+            terms = self.getChildren(term, rel, True) # not recursive yet
+            for t in terms:
+                tgenes = self._getGenes4Term(t, evid, taxa, rel)
+                for g in tgenes:
+                    genes.add(g)
+        return genes
+
+    def _getGenes4Term(self, term, evid = None, taxa = None, rel = None):
+        """ Retrieve all genes that are annotated with specified term, and qualified by evidence, taxa etc. """
+        genes = set()
+        # Scour through all genes
+        for gene in self.annots: # annotations: annots[gene] = (taxa, terms[term] = (evid, T/F))
+            (qtaxa, qterms) = self.annots[gene]
+            if taxa == None or taxa == qtaxa:
+                for qterm in qterms:
+                    if qterm != term:
+                        continue
+                    (qevid, qqual) = qterms[term]
+                    if (evid == None or evid == qevid) and qqual:
+                        genes.add(gene)
+                        break
+        return genes
+
+    def getChildren(self, parent_term_id_or_ids, rel = None, include_more_specific = False):
+        """ Retrieve all direct children of the given (parent) term.
+        """
+        parent_terms = self._makeIntoList(parent_term_id_or_ids)
+        cset = set()
+        for parent in parent_terms:
+            # definitions: children[term] = set((term, relation), ...)
+            current = self.children[parent]
+            for (child_term, child_rel) in current:
+                if rel == None or rel == child_rel:
+                    cset.add(child_term)
+        if len(cset) > 0 and include_more_specific:
+            grandkids = self.getChildren(cset, rel, True)
+            for grandkid in grandkids:
+                cset.add(grandkid)
+        return cset
+
+    def getParents(self, child_term_id, include_more_general = True):
+        """ Retrieve all parents of the given term, transitively or not.
+        """
+        direct = set() # all GO terms which are parents to given term
+        try:
+            (onto_ch, terms_ch, name_ch) = self.termdefs[child_term_id]
+            for (parent_id, parent_rel) in terms_ch:
+                (onto_pa, terms_pa, name_pa) = self.termdefs[parent_id]
+                direct.add(parent_id)
+                if (include_more_general):
+                    parents = self.getParents(parent_id, True)
+                    for parent in parents:
+                        direct.add(parent)
+        except KeyError:
+            pass # term was not found, possibly throw error?
+        return direct
+
+    def getTermdef(self, term_id):
+        """ Retrieve information about a given term:
+            ontology, parent terms, and name as a tuple.
+        """
+        try:
+            (onto_ch, terms_set, term_name) = self.termdefs[term_id]
+            return (onto_ch, terms_set, term_name)
+        except KeyError:
+            return ('Unknown', 'Unknown', 'Unknown')
+
+    def getAllAnnots(self):
+        """ Retrieve all annotated gene products """
+        return self.annots.keys()
+
+    def getAllBackground(self, positives = [], taxa = None, evid = None, include_more_general = False):
+        """ Retrieve all genes and terms that are annotated but not in a list of positives (gene products).
+        """
+        # (taxa, terms[term] = (evid, T/F))
+        bg_genes = set()
+        bg_list = []
+        for gene in self.annots:
+            if not gene in positives:
+                bg_genes.add(gene)
+                (qtaxa, qterms) = self.annots[gene]
+                if taxa == None or qtaxa == taxa:
+                    for t in qterms:
+                        (qevid, qqual) = qterms[t]
+                        if (evid == None or qevid == evid) and qqual:
+                            bg_list.append(t)
+                            if include_more_general:
+                                for parent in self.getParents(t, True):
+                                    bg_list.append(parent)
+        return (bg_genes, bg_list)
+
+    def getCountReport(self, positives, threshold = None, include_more_general = True):
+        """ For a set of named gene products (positives) this method determines the counts of GO terms.
+            Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) sorted by count.
+            positives: names of gene products
+            threshold: the count that must be reached for term to be reported (default is 0)
+            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
+            include_more_general: if True, include also more general GO terms annotated to gene products (default is True)
+            """
+        fg_list = [] # all terms, with multiple copies for counting
+        fg_map = self.getTerms4Genes(positives, include_more_general = include_more_general) #
+        for id in fg_map:
+            for t in fg_map[id]:
+                fg_list.append(t)
+        term_set = set(fg_list)
+        term_cnt = {}
+
+        nPos = len(positives)
+        if threshold == None:
+            threshold = 0 # include all terms
+        for t in term_set:
+            cnt = fg_list.count(t)
+            if cnt >= threshold:
+                term_cnt[t] = cnt
+        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
+        ret = []
+        for t in sorted_cnt:
+            defin = self.getTermdef(t[0])
+            if defin == None:
+                print 'Could not find definition of %s' % t[0]
+            else:
+                ret.append((t[0], t[1], defin[2], defin[0]))
+        return ret
+
+    def getEnrichmentReport(self, positives, background = None, evid = None, threshold = None, include_more_general = True):
+        """ For a set of named gene products (positives) this method determines the enrichment of GO terms.
+            Each GO term is also assigned an enrichment p-value (on basis of provided background, or on basis of all annotated genes, if not provided).
+            Note that to use the full set as background can be computationally expensive, so to speed up subsequent runs, the results are cached.
+            Returns a list of tuples (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
+            E-value is a Bonferroni-corrected p-value.
+            positives: names of gene products
+            background: names of gene products (or None if all annotated gene products should be used; default)
+            threshold: E-value that must be reached for term to be reported (default is 0.05)
+            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
+            include_more_general: if True, include also more general GO terms annotated to gene products (default is True)
+            """
+        # Process foreground: find terms of genes
+        fg_list = [] # all terms, with multiple copies for counting
+        fg_map = self.getTerms4Genes(positives, evid = evid, include_more_general = include_more_general) #
+        for fg_gene in fg_map:
+            for t in fg_map[fg_gene]:
+                fg_list.append(t)
+        nPos = len(positives)
+        # Process background: find terms of genes
+        bg_list = []
+        if background == None: # need to use the full set
+            background = self.annots.keys()
+        negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives
+        nNeg = len(negatives)
+        bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general)
+        for bg_gene in bg_map:
+            for t in bg_map[bg_gene]:
+                bg_list.append(t)
+
+        term_set = set(fg_list)
+        term_cnt = {}
+
+        if threshold == None:
+            threshold = 0.05
+
+        for t in term_set:
+            fg_hit = fg_list.count(t) # number of foreground genes WITH GO term (number of terms in the list for the collective set of foreground genes)
+            bg_hit = bg_list.count(t) # number of background genes WITH GO term (number of terms in the list for the collective set of background genes)
+            fg_nohit = nPos - fg_hit  # total number of genes in foreground minus that number of hits
+            bg_nohit = nNeg - bg_hit  # total number of genes in background minus that number of hits
+            pvalue = stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False) # one-tailed FET
+            evalue = pvalue * len(term_set) # Bonferroni correction
+            if evalue <= threshold: # check if significance req is fulfilled
+                term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue)
+        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
+        ret = []
+        for t in sorted_cnt:
+            defin = self.getTermdef(t[0])
+            if defin == None:
+                print 'Could not find definition of %s' % t[0]
+            else:
+                ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0]))
+        return ret
+
+class BinGO():
+
+    # Structures to hold all data relevant to session, all keys are "encoded"
+    annots = {}     # annotations: annots[gene] = (taxa, terms[term] = (evid, T/F))
+    termdefs = {}   # definitions: termdefs[term] = (onto, terms[term] = relation, name)
+    # Codes for encoding and decoding
+    gene_code = None
+    term_code = None
+    evid_code = None
+    # indices
+    annot_index = {}
+    # Files
+    f = None
+
+    def __init__(self, filename, taxa = None):
+        """ The binary file contains all the data and will initialise
+            gene annotations (annots) and term definitions (termdefs)
+            and the encoding/decoding keys. """
+        self.f = self._readBitFile(filename, taxa = taxa)
+
+    def _decodeGeneIDs(self, gene_codes):
+        if type(gene_codes) != list and type(gene_codes) != set and type(gene_codes) != tuple:
+            gene_codes = [gene_codes]
+        ids = []
+        for i in gene_codes:
+            s = decode(i, self.gene_code)
+            ids.append(s)
+        return ids
+
+    def _encodeGeneIDs(self, gene_names):
+        if type(gene_names) != list and type(gene_names) != set and type(gene_names) != tuple:
+            gene_names = [gene_names]
+        ids = []
+        for i in gene_names:
+            y = encode(i, self.gene_code)
+            ids.append(y)
+        return ids
+
+    def _getGeneEntry(self, gene):
+        peek = self.annot_index[gene]
+        self.f.seek(peek, 0)
+        buf = self.f.read(calcsize('IIH'))
+        (gene_int, taxa_int, nterms) = unpack('IIH', buf)
+        buf = self.f.read(nterms * calcsize('?BI'))
+        terms_dict = {}
+        for pos in range(0, len(buf) - 1, calcsize('?BI')):
+            (qual_bool, evid_int, term_int) = unpack('?BI', buf[pos:pos+calcsize('?BI')])
+            terms_dict[term_int] = (evid_int, qual_bool)
+        return (taxa_int, terms_dict)
+
+    def _getSuperTerms(self, term, rel = None):
+        """ Recursively compute the transitive closure. """
+        found = set()
+        try:
+            (_, closure, _) = self.termdefs[term]
+            for (t, r) in closure.items():
+                if (not rel) or r == rel:
+                    found.add(t)
+                    found.update(self._getSuperTerms(t, rel))
+        except KeyError:
+            print 'Could not find GO:%s' % (''.join(decode(term, self.term_code)))
+        return found
+
+    def _getChildTerms(self, term, rel = None):
+        found = set()
+        for (child, termdef) in self.termdefs.items():
+            (_, parents_dict, _) = termdef
+            try:
+                myrel = parents_dict[term]
+                if rel == myrel or not rel: found.add(child)
+            except KeyError:
+                pass
+        return found
+
+    def _getSpecificTerms(self, term, rel = None):
+        direct = self._getChildTerms(term, rel)
+        found = set()
+        for t in direct:
+            found.add(t)
+            found.update(self._getSpecificTerms(t, rel))
+        return found
+
+    def getTerms(self, genes, evid = None, onto = None, include_more_general = True):
+        """
+        Retrieve all GO terms for a given set of genes (or single gene).
+        The result is given as a map (key=gene name, value=list of unique terms) OR
+        in the case of a single gene as a list of unique terms.
+        If include_more_general is True (default) then transitively related terms are included
+        """
+        mymap = dict()
+        # STEP 1: Find all terms directly associated with specified genes
+        direct = set() # all GO terms (encoded)
+        ids = self._encodeGeneIDs(genes)
+        for i in ids:
+            gene_name = ''.join(decode(i, self.gene_code))
+            mymap[gene_name] = set()
+            try:
+                (taxa, terms) = self._getGeneEntry(i)
+                for (term, evid_and_qual) in terms.items():
+                    if evid_and_qual[1] and not evid: # if True and no evidence is specified
+                        direct.add(term)
+                        mymap[gene_name].add(term)
+                    elif self.evid_code[evid_and_qual[0]] == evid:
+                        direct.add(term)
+                        mymap[gene_name].add(term)
+            except KeyError:
+                pass
+                #print 'Failed to find annotations for gene %s' % gene_name
+        if include_more_general:
+            # STEP 2: Find the transitive closure of each term identified, store as a dictionary
+            indirect = {}
+            for t in direct:
+                if not indirect.has_key(t):
+                    indirect[t] = set(self._getSuperTerms(t))
+        # STEP 3: compile and return results
+        for gene in mymap:
+            term_ids = mymap[gene]
+            all_ids = set(term_ids)
+            if include_more_general:
+                for term_id in term_ids:
+                    all_ids.update(indirect[term_id])
+            mymap[gene] = set()
+            for term_enc in all_ids:
+                mymap[gene].add('GO:'+''.join(decode(term_enc, self.term_code)))
+        return mymap
+
+    def getAllGenes(self):
+        names = []
+        for g in self._decodeGeneIDs(self.annot_index.keys()):
+            names.append(''.join(g))
+        return names
+
+    def getGenes(self, terms, evid = None, taxa = None, rel = None, include_more_specific = True):
+        """ Retrieve all genes that are annotated with specified terms, and qualified by evidence, taxa etc. """
+        """ TODO: Debug--suspect this implementation is incorrect. """
+        term_ids = set()
+        for t in terms:
+            term_ids.add(encode(t[3:], self.term_code))
+        # STEP 1 (optional): determine more specific terms to be included in query
+        if include_more_specific:
+            myterms = set()
+            for t in term_ids:
+                myterms.add(t)
+                children = self._getSpecificTerms(t, rel)
+                myterms.update(children)
+            term_ids = myterms
+        # STEP 2: identify genes with those terms
+        found = {}
+        for g in self.annot_index:
+            gene_name = decode(g, self.gene_code)
+            (mytaxa, tdict) = self._getGeneEntry(g)
+            if not taxa or taxa == mytaxa:
+                for annot_term in tdict.keys():
+                    if tdict[annot_term] == evid:
+                        if annot_term in terms:
+                            try:
+                                added = found[gene_name]
+                                added.add(annot_term)
+                            except KeyError:
+                                found[gene_name] = set([annot_term])
+        # STEP 3: compile and return results
+        for gene in found:
+            term_ids = found[gene]
+            all_ids = set(term_ids)
+            found[gene] = set()
+            for term_enc in all_ids:
+                found[gene].add('GO:'+''.join(decode(term_enc, self.term_code)))
+        return found
+
+    def getTermdef(self, term):
+        term_id = encode(term[3:], self.term_code)
+        try:
+            (onto_ch, terms_dict, name_peek) = self.termdefs[term_id]
+            self.f.seek(name_peek, 0)
+            term_name = self.f.readline()
+            return (onto_codes[onto_ch], terms_dict, term_name)
+        except KeyError:
+            return ('Unknown', 'Unknown', 'Unknown')
+
+    def _readBitFile(self, filename, taxa, termnames = False):
+        f = open(filename, 'r')
+        # STEP 1: header info
+        ngene_code = None
+        nterm_code = None
+        nevid_code = None
+        ngene_cnt = 0
+        nterm_cnt = 0
+        nevid_cnt = 0
+        header = True
+        total_gene_cnt = None
+        current_gene_cnt = 0
+        current_terms_cnt = 0
+        annot_offset = 0
+        obo_offset = 0
+        while f:
+            if not ngene_code:
+                line = f.readline()
+                fields = line.split()
+                total_gene_cnt = int(fields[0])
+                total_terms_cnt = int(fields[1])
+                ngene_code = int(fields[2])
+                nterm_code = int(fields[3])
+                nevid_code = int(fields[4])
+                self.gene_code = ['' for _ in range(ngene_code)]
+                self.term_code = ['' for _ in range(nterm_code)]
+                self.evid_code = ['' for _ in range(nevid_code)]
+            elif ngene_cnt < ngene_code:
+                line = f.readline()
+                self.gene_code[ngene_cnt] = line.strip()
+                ngene_cnt += 1
+            elif nterm_cnt < nterm_code:
+                line = f.readline()
+                self.term_code[nterm_cnt] = line.strip()
+                nterm_cnt += 1
+            elif nevid_cnt < nevid_code:
+                line = f.readline()
+                self.evid_code[nevid_cnt] = line.strip()
+                nevid_cnt += 1
+            else: # we're not in the header
+                if header: offset = f.tell()
+                header = False
+                try:
+                    if current_gene_cnt < total_gene_cnt: # we are reading gene:terms annotations
+                        peek = f.tell()
+                        buf = f.read(calcsize('IIH'))
+                        (gene_int, taxa_int, nterms) = unpack('IIH', buf)
+                        current_gene_cnt += 1
+                        if (not taxa) or (taxa_int == taxa or taxa_int in taxa):
+                            self.annot_index[gene_int] = peek
+                        bufsize = calcsize('?BI')
+                        f.read(nterms * bufsize)
+                    elif current_terms_cnt < total_terms_cnt: # we are reading term definitions (term is_a term, term, term, ...)
+                        buf = f.read(calcsize('IcH'))
+                        (term_int, onto_ch, nterms) = unpack('IcH', buf)
+                        current_terms_cnt += 1
+                        bufsize = calcsize('BI')
+                        buf = f.read(nterms * bufsize)
+                        terms_dict = {}
+                        for pos in range(0, len(buf) - 1, bufsize):
+                            (rel_ndx, sup_int) = unpack('BI', buf[pos:pos+bufsize])
+                            terms_dict[sup_int] = rel_ndx
+                        name_peek = f.tell()
+                        f.readline() # skip putting name in memory, instead refer to the position in the file
+                        self.termdefs[term_int] = (onto_ch, terms_dict, name_peek)
+                    else:
+                        buf = f.read(calcsize('II'))
+                        (annot_offset, obo_offset) = unpack('II', buf)
+                        break
+                except error as inst:
+                    print "Problem reading binary file: ", inst, "at gene ", current_gene_cnt, "at definition ", current_terms_cnt, "at", f.tell()
+                    exit(3)
+        print "Read %d genes and %d term definitions" % (current_gene_cnt, current_terms_cnt)
+        print "Annotations start at", annot_offset, "\nDefinitions start at", obo_offset
+        return f
+
+    #FIXME: write code to perform test of taxa enrichment
+
+    def getGOReport_byScore(self, gene_score_map, negatives_score_map = {}, include_more_general = True, descending_order = True):
+        """ Generate a complete GO term report for a set of genes with associated scores.
+            Uses the Wilcoxon Ranksum test for each GO term to assign a p-value,
+            indicating the enrichment of term to "top" genes in descending order by score (by default).
+        """
+        fg_map = self.getTerms(gene_score_map.keys(), include_more_general = include_more_general)
+        fg_list = []
+        for id in fg_map:
+            for t in fg_map[id]:
+                fg_list.append(t)
+        term_set = set(fg_list)
+        term_pval = {}
+        if len(negatives_score_map) > 0:
+            bg_map = self.getTerms(negatives_score_map.keys(), include_more_general = include_more_general)
+        for t in term_set:
+            pos = []
+            neg = []
+            for gene in gene_score_map:
+                annot = fg_map[gene]
+                if not annot == None:
+                    if t in annot:
+                        pos.append(gene_score_map[gene])
+                    else:
+                        neg.append(gene_score_map[gene])
+            if len(pos) > 0 and len(neg) > 0:
+                if descending_order:
+                    p = stats.getRSpval(neg, pos)
+                else:
+                    p = stats.getRSpval(pos, neg)
+            if len(negatives_score_map) > 0 and p <= 0.05:
+                mpos = pos # scores of foreground genes with matching GO term
+                mneg = [] # scores of background genes with matching GO terms
+                for gene in negatives_score_map:
+                    annot = bg_map[gene]
+                    if not annot == None:
+                        if t in annot:
+                            mneg.append(negatives_score_map[gene])
+                if len(mneg) > 0:
+                    if descending_order:
+                        p2 = stats.getRSpval(mneg, mpos)
+                    else:
+                        p2 = stats.getRSpval(mpos, mneg)
+                else:
+                    p2 = 0.0
+                term_pval[t] = (p, p2)
+            else:
+                term_pval[t] = (p, 1.0)
+
+        sorted_pval = sorted(term_pval.items(), key=lambda v: v[1][0], reverse=False)
+
+        ret = []
+        for t in sorted_pval:
+            defin = self.getTermdef(t[0])
+            if defin == None:
+                print 'Could not find definition of %s' % t[0]
+            else:
+                ret.append((t[0], t[1][0], t[1][1], defin[2].strip(), defin[0]))
+        return ret
+
+    def getGOReport(self, positives, background = None, taxa = None, include_more_general = True):
+        """ Generate a complete GO term report for a set of genes (positives).
+            Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
+            Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
+            (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
+            E-value is a Bonferroni-corrected p-value.
+            """
+        pos = set(positives)
+        fg_map = self.getTerms(pos, include_more_general = include_more_general)
+        fg_list = []
+        for id in fg_map:
+            for t in fg_map[id]:
+                fg_list.append(t)
+        bg_map = {}
+        bg_list = []
+        neg = set()
+        if background != None:
+            neg = set(background).difference(pos)
+            bg_map = self.getTerms(neg, include_more_general = include_more_general)
+            for id in bg_map:
+                for t in bg_map[id]:
+                    bg_list.append(t)
+        term_set = set(fg_list)
+        term_cnt = {}
+
+        nPos = len(pos)
+        nNeg = len(neg)
+        if background == None:
+            for t in term_set:
+                term_cnt[t] = fg_list.count(t)
+            sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
+        else: # a background is provided
+            for t in term_set:
+                fg_hit = fg_list.count(t)
+                bg_hit = bg_list.count(t)
+                fg_nohit = nPos - fg_hit
+                bg_nohit = nNeg - bg_hit
+                term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
+            sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
+
+        ret = []
+        for t in sorted_cnt:
+            defin = self.getTermdef(t[0])
+            if defin == None:
+                print 'Could not find definition of %s' % t[0]
+            else:
+                if background != None:
+                    ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][1], defin[2], defin[0]))
+                else:
+                    ret.append((t[0], t[1], defin[2], defin[0]))
+        return ret
+
+def encode(code_me, encode_strings):
+    code = 0
+    accum = 1
+    try:
+        for pos in range(len(code_me)):
+            codelen = len(encode_strings[pos])
+            for i in range(codelen):
+                if encode_strings[pos][i] == code_me[pos]:
+                    code += accum * i
+                    accum *= codelen
+                    break
+    except IndexError as e:
+        print e, code_me
+    return code
+
+def decode(code, encode_strings):
+    npos    = len(encode_strings)
+    accum   = [1 for _ in range(npos)]
+    try:
+        for pos in range(1, npos): accum[pos] = accum[pos - 1] * len(encode_strings[pos - 1])
+        indices = [-1 for _ in range(npos)]
+        for pos in range(npos - 1, -1, -1): # go backwards, start at last (most significant) position
+            indices[pos] = code / accum[pos]
+            code -= accum[pos] * indices[pos]
+        string = [encode_strings[pos][indices[pos]] for pos in range(len(encode_strings))]
+    except IndexError as e:
+        print e, code
+    return string
+
+def _extractAnnotFields(line, columns = (1,2,3,4,6,8)):
+    """ Extract appropriate details from annotation file. This typically follows:
+        1.  DB
+            Database from which entry has been taken.
+            Example: PDB
+        2.  DB_Object_ID
+            A unique identifier in the DB for the item being annotated.
+            Here: PDB ID and chain ID of the PDB entry.
+            Example: 2EKB_A
+        3.  DB_Object_Symbol
+            Here: PDB ID and chain ID of the PDB entry.
+            Example:EKB_A
+        4.  Qualifiers
+            This column is used for flags that modify the interpretation of an annotation.
+            This field may be equal to: NOT, colocalizes_with, contributes_to,
+            NOT|contributes_to, NOT|colocalizes_with
+            Example: NOT
+        5.  GO Identifier
+            The GO identifier for the term attributed to the DB_Object_ID.
+            Example: GO:0005625
+        6.  DB:Reference
+            A single reference cited to support an annotation.
+            Where an annotation cannot reference a paper, this field will contain
+            a GO_REF identifier. See section 8 and
+            http://www.geneontology.org/doc/GO.references
+            for an explanation of the reference types used.
+            Example: PMID:9058808
+        7.  Evidence
+            One of either EXP, IMP, IC, IGI, IPI, ISS, IDA, IEP, IEA, TAS, NAS,
+            NR, ND or RCA.
+            Example: TAS
+        9.  Aspect
+            One of the three ontologies: P (biological process), F (molecular function)
+            or C (cellular component).
+            Example: P
+
+        In columns specify index (starts with 0 NOT 1) for gene, symb, qual, term, evid, onto
+        """
+    fields = line.strip().split('\t')
+    gene = fields[columns[0]]
+    symb = fields[columns[1]]
+    qual = fields[columns[2]]
+    term = fields[columns[3]]
+    if not term.startswith('GO:'):
+        term = None
+        raise "No GO term on line: " + line
+    evid = fields[columns[4]]
+    if not evid_codes.has_key(evid):
+        evid = None
+    onto = fields[columns[5]]
+    if not onto_codes.has_key(onto):
+        onto = None
+    taxa_idx = line.find('taxon:')
+    if taxa_idx == -1:
+        taxa = None
+    else:
+        taxa = line[taxa_idx:]
+        taxa = taxa.split('\t')
+        taxa_spec = taxa[0].split(':')
+        taxa = int(taxa_spec[len(taxa_spec) - 1]) # pick last taxon ID
+    return (gene, symb, qual, term, evid, onto, taxa)
+
+def readOBOFile(obofile):
+    """
+    http://www.geneontology.org/GO.format.obo-1_2.shtml
+    """
+    src = open(obofile, 'r')
+    terms = {}
+    in_term_def = False
+    in_type_def = False
+    for line in src:
+        if in_term_def:
+            if line.startswith('id: '):
+                term_id = line[4:14]
+                term_is = set()
+            elif line.startswith('name: '):
+                term_name = line[6:].strip()
+            elif line.startswith('def: '):
+                # Note this is a multi-line field, delimited by "'s
+                pass
+            elif line.startswith('namespace: '):
+                if   line[11] == 'b': term_onto = 'P'
+                elif line[11] == 'm': term_onto = 'F'
+                elif line[11] == 'c': term_onto = 'C'
+            elif line.startswith('is_a: '):
+                term_is.add((line[6:16], 'is_a'))
+            elif line.startswith('relationship: '):
+                fields = line.split()
+                term_is.add((fields[2], fields[1]))
+            elif line.startswith('intersection_of: '):
+                fields = line.split()
+                if fields[1].startswith('GO:'):
+                    term_is.add((fields[1], 'isect'))
+                else:
+                    term_is.add((fields[2], fields[1]))
+            elif line.startswith('is_obsolete: '):
+                in_term_def = False # ignore this entry
+        if line.startswith('[Term]'):
+            if in_term_def: # already defining one, stash it before moving on to the next...
+                terms[term_id] = (term_name, term_onto, term_is)
+            elif in_type_def:
+                in_type_def = False
+            in_term_def = True
+        if line.startswith('[Typedef]'):
+            if in_term_def: # already defining one, stash it before moving on to the next...
+                in_term_def= False
+            in_type_def = True
+    if in_term_def: #  defining one, stash it
+        terms[term_id] = (term_name, term_onto, term_is)
+    return terms
+
+def writeBitFile(annotFile, obofile, destFile, taxas = None):
+    print "Started at", time.asctime()
+    # open annotation file to analyse and index data
+    src = open(annotFile, 'r')
+    gene_index = [{} for _ in range(6)]  # count different characters in different positions
+    term_index = [{} for _ in range(7)]  # count different characters in different positions
+    evid_index = {}
+    gene_cnt = 0
+    cnt = 0
+    prev_gene = None
+    for line in src:
+        cnt += 1
+        #if cnt > 100000:
+        #    break
+        if line.startswith('!'):
+            continue
+        (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line)
+        if not (taxas and ((taxa == taxas) or (taxa in taxas))):  # The gene does NOT belong to a nominated taxon
+            continue
+        if not (gene == prev_gene): # not the same gene
+            gene_cnt += 1
+        try:
+            evid_index[evid]
+        except:
+            evid_index[evid] = len(evid_index)
+        pos = 0
+        for ch in gene[0:6]:
+            try:
+                gene_index[pos][ch]
+            except KeyError: # no match
+                gene_index[pos][ch] = len(gene_index[pos])
+            pos += 1
+        pos = 0
+        for ch in term[3:10]:
+            try:
+                term_index[pos][ch]
+            except KeyError: # no match
+                term_index[pos][ch] = len(term_index[pos])
+            pos += 1
+        prev_gene = gene
+    src.close()
+    print "Read annotations for %d genes" % gene_cnt
+
+    gene_code = ['' for _ in range(6)]
+    term_code = ['' for _ in range(7)]
+    for d in range(len(gene_index)):
+        arr = ['?' for _ in gene_index[d]]
+        for (ch, index) in gene_index[d].items():
+            arr[index] = ch
+        gene_code[d] = ''.join(arr)
+    for d in range(len(term_index)):
+        arr = ['?' for _ in term_index[d]]
+        for (ch, index) in term_index[d].iteritems():
+            arr[index] = ch
+        term_code[d] = ''.join(arr)
+    evid_code = ['' for _ in range(len(evid_index))]
+    for (e, ndx) in evid_index.items():
+        evid_code[ndx] = e
+
+    # Get GO definitions
+    terms = readOBOFile(obofile)
+    print "Read %d GO definitions" % len(terms)
+
+    # re-open, now with the aim of copying info
+    src = open(annotFile, 'r')
+    dst = open(destFile, 'w')
+    # STEP 1: header info
+    dst.write("%d\t%d\t%d\t%d\t%d\n" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index)))
+    for code_str in gene_code:
+        dst.write(code_str+"\n")
+    for code_str in term_code:
+        dst.write(code_str+"\n")
+    for e_str in evid_code:
+        dst.write(e_str+'\n')
+    print "Wrote header %d\t%d\t%d\t%d\t%d, now at @%d" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index), dst.tell())
+
+    # STEP 2: write annotations
+    annot_offset = dst.tell()
+    prev_gene = None
+    concat_terms = {}
+    cnt = 0
+    for line in src:
+        cnt += 1
+        #if cnt > 100000:
+        #    break
+        if line.startswith('!'):
+            continue
+        (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line)
+        if not (taxas and ((taxa == taxas) or (taxa in taxas))): # The gene does NOT belong to a nominated taxon
+            continue
+        if gene != prev_gene: # new gene is found
+            if prev_gene != None:
+                # write prev data
+                s = pack('IIH', encode(prev_gene, gene_code), taxa, len(concat_terms))
+                dst.write(s)
+                for t in concat_terms:
+                    (o, q, e) = concat_terms[t]
+                    s = pack('?BI', q, evid_index[e], encode(t, term_code))
+                    dst.write(s)
+            # re-init
+            prev_gene = gene
+            concat_terms = {}
+        concat_terms[term[3:]] = (onto, qual, evid)
+    if len(concat_terms) > 0:
+        # write data in buffer
+        s = pack('IIH', encode(prev_gene, gene_code), taxa, len(concat_terms))
+        dst.write(s)
+        for t in concat_terms:
+            (o, q, e) = concat_terms[t]
+            s = pack('?BI', q, evid_index[e], encode(t, term_code))
+            dst.write(s)
+    print "Wrote GO annotations, now at @%d" % dst.tell()
+
+    # Next, the ontology definition...
+    obo_offset = dst.tell() # remember the position where the OBO starts
+    sorted_terms = sorted(terms.iteritems(), key=operator.itemgetter(0))
+    for [t, _] in sorted_terms:
+        (term_name, term_onto, term_is) = terms[t]
+        s = pack('IcH', encode(t[3:], term_code), term_onto, len(term_is))
+        dst.write(s)
+        for (sup_term, sup_rel) in term_is:
+            try:
+                index = onto_rel.index(sup_rel)
+            except ValueError:
+                index = 9
+            s = pack('BI', index, encode(sup_term[3:], term_code))
+            dst.write(s)
+        dst.write(term_name + '\n')
+    print "Wrote %d GO definitions, now at @%d" % (len(sorted_terms), dst.tell())
+
+    # Finally, write the offsets to quickly access annotations and definitions, resp
+    dst.write(pack('II', annot_offset, obo_offset))
+    # done, close
+    dst.close()
+    print "Completed at", time.asctime()
+
+
--- a/guide.py
+++ b/guide.py
+###################################################
+# This module is a supplement to the Python guide #
+# Version 2.2016.1 (8/3/2016)                    #
+###################################################
+''' 
+This module contains code for that can help solving bioinformatics problems.
+See the accompanying Python guide for more explanations and examples. 
+
+Alphabet is a class that defines valid symbols that we then use to make up valid 
+biological sequences. Note that we also define variables corresponding to 
+DNA, RNA and Protein sequences that can be used directly.
+
+Sequence defines basic parts and operations on biological sequences. 
+
+Alignment defines an alignment of sequences (how symbols in different sequences line 
+up when placed on-top). Alignment methods should generate instances of this class.
+
+SubstMatrix defines a substitution matrix, i.e. a scoring system for performing 
+alignments. You can read these from files or construct them manually.
+
+GeneProfile defines parts and operations for gene expression profiles. Essentially, 
+the class will help to index expression data by gene name (rows) and by sample name (columns).
+
+There are several methods not tied to a particular class because they construct new instances, 
+e.g. reading from file, retrieving from the internet, creating an alignment from sequences etc.
+
+You need to have numpy installed (see http://www.numpy.org/). 
+Should work with Python v2.6-2.7 (see http://www.python.org/). 
+Has not been written to work with Python v3 and later--but this should be easy to do.
+The code may contain bugs--please report to m.boden@uq.edu.au
+'''
+
+import math, numpy, urllib, urllib2
+
+###############################################################################
+# Alphabet                                                                    #
+###############################################################################
+ 
+class Alphabet():
+    """ A minimal class for alphabets """
+    def __init__(self, symbolString):
+        self.symbols = symbolString
+    def __len__(self):              # implements the "len" operator, e.g. "len(Alphabet('XYZ')" results in 3
+        return len(self.symbols)
+    def __contains__(self, sym):    # implements the "in" operator, e.g. "'A' in Alphabet('ACGT')" results in True 
+        return sym in self.symbols
+    def __iter__(self):             # method that allows us to iterate over all symbols, e.g. "for sym in Alphabet('ACGT'): print sym" prints A, C, G and T on separate lines
+        tsyms = tuple(self.symbols)
+        return tsyms.__iter__()
+    def __getitem__(self, ndx):
+        """ Retrieve the symbol(s) at the specified index (or slice of indices) """
+        return self.symbols[ndx]
+    def index(self, sym):
+        """ Retrieve the index of the given symbol in the alphabet. """
+        return self.symbols.index(sym)
+    def __str__(self):
+        return self.symbols
+
+""" Below we declare alphabet variables that are going to be available when 
+this module is imported """ 
+DNA_Alphabet = Alphabet('ACGT')
+RNA_Alphabet = Alphabet('ACGU')
+Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
+Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
+
+###############################################################################
+# Sequence                                                                    #
+###############################################################################
+ 
+class Sequence():
+    """ A biological sequence class. Stores the sequence itself, 
+        the alphabet and a name. 
+        Usage:
+        >>> seq1 = Sequence('ACGGGAGAGG', DNA_Alphabet, 'ABC')
+        >>> print seq1
+        ABC: ACGGGAGAGG
+        >>> 'C' in seq1
+        True
+        >>> for sym in seq1:
+        ...     print sym
+        """
+    def __init__(self, sequence, alphabet, name = '', gappy = False, annot = ''):
+        """ Construct a sequence from a string, an alphabet (gappy or not) and a name.
+            The parameter gappy is for sequences when used in alignments, which means that '-' is allowed. """
+        for sym in sequence:
+            if not sym in alphabet and (sym != '-' or not gappy):  # error check: bail out
+                raise RuntimeError('Invalid symbol: ' + sym)
+        self.sequence = sequence
+        self.alphabet = alphabet
+        self.name = name
+        self.gappy = gappy
+        self.annot = annot # some annotation, e.g. species
+    def __len__(self):      # the "len" operator
+        return len(self.sequence)
+    def __iter__(self):     # method that allows us to iterate over a sequence
+        tsyms = tuple(self.sequence)
+        return tsyms.__iter__()
+    def __contains__(self, item):   # test for membership (the "in" operator)
+        for sym in self.sequence:
+            if sym == item:
+                return True
+        return False
+    def __getitem__(self, ndx):     # [ndx] operator (retrieve a specified index (or a "slice" of indices) of the sequence data.
+        return self.sequence[ndx]
+    def writeFasta(self):
+        """ Write one sequence in FASTA format to a string and return it. """
+        fasta = '>' + self.name + ' ' + self.annot + '\n'
+        data = self.sequence
+        nlines = (len(self.sequence) - 1) / 60 + 1
+        for i in range(nlines):
+            lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
+            fasta += lineofseq
+        return fasta
+    def __str__(self):      # "pretty" print sequence
+        str = self.name + ': '
+        for sym in self:
+            str += sym
+        return str
+    def count(self, findme):
+        """ Get the number of occurrences of specified symbol """
+        cnt = 0
+        for sym in self.sequence:
+            if findme == sym:
+                cnt = cnt + 1
+        return cnt
+    def find(self, findme):
+        """ Find the position of the specified symbol or sub-sequence """
+        return self.sequence.find(findme)
+
+###############################################################################
+# Alignment                                                                   #
+###############################################################################
+ 
+class Alignment():
+    """ A sequence alignment class. Stores two or more sequences of equal length where
+    one symbol is gap '-'. The number of columns in the alignment is given by alignlen. 
+    Example usage:
+    >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
+    >>> print Alignment(seqs)
+     THIS-LI-NE-
+     --ISALIGNED """
+    def __init__(self, seqs):
+        self.alphabet = None
+        self.alignlen = -1
+        self.seqs = seqs
+        self.namelen = 0
+        for s in seqs:
+            if self.alphabet == None:
+                self.alphabet = s.alphabet
+            elif self.alphabet != s.alphabet:
+                raise RuntimeError("Alignment invalid: contains a mix of alphabets")
+            if self.alignlen == -1:
+                self.alignlen = len(s)
+            elif self.alignlen != len(s):
+                raise RuntimeError("Alignment invalid: lengths vary")
+            self.namelen = max(len(s.name), self.namelen)
+    def __str__(self):    
+        string = ''
+        for seq in self.seqs:
+            string += seq.name.ljust(self.namelen+1)
+            for sym in seq:
+                string += sym
+            string += '\n'
+        return string
+    def __len__(self):
+        """ Defines what the "len" operator returns for an instance of Alignment: the number of sequences. """
+        return len(self.seqs)
+    def __getitem__(self, ndx):
+        return self.seqs[ndx]
+
+    def calcDistances(self, measure, a=1.0):
+        """ Calculate the evolutionary distance between all pairs of sequences
+        in this alignment, using the given measure. Measure can be one of
+        'fractional', 'poisson', 'gamma', 'jc' or 'k2p'. If 'gamma' or 'k2p' is
+        given, then the parameter a must also be specified (or else it will use
+        the default value of 1.0).
+        Definitions of each distance metric are found in Zvelebil and Baum p268-276.
+        These are mostly intended for DNA, but adapted for protein (as below).
+        Note however that there are alternative distance matrices for proteins (p276).
+        """
+        measure = measure.lower()
+        if not measure in ['fractional', 'poisson', 'gamma', 'jc', 'k2p']:
+            raise RuntimeError('Unsupported evolutionary distance measure: %s' % measure)
+        a = float(a)
+        distmat = numpy.zeros((len(self.seqs), len(self.seqs)))
+        # Loop through each pair of sequences
+        for i in range(len(self.seqs)):
+            for j in range(i + 1, len(self.seqs)):
+                seqA = self.seqs[i]
+                seqB = self.seqs[j]
+                # Calculate the fractional distance (p) first
+                # The two sequences of interest are in seqA and seqB
+                L = 0
+                D = 0
+                for k in range(self.alignlen):
+                    # For every non-gapped column, put to L
+                    # For every non-gapped column where the sequences are
+                    # different, put to D
+                    if seqA[k] != '-' and seqB[k] != '-':
+                        L += 1
+                        if seqA[k] != seqB[k]:
+                            D += 1
+                p = float(D)/L
+                # Now calculate the specified measure based on p
+                if measure == 'fractional':
+                    dist = p
+                else:
+                    raise RuntimeError('Not implemented: %s' % measure)
+                distmat[i, j] = distmat[j, i] = dist
+        return distmat
+    def writeClustal(self):
+        """ Write the alignment to a string using the Clustal file format. """
+        symbolsPerLine = 60
+        maxNameLength =  self.namelen + 1
+        mystring = ''
+        wholeRows = self.alignlen / symbolsPerLine
+        for i in range(wholeRows):
+            for j in range(len(self.seqs)):
+                mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
+                mystring += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
+            mystring += '\n'
+        # Possible last row
+        lastRowLength = self.alignlen - wholeRows*symbolsPerLine
+        if lastRowLength > 0:
+            for j in range(len(self.seqs)):
+                if maxNameLength > 0:
+                    mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
+                mystring += self.seqs[j][-lastRowLength:] + '\n'
+        return mystring
+
+    def writeHTML(self, filename):
+        """ Generate HTML that displays the alignment in colour. 
+        """
+        fh = open(filename, 'w')
+        fh.write('<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n')
+        html = ''.ljust(self.namelen) + ' '
+        for i in range(self.alignlen - 1):
+            if (i+1) % 10 == 0:
+                html += str(i/10+1)[-1]
+            else:
+                html += ' '
+        html += '%s\n' % (self.alignlen)
+        fh.write(html)
+        if self.alignlen > 10:
+            html = ''.ljust(self.namelen) + ' '
+            for i in range(self.alignlen - 1):
+                if (i+1) % 10 == 0:
+                    html += '0'
+                else:
+                    html += ' '
+            html += '\n'
+            fh.write(html)
+        if len(self.alphabet) <= 5: # DNA or RNA
+            colours = {'A':'green','C':'orange','G':'red','T':'#66bbff','U':'#66bbff'}
+        else: # amino acids
+            colours = {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'}
+        for seq in self.seqs:
+            html = seq.name.ljust(self.namelen) + ' '
+            for sym in seq:
+                try:
+                    colour = colours[sym]
+                except KeyError:
+                    colour = 'white'
+                html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (colour, sym)
+            html += '\n'
+            fh.write(html)
+        fh.write('</pre></body></html>\n')
+        fh.close()
+
+    def scoreAlignment(self, substmat = None, gap = -1):
+        """Score the alignment using a substitution matrix (substmat).
+           If the alignment consists of more than two sequences, the minimum
+           score of each column is used.
+           If substmat is not specified (None), the count of matches is returned.
+        """
+        nseqs = len(self.seqs)
+        total = 0
+        for pos in range(self.alignlen):
+            min = None
+            for i in range(nseqs):
+                for j in range(i+1, nseqs):
+                    gap_here = self.seqs[i][pos] == '-' or self.seqs[j][pos] == '-'
+                    score = 0
+                    if substmat == None:
+                        if self.seqs[i][pos] == self.seqs[j][pos]:
+                            score = 1
+                    else: # we have a substitution matrix 
+                        if gap_here:
+                            score = gap
+                        else:
+                            score = substmat.get(self.seqs[i][pos], self.seqs[j][pos])
+                    if min == None:
+                        min = score
+                    elif min > score:
+                        min = score
+            total += min
+        return total
+
+###############################################################################
+# Methods to create instances of Alignment                                    #
+###############################################################################
+ 
+def align(seqA, seqB, substMatrix, gap = -1):
+    """ Align seqA with seqB using the Needleman-Wunsch
+    (global) algorithm. substMatrix is the substitution matrix to use and
+    gap is the linear gap penalty to use. """
+    stringA, stringB = seqA.sequence, seqB.sequence
+    lenA, lenB = len(seqA), len(seqB)
+    # Create the scoring matrix (S) and a matrix for traceback
+    S = numpy.zeros((lenA + 1, lenB + 1))
+    Traceback = numpy.zeros((lenA + 1, lenB + 1))
+    # Fill the first row and column of S with multiples of the gap penalty
+    for i in range(lenA + 1):
+        S[i, 0] = i * gap
+    for j in range(lenB + 1):
+        S[0, j] = j * gap
+    # Calculate the optimum score at each location in the matrix, note which option that was chosen for traceback
+    for i in range(1, lenA + 1):
+        for j in range(1, lenB + 1):
+            match  = S[i-1, j-1] + substMatrix.get(stringA[i-1], stringB[j-1])
+            delete = S[i-1, j  ] + gap
+            insert = S[i  , j-1] + gap
+            Traceback[i, j] = numpy.argmax([match, delete, insert])
+            S[i, j] = max([match, delete, insert])
+    # Trace back the optimal alignment
+    alignA = ''
+    alignB = ''
+    # Start at the end
+    i = lenA
+    j = lenB
+    # Stop when we hit the end of a sequence
+    while i > 0 and j > 0:
+        if Traceback[i, j] == 1: 
+            # Got here by a gap in sequence B (go up)
+            alignA = stringA[i-1] + alignA
+            alignB = '-' + alignB
+            i -= 1
+        elif Traceback[i, j] == 2: 
+            # Got here by a gap in sequence A (go left)
+            alignA = "-" + alignA
+            alignB = stringB[j-1] + alignB
+            j -= 1
+        else:
+            # Got here by aligning the bases (go diagonally)
+            alignA = stringA[i-1] + alignA
+            alignB = stringB[j-1] + alignB
+            i -= 1
+            j -= 1
+    # Fill in the rest of the alignment if it begins with gaps
+    # (i.e., trace back all the way to S[0, 0])
+    while i > 0:
+        # Go up
+        alignA = stringA[i-1] + alignA
+        alignB = '-' + alignB
+        i -= 1
+    while j > 0:
+        # Go left
+        alignA = '-' + alignA
+        alignB = stringB[j-1] + alignB
+        j -= 1
+    return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])
+
+###############################################################################
+# SubstMatrix                                                                 #
+###############################################################################
+ 
+class SubstMatrix():
+    """ Create a substitution matrix for an alphabet.
+    Example usage:
+    >>> sm = SubstMatrix(DNA_Alphabet)
+    >>> for a in DNA_Alphabet:
+    ...     for b in DNA_Alphabet:
+    ...         if a > b:
+    ...             sm.set(a, b, -1)
+    ...         elif a == b:
+    ...             sm.set(a, b, +1)
+    ...
+    >>> print sm
+    A   1 
+    C  -1   1 
+    G  -1  -1   1 
+    T  -1  -1  -1   1 
+        A   C   G   T
+    >>> sm.get('C', 'T')
+    -1
+    """
+    def __init__(self, alphabet, scoremat = None):
+        self.scoremat = scoremat or {}      # start with empty dictionary
+        self.alphabet = alphabet
+    def _getkey(self, sym1, sym2):
+        """ Construct canonical (unordered) key for two symbols """
+        if sym1 <= sym2:
+            return tuple([sym1, sym2])
+        else:
+            return tuple([sym2, sym1])
+    def set(self, sym1, sym2, score):
+        """ Add a score to the substitution matrix """
+        self.scoremat[self._getkey(sym1, sym2)] = score
+    def get(self, sym1, sym2):
+        return self.scoremat[self._getkey(sym1, sym2)]
+    def __str__(self):
+        symbols = self.alphabet.symbols # what symbols are in the alphabet
+        i = len(symbols)
+        string = ''
+        for a in symbols:
+            string += a + ' '
+            for b in symbols[:len(symbols)-i+1]:
+                score = self.scoremat[self._getkey(a, b)]
+                if score != None:
+                    string += str(score).rjust(3) + ' '
+                else:
+                    string += "?".rjust(3) + ' '
+            string += '\n'
+            i -= 1
+        string += '    ' + '   '.join(self.alphabet.symbols)
+        return string
+    def writeFile(self, filename):
+        """ Write this substitution matrix to the given file. """
+        fh = open(filename, 'w')
+        file = ''
+        for key in self.scoremat:
+            file += ''.join(key) + ': ' + str(self.scoremat[key]) + '\n'
+        fh.write(file)
+        fh.close()
+
+###############################################################################
+# Below are some useful methods for loading data from strings and files.      #
+# They recognize the FASTA and Clustal formats (nothing fancy).               # 
+###############################################################################
+ 
+def readSubstMatrix(filename, alphabet):
+    """ Read in the substitution matrix stored in the given file. """
+    mat = SubstMatrix(alphabet)
+    fh = open(filename, 'r')
+    data = fh.read()
+    fh.close()
+    lines = data.splitlines()
+    for line in lines:
+        if len(line.strip()) == 0:
+            continue
+        symbols, score = line.split(':')
+        score = int(score)
+        mat.set(symbols[0], symbols[1], score)
+    return mat
+
+def readFastaString(string, alphabet, gappy = False):
+    """ Read the given string as FASTA formatted data and return the list of
+    sequences contained within it. """
+    seqlist = []    # list of sequences contained in the string 
+    seqname = ''  # name of *current* sequence 
+    seqannot = '' # annotation of *current* sequence 
+    seqdata = []    # sequence data for *current* sequence
+    for line in string.splitlines():    # read every line
+        if len(line) == 0:              # ignore empty lines
+            continue
+        if line[0] == '>':  # start of new sequence            
+            if seqname:     # check if we've got one current
+                current = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot)
+                seqlist.append(current)
+            # now collect data about the new sequence
+            parts = line[1:].split() # skip first char
+            seqname = ''  # name of *current* sequence 
+            seqannot = '' # annotation of *current* sequence 
+            if len(parts) > 0: seqname = parts[0] 
+            if len(parts) > 1: seqannot = line[len(seqname) + 2:] # the rest of the line
+            seqdata = []
+        else:               # we assume this is (more) data for current
+            cleanline = line.split()
+            for thisline in cleanline:
+                seqdata.extend(tuple(thisline.strip('*')))
+    # we're done reading the file, but the last sequence remains
+    if seqname:
+        lastseq = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot)
+        seqlist.append(lastseq)
+    return seqlist
+
+def readFastaFile(filename, alphabet, gappy = False):
+    """ Read the given FASTA formatted file and return the list of sequences 
+    contained within it. """
+    fh = open(filename)
+    data = fh.read()
+    fh.close()
+    seqlist = readFastaString(data, alphabet, gappy)
+    return seqlist
+
+def writeFastaFile(filename, seqs):
+    """ Write the specified sequences to a FASTA file. """
+    fh = open(filename, 'w')
+    for seq in seqs:
+        fh.write(seq.writeFasta())
+    fh.close()
+    
+def readClustalString(string, alphabet):
+    """ Read a ClustalW2 alignment in the given string and return as an
+    Alignment object. """
+    seqs = {} # sequence data
+    for line in string.splitlines():
+        if line.startswith('CLUSTAL') or line.startswith('STOCKHOLM') \
+           or line.startswith('#'):
+            continue
+        if len(line.strip()) == 0:
+            continue
+        if line[0] == ' ' or '*' in line or ':' in line:
+            continue
+        sections = line.split()
+        name, seq = sections[0:2]
+        if seqs.has_key(name):
+            seqs[name] += seq
+        else:
+            seqs[name] = seq
+    sequences = []
+    for name, seq in seqs.items():
+        sequences.append(Sequence(seq, alphabet, name, gappy = True))
+    return Alignment(sequences)
+
+def readClustalFile(filename, alphabet):
+    """ Read a ClustalW2 alignment file and return an Alignment object
+    containing the alignment. """
+    fh = open(filename)
+    data = fh.read()
+    fh.close()
+    aln = readClustalString(data, alphabet)
+    return aln
+
+def writeClustalFile(filename, aln):
+    """ Write the specified alignment to a Clustal file. """
+    fh = open(filename, 'w')
+    fh.write('CLUSTAL W (1.83) multiple sequence alignment\n\n\n') # fake header so that clustal believes it
+    fh.write(aln.writeClustal())
+    fh.close()
+
+###############################################################################
+# GeneProfile                                                                 #
+###############################################################################
+ 
+class GeneProfile():
+    """ A class for gene expression data.
+    Example usage:
+    >>> gp = GeneProfile('MyMicroarray', ['Exp1', 'Exp2'])
+    >>> gp['gene1'] = [0.1, 0.5]
+    >>> gp['gene2'] = [2, 1]
+    >>> gp.getSample('Exp2')
+    {'gene1': [0.5], 'gene2': [1.0]}
+    """
+    def __init__(self, dataset_name='', sample_names=[], profiles = None):
+        """ Create a gene profile set. """
+        self.name = dataset_name
+        self.samples = sample_names
+        self.genes = profiles or {} # dictionary for storing all gene--measurement pairs
+    def __setitem__(self, name, probevalues):
+        if len(probevalues) == len(self.samples):
+            self.genes[name] = [float(y) for y in probevalues]
+        else:
+            raise RuntimeError('Invalid number of measurements for probe ' + name)
+    def __getitem__(self, name):
+        return self.genes[name]
+    def getSorted(self, index, descending=True):
+        """Get a list of (gene, value) tuples in descending order by value"""
+        key_fn = lambda v: v[1][index]
+        return sorted(self.genes.items(), key=key_fn, reverse=descending)
+    def addSample(self, sample_name, sample_dict):
+        """Add a sample to the current data set.
+           sample_dict is a dictionary with the same keys as the current gene set.
+           Only values for genes in the current set will be added. """
+        self.headers.extend(sample_name)
+        if not self.genes:
+            self.genes = sample_dict
+        else:
+            for gene in self.genes:
+                values = sample_dict[gene]
+                if values:
+                    self.genes[gene].extend([float(y) for y in values])
+                else:
+                    self.genes[gene].extend([0.0 for _ in sample_name])
+        return self.genes
+    def getSample(self, sample_name):
+        """Construct a gene dictionary including only named samples. """
+        mygenes = {}
+        if isinstance(sample_name, str):    # a single sample-name
+            mysamples = [sample_name]
+        else:                               # a list of sample-names
+            mysamples = sample_name         
+        for gene in self.genes:
+            mygenes[gene] = []
+            for name in mysamples:
+                mygenes[gene].append(self.genes[gene][self.samples.index(name)])
+        return mygenes
+    def getRatio(self, sample1, sample2):
+        """Get the ratio of two samples in the data set. """
+        mygenes = {}
+        index1 = self.samples.index(sample1)
+        index2 = self.samples.index(sample2)
+        for gene in self.genes:
+            mygenes[gene] = []
+            mygenes[gene].append(self.genes[gene][index1] / self.genes[gene][index2])
+        return mygenes
+    def __str__(self):
+        """ Display data as a truncated GEO SOFT file named filename. """
+        line = '^DATASET = ' + self.dataset + '\n'
+        line += '!dataset_table_begin\nID_REF\t'
+        for header in self.headers:
+            line += header + '\t'
+        line += '\n'
+        for gene in self.genes:
+            line += gene + '\t'
+            values = self.genes[gene]
+            for value in values:
+                line += format(value, '5.3f') + '\t'
+            line += '\n'
+        line += '!dataset_table_end\n'
+    def writeGeoFile(self, filename):
+        fh = open(filename, 'w')
+        fh.write(str(self))
+        fh.close()
+
+def getLog(genedict, base=2):
+    """Get the log-transformed value of a sample/column. """
+    mygenes = {}
+    for gene in genedict:
+        mygenes[gene] = []
+        for sample in genedict[gene]:
+            mygenes[gene].append(math.log(sample, base))
+    return mygenes
+
+def readGeoFile(filename, id_column = 0):
+    """ Read a Gene Expression Omnibus SOFT file. """
+    dataset = None
+    fh = open(filename, "rU")
+    manylines = fh.read()
+    fh.close()
+    data_rows = False  # Indicates whether we're reading the data section or metadata
+    name = 'Unknown'
+    cnt_data = 0
+    for line in manylines.splitlines():
+        if line.startswith('^DATASET'):
+            name = line.split('= ')[1]
+            continue
+        data_rows = line.startswith('!dataset_table_begin')
+        data_rows = not line.startswith('!dataset_table_end')
+        if len(line.strip()) == 0 or line.startswith('!') or line.startswith('#') or line.startswith('^'):
+            continue
+        if data_rows:
+            cnt_data += 1
+            if (cnt_data == 1):  # First line contains the headers
+                headers = line.split('\t')
+                dataset = GeneProfile(name, headers[2:])  # Create the data set
+                continue
+            ignore = (dataset == None)  # ignore the row if the dataset is not initialised
+            id = line.split('\t')[id_column]
+            values = []
+            cnt_word = 0
+            for word in line.split('\t'):
+                cnt_word += 1
+                if cnt_word <= (id_column + 1): # ignore the gene names
+                    continue
+                if word == 'null':
+                    ignore = True # ignore gene if a value is null
+                    continue
+                try:
+                    values.append(float(word))
+                except:  # ignore values that are not "float"
+                    continue
+            if not ignore:
+                dataset[id] = tuple(values)
+    print 'Data set %s contains %d genes' % (name, len(dataset.genes))
+    return dataset
+
+###############################################################################
+# Web service methods that find data in online databases. 
+# Our implementations are mainly serviced by EBI.
+###############################################################################
+ 
+def getSequence(entryId, dbName, alphabet):
+    """ Retrieve a single entry from a database
+    entryId: ID for entry e.g. 'P63166' (Uniprot Accession) or 'SUMO1_MOUSE' (Uniprot Identifier) 
+    dbName: name of db e.g. 'uniprotkb', 'pdb' or 'refseqn'.
+    See: http://www.uniprot.org/faq/28. """
+    url = 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' +\
+    dbName + '&format=fasta&id=' + entryId
+    try:
+        data = urllib2.urlopen(url).read()
+        return readFastaString(data, alphabet)[0]
+    except urllib2.HTTPError, ex:
+        raise RuntimeError(ex.read())
+
+def searchSequences(query, dbName = 'uniprot'):
+    """
+    Retrieve multiple entries matching query from a database currently only via UniProtKB
+    query: search term(s) e.g. 'organism:9606+AND+antigen'
+    dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
+    See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
+    See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
+    """
+    if dbName.startswith('uniprot'):
+        # Construct URL
+        url = 'http://www.uniprot.org/' + dbName + '/?format=list&query=' + query
+        # Get the entries
+        try:
+            data = urllib2.urlopen(url).read()
+            return data.splitlines()
+        except urllib2.HTTPError, ex:
+            raise RuntimeError(ex.read())
+    elif dbName.startswith('refseq'):
+        dbs = dbName.split(":")
+        if len(dbs) > 1:
+            dbName = dbs[1]
+        base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
+        url = base + "esearch.fcgi?db=" + dbName + "&term=" + query
+        # Get the entries
+        try:
+            data = urllib2.urlopen(url).read()
+            words = data.split("</Id>")
+            words = [w[w.find("<Id>")+4:] for w in words[:-1]]
+            return words
+        except urllib2.HTTPError, ex:
+            raise RuntimeError(ex.read())
+    return
+
+def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
+    """
+    Map identifiers between databases (based on UniProtKB; 
+    see http://www.uniprot.org/faq/28)
+    identifiers: a list of identifiers (list of strings)
+    frm: the abbreviation for the identifier FROM which to idmap
+    to: the abbreviation for the identifier TO which to idmap
+    Returns a dictionary with key (from) -> value (to).
+    ACC is Uniprot Accession (e.g. 'P42813').
+    """
+    url = 'http://www.uniprot.org/mapping/'
+    # construct query by concatenating the list of identifiers
+    if isinstance(identifiers, str):
+        query = identifiers.strip()
+    else: # assume it is a list of strings
+        query = ''
+        for id in identifiers:
+            query = query + id.strip() + ' '
+        query = query.strip() # remove trailing spaces
+    params = {
+        'from' : frm,
+        'to' : to,
+        'format' : 'tab',
+        'query' : query
+    }
+    if len(query) > 0:
+        request = urllib2.Request(url, urllib.urlencode(params))
+        response = urllib2.urlopen(request).read()
+        d = dict()
+        for row in response.splitlines()[1:]:
+            pair = row.split('\t')
+            d[pair[0]] = pair[1]
+        return d
+    else:
+        return dict()
+
+###############################################################################
+# Gene Ontology services.
+# See http://www.ebi.ac.uk/QuickGO/WebServices.html for more info
+###############################################################################
+
+def getGODef(goterm):
+    """
+    Retrieve information about a GO term
+    goterm: the identifier, e.g. 'GO:0002080'
+    """
+    # Construct URL
+    url = 'http://www.ebi.ac.uk/QuickGO/GTerm?format=obo&id=' + goterm
+    # Get the entry: fill in the fields specified below
+    try:
+        entry={'id': None, 'name': None, 'def': None}
+        data = urllib2.urlopen(url).read()
+        for row in data.splitlines():
+            index = row.find(':')
+            if index > 0 and len(row[index:]) > 1:
+                field = row[0:index].strip()
+                value = row[index+1:].strip(' "') # remove spaces
+                if field in entry.keys():         # check if we need field
+                    if entry[field] == None:      # check if assigned
+                        entry[field] = value
+        return entry
+    except urllib2.HTTPError, ex:
+        raise RuntimeError(ex.read())
+
+def getGOTerms(genes, db='UniProtKB'):
+    """
+    Retrieve all GO terms for a given set of genes (or single gene).
+    db: use specified database, e.g. 'UniProtKB', 'UniGene', 
+    or 'Ensembl'.
+    The result is given as a map (key=gene name, value=list of unique 
+    terms) OR in the case of a single gene as a list of unique terms.
+    """
+    if type(genes) != list and type(genes) != set and type(genes) != tuple:   
+        genes = [genes]  # if 'genes' is a single gene, we make a single item list
+    map = dict()
+    uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&protein='
+    for gene in genes:
+        terms = set()  # empty result set
+        url = uri + gene.strip() # Construct URL
+        try: # Get the entry: fill in the fields specified below
+            data = urllib2.urlopen(url).read()
+            for row in data.splitlines()[1:]:  # we ignore header row
+                values = row.split('\t')
+                if len(values) >= 7:
+                    terms.add(values[6]) # add term to result set
+            map[gene] = list(terms)      # make a list of the set
+        except urllib2.HTTPError, ex:
+            raise RuntimeError(ex.read())
+    if len(genes) == 1:
+        return map[genes[0]]
+    else:
+        return map
+
+def getGenes(goterms, db='UniProtKB', taxo=None):
+    """
+    Retrieve all genes/proteins for a given set of GO terms 
+    (or single GO term).
+    db: use specified database, e.g. 'UniProtKB', 'UniGene', 
+    or 'Ensembl'
+    taxo: use specific taxonomic identifier, e.g. 9606 (human)
+    The result is given as a map (key=gene name, value=list of unique 
+    terms) OR in the case of a single gene as a list of unique terms.
+    """
+    if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
+        goterms = [goterms]
+    map = dict()
+    if taxo == None:
+        uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&term='
+    else:
+        uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&tax='+\
+        str(taxo)+'&term='
+    for goterm in goterms:
+        genes = set()   # start with empty result set
+        url = uri + goterm.strip() # Construct URL
+        try: # Get the entry: fill in the fields specified below
+            data = urllib2.urlopen(url).read()
+            for row in data.splitlines()[1:]:  # we ignore first (header) row
+                values = row.split('\t')
+                if len(values) >= 7:
+                    genes.add(values[1])  # add gene name to result set
+            map[goterm] = list(genes)
+        except urllib2.HTTPError, ex:
+            raise RuntimeError(ex.read())
+    if len(goterms) == 1:
+        return map[goterms[0]]
+    else:
+        return map
+
+###############################################################################
+# PhyloTree                                                                   #
+###############################################################################
+ 
+class PhyloTree:
+    """ Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
+        Functionality includes labelling and traversing nodes; reading and writing to Newick format;
+        association with sequence alignment; maximum parsimony inference of ancestral sequence;
+        generation of single, bifurcating rooted tree by UPGMA.
+        Known issues: Binary only; Parsimony does not handle gaps in alignment.
+        Programmers should note that almost all functionality is implemented through recursion. """
+
+    def __init__(self, root):
+        """ Create a tree from a node that is "root" in the tree."""
+        self.root = root
+
+    def putAlignment(self, aln):
+        """ Associate the tree with a set of sequences/alignment.
+            Involves assigning the sequence to the leaf nodes. """
+        self.aln = aln
+        self.root._assignAlignment(aln)
+
+    def __str__(self):
+        """ Produce a printable representation of the tree, specifically the root of the tree. """
+        return str(self.root)
+
+    def strSequences(self, start = None, end = None):
+        """ Produce a sequence representation of the tree, specifically the root of the tree.
+            Specify the start and end positions in the alignment for the sequence to be printed
+            (if None the min and max positions will be used). """
+        if self.aln != None:
+            my_start = start or 0
+            my_end = end or self.aln.alignlen
+            return self.root._printSequences(my_start, my_end)
+
+    def findLabel(self, label):
+        """ Retrieve/return the node with the specified label.
+            Returns None if not found."""
+        return self.root._findLabel(label)
+
+    def getDescendantsOf(self, node, transitive = False):
+        """ Retrieve and return the (list of) descendants (children) of a specified node.
+            Node can be the label or the instance.
+            transitive indicates if only the direct descendants (False) or if all descendants
+            should be returned.
+            If node does not exist, None is returned.
+            If node has no descendants, an empty list will be returned."""
+        if not isinstance(node, PhyloNode):
+            node = self.root.findLabel(node)
+        if node:
+            return node.getDescendants(transitive)
+        return None
+
+    def getAncestorsOf(self, node, transitive = False):
+        """ Retrieve and return the ancestor (transitive=False) or
+            ancestors (transitive=True) of a specified node.
+            Node can be the label or the instance.
+            If node does not exist, None is returned.
+            If node is the root of the tree, None is returned."""
+        if not isinstance(node, PhyloNode):
+            node = self.root.findLabel(node)
+        if node:
+            myroot = self.root
+            found = False
+            branching = []
+            while not found and myroot != None:
+                branching.append(myroot)
+                if myroot.left == node or myroot.right == node:
+                    found = True
+                    break
+                if myroot.left:
+                    if myroot.left.isAncestorOf(node, transitive = True):
+                        myroot = myroot.left
+                    else: # must be right branch then...
+                        myroot = myroot.right
+                else: # must be right branch then...
+                    myroot = myroot.right
+            if found and transitive:
+                return branching
+            elif found and len(branching) > 0:
+                return branching[len(branching)-1]
+            return None
+
+    def parsimony(self):
+        """ Solve the "small parsimony problem",
+            i.e. find the sequences on each of the internal nodes.
+            See Jones and Pevzner, p. 368 and onwards, for details. """
+        self.root._forwardParsimony(self.aln)  # setup and compute scores for all nodes
+        self.root._backwardParsimony(self.aln) # use scores to determine sequences
+        return self.root.getSequence() # return the sequence found at the root
+
+###############################################################################
+# PhyloNode                                                                   #
+###############################################################################
+ 
+class PhyloNode:
+    """ A class for a node in a rooted, binary (bifurcating) tree.
+        Contains pointers to descendants/daughters (left and right),
+        optional fields include data, label, sequence and dist.
+        If parsimony is used scores and traceback pointers are available.
+        A number of methods are named with a _ prefix. These can be, but
+        are not intended to be used from outside the class. """
+
+    def __init__(self, label = ''):
+        """ Initialise an initially unlinked node.
+            Populate fields left and right to link it with other nodes.
+            Set label to name it.
+            Use field data for any type of information associated with node.
+            Use dist to indicate the distance to its parent (if any).
+            Other fields are used internally, including sequence for associated alignment,
+            seqscores, backleft and backright for maximum parsimony. """
+        self.left = None
+        self.right = None
+        self.data = None
+        self.label = label
+        self.dist = None
+        self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
+        self.seqscores = None # The scores propagated from leaves via children
+        self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols
+        self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
+
+    def __str__(self):
+        """ Returns string with node (incl descendants) in a Newick style. """
+        left = right = label = dist = ''
+        if self.left:
+            left = str(self.left)
+        if self.right:
+            right = str(self.right)
+        if self.dist or self.dist == 0.0:
+            dist = ':' + str(self.dist)
+        if self.label != None:
+            label = str(self.label)
+            if not self.left and not self.right:
+                return label + dist
+            else:
+                return '(' + left + ',' + right + ')' + label + dist
+        else: # there is no label
+            if not self.left and self.right:
+                return ','+right
+            elif self.left and not self.right:
+                return left+','
+            elif self.left and self.right:
+                return '(' + left + ',' + right + ')' + dist
+
+    def _printSequences(self, start, end):
+        """ Returns string with node (incl descendants) in a Newick style. """
+        left = right = label = dist = ''
+        if self.left:
+            left = self.left._printSequences(start, end)
+        if self.right:
+            right = self.right._printSequences(start, end)
+        if self.dist:
+            dist = ':' + str(self.dist)
+        if self.sequence != None:
+            label = "".join(self.sequence[start:end]) + ""
+            if not self.left and not self.right:
+                return label + dist
+            else:
+                return '(' + left + ',' + right + ')' + label + dist
+        else: # there is no label
+            if not self.left and self.right:
+                return ','+right
+            elif self.left and not self.right:
+                return left+','
+            elif self.left and self.right:
+                return '(' + left + ',' + right + ')' + dist
+
+    def _findLabel(self, label):
+        """ Find a node by label at this node or in any descendants (recursively). """
+        if self.label == label:
+            return self
+        else:
+            if self.left:
+                foundLeft = self.left._findLabel(label)
+                if foundLeft:
+                    return foundLeft
+            if self.right:
+                return self.right._findLabel(label)
+            return None
+
+    def _propagateDistance(self, parent_dist):
+        """ Convert absolute distances to relative.
+            The only parameter is the absolute distance to the parent of this node. """
+        travelled = self.dist               # absolute distance to this node
+        self.dist = parent_dist - self.dist # relative distance to this node
+        if self.left != None:               # if there is a child node...
+            self.left._propagateDistance(travelled) # pass absolute distance to this node
+        if self.right != None:
+            self.right._propagateDistance(travelled)
+
+    def _assignAlignment(self, aln):
+        """ Assign an alignment to the node, which implies assigning a sequence to it if one is
+            available in the alignment. """
+        self.sequence = None
+        if self.left != None:
+            self.left._assignAlignment(aln)
+        if self.right != None:
+            self.right._assignAlignment(aln)
+        for seq in aln.seqs:
+            if seq.name == self.label:
+                self.sequence = seq
+                break
+
+    def _forwardParsimony(self, aln):
+        """ Internal function that operates recursively to first initialise each node (forward),
+            stopping only once a sequence has been assigned to the node,
+            then to propagate scores from sequence assigned nodes to root (backward). """
+        if self.sequence == None: # no sequence has been assigned
+            if self.left == None and self.right == None:    # no children, so terminal, cannot propagate scores
+                raise RuntimeError("No sequence assigned to leaf node:", self.label)
+            scoresleft = scoresright = None
+            if self.left != None:
+                scoresleft = self.left._forwardParsimony(aln)
+            if self.right != None:
+                scoresright = self.right._forwardParsimony(aln)
+            # for each position in the alignment,
+            # introduce (initially zero) score for each symbol in alphabet
+            self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
+            # for each position in the alignment,
+            # allocate a position to put the left child symbol from which each current node symbol score was determined
+            self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
+            # allocate a position to put the right child symbol from which each current node symbol score was determined
+            self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
+            for col in range(aln.alignlen):
+                for a_parent in range(len(aln.alphabet)):
+                    best_score_left = +9999999
+                    best_score_right = +9999999
+                    best_symb_left = 0
+                    best_symb_right = 0
+                    for a_left in range(len(aln.alphabet)):
+                        score = (scoresleft[col][a_left] + (1 if a_left != a_parent else 0)) # if we want to weight scores, this would need to change
+                        if score < best_score_left:
+                            best_symb_left = a_left
+                            best_score_left = score
+                    for a_right in range(len(aln.alphabet)):
+                        score = (scoresright[col][a_right] + (1 if a_right != a_parent else 0)) # if we want to weight scores, this would need to change
+                        if score < best_score_right:
+                            best_symb_right = a_right
+                            best_score_right = score
+                    self.seqscores[col][a_parent] = best_score_left + best_score_right
+                    self.backleft[col][a_parent] = best_symb_left
+                    self.backright[col][a_parent] = best_symb_right
+        else:
+            self.seqscores = [[0 if a==sym else 999999 for a in aln.alphabet] for sym in self.sequence] # if we want to weight scores, this would need to change
+        return self.seqscores
+
+    def _backwardParsimony(self, aln, seq = None):
+        """ Internal function that operates recursively to inspect scores to determine
+            most parsimonious sequence, from root to leaves. """
+        if self.sequence == None: # no sequence has been assigned
+            leftbuf = []
+            rightbuf = []
+            if self.left == None and self.right == None:    # no children, so terminal, cannot propagate scores
+                raise RuntimeError("No sequence assigned to leaf node:", self.label)
+            if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
+                currbuf = []
+                for col in range(aln.alignlen):
+                    min_score = 999999
+                    min_symb = None
+                    left_symb = None
+                    right_symb = None
+                    for a_parent in range(len(aln.alphabet)):
+                        if self.seqscores[col][a_parent] < min_score:
+                            min_score = self.seqscores[col][a_parent]
+                            min_symb = a_parent
+                            left_symb = self.backleft[col][a_parent]
+                            right_symb = self.backright[col][a_parent]
+                    currbuf.append(aln.alphabet[min_symb])
+                    leftbuf.append(aln.alphabet[left_symb])
+                    rightbuf.append(aln.alphabet[right_symb])
+                self.sequence = Sequence(currbuf, aln.alphabet, self.label, gappy = True)
+            else: # Non-root, but not leaf
+                self.sequence = seq
+                col = 0
+                for sym_parent in self.sequence:
+                    a_parent = aln.alphabet.index(sym_parent)
+                    left_symb = self.backleft[col][a_parent]
+                    right_symb = self.backright[col][a_parent]
+                    leftbuf.append(aln.alphabet[left_symb])
+                    rightbuf.append(aln.alphabet[right_symb])
+                    col += 1
+            self.left._backwardParsimony(aln, Sequence(leftbuf, aln.alphabet, self.label, gappy = True))
+            self.right._backwardParsimony(aln, Sequence(rightbuf, aln.alphabet, self.label, gappy = True))
+        return self.sequence
+
+    def getSequence(self):
+        """ Get the sequence for the node. Return None if no sequence is assigned.
+            Requires that an alignment is associated with the tree, and that sequence names match node labels.
+            If the explored node is not a leaf, the sequence can be determined by parsimony. """
+        if self.sequence != None: # a sequence has been assigned
+            return self.sequence
+        elif self.seqscores != None: # inferred by parsimony but not yet assigned
+            return None # determine most parsimonous sequence, not yet implemented
+
+    def isAncestorOf(self, node, transitive = True):
+        """ Decide if this node is the ancestor of specified node.
+            If transitive is True (default), all descendants are included.
+            If transitive is False, only direct descendants are included. """
+        if node == self.left or node == self.right:
+            return True
+        elif transitive:
+            if self.left:
+                statusLeft = self.left.isAncestorOf(node, transitive)
+                if statusLeft: return True
+            if self.right:
+                return self.right.isAncestorOf(node, transitive)
+        else:
+            return False
+
+    def getDescendants(self, transitive = False):
+        """ Retrieve and return (list of) nodes descendant of this.
+            If transitive is False (default), only direct descendants are included.
+            If transitive is True, all descendants are (recursively) included. """
+        children = []
+        if self.left:
+            children.append(self.left)
+        if self.right:
+            children.append(self.right)
+        if not transitive:
+            return children
+        else:
+            grandchildren = []
+            for c in children:
+                d = c.getDescendants(transitive)
+                if d:
+                    grandchildren.extend(d)
+            children.extend(grandchildren)
+            return children
+
+###############################################################################
+# Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
+# Methods for processing files of trees on the Newick format
+###############################################################################
+ 
+def runUPGMA(aln, measure, absoluteDistances = False):
+    """ Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
+        Use specified distance metric (see sequence.calcDistances).
+        If absoluteDistances is True, the tree will be assigned the total distance from provided species.
+        Otherwise, the relative addition at each path will be assigned."""
+    D = {}
+    N = {} # The number of sequences in each node
+    M = aln.calcDistances(measure) # determine all pairwise distances
+    nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
+    """ For each node-pair, assign the distance between them. """
+    for i in range(len(nodes)):
+        nodes[i].sequence = aln.seqs[i]
+        nodes[i].dist = 0.0
+        N[nodes[i]] = 1 # each cluster contains a single sequence
+        for j in range(0, i):
+            D[_getkey(nodes[i], nodes[j])] = M[i, j]
+    """ Now: treat each node as a cluster,
+        until there is only one cluster left,
+        find the *closest* pair of clusters, and
+        merge that pair into a new cluster (to replace the two that merged).
+        In each case, the new cluster is represented by the (phylo)node that is formed. """
+    while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
+        closest_pair = (None, None)    # The two nodes that are closest to one another according to supplied metric
+        closest_dist = None            # The distance between them
+        for pair in D:                 # check all pairs which should be merged
+            dist = D[pair]
+            if dist < closest_dist or closest_dist == None:
+                closest_dist = dist
+                closest_pair = pair
+        # So we know the closest, now we need to merge...
+        x = closest_pair[0]            # See Zvelebil and Baum p. 278 for notation
+        y = closest_pair[1]
+        z = PhyloNode()                # create a new node for the cluster z
+        z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
+        Nx = N.pop(x)                  # find number of sequences in x, remove the cluster from list N
+        Ny = N.pop(y)                  # find number of sequences in y, remove the cluster from list N
+        dz = {}                        # new distances to cluster z
+        for w in N:                    # for each node w ...
+            # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
+            dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
+            dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
+            dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
+        N[z] = Nx + Ny                 # total number of sequences in new cluster, insert new cluster in list N
+        for w in dz:                   # we have to run through the nodes again, now not including the removed x and y
+            D[_getkey(z, w)] = dz[w]   # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
+        z.left = x                     # link the phylogenetic tree
+        z.right = y
+        nodes.append(z)
+    if not absoluteDistances:
+        x._propagateDistance(z.dist)   # convert absolute distances to relative by recursing down left path
+        y._propagateDistance(z.dist)   # convert absolute distances to relative by recursing down right path
+        z.dist = 0.0                   # root z is at distance 0 from merged x and y
+    return PhyloTree(z)                # make it to tree, return
+
+def _getkey(node1, node2):
+    """ Construct canonical (unordered) key for two symbols """
+    if node1 <= node2:
+        return tuple([node1, node2])
+    else:
+        return tuple([node2, node1])
+
+def _findComma(string, level = 0):
+    """ Find first comma at specified level of embedding """
+    mylevel = 0
+    for i in range(len(string)):
+        if string[i] == '(':
+            mylevel += 1
+        elif string[i] == ')':
+            mylevel -= 1
+        elif string[i] == ',' and mylevel == level:
+            return i
+    return -1
+
+def parseNewickNode(string):
+    """ Utility function that recursively parses embedded string using Newick format. """
+    first = string.find('(')
+    last = string[::-1].find(')') # look from the back
+    if first == -1 and last == -1: # we are at leaf
+        y = string.split(':')
+        node = PhyloNode(y[0])
+        if len(y) >= 2:
+            node.dist = float(y[1])
+        return node
+    elif first >= 0 and last >= 0:
+        # remove parentheses
+        last = len(string) - last - 1 # correct index to refer from start instead of end of string
+        embed = string[first + 1:last]
+        tail = string[last + 1:]
+        # find where corresp comma is
+        comma = _findComma(embed)
+        if comma == -1:
+            raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
+        left = embed[0:comma].strip()
+        right = embed[comma + 1:].strip()
+        y = tail.split(':')
+        node = PhyloNode(y[0])
+        if len(y) >= 2:
+            node.dist = float(y[1])
+        node.left = parseNewickNode(left)
+        node.right = parseNewickNode(right)
+        return node
+    else:
+        raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
+
+def parseNewick(string):
+    """ Main method for parsing a Newick string into a (phylogenetic) tree.
+        Handles labels (on both leaves and internal nodes), and includes distances (if provided).
+        Returns an instance of a PhyloTree. """
+    if string.find(';') != -1:
+        string = string[:string.find(';')]
+    return PhyloTree(parseNewickNode(string))
+
+def readNewickFile(filename):
+    """ Read file on Newick format.
+        Returns an instance of a PhyloTree."""
+    f = open(filename)
+    string = ''.join(f)
+    return parseNewick(string)
+
+def writeNewickFile(filename, tree):
+    """ Write the specified tree to a Newick file. """
+    fh = open(filename, 'w')
+    fh.write(tree.__str__())
+    fh.close()
+
+###############################################################################
+# Below is code that will be run if the module is "run", and not just "imported".
+###############################################################################
+
+if __name__=='__main__':
+    x = Sequence('ACTGA', DNA_Alphabet, 'x')
+    print "Sequence", x, "is constructed from the symbols", x.alphabet.symbols
+    print "( There are", x.count('A'), "occurrences of the symbol 'A' in", x.sequence, ")" 
+    y = Sequence('TACGA', DNA_Alphabet, 'y')
+    print "Sequence", y, "is constructed from the symbols", y.alphabet.symbols
+    print 
+    print "( The sub-sequence 'CG' starts at index", y.find('CG'), "of", y.sequence, ")"
+    print
+    sm = SubstMatrix(DNA_Alphabet)
+    for a in DNA_Alphabet:
+        for b in DNA_Alphabet:
+            if a==b: 
+                sm.set(a, b, +2) # match
+            else: 
+                sm.set(a, b, -1) # mismatch
+    print "Below is a substitution matrix for the alphabet", DNA_Alphabet.symbols
+    print sm
+    print
+    aln = align(x, y, sm, -2)
+    print "Below is the alignment between x and y"
+    print aln
+    
\ No newline at end of file
--- a/ml.py
+++ b/ml.py
+import numpy
+import numpy.random
+import math
+import random
+
+class NN():
+    """
+    A basic implementation of a standard, multi-layer, feed-forward neural network
+    and back-propagation learning.
+    """
+    def __init__(self, nInput, nHidden, nOutput):
+        """ Constructs a neural network and initializes its weights to small random values.
+            nInput  Number of input nodes
+            nHidden Number of hidden nodes
+            nOutput Number of output nodes
+        """
+        self.ninput = nInput
+        self.hidden = numpy.empty(nHidden)                  # hidden nodes
+        self.output = numpy.empty(nOutput)                  # output nodes
+        self.w_hid  = numpy.random.randn(nHidden, nInput)   # weights in -> hid
+        self.b_hid  = numpy.random.randn(nHidden)           # biases hidden layer
+        self.w_out  = numpy.random.randn(nOutput, nHidden)  # weights hid -> out
+        self.b_out  = numpy.random.randn(nOutput)           # biases output layer
+        print "Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output))
+
+    def writeFile(self, filename):
+        """ Save NN to a file. """
+        f = open(filename, 'w')
+        f.write(str(self.ninput)+'\n')
+        f.write(str(len(self.hidden))+'\n')
+        f.write(str(len(self.output))+'\n')
+        for row in self.w_hid:
+            for w in row:
+                f.write(str(w)+'\n')
+        for b in self.b_hid:
+            f.write(str(b)+'\n')
+        for row in self.w_out:
+            for w in row:
+                f.write(str(w)+'\n')
+        for b in self.b_out:
+            f.write(str(b)+'\n')
+        f.close()
+
+    def _fLogistic(self, net):
+        """ The logistic output function.
+            Computes the output value of a node given the summed incoming activation,
+            values bounded between 0 and 1.
+            net: The summed incoming activation. """
+        return 1.0 / (1.0 + numpy.exp(-net))
+
+    def _fSoftmax(self, net):
+        """ The softmax output function.
+            Computes the output value of a node given the summed incoming activation,
+            values bounded between 0 and 1, where all add to 1.0.
+            net: The summed incoming activation for each output (must be the full layer). """
+        tmp = numpy.exp(net)
+        sum = numpy.sum(tmp)
+        out = tmp / sum
+        return out
+
+    def _fprimeLogistic(self, y):
+        """ The derivative of the logistic output function.
+            y: The value by which the gradient is determined.
+            returns the gradient at output y. """
+        return y * (1.0 - y)
+
+    def feedforward(self, input):
+        """ Computes the output values of the output nodes in the network given input values.
+            input: the one-dim array of input values
+            returns the one-dim array of computed output values. """
+        # compute the activation of each hidden node (depends on supplied input values)
+        self.hidden = self._fLogistic(self.w_hid.dot(input) + self.b_hid)
+        # compute the activation of each output node (depends on hidden node activations computed above)
+        if len(self.output) == 1:
+            self.output = self._fLogistic(self.w_out.dot(self.hidden) + self.b_out)
+        else:
+            self.output = self._fSoftmax(self.w_out.dot(self.hidden) + self.b_out)
+        return self.output
+
+    def test(self, inputs, targets):
+        """ Create a confusion matrix for all predictions with known target classes. """
+        cm = numpy.zeros((len(self.output), len(self.output))) # confusion matrix
+        for p in range(len(inputs)):
+            input   = inputs[p]
+            target  = targets[p]
+            # present the input and calculate the outputs
+            output = self.feedforward(input)
+            # which class?
+            c_targ = maxIndex(target)
+            c_pred = maxIndex(output)
+            cm[c_targ, c_pred] += 1
+        return cm
+
+    def train(self, input, target, eta = 0.1, niter = 1, shuffle = True):
+        """ Adapts weights in the network given the values that should appear at the output (target)
+            when the input has been presented. The procedure is known as error back-propagation.
+            This implementation is "online" rather than "batched", that is, the change is not based
+            on the gradient of the golbal error, merely the local, pattern-specific error.
+            target:  The desired output values
+            eta:     The learning rate, always between 0 and 1, typically a small value (default 0.1)
+            shuffle: If true, input rows are shuffled before training (reduces bias imposed by order
+            in online training)
+            returns an error value (the root-mean-squared-error). """
+        try:
+            len(input[0])
+            multi_input = input
+            multi_targ  = target
+        except TypeError:
+            multi_input = [ input ]
+            multi_targ  = [ target ]
+        for i in range(niter):
+            mse = 0.0
+            entries = range(len(multi_input))
+            if shuffle:
+                random.shuffle(entries)
+            for p in entries:
+                input = multi_input[p]
+                target  = multi_targ[p]
+                # present the input and calculate the outputs
+                self.feedforward(input)
+                # compute the error of output nodes (explicit target is available -- so quite simple)
+                # also, calculate the root-mean-squared-error to indicate progress
+                dif_out = (target - self.output)
+                if len(self.output) == 1:
+                    err_out = dif_out * self._fprimeLogistic(self.output)
+                else:
+                    err_out = dif_out #* self._fprimeSoftmax(self.output)
+                # compute the error of hidden nodes (indirect contribution to error at output layer)
+                err_hid = self.w_out.T.dot(err_out) * self._fprimeLogistic(self.hidden)
+                # change weights according to errors
+                self.w_out += numpy.outer(err_out, self.hidden) * eta
+                self.b_out += err_out * eta
+                self.w_hid += numpy.outer(err_hid, input) * eta
+                self.b_hid += err_hid * eta
+                if i == niter - 1: # last round
+                    mse += float(numpy.mean(numpy.square(dif_out)))
+        return math.sqrt(mse / len(entries)) # Root of mean squared error (RMSE)
+
+def readNNFile(filename):
+    """ Load a NN from a file. """
+    f = open(filename, 'r')
+    nInput = int(f.readline())
+    nHidden = int(f.readline())
+    nOutput = int(f.readline())
+    nn = NN(nInput, nHidden, nOutput)
+    for i in range(nHidden):
+        for j in range(nInput):
+            nn.w_hid[i, j] = float(f.readline())
+    for i in range(nHidden):
+        nn.b_hid[i] = float(f.readline())
+    for i in range(nOutput):
+        for j in range(nHidden):
+            nn.w_out[i, j] = float(f.readline())
+    for i in range(nOutput):
+        nn.b_out[i] = float(f.readline())
+    f.close()
+    return nn
+
+def maxIndex(output):
+    """ Figure out the index of the largest value in the specified array/list. """
+    if len(output) > 1: # multi-class
+        max = 0
+        for i in range(len(output)):
+            if output[i] > output[max]:
+                max = i
+    else:               # two-class, single output 0/1
+        max = int(round(output[0]))
+    return max
+
+def Qk(cm, alpha):
+    """ Compute the Q accuracy from a confusion matrix (see test method above) """
+    Q = {}
+    for a in alpha:
+        i = alpha.index(a)
+        Q[a] = (cm[i, i] / numpy.sum(cm[i])) * 100
+    tp = 0; pos = 0
+    for a in alpha:
+        i = alpha.index(a)
+        tp += cm[i, i]
+        pos += sum(cm[i])
+    return (float(tp) / float(pos)) * 100, Q
+
+def readDenseDataFile(filename):
+    """ Read data from file for training a neural network.
+        The file follows the "dense" row-format:
+        <i1> <i2> ... <im> | <o1> ... <on>
+        where ix are m input values and ox are n output values """
+    # first check format
+    ninputs = None
+    noutputs = None
+    nexamples = 0
+    f = open(filename)
+    cnt = 0
+    for row in f:
+        cnt += 1
+        inp, outp = row.split('|')
+        indata = [ float(token) for token in inp.split() ]
+        if ninputs:
+            if len(indata) != ninputs:
+                raise RuntimeError('Error reading file: Invalid input at row %d' % cnt)
+        ninputs = len(indata)
+        outdata = [ float(token) for token in outp.split() ]
+        if noutputs:
+            if len(outdata) != noutputs:
+                raise RuntimeError('Error reading file: Invalid output at row %d' % cnt)
+        noutputs = len(outdata)
+    f.close()
+    nexamples = cnt
+    inm  = numpy.zeros((nexamples, ninputs))
+    outm = numpy.zeros((nexamples, noutputs))
+    f = open(filename)
+    cnt = 0
+    for row in f:
+        inp, outp = row.split('|')
+        inm[cnt]  = [ float(token) for token in inp.split()  ]
+        outm[cnt] = [ float(token) for token in outp.split() ]
+        cnt += 1
+    f.close()
+    return inm, outm
+
+
+def fGaussian(x, mu = 0.0, sigma2 = 1.0):
+    """ Gaussian PDF for numpy arrays """
+    num = (x - mu) ** 2
+    den = 2 * sigma2
+    expon = numpy.exp(-num/den)
+    return expon / numpy.sqrt(2.0 * numpy.pi * sigma2)
+
+class KMeans():
+    """
+    K-means clustering is a special case of Expectation-Maximization (EM).
+    In K-means clustering we consider samples
+        x1,...,xn labeled with z1,...,zN with xt vectors in R^D and zt \in {1,...,K}.
+    In other words, zt is a class label, or cluster label, for the data point xt.
+    We can define a K-means probability model as follows where N(mu, I) denotes the
+    D-dimensional Gaussian distribution with mean mu \in R^D and with the
+    identity covariance matrix.
+        theta = <mu_1,...,mu_K>, mu_k \in R^D
+        P(x1, . . . , xn, z1, . . . zn) = PROD P(zt) P(xt|zt) = PROD 1/K N(mu_zt, I) (xt)
+    We now consider the optimization problem defined for this model. For
+    this model
+        (mu_1,...,mu_K)* = argmin_mu min_z SUM || muzt - xt || ^2
+    The optimization problem defines K-means clustering (under quadratic distortion).
+    This problem is non-convex and in fact is NP-hard. The K-means algorithm is coordinate
+    descent applied to this objective and is equivalent to EM under the above probability
+    model. The K-means clustering algorithm can be written as follows where we specify a
+    typical initialization step.
+        1. Initialize mu_z to be equal to a randomly selected point xt.
+        2. Repeat the following until (z1, . . . zn) stops changing.
+            (a) zt   := argmin_z || mu_z - xt || ^2
+            (b) Nz   := |{t: zt = z}|
+            (c) mu_z := 1 / Nz SUM_t:zt=z xt
+    In words, the K-means algorithm first assigns a class center mu_z for each class z.
+    It then repeatedly classifies each point xt as belonging to the class whose center is
+    nearest xt and then recomputes the class centers to be the mean of the point placed in that class.
+    Because it is a coordinate descent algorithm, the sum of squares of the difference
+    between each point and its class center is reduced by each update. This implies that the
+    classification must eventually stabilize.
+    The procedure terminates when the class labels stop changing.
+    """
+    def __init__(self, data):
+        """ Construct a K-means classifier using the provided data.
+            data: a two-dim numpy array, with one row corresponding to a data point.
+            If training is not performed, the provided data is used as the "means". """
+        assert len(data) > 0, "Data must be supplied"
+        self.data  = data
+        self.means = data.copy()
+        self.samplelen = len(data[0])
+        self.vars = numpy.empty((len(data), self.samplelen))
+
+    def classify(self, sample):
+        assert len(sample) == self.samplelen, "Sample vector has invalid length: " + str(len(sample))
+        sqrdist = numpy.sum((self.means - sample) ** 2, 1)
+        return sqrdist.argmin(0)
+
+    def train(self, K):
+        data = self.data
+        N = len(data)
+        clusters = numpy.zeros((N, 1))
+        self.means = self.data[numpy.random.randint(N, size = K),:]  # pick K random samples
+        while True:
+            previous = clusters.copy()
+            """ Compute cluster memberships GIVEN means """
+            kdist = numpy.empty((len(data), K))
+            for i in range(K):
+                kdist[:,i] = numpy.sum((data - self.means[i]) ** 2, 1)
+            clusters[:,0] = kdist.argmin(1)
+            nsame = numpy.sum(previous[:,0] == clusters[:,0])
+            if nsame == N:
+                break
+            """ Compute means GIVEN cluster memberships """
+            for i in range(K):
+                members = data[clusters[:,0] == i]
+                self.means[i] = members.mean(0)  # mean over rows per column
+
+def eucdist(v1, v2):
+    diff = 0
+    for i in range(len(v1)):
+        diff += (v1[i] - v2[i])**2
+    return math.sqrt(diff)
+
+
+
+
--- a/phylo.py
+++ b/phylo.py
+'''
+Module with methods and classes for phylogeny.
+@author: mikael
+'''
+##import sequence
+
+class PhyloTree:
+    """ Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
+        Functionality includes labelling and traversing nodes; reading and writing to Newick format;
+        association with sequence alignment; maximum parsimony inference of ancestral sequence;
+        generation of single, bifurcating rooted tree by UPGMA.
+        Known issues: Binary only; Parsimony does not handle gaps in alignment.
+        Programmers should note that almost all functionality is implemented through recursion. """
+
+    def __init__(self, root):
+        """ Create a tree from a node that is "root" in the tree."""
+        self.root = root
+
+    def putAlignment(self, aln):
+        """ Associate the tree with a set of sequences/alignment.
+            Involves assigning the sequence to the leaf nodes. """
+        self.aln = aln
+        self.root._assignAlignment(aln)
+
+    def __str__(self):
+        """ Produce a printable representation of the tree, specifically the root of the tree. """
+        return str(self.root)
+
+    def strSequences(self, start = None, end = None):
+        """ Produce a sequence representation of the tree, specifically the root of the tree.
+            Specify the start and end positions in the alignment for the sequence to be printed
+            (if None the min and max positions will be used). """
+        if self.aln != None:
+            my_start = start or 0
+            my_end = end or self.aln.alignlen
+            return self.root._printSequences(my_start, my_end)
+
+    def findLabel(self, label):
+        """ Retrieve/return the node with the specified label.
+            Returns None if not found."""
+        return self.root._findLabel(label)
+
+    def getDescendantsOf(self, node, transitive = False):
+        """ Retrieve and return the (list of) descendants (children) of a specified node.
+            Node can be the label or the instance.
+            transitive indicates if only the direct descendants (False) or if all descendants
+            should be returned.
+            If node does not exist, None is returned.
+            If node has no descendants, an empty list will be returned."""
+        if not isinstance(node, PhyloNode):
+            node = self.root.findLabel(node)
+        if node:
+            return node.getDescendants(transitive)
+        return None
+
+    def getAncestorsOf(self, node, transitive = False):
+        """ Retrieve and return the ancestor (transitive=False) or
+            ancestors (transitive=True) of a specified node.
+            Node can be the label or the instance.
+            If node does not exist, None is returned.
+            If node is the root of the tree, None is returned."""
+        if not isinstance(node, PhyloNode):
+            node = self.root.findLabel(node)
+        if node:
+            myroot = self.root
+            found = False
+            branching = []
+            while not found and myroot != None:
+                branching.append(myroot)
+                if myroot.left == node or myroot.right == node:
+                    found = True
+                    break
+                if myroot.left:
+                    if myroot.left.isAncestorOf(node, transitive = True):
+                        myroot = myroot.left
+                    else: # must be right branch then...
+                        myroot = myroot.right
+                else: # must be right branch then...
+                    myroot = myroot.right
+            if found and transitive:
+                return branching
+            elif found and len(branching) > 0:
+                return branching[len(branching)-1]
+            return None
+
+    def parsimony(self):
+        """ Solve the "small parsimony problem",
+            i.e. find the sequences on each of the internal nodes.
+            See Jones and Pevzner, p. 368 and onwards, for details. """
+        self.root._forwardParsimony(self.aln)  # setup and compute scores for all nodes
+        self.root._backwardParsimony(self.aln) # use scores to determine sequences
+        return self.root.getSequence() # return the sequence found at the root
+
+
+class PhyloNode:
+    """ A class for a node in a rooted, binary (bifurcating) tree.
+        Contains pointers to descendants/daughters (left and right),
+        optional fields include data, label, sequence and dist.
+        If parsimony is used scores and traceback pointers are available.
+        A number of methods are named with a _ prefix. These can be, but
+        are not intended to be used from outside the class. """
+
+    def __init__(self, label = ''):
+        """ Initialise an initially unlinked node.
+            Populate fields left and right to link it with other nodes.
+            Set label to name it.
+            Use field data for any type of information associated with node.
+            Use dist to indicate the distance to its parent (if any).
+            Other fields are used internally, including sequence for associated alignment,
+            seqscores, backleft and backright for maximum parsimony. """
+        self.left = None
+        self.right = None
+        self.data = None
+        self.label = label
+        self.dist = None
+        self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
+        self.seqscores = None # The scores propagated from leaves via children
+        self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols
+        self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
+
+    def __str__(self):
+        """ Returns string with node (incl descendants) in a Newick style. """
+        left = right = label = dist = ''
+        if self.left:
+            left = str(self.left)
+        if self.right:
+            right = str(self.right)
+        if self.dist or self.dist == 0.0:
+            dist = ':' + str(self.dist)
+        if self.label != None:
+            label = str(self.label)
+            if not self.left and not self.right:
+                return label + dist
+            else:
+                return '(' + left + ',' + right + ')' + label + dist
+        else: # there is no label
+            if not self.left and self.right:
+                return ','+right
+            elif self.left and not self.right:
+                return left+','
+            elif self.left and self.right:
+                return '(' + left + ',' + right + ')' + dist
+
+    def _printSequences(self, start, end):
+        """ Returns string with node (incl descendants) in a Newick style. """
+        left = right = label = dist = ''
+        if self.left:
+            left = self.left._printSequences(start, end)
+        if self.right:
+            right = self.right._printSequences(start, end)
+        if self.dist:
+            dist = ':' + str(self.dist)
+        if self.sequence != None:
+            label = "".join(self.sequence[start:end]) + ""
+            if not self.left and not self.right:
+                return label + dist
+            else:
+                return '(' + left + ',' + right + ')' + label + dist
+        else: # there is no label
+            if not self.left and self.right:
+                return ','+right
+            elif self.left and not self.right:
+                return left+','
+            elif self.left and self.right:
+                return '(' + left + ',' + right + ')' + dist
+
+    def _findLabel(self, label):
+        """ Find a node by label at this node or in any descendants (recursively). """
+        if self.label == label:
+            return self
+        else:
+            if self.left:
+                foundLeft = self.left._findLabel(label)
+                if foundLeft:
+                    return foundLeft
+            if self.right:
+                return self.right._findLabel(label)
+            return None
+
+    def _propagateDistance(self, parent_dist):
+        """ Convert absolute distances to relative.
+            The only parameter is the absolute distance to the parent of this node. """
+        travelled = self.dist               # absolute distance to this node
+        self.dist = parent_dist - self.dist # relative distance to this node
+        if self.left != None:               # if there is a child node...
+            self.left._propagateDistance(travelled) # pass absolute distance to this node
+        if self.right != None:
+            self.right._propagateDistance(travelled)
+
+    def _assignAlignment(self, aln):
+        """ Assign an alignment to the node, which implies assigning a sequence to it if one is
+            available in the alignment. """
+        self.sequence = None
+        if self.left != None:
+            self.left._assignAlignment(aln)
+        if self.right != None:
+            self.right._assignAlignment(aln)
+        for seq in aln.seqs:
+            if seq.name == self.label:
+                self.sequence = seq
+                break
+
+    def _forwardParsimony(self, aln):
+        """ Internal function that operates recursively to first initialise each node (forward),
+            stopping only once a sequence has been assigned to the node,
+            then to propagate scores from sequence assigned nodes to root (backward). """
+        if self.sequence == None: # no sequence has been assigned
+            if self.left == None and self.right == None:    # no children, so terminal, cannot propagate scores
+                raise RuntimeError("No sequence assigned to leaf node:", self.label)
+            scoresleft = scoresright = None
+            if self.left != None:
+                scoresleft = self.left._forwardParsimony(aln)
+            if self.right != None:
+                scoresright = self.right._forwardParsimony(aln)
+            # for each position in the alignment,
+            # introduce (initially zero) score for each symbol in alphabet
+	#Project "Substitution weights" should focus on this line of code
+            self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
+            # for each position in the alignment,
+            # allocate a position to put the left child symbol from which each current node symbol score was determined
+            self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
+            # allocate a position to put the right child symbol from which each current node symbol score was determined
+            self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
+            for col in range(aln.alignlen):
+                for a_parent in range(len(aln.alphabet)):
+                    best_score_left = +9999999
+                    best_score_right = +9999999
+                    best_symb_left = 0
+                    best_symb_right = 0
+                    for a_left in range(len(aln.alphabet)):
+                        score = (scoresleft[col][a_left] + (1 if a_left != a_parent else 0)) # if we want to weight scores, this would need to change
+                        if score < best_score_left:
+                            best_symb_left = a_left
+                            best_score_left = score
+                    for a_right in range(len(aln.alphabet)):
+                        score = (scoresright[col][a_right] + (1 if a_right != a_parent else 0)) # if we want to weight scores, this would need to change
+                        if score < best_score_right:
+                            best_symb_right = a_right
+                            best_score_right = score
+                    self.seqscores[col][a_parent] = best_score_left + best_score_right
+                    self.backleft[col][a_parent] = best_symb_left
+                    self.backright[col][a_parent] = best_symb_right
+        else:
+            self.seqscores = [[0 if a==sym else 999999 for a in aln.alphabet] for sym in self.sequence] # if we want to weight scores, this would need to change
+        return self.seqscores
+
+    def _backwardParsimony(self, aln, seq = None):
+        """ Internal function that operates recursively to inspect scores to determine
+            most parsimonious sequence, from root to leaves. """
+        if self.sequence == None: # no sequence has been assigned
+            leftbuf = []
+            rightbuf = []
+            if self.left == None and self.right == None:    # no children, so terminal, cannot propagate scores
+                raise RuntimeError("No sequence assigned to leaf node:", self.label)
+            if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
+                currbuf = []
+                for col in range(aln.alignlen):
+                    min_score = 999999
+                    min_symb = None
+                    left_symb = None
+                    right_symb = None
+                    for a_parent in range(len(aln.alphabet)):
+                        if self.seqscores[col][a_parent] < min_score:
+                            min_score = self.seqscores[col][a_parent]
+                            min_symb = a_parent
+                            left_symb = self.backleft[col][a_parent]
+                            right_symb = self.backright[col][a_parent]
+                    currbuf.append(aln.alphabet[min_symb])
+                    leftbuf.append(aln.alphabet[left_symb])
+                    rightbuf.append(aln.alphabet[right_symb])
+                self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy = True)
+            else: # Non-root, but not leaf
+                self.sequence = seq
+                col = 0
+                for sym_parent in self.sequence:
+                    a_parent = aln.alphabet.index(sym_parent)
+                    left_symb = self.backleft[col][a_parent]
+                    right_symb = self.backright[col][a_parent]
+                    leftbuf.append(aln.alphabet[left_symb])
+                    rightbuf.append(aln.alphabet[right_symb])
+                    col += 1
+            self.left._backwardParsimony(aln, sequence.Sequence(leftbuf, aln.alphabet, self.label, gappy = True))
+            self.right._backwardParsimony(aln, sequence.Sequence(rightbuf, aln.alphabet, self.label, gappy = True))
+        return self.sequence
+
+    def getSequence(self):
+        """ Get the sequence for the node. Return None if no sequence is assigned.
+            Requires that an alignment is associated with the tree, and that sequence names match node labels.
+            If the explored node is not a leaf, the sequence can be determined by parsimony. """
+        if self.sequence != None: # a sequence has been assigned
+            return self.sequence
+        elif self.seqscores != None: # inferred by parsimony but not yet assigned
+            return None # determine most parsimonous sequence, not yet implemented
+
+    def isAncestorOf(self, node, transitive = True):
+        """ Decide if this node is the ancestor of specified node.
+            If transitive is True (default), all descendants are included.
+            If transitive is False, only direct descendants are included. """
+        if node == self.left or node == self.right:
+            return True
+        elif transitive:
+            if self.left:
+                statusLeft = self.left.isAncestorOf(node, transitive)
+                if statusLeft: return True
+            if self.right:
+                return self.right.isAncestorOf(node, transitive)
+        else:
+            return False
+
+    def getDescendants(self, transitive = False):
+        """ Retrieve and return (list of) nodes descendant of this.
+            If transitive is False (default), only direct descendants are included.
+            If transitive is True, all descendants are (recursively) included. """
+        children = []
+        if self.left:
+            children.append(self.left)
+        if self.right:
+            children.append(self.right)
+        if not transitive:
+            return children
+        else:
+            grandchildren = []
+            for c in children:
+                d = c.getDescendants(transitive)
+                if d:
+                    grandchildren.extend(d)
+            children.extend(grandchildren)
+            return children
+
+""" ----------------------------------------------------------------------------------------
+    Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
+    ----------------------------------------------------------------------------------------"""
+
+def runUPGMA(aln, measure, absoluteDistances = False):
+    """ Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
+        Use specified distance metric (see sequence.calcDistances).
+        If absoluteDistances is True, the tree will be assigned the total distance from provided species.
+        Otherwise, the relative addition at each path will be assigned."""
+    D = {}
+    N = {} # The number of sequences in each node
+    M = aln.calcDistances(measure) # determine all pairwise distances
+    nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
+    """ For each node-pair, assign the distance between them. """
+    for i in range(len(nodes)):
+        nodes[i].sequence = aln.seqs[i]
+        nodes[i].dist = 0.0
+        N[nodes[i]] = 1 # each cluster contains a single sequence
+        for j in range(0, i):
+            D[_getkey(nodes[i], nodes[j])] = M[i, j]
+    """ Now: treat each node as a cluster,
+        until there is only one cluster left,
+        find the *closest* pair of clusters, and
+        merge that pair into a new cluster (to replace the two that merged).
+        In each case, the new cluster is represented by the (phylo)node that is formed. """
+    while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
+        closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
+        closest_dist = None         # The distance between them
+        for pair in D:              # check all pairs which should be merged
+            dist = D[pair]
+            if dist < closest_dist or closest_dist == None:
+                closest_dist = dist
+                closest_pair = pair
+        # So we know the closest, now we need to merge...
+        x = closest_pair[0]         # See Zvelebil and Baum p. 278 for notation
+        y = closest_pair[1]
+        z = PhyloNode()             # create a new node for the cluster z
+        z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
+        Nx = N.pop(x)               # find number of sequences in x, remove the cluster from list N
+        Ny = N.pop(y)               # find number of sequences in y, remove the cluster from list N
+        dz = {}                     # new distances to cluster z
+        for w in N:                 # for each node w ...
+            # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
+            dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
+            dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
+            dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
+        N[z] = Nx + Ny              # total number of sequences in new cluster, insert new cluster in list N
+        for w in dz:                # we have to run through the nodes again, now not including the removed x and y
+            D[_getkey(z, w)] = dz[w]# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
+        z.left = x                  # link the phylogenetic tree
+        z.right = y
+        nodes.append(z)
+    if not absoluteDistances:
+        x._propagateDistance(z.dist)    # convert absolute distances to relative by recursing down left path
+        y._propagateDistance(z.dist)    # convert absolute distances to relative by recursing down right path
+        z.dist = 0.0                    # root z is at distance 0 from merged x and y
+    return PhyloTree(z)             # make it to tree, return
+
+def _getkey(node1, node2):
+    """ Construct canonical (unordered) key for two symbols """
+    if node1 <= node2:
+        return tuple([node1, node2])
+    else:
+        return tuple([node2, node1])
+
+""" ----------------------------------------------------------------------------------------
+    Methods for processing files of trees on the Newick format
+    ----------------------------------------------------------------------------------------"""
+
+def _findComma(string, level = 0):
+    """ Find first comma at specified level of embedding """
+    mylevel = 0
+    for i in range(len(string)):
+        if string[i] == '(':
+            mylevel += 1
+        elif string[i] == ')':
+            mylevel -= 1
+        elif string[i] == ',' and mylevel == level:
+            return i
+    return -1
+
+def parseNewickNode(string):
+    """ Utility function that recursively parses embedded string using Newick format. """
+    first = string.find('(')
+    last = string[::-1].find(')') # look from the back
+    if first == -1 and last == -1: # we are at leaf
+        y = string.split(':')
+        node = PhyloNode(y[0])
+        if len(y) >= 2:
+            node.dist = float(y[1])
+        return node
+    elif first >= 0 and last >= 0:
+        # remove parentheses
+        last = len(string) - last - 1 # correct index to refer from start instead of end of string
+        embed = string[first + 1:last]
+        tail = string[last + 1:]
+        # find where corresp comma is
+        comma = _findComma(embed)
+        if comma == -1:
+            raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
+        left = embed[0:comma].strip()
+        right = embed[comma + 1:].strip()
+        y = tail.split(':')
+        node = PhyloNode(y[0]) #node is an instance of the PhyloNode() class
+        if len(y) >= 2:
+            node.dist = float(y[1])
+        node.left = parseNewickNode(left)
+        node.right = parseNewickNode(right)
+        return node
+    else:
+        raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
+
+def parseNewick(string):
+    """ Main method for parsing a Newick string into a (phylogenetic) tree.
+        Handles labels (on both leaves and internal nodes), and includes distances (if provided).
+        Returns an instance of a PhyloTree. """
+    if string.find(';') != -1:
+        string = string[:string.find(';')]
+    return PhyloTree(parseNewickNode(string))
+
+def readNewick(filename):
+    """ Read file on Newick format.
+        Returns an instance of a PhyloTree."""
+    f = open(filename)
+    string = ''.join(f)
+    return parseNewick(string)
+
--- a/prob.py
+++ b/prob.py
+'''
+Module for classes and functions that are representing and processing basic probabilities.
+Uses and depends on "Alphabet" that is used to define discrete random variables.
+'''
+import random
+from sym import *
+from copy import deepcopy
+import math
+
+#################################################################################################
+# Generic utility functions
+#################################################################################################
+
+def _getMeTuple(alphas, str):
+    """ Handy function that resolves what entries that are being referred to in the case
+    of written wildcards etc.
+    Example y = _getMeTuple([DNA_Alphabet, Protein_Alphabet], '*R') gives y = (None, 'R')
+    alphas: the alphabets
+    str: the string that specifies entries (may include '*' and '-' signifying any symbol) """
+    assert len(str) == len(alphas), "Entry invalid"
+    if not type(str) is tuple:
+        list = []
+        for ndx in range(len(alphas)):
+            if str[ndx] == '*' or str[ndx] == '-':
+                list.append(None)
+            else:
+                list.append(str[ndx])
+        return tuple(list)
+    else:
+        return str
+
+#################################################################################################
+# Distrib class
+#################################################################################################
+
+class Distrib():
+    """ A class for a discrete probability distribution, defined over a specified "Alphabet"
+        TODO: Fix pseudo counts
+              Exclude from counts, specify in constructor,
+              include only when computing probabilities by standard formula (n_a + pseudo_a * N^(1/2)) / (N + N^(1/2))
+              Exclude from filesaves, include with filereads (optional)
+    """
+    def __init__(self, alpha, pseudo = 0.0):
+        """ Construct a new distribution for a specified alphabet, using an optional pseudo-count.
+        alpha: alphabet
+        pseudo: either a single "count" that applies to all symbols, OR a distribution/dictionary with counts.
+        """
+        self.pseudo = pseudo or 0.0
+        self.alpha = alpha
+        self.cnt = [0.0 for _ in alpha]
+        try: # assume pseudo is a dictionary or a Distrib itself
+            self.tot = 0
+            symndx = 0
+            for sym in alpha:
+                cnt = float(pseudo[sym])
+                self.cnt[symndx] = cnt
+                self.tot = self.tot + cnt
+                symndx += 1
+        except TypeError: # assume pseudo is a single count for each symbol
+            self.cnt = [float(self.pseudo) for _ in alpha]
+            self.tot = float(self.pseudo) * len(alpha) # track total counts (for efficiency)
+
+    def observe(self, sym, cntme = 1.0):
+        """ Make an observation of a symbol
+        sym: symbol that is being observed
+        cntme: number/weight of observation (default is 1)
+        """
+        ndx = self.alpha.symbols.index(sym)
+        self.cnt[ndx] = self.cnt[ndx] + cntme
+        self.tot = self.tot + cntme
+        return
+
+    def reset(self):
+        """ Re-set the counts of this distribution. Pseudo-counts are re-applied. """
+        try:
+            self.tot = 0
+            symndx = 0
+            for sym in self.alpha: # assume it is a Distribution
+                cnt = float(self.pseudo[sym])
+                self.cnt[symndx] = cnt
+                self.tot = self.tot + cnt
+                symndx += 1
+        except TypeError: # assume pseudo is a single count for each symbol
+            self.cnt = [float(self.pseudo) for _ in self.alpha]
+            self.tot = float(self.pseudo) * len(self.alpha) # track total counts (for efficiency)
+
+    def reduce(self, new_alpha):
+        """ Create new distribution from self, using (smaller) alphabet new_alpha. """
+        d = Distrib(new_alpha, self.pseudo)
+        for sym in new_alpha:
+            d.observe(sym, self.cnt[self.alpha.index(sym)])
+        return d
+
+    def count(self, sym = None):
+        """ Return the absolute count(s) of the distribution
+            or the count for a specified symbol. """
+        if sym != None:
+            ndx = self.alpha.symbols.index(sym)
+            return self.cnt[ndx]
+        else:
+            d = {}
+            index = 0
+            for a in self.alpha:
+                d[a] = self.cnt[index]
+                index += 1
+            return d
+
+    def add(self, distrib):
+        """ Add the counts for the provided distribution to the present. """
+        for i in range(len(self.cnt)):
+            cnt = distrib.count(self.alpha[i])
+            self.cnt[i] += cnt
+            self.tot += cnt
+
+    def subtract(self, distrib):
+        """ Subtract the counts for the provided distribution from the present. """
+        for i in range(len(self.cnt)):
+            cnt = distrib.count(self.alpha[i])
+            self.cnt[i] -= cnt
+            self.tot -= cnt
+
+    def getSymbols(self):
+        return self.alpha.symbols
+
+    def __getitem__(self, sym):
+        """ Retrieve the probability of a symbol (ascertained by counts incl pseudo-counts) """
+        if self.tot > 0.0:
+            return self.count(sym) / self.tot
+        else:
+            return 1.0 / len(self.alpha) # uniform
+
+    def prob(self, sym = None):
+        """ Retrieve the probability of a symbol OR the probabilities of all symbols
+        (listed in order of the alphabet index). """
+        if sym != None:
+            return self.__getitem__(sym)
+        elif self.tot > 0:
+            return [ s / self.tot for s in self.cnt ]
+        else:
+            return [ 1.0 / len(self.alpha) for _ in self.cnt ]
+
+    def __iter__(self):
+        return self.alpha
+
+    def __str__(self):
+        """ Return a readable representation of the distribution """
+        str = '< '
+        for s in self.alpha:
+            str += (s + ("=%4.2f " % self[s]))
+        return str + ' >'
+
+    def swap(self, sym1, sym2):
+        """ Swap the entries for specified symbols. Useful for reverse complement etc.
+            Note that changes are made to the current instance. Use swapxcopy if you
+            want to leave this instance intact. """
+        sym1ndx = self.alpha.index(sym1)
+        sym2ndx = self.alpha.index(sym2)
+        tmpcnt = self.cnt[sym1ndx]
+        self.cnt[sym1ndx] = self.cnt[sym2ndx]
+        self.cnt[sym2ndx] = tmpcnt
+
+    def swapxcopy(self, sym1, sym2):
+        """ Create a new instance with swapped entries for specified symbols.
+            Useful for reverse complement etc.
+            Note that changes are NOT made to the current instance.
+            Use swap if you want to modify this instance. """
+        newdist = Distrib(self.alpha, self.count())
+        newdist.swap(sym1, sym2)
+        return newdist
+
+    def writeDistrib(self, filename = None):
+        """ Write the distribution to a file or string.
+            Note that the total number of counts is also saved, e.g.
+            * 1000 """
+        str = ''
+        for s in self.alpha:
+            str += (s + ("\t%f\n" % self[s]))
+        str += "*\t%d\n" % self.tot
+        if filename != None:
+            fh = open(filename, 'w')
+            fh.write(str)
+            fh.close()
+        return str
+
+    def generate(self):
+        """ Generate and return a symbol from the distribution using assigned probabilities. """
+        alpha = self.alpha
+        p = random.random() # get a random value between 0 and 1
+        q = 0.0
+        for sym in alpha: # pick a symbol with a frequency proportional to its probability
+            q = q + self[sym]
+            if p < q:
+                return sym
+        return alpha[len(alpha)]
+
+    def getmax(self):
+        """ Generate the symbol with the largest probability. """
+        maxprob = 0.0
+        maxsym = None
+        for sym in self.alpha:
+            if self[sym] > maxprob or maxprob == 0.0:
+                maxsym = sym
+                maxprob = self[sym]
+        return maxsym
+
+    def getsort(self):
+        """ Return the list of symbols, in order of their probability. """
+        symlist = [sym for (sym, _) in self.getProbsort()]
+        return symlist
+
+    def getProbsort(self):
+        """ Return the list of symbol-probability pairs, in order of their probability. """
+        s = [(sym, self.prob(sym)) for sym in self.alpha]
+        ss = sorted(s, key=lambda y: y[1], reverse=True)
+        return ss
+
+    def divergence(self, distrib2):
+        """ Calculate the Kullback-Leibler divergence between two discrete distributions.
+            Note that when self.prob(x) is 0, the divergence for x is 0.
+            When distrib2.prob(x) is 0, it is replaced by 0.0001.
+        """
+        assert self.alpha == distrib2.alpha
+        sum = 0.0
+        base = len(self.alpha)
+        for sym in self.alpha:
+            if self[sym] > 0:
+                if distrib2[sym] > 0:
+                    sum += math.log(self[sym] / distrib2[sym]) * self[sym]
+                else:
+                    sum += math.log(self[sym] / 0.0001) * self[sym]
+        return sum
+
+    def entropy(self):
+        """ Calculate the information (Shannon) entropy of the distribution.
+            Note that the base is the size of the alphabet, so maximum entropy is by definition 1.
+            Also note that if the probability is exactly zero, it is replaced by a small value to
+            avoid numerical issues with the logarithm. """
+        sum = 0.0
+        base = len(self.alpha)
+        for sym in self.alpha:
+            p = self.__getitem__(sym)
+            if p == 0:
+                p = 0.000001
+            sum +=  p * math.log(p, base)
+        return -sum
+
+def writeDistribs(distribs, filename):
+    """ Write a list/set of distributions to a single file. """
+    str = ''
+    k = 0
+    for d in distribs:
+        str += "[%d]\n%s" % (k, d.writeDistrib())
+        k += 1
+    fh = open(filename, 'w')
+    fh.write(str)
+    fh.close()
+
+def _readDistrib(linelist):
+    """ Extract distribution from a pre-processed list if strings. """
+    symstr = ''
+    d = {}
+    for line in linelist:
+        line = line.strip()
+        if len(line) == 0 or line.startswith('#'):
+            continue
+        sections = line.split()
+        sym, value = sections[0:2]
+        if len(sym) == 1:
+            if sym != '*':
+                symstr += sym
+        else:
+            raise RuntimeError("Invalid symbol in distribution: " + sym)
+        try:
+            d[sym] = float(value)
+        except ValueError:
+            raise RuntimeError("Invalid value in distribution for symbol " + sym + ": " + value)
+    if len(d) == 0:
+        return None
+    alpha = Alphabet(symstr)
+    if '*' in d.keys(): # tot provided
+        for sym in d:
+            if sym != '*':
+                d[sym] = d[sym] * d['*']
+    distrib = Distrib(alpha, d)
+    return distrib
+
+def readDistribs(filename):
+    """ Load a list of distributions from file.
+    Note that if a row contains '* <number>' then it is assumed that each probability
+    associated with the specific distribution is based on <number> counts. """
+    fh = open(filename)
+    string = fh.read()
+    distlist = []
+    linelist = []
+    for line in string.splitlines():
+        line = line.strip()
+        if line.startswith('['):
+            if len(linelist) != 0:
+                distlist.append(_readDistrib(linelist))
+            linelist = []
+        elif len(line) == 0 or line.startswith('#'):
+            pass # comment or blank line --> ignore
+        else:
+            linelist.append(line)
+    # end for-loop, reading the file
+    if len(linelist) != 0:
+        distlist.append(_readDistrib(linelist))
+    fh.close()
+    return distlist
+
+def readDistrib(filename):
+    """ Load a distribution from file.
+    Note that if a row contains '* <number>' then it is assumed that each probability
+    is based on <number> counts. """
+    dlist = readDistribs(filename)
+    if len(dlist) > 0:  # if at least one distribution was in the file...
+        return dlist[0] # return the first
+
+import re
+
+def _readMultiCount(linelist, format = 'JASPAR'):
+    ncol = 0
+    symcount = {}
+    if format == 'JASPAR2010':
+        for line in linelist:
+            line = line.strip()
+            if len(line) > 0:
+                name = line.split()[0]
+                counts = []
+                for txt in re.findall(r'\w+', line):
+                    try:
+                        y = float(txt)
+                        counts.append(y)
+                    except ValueError:
+                        pass # ignore non-numeric entries
+                if len(counts) != ncol and ncol != 0:
+                    raise RuntimeError('Invalid row in file: ' + line)
+                ncol = len(counts)
+                if len(name) == 1: # proper symbol
+                    symcount[name] = counts
+        alpha = Alphabet(''.join(symcount.keys()))
+        distribs = []
+        for col in range(ncol):
+            d = dict([(sym, symcount[sym][col]) for sym in symcount])
+            distribs.append(Distrib(alpha, d))
+    elif format == 'JASPAR':
+        alpha_str = 'ACGT'
+        alpha = Alphabet(alpha_str)
+        cnt = 0
+        for sym in alpha_str:
+            line = linelist[cnt].strip()
+            counts = []
+            for txt in re.findall(r'\w+', line):
+                try:
+                    y = float(txt)
+                    counts.append(y)
+                except ValueError:
+                    pass # ignore non-numeric entries
+            if len(counts) != ncol and ncol != 0:
+                raise RuntimeError('Invalid row in file: ' + line)
+            ncol = len(counts)
+            symcount[sym] = counts
+            cnt += 1
+        distribs = []
+        for col in range(ncol):
+            d = dict([(sym, symcount[sym][col]) for sym in symcount])
+            distribs.append(Distrib(alpha, d))
+    else:
+        raise RuntimeError('Unsupported format: ' + format)
+    return distribs
+
+def readMultiCounts(filename, format = 'JASPAR'):
+    """ Read a file of raw counts for multiple distributions over the same set of symbols
+        for (possibly) multiple (named) entries.
+        filename: name of file
+        format: format of file, default is 'JASPAR' exemplified below
+        >MA0001.1 SEP4
+        0    3    79    40    66    48    65    11    65    0
+        94    75    4    3    1    2    5    2    3    3
+        1    0    3    4    1    0    5    3    28    88
+        2    19    11    50    29    47    22    81    1    6
+        returns a dictionary of Distrib's, key:ed by entry name (e.g. MA001.1)
+    """
+    fh = open(filename)
+    linelist = []
+    entryname = ''
+    entries = {}
+    for row in fh:
+        row = row.strip()
+        if len(row) < 1: continue
+        if row.startswith('>'):
+            if len(linelist) > 0:
+                entries[entryname] = _readMultiCount(linelist, format=format)
+                linelist = []
+            entryname = row[1:].split()[0]
+        else:
+            linelist.append(row)
+    if len(linelist) > 0:
+        entries[entryname] = _readMultiCount(linelist, format=format)
+    fh.close()
+    return entries
+
+def readMultiCount(filename, format = 'JASPAR'):
+    """ Read a file of raw counts for multiple distributions over the same set of symbols.
+        filename: name of file
+        format: format of file, default is 'JASPAR' exemplified below
+        0    3    79    40    66    48    65    11    65    0
+        94    75    4    3    1    2    5    2    3    3
+        1    0    3    4    1    0    5    3    28    88
+        2    19    11    50    29    47    22    81    1    6
+        returns a list of Distrib's
+    """
+    d = readMultiCounts(filename, format=format)
+    if len(d) > 0:
+        return d.values()[0]
+
+#################################################################################################
+# Joint class
+#################################################################################################
+
+class Joint(object):
+    """ A joint probability class.
+        The JP is represented as a distribution over n-tuples where n is the number of variables.
+        Variables can be for any defined alphabet. The size of each alphabet determine the
+        number of entries in the table (with probs that add up to 1.0) """
+
+    def __init__(self, alphas):
+        """ A distribution of n-tuples.
+        alphas: Alphabet(s) over which the distribution is defined
+        """
+        if type(alphas) is Alphabet:
+            self.alphas = tuple( [alphas] )
+        elif type(alphas) is tuple:
+            self.alphas = alphas
+        else:
+            self.alphas = tuple( alphas )
+        self.store = TupleStore(self.alphas)
+        self.totalCnt = 0
+
+    def getN(self):
+        """ Retrieve the number of distributions/random variables. """
+        return len(self.alphas)
+
+    def __iter__(self):
+        return self.store.__iter__()
+
+    def reset(self):
+        """ Re-set the counts of this joint distribution. Pseudo-counts are re-applied. """
+        for entry in self.store:
+            self.store[entry] = None
+        self.totalCnt = 0
+
+    def observe(self, key, cnt = 1):
+        """ Make an observation of a tuple/key
+        key: tuple that is being observed
+        cnt: number/weight of observation (default is 1)
+        """
+        key = _getMeTuple(self.alphas, key)
+        if not None in key:
+            score = self.store[key]
+            if (score == None):
+                score = 0
+            self.totalCnt += cnt
+            self.store[key] = score + cnt
+        else: # there are wildcards in the key
+            allkeys = [mykey for mykey in self.store.getAll(key)]
+            mycnt = float(cnt)/float(len(allkeys))
+            self.totalCnt += cnt
+            for mykey in allkeys:
+                score = self.store[mykey]
+                if (score == None):
+                    score = 0
+                self.store[mykey] = score + mycnt
+        return
+
+    def count(self, key):
+        """ Return the absolute count that is used for the joint probability table. """
+        key = _getMeTuple(self.alphas, key)
+        score = self.store[key]
+        if (score == None):
+            score = 0.0
+            for match in self.store.getAll(key):
+                y = self.store[match]
+                if y != None:
+                    score += y
+        return score
+
+    def __getitem__(self, key):
+        """ Determine and return the probability of a specified expression of the n-tuple
+        which can involve "wildcards"
+        Note that no assumptions are made regarding independence. """
+        key = _getMeTuple(self.alphas, key)
+        score = self.store[key]
+        if (score == None):
+            score = 0.0
+            for match in self.store.getAll(key):
+                y = self.store[match]
+                if y != None:
+                    score += y
+        if self.totalCnt == 0:
+            return 0.0
+        return float(score) / float(self.totalCnt)
+
+    def __str__(self):
+        """ Return a textual representation of the JP. """
+        str = '< '
+        if self.totalCnt == 0.0:
+            return str + 'None >'
+        for s in self.store:
+            if self[s] == None:
+                y = 0.0
+            else:
+                y = self[s]
+            str += (''.join(s) + ("=%4.2f " % y))
+        return str + ' >'
+
+    def items(self, sort = False):
+        """ In a dictionary-like way return all entries as a list of 2-tuples (key, prob).
+        If sort is True, entries are sorted in descending order of probability.
+        Note that this function should NOT be used for big (>5 variables) tables."""
+        if self.totalCnt == 0.0:
+            return []
+        ret = []
+        for s in self.store:
+            if self[s] != None:
+                ret.append((s, self[s]))
+        if sort:
+            return sorted(ret, key=lambda v: v[1], reverse=True)
+        return ret
+
+
+class IndepJoint(Joint):
+
+    def __init__(self, alphas, pseudo = 0.0):
+        """ A distribution of n-tuples.
+        All positions are assumed to be independent.
+        alphas: Alphabet(s) over which the distribution is defined
+        """
+        self.pseudo = pseudo
+        if type(alphas) is Alphabet:
+            self.alphas = tuple( [alphas] )
+        elif type(alphas) is tuple:
+            self.alphas = alphas
+        else:
+            self.alphas = tuple( alphas )
+        self.store = [Distrib(alpha, pseudo) for alpha in self.alphas]
+
+    def getN(self):
+        """ Retrieve the number of distributions/random variables. """
+        return len(self.alphas)
+
+    def __iter__(self):
+        return TupleStore(self.alphas).__iter__()
+
+    def reset(self):
+        """ Re-set the counts of each distribution. Pseudo-counts are re-applied. """
+        self.store = [Distrib(alpha, self.pseudo) for alpha in self.alphas]
+
+    def observe(self, key, cnt = 1, countGaps = True):
+        """ Make an observation of a tuple/key
+        key: tuple that is being observed
+        cnt: number/weight of observation (default is 1)
+        """
+        assert len(key) == len(self.store), "Number of symbols must agree with the number of positions"
+        for i in range(len(self.store)):
+            subkey = key[i]
+            if subkey == '-' and countGaps == False:
+                continue
+            if subkey == '*' or subkey == '-':
+                for sym in self.alphas[i]:
+                    score = self.store[i][sym]
+                    if (score == None):
+                        score = 0
+                    self.store[i].observe(sym, float(cnt)/float(len(self.alphas[i])))
+            else:
+                score = self.store[i][subkey]
+                if (score == None):
+                    score = 0
+                self.store[i].observe(subkey, cnt)
+
+    def __getitem__(self, key):
+        """ Determine and return the probability of a specified expression of the n-tuple
+        which can involve "wildcards"
+        Note that variables are assumed to be independent. """
+        assert len(key) == len(self.store), "Number of symbols must agree with the number of positions"
+        prob = 1.0
+        for i in range(len(self.store)):
+            mykey = key[i]
+            if mykey == '*' or mykey == '-':
+                pass # same as multiplying with 1.0 (all symbols possible)
+            else:
+                prob *= self.store[i][mykey]
+        return prob
+
+    def get(self, sym, pos):
+        """ Retrieve the probability of a specific symbol at a specified position. """
+        mystore = self.store[pos]
+        return mystore[sym]
+
+    def getColumn(self, column, count = False):
+        """ Retrieve all the probabilities (or counts) for a specified position.
+            Returns values as a dictionary, with symbol as key."""
+        d = {}
+        for a in self.alphas[column]:
+            if count: # absolute count
+                d[a] = self.store[column].count(a)
+            else: # probability
+                d[a] = self.store[column][a]
+        return d
+
+    def getRow(self, sym, count = False):
+        """ Retrieve the probabilities (or counts) for a specific symbol over all columns/positions.
+            Returns a list of values in the order of the variables/alphabets supplied to the constructor. """
+        d = []
+        for store in self.store:
+            if count: # absolute count
+                d.append(store.count(sym))
+            else: # probability
+                d.append(store[sym])
+        return d
+
+    def getMatrix(self, count = False):
+        """ Retrieve the full matrix of probabilities (or counts) """
+        d = {}
+        for a in self.alphas[0]:
+            d[a] = self.getRow(a, count)
+        return d
+
+    def displayMatrix(self, count = False):
+        """ Pretty-print matrix """
+        print " \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))
+        for a in self.alphas[0]:
+            if count:
+                print "%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))
+            else:
+                print "%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))
+
+    def __str__(self):
+        """ Text representation of the table. Note that size is an issue so big tables
+        will not be retrieved and displayed. """
+        if self.alphas > 5:
+            return '< ... too large to process ... >'
+        tstore = TupleStore(self.alphas)
+        str = '< '
+        for key in tstore:
+            p = 1.0
+            for i in range(len(self.store)):
+                value = self.store[i][key[i]]
+                if value != None and value != 0.0:
+                    p *= value
+                else:
+                    p = 0;
+                    break;
+            str += (''.join(key) + ("=%4.2f " % p))
+        return str + ' >'
+
+    def items(self, sort = False):
+        """ In a dictionary-like way return all entries as a list of 2-tuples (key, prob).
+        If sort is True, entries are sorted in descending order of probability.
+        Note that this function should NOT be used for big (>5 variables) tables."""
+        tstore = TupleStore(self.alphas)
+        ret = []
+        for key in tstore:
+            p = 1.0
+            for i in range(len(self.store)):
+                value = self.store[i][key[i]]
+                if value != None and value != 0.0:
+                    p *= value
+                else:
+                    p = 0;
+                    break;
+            if p > 0.0:
+                ret.append((key, p))
+        if sort:
+            return sorted(ret, key=lambda v: v[1], reverse=True)
+        return ret
+
+class NaiveBayes():
+    """ NaiveBayes implements a classifier: a model defined over a class variable
+        and conditional on a list of discrete feature variables.
+        Note that feature variables are assumed to be independent. """
+
+    def __init__(self, inputs, output, pseudo_input = 0.0, pseudo_output = 0.0):
+        """ Initialise a classifier.
+            inputs: list of alphabets that define the values that input variables can take.
+            output: alphabet that defines the possible values the output variable takes
+            pseudo_input: pseudo-count used for each input variable (default is 0.0)
+            pseudo_output: pseudo-count used for the output variable (default is 0.0) """
+        if type(inputs) is Alphabet:
+            self.inputs = tuple( [inputs] )
+        elif type(inputs) is tuple:
+            self.inputs = inputs
+        else:
+            self.inputs = tuple( inputs )
+        self.condprobs = {}   # store conditional probabilities as a dictionary (class is key)
+        for outsym in output: # GIVEN the class
+            # for each input variable initialise a conditional probability
+            self.condprobs[outsym] = [ Distrib(input, pseudo_input) for input in self.inputs ]
+        self.classprob = Distrib(output, pseudo_output) # the class prior
+
+    def observe(self, inpseq, outsym):
+        """ Record an observation of an input sequence of feature values that belongs to a class.
+            inpseq: sequence/list of feature values, e.g. 'ATG'
+            outsym: the class assigned to these feature values. """
+        condprob = self.condprobs[outsym]
+        for i in range(len(inpseq)):
+            condprob[i].observe(inpseq[i])
+        self.classprob.observe(outsym)
+
+    def __getitem__(self, key):
+        """ Determine and return the class probability GIVEN a specified n-tuple of feature values
+        The class probability is given as an instance of Distrib. """
+        out = Distrib(self.classprob.alpha)
+        for outsym in self.classprob.getSymbols():
+            condprob = self.condprobs[outsym]
+            prob = self.classprob[outsym]
+            for i in range(len(key)):
+                prob *= condprob[i][key[i]] or 0.0
+            out.observe(outsym, prob)
+        return out
+
+
--- a/rcdict.py
+++ b/rcdict.py
+import sym
+
+class RCDict(dict):
+    """ Class that extends a standard dictionary to accept only fixed-length DNA symbol strings as keys.
+        Additionally, it maps the reverse complement to the same value. """
+
+    def __init__(self, alpha = sym.DNA_Alphabet):
+        """ Initialise a reverse-complement dictionary to accept strings of a given alphabet (DNA by default) """
+        self.alpha = alpha
+        self.length = None
+
+    def __setitem__(self, key, value):
+        """ Set the value for a key.
+            Checks to see that if
+            (a) previous keys have been used that the length is the same, and
+            (b) the key consists only of valid symbols. """
+        if self.length == None:
+            self.length = len(key)
+        elif len(key) != self.length:
+            raise RuntimeError("Invalid key: " + str(key))
+        for i in range(len(key)):
+            if not key[i] in sym.DNA_Alphabet:
+                raise RuntimeError("Invalid symbol in key: " + str(key[i]))
+        super(RCDict, self).__setitem__(self.canonical(key), value)
+
+    def canonical(self, key):
+        """ Figures out the canonical key (original or its reverse complement).
+            Note that is you use other than DNA you may need to modify this code. """
+        if self.length == None:
+            return key
+        alpha = self.alpha
+        rcindx = [0 for _ in range(self.length)]
+        fwindx = [alpha.index(sym) for sym in key]
+        undecided = True
+        for forw in range(self.length):
+            backw = self.length - forw - 1
+            rcindx[forw] = 3 - fwindx[backw] # formula for converting A <-> T, C <-> G
+            if undecided and rcindx[forw] > fwindx[forw]: # RC is "higher" than original
+                return key
+            undecided = rcindx[forw] == fwindx[forw]
+        return ''.join([alpha.symbols[indx] for indx in rcindx])
+
+    def __getitem__(self, key):
+        """ Retrieve the value associated with a specified key. """
+        return super(RCDict, self).__getitem__(self.canonical(key))
+
+    def getSum(self, IUPAC_key):
+        """ Retrieve the sum of all the entries that match the specified IUPAC key. """
+        # TODO
+        pass
--- a/sam.py
+++ b/sam.py
+"""
+This python module reads in sam files from RNA-seq experiment and processes
+them and RNA-seq data1
+"""
+
+from collections import Counter
+import matplotlib.pyplot as plt
+import numpy as np
+import itertools
+import operator
+import math
+from scipy import stats
+from numpy import array, empty
+import scipy.cluster.hierarchy as sch
+
+
+def sam_reader(filename):
+    """Mandatory fields are QNAME,FLAG,RNAME,POS,MAPQ,CIGAR,RNEXT,PNEXT,TLEN,SEQ,QUAL
+for more info http://samtools.github.io/hts-specs/SAMv1.pdf """
+    data=[]
+    f= open(filename,'r')
+    for row in f:
+        if row.startswith('@'): # skip the header
+            pass
+        else:
+            info=row.strip().split('\t')
+            data.append(info)
+    return data
+
+
+def base_percentages(reads):
+    "reports base percentage  %A,%T,%C,%G "
+    all_seqs=[]
+    for read in reads:
+        seq=read[9]
+        seq=[seq[i:i+1] for i in range(0,len(seq),1)]
+        for nuc in seq:
+            all_seqs.append(nuc)
+    counts=dict(Counter(all_seqs))
+    nucs=counts.keys()
+    freqs={}
+    for nuc in nucs:
+        freqs[nuc]=float(counts[nuc])/sum(counts.values())
+    return freqs
+
+
+def numberofreads(reads):
+    """Incremented for every sequence-containing line in the sam file, regardless of whether it represents an alignment.
+for some files, this is not actually the number of reads. indeed, this may be a poor name for this stat"""
+    return len(reads)
+
+
+def mapped_reads(reads,paired_end=True):
+    """If duplicate tracking was enabled via -D, then this attempts to recapitulate the number of unique, mapped, probe-id's in the original sam file. It is multiplied by 2 for paired-end data with duplicate read id's.
+The idea is that if you divide this by the number of reads in the fastq you aligned (possibly from the output of fastq-stats),
+you will get an accurate "percentage of reads aligned" statistic.
+"mapped" is something with a non-negative position, and a "non-asterisk" cigar string."""
+    mapped_reads=[]
+    store_reads=[]
+    for read in reads:
+        if read[3]>0 and read[5]!='*':
+            mapped_reads.append(read[0])
+            store_reads.append(read)
+    mapped=set(mapped_reads)
+    list_mapped=list(mapped)
+    if paired_end==True:
+        mapped=len(mapped)+len(mapped)
+    else:
+        mapped=len(mapped)
+    print "number of mapped reads",mapped
+    return store_reads
+
+
+def mappedBases(mapped_reads):
+    """Total number of mapped bases in sam file"""
+    seq=""
+    for read in mapped_reads:
+        seq=seq+read[9]
+    return len(seq)
+
+def forward(mapped_reads):
+    """The number of lines in the sam file that were aligned to the "forward" strand. No accounting is done on duplicates."""
+    forward=[read for read in mapped_reads if read[9]>0]
+    return forward
+
+
+def reverse(mapped_reads):
+    """The number of lines in the sam file that were aligned to the "reverse" strand. No accounting is done on duplicates."""
+    reverse=[read for read in mapped_reads if read[9]<0]
+    return reverse
+
+
+
+########Qualities and STATS
+
+
+def subgroups(mapped_reads):
+    """form groups p<1e-3 one group,1e-3<=p<1e-2 one group,1e-2<=p<1 one group a total of three groups"""
+    group1=[]
+    group2=[]
+    group3=[]
+    for read in mapped_reads:
+        if int(read[4])>29:
+            group1.append(read)
+        elif int(read[4])<=29 and int(read[4])>17:
+            group2.append(read)
+        elif int(read[4])<=17:
+            group3.append(read)
+        else:
+            pass
+    print len(group1),"in p<1e-3 group"
+    print len(group2),"in 1e-3<=p<1e-2 group"
+    print len(group3),"in 1e-2<=p<1 group"
+    return group1,group2,group3
+
+
+
+def dinuc_freq(mapped_reads):
+    "reports dinucleotide composition using p(Rho) statistics for overrepresentation"
+    all_seqs=[]
+    for read in mapped_reads:
+        seq=read[9]
+        seq=[seq[i:i+1] for i in range(0,len(seq),1)]
+        for nuc in seq:
+            all_seqs.append(nuc)
+    counts=dict(Counter(all_seqs))
+    nucs=counts.keys()
+    freqs={}
+    for nuc in nucs:
+        freqs[nuc]=float(counts[nuc])/sum(counts.values())
+    all_seqs=[]
+    for read in mapped_reads:
+        seq=read[9]
+        seq=[seq[i:i+2] for i in range(0,len(seq),2)]
+        for nuc in seq:
+            all_seqs.append(nuc)
+    counts=dict(Counter(all_seqs))
+    dinucs=counts.keys()
+    dinuc_counts={}
+    for i in dinucs:
+        val=float(counts[i])/sum(counts.values())
+        dinuc_counts[i]=val/(freqs[i[0]]*freqs[i[1]]) # p-values
+    return dinuc_counts
+
+
+def PercentReadsAligned(group1,group2,group3,numfastq):
+    """Provide a list of mapped_reads and the number of reads in the fastq file"""
+    mapped_reads=group1+group2+group3
+    Mapped=len(mapped_reads)/float(numfastq)
+    Unmapped=1-float(Mapped)
+##    print "Mapping stats"
+##    print"p<1e-3", len(group1)/float(numfastq)
+##    print"1e-3<=p<1e-2",len(group2)/float(numfastq)
+##    print "1e-2<=p<1",len(group3)/float(numfastq)
+##    print "Unmapped",Unmapped
+    labels="p<1e-3","1e-3<=p<1e-2","1e-2<=p<1","Unmapped"
+    x=[len(group1)/float(numfastq),len(group2)/float(numfastq),len(group3)/float(numfastq),Unmapped]
+    plt.figure(1, figsize=(8,8))
+    ax = plt.axes([0.1, 0.1, 0.8, 0.8])
+    plt.pie(x,labels=labels,autopct='%1.1f%%', shadow=True)
+    plt.title('Mapping stats')
+    plt.show()
+    return Mapped
+
+
+
+def length_stats(group1,group2,group3):
+    """returns basic stats relating to the lengths of the reads
+Calculations are based on the the length of the (possibly hard-clipped) sequence in the sam file."""
+    reads=[group1,group2,group3]
+    data=[]
+    for i in range(0,len(reads)):
+        lengths=[]
+        for read in reads[i]:
+            if int(read[8])<0:
+                length=-1*int(read[8])
+            else:
+                length=int(read[8])
+            lengths.append(length)
+        mean_len=np.mean(lengths)
+        print "group"+str(i+1)+"mean",mean_len
+        max_len=np.max(lengths)
+        print "group"+str(i+1)+"max length",max_len
+        min_len=np.min(lengths)
+        print "group"+str(i+1)+"min length",min_len
+        data.append(["group"+str(i+1),mean_len,max_len,min_len])
+    return data
+
+def plot_length_distrib(group,name):
+    """distribution of lengths of all the sam reads"""
+    lengths=[]
+    for read in group:
+        if int(read[8])<0:
+            length=-1*int(read[8])
+        else:
+            length=int(read[8])
+        lengths.append(length)
+    ##Visualize length distribution
+    plt.figure(1, figsize=(8,8))
+    ax = plt.axes([0.1, 0.1, 0.8, 0.8])
+    n, bins, patches = plt.hist(lengths,100, normed=0, facecolor='g')
+    plt.xlabel("lengths")
+    plt.ylabel("number of mapped reads")
+    plt.title(name)
+    plt.show()
+
+def inv_logit(p):
+    return 10**(p/-10)
+
+
+
+def plot_base_composition(reads,sym):
+    "reports nucelotide frequencies at each position in the sam sequences"
+    #DNA_Alphabet=["A","C","T","G","N"]
+    all_nucs=[]
+    for read in reads:
+        nucs={}#dictionary to store nucleotide data
+        seq=read[9]
+        for i in range(0,len(seq)):
+            nucs[str(i+1)]=seq[i]
+        all_nucs.append(nucs)
+    all_items=[]
+    counts=[]
+    pos=range(1,len(seq)+1)
+    for dicts in all_nucs:
+        for item in dicts.items():
+            all_items.append(item)
+    all_items.sort(key=operator.itemgetter(0))
+    groups= [map(operator.itemgetter(1),list(group)) for key, group in itertools.groupby(all_items, operator.itemgetter(0))]
+    for group in groups:
+        counts.append(group.count(sym))
+    print counts
+    plt.figure(1, figsize=(8,8))
+    ax = plt.axes([0.1, 0.1, 0.8, 0.8])
+    plt.bar(pos,counts,facecolor='g')
+    plt.xlabel("Position")
+    plt.ylabel("number of mapped reads")
+    plt.title(sym)
+    plt.show()
+    return counts
+
+#####################################################
+#Transcript reader
+
+def raw_count_reader(filename):
+    """Count the raw counts in the file"""
+    data={}
+    f= open(filename,'r')
+    for row in f:
+        if row.startswith('t1'): # skip the header
+            pass
+        else:
+            info=row.strip().split('\t')
+            data[info[0]]=[int(info[1]),int(info[2]),int(info[3]),int(info[4]),float(info[5])] #t1,rept1,t10,rept10,len
+    return data
+
+
+#####Normalisation methods
+
+
+def get_RPKM(data,num_map1,num_map2,num_map3,num_map4):
+    """provide number of mapped reads for the two groups of interest and raw count data .This method provides length normalisation to prevent length and total count bias"""
+    all_rpkms=[];final={}
+    for i,s,ii,ss,v in data.values():
+        rpkms=[]
+        num_mapped_reads=[num_map1,num_map2,num_map3,num_map4]
+        vals=[i,s,ii,ss]
+        lengths=[v,v,v,v]
+        for n in range(0,len(vals)):
+            if vals[n]==0:
+                rpkm=0
+                rpkms.append(rpkm)
+            else:
+                #perform RPKM calc
+                rpkm= float(vals[n])/(lengths[n]*(float(num_mapped_reads[n])/10**6))
+                rpkms.append(rpkm)
+        all_rpkms.append(rpkms)
+    #return gene names and rpkms
+    for i in range(0,len(data.keys())):
+        final[data.keys()[i]]=[float(all_rpkms[i][0]),float(all_rpkms[i][1]),float(all_rpkms[i][2]),float(all_rpkms[i][3])]
+    return final
+
+def write_RPKM_data(RPKM_data,filename):
+    """write RPKM data to a file"""
+    f=open(filename,'w')
+    for i in range(0,len(RPKM_data)):
+        f.write("%s\t%d\t%d\t%d\t%d\n"%(RPKM_data.keys()[i],int(RPKM_data.values()[i][0]),int(RPKM_data.values()[i][1]),int(RPKM_data.values()[i][2]),int(RPKM_data.values()[i][3])))
+    f.close()
+
+
+
+
+###############Visualize replicates to determine degree of biological variation
+
+def pearson_def(x, y):
+    """Pearson correlation coefficient R"""
+    assert len(x) == len(y)
+    n = len(x)
+    assert n > 0
+    avg_x = np.mean(x)
+    avg_y = np.mean(y)
+    diffprod = 0
+    xdiff2 = 0
+    ydiff2 = 0
+    for idx in range(n):
+        xdiff = x[idx] - avg_x
+        ydiff = y[idx] - avg_y
+        diffprod += xdiff * ydiff
+        xdiff2 += xdiff * xdiff
+        ydiff2 += ydiff * ydiff
+    return diffprod / math.sqrt(xdiff2 * ydiff2)
+
+
+def plotreprpkm(rpkm_data,timepoint):
+    """plot showing level of agreement between technical replicates for RPKM between replicates and plots coefficient of determination"""
+    one=[]
+    two=[]
+    if timepoint=="t1":
+        for i in range(0,len(rpkm_data.values())):
+            one.append(int(rpkm_data.values()[i][0]))
+            two.append(int(rpkm_data.values()[i][1]))
+    else:
+        for i in range(0,len(rpkm_data.values())):
+            one.append(int(rpkm_data.values()[i][2]))
+            two.append(int(rpkm_data.values()[i][3]))
+    plt.plot(one,two,'o')
+    pcc=pearson_def(one,two)
+    R2=pcc**2
+    name="""Technical Replicates
+R2="""+str(R2)
+    m,b= np.polyfit(one,two,1)
+    plt.figure(1, figsize=(8,8))
+    plt.plot(one, np.array(one)*m +b,'r-')
+    plt.text(3000, max(two)-1000,name , fontsize=12)
+    plt.xlabel("RPKM replicate 1")
+    plt.ylabel("RPKM replicate 2")
+    plt.title(timepoint)
+    plt.show()
+
+
+def plotMAreprpkm(rpkm_data,timepoint):
+    """MA Plot of log(RPKM) vs Average log(RPKM) of replicates"""
+    m=[]
+    a=[]
+    if timepoint=="t1":
+        for i in range(0,len(rpkm_data.values())):
+            y=np.log2(rpkm_data.values()[i][0]+1)-np.log2(rpkm_data.values()[i][1]+1)
+            x=(np.log2(rpkm_data.values()[i][0]+1)+np.log2(rpkm_data.values()[i][1]+1))/2
+            m.append(y)
+            a.append(x)
+    else:
+        for i in range(0,len(rpkm_data.values())):
+            y=np.log2(rpkm_data.values()[i][2]+1)-np.log2(rpkm_data.values()[i][3]+1)
+            x=(np.log2(rpkm_data.values()[i][2]+1)+np.log2(rpkm_data.values()[i][3]+1))/2
+            m.append(y)
+            a.append(x)
+    plt.figure(1, figsize=(8,8))
+    ax = plt.axes([0.1, 0.1, 0.8, 0.8])
+    plt.plot(a,m,'o')
+    plt.axhline(np.mean(m)+1.96*np.std(m),color="green",label="avg diff +1.96(std diff)")
+    plt.axhline(np.mean(m)-1.96*np.std(m),color="green",label="avg diff -1.96(std diff)")
+    plt.xlabel("Average log(RPKM) of replicates")
+    plt.ylabel("Difference in log(RPKM) of replicates")
+    plt.legend(loc="lower right")
+    plt.title(timepoint)
+    plt.show()
+
+
+
+def get_cv(data1,condition):
+    cvs=[]
+    if condition=="t1":
+        for i in range(0,len(data1.values())):
+            mean = np.mean([data1.values()[i][0],data1.values()[i][1]])
+            std=np.std([data1.values()[i][0],data1.values()[i][1]])
+            if mean==0.0 and std==0.0:
+                pass
+            else:
+                cv=float(mean+1)/(std+1)
+                cvs.append(cv)
+    else:
+        for i in range(0,len(data1.values())):
+            mean = np.mean([data1.values()[i][2],data1.values()[i][3]])
+            std=np.std([data1.values()[i][2],data1.values()[i][3]])
+            if mean==0.0 and std==0.0:
+                pass
+            else:
+                cv=float(mean+1)/(std+1)
+                cvs.append(cv)
+    return cvs
+
+
+def get_boxplots(norm,original):
+    """distribution of the coeficient of variation across samples (replicates) normalised using the methods provided"""
+    bp=plt.boxplot([norm,original],notch=False, patch_artist=True)
+    for box in bp['boxes']:
+        box.set(color="red")
+        box.set(color="blue")
+    plt.ylabel("coefficient of variation")
+    plt.xlabel("Methods")
+    my_xticks = ['RPKM','raw counts']
+    x=[1,2]
+    plt.xticks(x,my_xticks)
+    plt.ylim(0,400)
+    plt.show()
+
+
+def plotavg_cv(norm,original):
+    """distribution of the coeficient of variation across samples (replicates) normalised using the methods provided"""
+    x=[1,2]
+    y=[np.mean(norm),np.mean(original)]
+    plt.figure(1, figsize=(8,8))
+    ax = plt.axes([0.1, 0.1, 0.8, 0.8])
+    plt.bar(x[0],y[0],color="red",label="RPKM")
+    plt.bar(x[1],y[1],color="blue",label="Raw counts")
+    plt.ylabel("Average coefficient of variation")
+    plt.xlabel("Methods")
+    ax.xaxis.set_ticklabels([])
+    plt.legend(loc="upper right")
+    plt.show()
+
+
+def plotMA(rpkm_data,cutoff=[-1.5,1.5]):
+    """Produce MA plot using logfold as cutoff"""
+    logfc=[]
+    avg_rpkm=[]
+    sig_logfc=[]
+    sig_avg_rpkm=[]
+    logfc2=[]
+    avg_rpkm2=[]
+    sig_logfc2=[]
+    sig_avg_rpkm2=[]
+    for i,ii,s,ss in rpkm_data.values():
+        fc=np.log2(float(s+1)/(i+1))
+        if fc<cutoff[0] or fc>cutoff[1]:
+            sig_logfc.append(fc)
+            sig_avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
+        else:
+            logfc.append(fc)
+            avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
+    for i,ii,s,ss in rpkm_data.values():
+        fc2=np.log2(float(ss+1)/(ii+1))
+        if fc2<cutoff[0] or fc2>cutoff[1]:
+            sig_logfc2.append(fc2)
+            sig_avg_rpkm2.append(np.log2(ss+1)+np.log2(ii+1)/2)
+        else:
+            logfc2.append(fc2)
+            avg_rpkm2.append(np.log2(ss+1)+np.log2(ii+1)/2)
+    plt.figure(1, figsize=(8,8))
+    ax = plt.axes([0.1, 0.1, 0.8, 0.8])
+    plt.plot(avg_rpkm,logfc,'o',color="blue",label="rep1")
+    plt.plot(avg_rpkm2,logfc2,'x',color="blue",label="rep2")
+    plt.plot(sig_avg_rpkm,sig_logfc,'o',color="red",label="sig rep1")
+    plt.plot(sig_avg_rpkm2,sig_logfc2,'x',color="red",label="sig rep2")
+    plt.axhline(cutoff[0],color="orange")
+    plt.axhline(cutoff[1],color="orange")
+    plt.ylabel("Fold Change (log2)")
+    plt.xlabel("Average RPKM (log2)")
+    plt.title("MA plot")
+    plt.legend(loc="upper left")
+    plt.show()
+
+def plotMA_pval(rpkm_data,cutoff=0.05):
+    """Produce MA plot using the pvalue as cutoff"""
+    logfc=[]
+    avg_rpkm=[]
+    sig_logfc=[]
+    sig_avg_rpkm=[]
+    logfc2=[]
+    avg_rpkm2=[]
+    sig_logfc2=[]
+    sig_avg_rpkm2=[]
+    for i,ii,s,ss,pval in rpkm_data.values():
+        fc=np.log2(float(s+1)/(i+1))
+        if float(pval)<cutoff:
+            sig_logfc.append(fc)
+            sig_avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
+        else:
+            logfc.append(fc)
+            avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
+    for i,ii,s,ss,pval in rpkm_data.values():
+        fc2=np.log2(float(ss+1)/(ii+1))
+        if float(pval)<cutoff:
+            sig_logfc2.append(fc2)
+            sig_avg_rpkm2.append(np.log2(ss+1)+np.log2(ii+1)/2)
+        else:
+            logfc2.append(fc2)
+            avg_rpkm2.append(np.log2(ss+1)+np.log2(ii+1)/2)
+    plt.figure(1, figsize=(8,8))
+    ax = plt.axes([0.1, 0.1, 0.8, 0.8])
+    plt.plot(avg_rpkm,logfc,'o',color="blue",label="rep1")
+    plt.plot(avg_rpkm2,logfc2,'o',color="blue",label="rep2")
+    plt.plot(sig_avg_rpkm,sig_logfc,'o',color="red",label="sig rep1")
+    plt.plot(sig_avg_rpkm2,sig_logfc2,'x',color="red",label="sig rep2")
+    plt.ylabel("Fold Change (log2)")
+    plt.xlabel("Average RPKM (log2)")
+    plt.title("MA plot")
+    plt.legend(loc="upper left")
+    plt.show()
+
+
+#####DE expression statistical test (T-Test, ANOVA and FDR)
+
+
+def Welcht(rpkm):
+    """Performs Welchs T-statistic (one-tailed)"""
+    ts=[]
+    result={}
+    for i,ii,s,ss in rpkm.values():
+        sd1=np.std([i,ii])
+        sd2=np.std([s,ss])
+        t=(np.mean([s,ss])-np.mean([i,ii]))/(math.sqrt(((float(sd2)/2)+(float(sd1)/2))))
+        ts.append(t)
+    pvals=[]
+    for t in ts:
+        pval = stats.t.sf(np.abs(t), 2-1)
+        if pval==float('nan'):
+            pval=1
+            pvals.append(pval)
+        else:
+            pval=pval
+            pvals.append(pval)
+    corr_pvals=correct_pvalues_for_multiple_testing(pvals, correction_type = "Benjamini-Hochberg")
+    for i in range(0,len(rpkm.values())):
+        result[rpkm.keys()[i]]=[rpkm.values()[i][0],rpkm.values()[i][1],rpkm.values()[i][2],rpkm.values()[i][3],corr_pvals[i]]
+    return result
+
+
+
+def correct_pvalues_for_multiple_testing(pvalues, correction_type = "Benjamini-Hochberg"):
+    """
+    consistent with R print correct_pvalues_for_multiple_testing([0.0, 0.01, 0.029, 0.03, 0.031, 0.05, 0.069, 0.07, 0.071, 0.09, 0.1])
+    """
+    pvalues = array(pvalues)
+    n = float(pvalues.shape[0])
+    new_pvalues = empty(n)
+    if correction_type == "Bonferroni":
+        new_pvalues = n * pvalues
+    elif correction_type == "Bonferroni-Holm":
+        values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
+        values.sort()
+        for rank, vals in enumerate(values):
+            pvalue, i = vals
+            new_pvalues[i] = (n-rank) * pvalue
+    elif correction_type == "Benjamini-Hochberg":
+        values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
+        values.sort()
+        values.reverse()
+        new_values = []
+        for i, vals in enumerate(values):
+            rank = n - i
+            pvalue, index = vals
+            new_values.append((n/rank) * pvalue)
+        for i in xrange(0, int(n)-1):
+            if new_values[i] < new_values[i+1]:
+                new_values[i+1] = new_values[i]
+        for i, vals in enumerate(values):
+            pvalue, index = vals
+            new_pvalues[index] = new_values[i]
+    return new_pvalues
+
+
+####Method Run hiearachical clustering on the correlation matrix (of differentially expressed genes) -Coexpression
+
+def cluster_data(data_matrix,genenames,timepoint):
+    "One replicates at a specific time point"
+    D = np.zeros([np.shape(data_matrix)[0],1])
+    ##generate a distance matrix
+    for i in range(np.shape(data_matrix)[0]):
+        for j in range(1):
+            D[i,j] = abs(data_matrix[i] - data_matrix[j])**2  #use Wards method (other methods could be implemented here)
+    labels=list('' for i in range(np.shape(data_matrix)[0]))
+    for i in range(np.shape(data_matrix)[0]):
+        labels[i]=str(i)+","+str(genenames[i])
+    fig=plt.figure(1, figsize=(17,8))
+    linked = sch.linkage(D, method='centroid')
+    dend = sch.dendrogram(linked, orientation='right',labels=labels) # sets the oirentation root at the right
+    plt.title(timepoint)
+    fig.savefig(timepoint+'dendogram.png')
+    return dend['ivl']
+
+def heatmap_cluster(data_matrix,timepoint):
+    """Produces a heatmap of the clustered count data"""
+    D = np.zeros([np.shape(data_matrix)[0],np.shape(data_matrix)[0]])
+    for i in range(np.shape(data_matrix)[0]):
+        for j in range(np.shape(data_matrix)[0]):
+            D[i,j] = abs(data_matrix[i] - data_matrix[j])**2  #use Wards method (other methods could be implemented here)
+    fig = plt.figure()
+    axdendro = fig.add_axes([0.09,0.1,0.2,0.8])
+    linked = sch.linkage(D, method='centroid')
+    dend = sch.dendrogram(linked, orientation='right') # sets the oirentation root at the right
+    axdendro.set_xticks([])
+    axdendro.set_yticks([])
+    #plot distance matrix
+    axmatrix = fig.add_axes([0.3,0.1,0.6,0.8])
+    index = dend['leaves']
+    D=D[index,:]
+    D=D[:,index]
+    im = axmatrix.matshow(D, aspect='auto', origin='lower')
+    axmatrix.set_xticks([])
+    axmatrix.set_yticks([])
+    #plot color bar
+    axcolor = fig.add_axes([0.91,0.1,0.02,0.8])
+    fig.colorbar(im, cax=axcolor)
+    #display the heatmap
+    fig.savefig(timepoint+'heatmap.png')
+
--- a/seqdata.py
+++ b/seqdata.py
+'''
+Module that provides methods and classes for working with genome sequence data.
+For instance,
+- BED files
+- 2bit genome sequence files
+
+'''
+
+def overlap(chromLoc1, chromLoc2):
+    """ Check if chromosome locations described by tuples
+        (chrom, chromStart, chromEnd) overlap.
+        If so return the number of positions that overlap.
+        Return 0 in case of NO overlap.
+    """
+    if chromLoc1[0] == chromLoc2[0]:
+        halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
+        halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
+        minWidth = min(halfWidth1, halfWidth2)
+        minWidth = max(minWidth, 1)
+        maxWidth = max(halfWidth1, halfWidth2)
+        maxWidth = max(maxWidth, 1)
+        centre1 = chromLoc1[1] + halfWidth1
+        centre2 = chromLoc2[1] + halfWidth2
+        diffCentres = abs(centre1 - centre2)
+        if diffCentres + minWidth < maxWidth: # one fragment encompasses the other
+            return minWidth * 2
+        else:
+            return max(0, halfWidth1 + halfWidth2 - diffCentres)
+    else:
+        return 0
+
+def distance(chromLoc1, chromLoc2, minimum = True):
+    """ Check the distance between two locations described by tuples
+        (chrom, chromStart, chromEnd).
+        If chromLoc1 is BEFORE chromLoc2 then the distance is positive, else negative.
+        If not on same chromosome return None.
+        minimum: if True (default), then use minimum distance, if False, use centre to centre
+    """
+    if chromLoc1[0] == chromLoc2[0]:
+        halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
+        halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
+        minWidth = min(halfWidth1, halfWidth2)
+        minWidth = max(minWidth, 1)
+        maxWidth = max(halfWidth1, halfWidth2)
+        maxWidth = max(maxWidth, 1)
+        centre1 = chromLoc1[1] + halfWidth1
+        centre2 = chromLoc2[1] + halfWidth2
+        diffCentres = abs(centre1 - centre2)
+        if not minimum:
+            return centre2 - centre1
+        if diffCentres + minWidth < maxWidth: # one fragment encompasses the other
+            return 0
+        elif halfWidth1 + halfWidth2 - diffCentres > 0: # fragments overlap A-to-B or B-to-A
+            return 0
+        else:
+            loc1_is_1st = chromLoc2[1] - chromLoc1[2]
+            loc1_is_2nd = chromLoc1[1] - chromLoc2[2]
+            if loc1_is_1st > loc1_is_2nd:
+                return loc1_is_1st
+            else:
+                return -loc1_is_2nd
+    else:
+        return None
+
+class BedEntry():
+
+    def __init__(self, chrom, chromStart, chromEnd):
+        self.chrom = chrom
+        self.chromStart = chromStart
+        self.chromEnd = chromEnd
+        self.blockCount = None
+        self.usestrand = False
+        self.name = ''
+
+    def addOption(self,
+                  name = None,
+                  score = None,
+                  strand = None,
+                  thickStart = None,
+                  thickEnd = None,
+                  itemRgb = None,
+                  blockCount = None,
+                  blockSizes = None,
+                  blockStarts = None,
+                  signalValue = None,
+                  pValue = None,
+                  qValue = None,
+                  peak = None,
+                  tags = None,
+                  summit = None,
+                  fold = None,
+                  fdr = None,
+                  zscore = None,
+                  bg = None):
+        if name: self.name = name
+        if score: self.score = score
+        if strand:
+            self.strand = strand
+            self.usestrand = True # use reverse complement when sequence is requested from genome
+        if thickStart: self.thickStart = thickStart
+        if thickEnd: self.thickEnd = thickEnd
+        if itemRgb: self.itemRgb = [int(color) for color in itemRgb.split(',')]
+        if blockCount:
+            self.blockCount = max(0, blockCount)
+            if blockCount > 0:
+                self.blockSizes = [int(sizeword) for sizeword in blockSizes.split(',')]
+                self.blockStarts = [int(startword) for startword in blockStarts.split(',')]
+                if len(self.blockSizes) != blockCount or len(self.blockStarts) != blockCount:
+                    raise RuntimeError('Blockcount is incorrect in BED entry \"%s\"' % str(self))
+        if signalValue: self.signalValue = signalValue
+        if pValue: self.pValue = pValue
+        if qValue: self.qValue = qValue
+        if peak: self.peak = peak
+        if tags: self.tags = tags
+        if summit: self.summit = summit
+        if fold: self.fold = fold
+        if fdr: self.fdr = fdr
+        if bg: self.bg = bg
+        if zscore: self.zscore = zscore
+
+    def __str__(self):
+        return str((self.chrom, self.chromStart, self.chromEnd))
+
+    def __getitem__(self, i):
+        if self.blockCount:
+            return (self.chrom, self.blockStarts[i], self.blockStarts[i] + self.blockSizes[i])
+
+    def __iter__(self):
+        if self.blockCount:
+            for i in range(self.blockCount):
+                if self.blockSizes[i] > 0:
+                    yield (self.chrom, self.blockStarts[i], self.blockStarts[i] + self.blockSizes[i])
+
+    def __len__(self):
+        return self.blockCount
+
+    def loc(self, genome = None, fixedwidth = None, usesummit = False, useshift = None):
+        """ Retrieve the genomic location for BED entry, or sequence if genome is provided
+            genome: a dictionary with keys for sequence names, e.g. 'chr1', 'chrX', etc, and values with indexed/sliceable strings
+            fixedwidth: the width of the location/sequence if the width in the BED entry is ignored, and only its centre is used
+            usesummit: centre a fixedwidth window around an assigned "summit"
+            useshift: centre a fixedwidth window around a shifted centre point, e.g. useshift=-125 will shiftcentre point 125bp upstream,
+            to say capture a fixedwidth=350bp window with 350/2-125=50bp downstream
+        """
+        otherstrand = False
+        if (self.usestrand):
+            if (self.strand == '-'):
+                otherstrand = True
+
+        if (otherstrand == False):
+            end = self.chromEnd
+            start = self.chromStart
+            mywidth = fixedwidth or (self.chromEnd - self.chromStart)
+            mycentre = start + (self.chromEnd - self.chromStart) / 2
+            if usesummit:
+                mycentre = self.summit
+            if useshift:
+                mycentre = mycentre + useshift
+            if fixedwidth: # we need to re-calculate start and end
+                if genome:
+                    end = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
+                else:
+                    end = mycentre + (mywidth / 2)
+                start = max(0, mycentre - (mywidth / 2))
+
+        else: # other strand
+            start = self.chromEnd
+            end = self.chromStart
+            mywidth = fixedwidth or (self.chromEnd - self.chromStart)
+            mycentre = self.chromStart + (self.chromEnd - self.chromStart) / 2
+            if usesummit:
+                mycentre = self.summit
+            if useshift:
+                mycentre = mycentre - useshift # shift is reversed on other strand
+            if fixedwidth: # we need to re-calculate start and end
+                end = max(0, mycentre - (mywidth / 2))
+                if genome:
+                    start = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
+                else:
+                    start = mycentre + (mywidth / 2)
+
+        if genome: # refer to the genome sequence
+            return genome[self.chrom][start : end]
+        else:
+            return (self.chrom, start, end)
+
+    def setwidth(self, fixedwidth = None, usesummit = False):
+        if fixedwidth:
+            if usesummit:
+                diff = self.summit - fixedwidth / 2
+            else:
+                diff = (self.chromEnd - self.chromStart) / 2 - fixedwidth / 2
+            self.chromStart += diff
+            self.chromStart += diff + fixedwidth
+        return (self.chrom, self.chromStart, self.chromEnd)
+
+class BedFile():
+    """ Read BED file.
+
+        See http://genome.ucsc.edu/FAQ/FAQformat#format1
+
+        The first three required BED fields are (part of all supported sub-formats):
+
+        chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671).
+        chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+        chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+
+        The 9 additional optional BED fields are (part of sub-format "Optional"):
+
+        name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode.
+        score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). This table shows the Genome Browser's translation of BED score values into shades of gray:
+        shade
+        strand - Defines the strand - either '+' or '-'.
+        thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
+        thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).
+        itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
+        blockCount - The number of blocks (exons) in the BED line.
+        blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+        blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+
+        ENCODE also defines broadpeaks and narrowpeaks format (part of our "Peaks" sub-format):
+
+        name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode.
+        score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were '0' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
+        strand - +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned.
+        signalValue - Measurement of overall (usually, average) enrichment for the region.
+        pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
+        qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
+        peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
+
+        MACS also defines a "summit" peaks format (part of our "Summit" sub-format)
+        It contains the peak summits locations for every peaks. The 5th column in this file is the .
+        In addition to the required three, the following fields follow:
+        length         [redundant, ignored]
+        summit         summit height of fragment pileup
+        tags
+        pValue         [-10*log10(pvalue)]
+        fold           [enrichment]
+        FDR            [%; optional]
+
+        "CCAT" BED-like file format:
+        chromosome,
+        peakcenter    [converted to summit],
+        regionstart,
+        regionend,
+        tags          [tagcount],
+        bg            [bgcount],
+        zscore,
+        fdr
+
+    """
+
+    def __init__(self, entries, format = 'Limited'):
+        if isinstance(entries, str): # filename
+            self.rows = self._read(entries, format)
+        else:
+            self.rows = entries
+        self.format = format
+        self.indices = self._createIndices()
+        
+    def _read(self, filename, format = 'Limited'):
+        """ Read a BED file.
+            format: specifies the format of the file,
+            "Limited", e.g.
+                chr22 1000 5000
+                chr22 2000 6000
+            "Optional", e.g.
+                track name=pairedReads description="Clone Paired Reads" useScore=1
+                chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512
+                chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601
+                ...
+                (also handles the Limited + score format)
+            "Peaks", e.g.
+                chr1    569780    569930    .    0    .    19    6.07811    -1    -1
+                chr1    713300    713450    .    0    .    54    49.1167    -1    -1
+            "Strand", e.g.
+                chr4    185772359    185772424    -
+                chr18    20513381    20513401    +
+            also supports a 5th label field 
+                chr5    20611949        20611949        +       ENSG00000251629_20611949
+                chr3    42187863        42187863        -       ENSG00000234562_42187863
+            "Summit", e.g.
+                # d = 130
+                chr      start    end   length summit  tags -10*log10(pvalue)    fold_enrichment    FDR(%)
+                chr1     8250     8671    422    286    46    145.84    11.68    0.51
+                chr1    36382    36984    603    405    46    315.23    27.05    0.24
+            "CCAT", e.g.
+                chr8    94747805    94747070    94749250    525     3    21.519196    0.002000
+                chr17   55277895    55277070    55279280    560    18    21.283333    0.002000
+            "Cropped", e.g.
+                chr1    851602    10
+                chr1    921184    18
+                chr1    931838    9
+        """
+        f = open(filename)
+        row = 0
+        acceptHeaderRows = 1
+        headerRow = None
+        rows = []
+        for line in f:
+            row += 1
+            words = line.strip().split()
+            if len(words) == 0:
+                continue # ignore empty lines
+            if words[0].strip().startswith('#'):
+                continue # comment
+            if words[0].strip().startswith('browser'):
+                continue # ignore
+            if words[0].strip().startswith('track'):
+                continue # ignore
+            try:
+                chrom = words[0]
+                if format.lower().startswith('ccat'):
+                    chromStart = int(words[2])
+                    chromEnd = int(words[3])
+                else: # all other standard BED formats
+                    chromStart = int(words[1])
+                    chromEnd = int(words[2])
+                entry = BedEntry(chrom, chromStart, chromEnd)
+                if format.lower().startswith('opt'):
+                    if len(words) >= 12:
+                        entry.addOption(name = words[3], score = float(words[4]), strand = words[5], thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8], blockCount = int(words[9]), blockSizes = words[10], blockStarts = words[11])
+                    elif len(words) >= 9:
+                        entry.addOption(name = words[3], score = float(words[4]), strand = words[5], thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8])
+                    elif len(words) >= 6:
+                        entry.addOption(name = words[3], score = float(words[4]), strand = words[5])
+                    elif len(words) >= 5:
+                        entry.addOption(name = words[3], score = float(words[4]))
+                    elif len(words) >= 4:
+                        entry.addOption(name = words[3])
+                    else:
+                        entry.addOption(name = '.', score = int(words[3]), strand = '.')
+                elif format.lower().startswith('bed6'):
+                    entry.addOption(name=words[3], score=float(words[4]), strand=words[5])
+                elif format.lower().startswith('strand'):
+                    if len(words) >= 4: # properly formatted
+                        entry.addOption(strand = words[3])
+                    if len(words) >= 5:
+                        entry.addOption(name = words[4])
+                elif format.lower().startswith('peak'):
+                    if len(words) >= 10: # narrowpeaks
+                        entry.addOption(name = words[3], score = int(words[4]), strand = words[5], signalValue = float(words[6]), pValue = float(words[7]), qValue = float(words[8]), peak = int(words[9]))
+                    else: # broadpeaks
+                        entry.addOption(name = words[3], score = int(words[4]), strand = words[5], signalValue = float(words[6]), pValue = float(words[7]), qValue = float(words[8]))
+                elif format.lower().startswith('summit'):
+                    if len(words) >= 9:
+                        entry.addOption(summit = int(words[4]), tags = int(words[5]), pValue = float(words[6]), fold = float(words[7]), fdr = float(words[8]))
+                    else:
+                        entry.addOption(summit = int(words[4]), tags = int(words[5]), pValue = float(words[6]), fold = float(words[7]))
+                elif format.lower().startswith('ccat'):
+                    entry.addOption(summit = int(words[1]) - entry.chromStart, tags = int(words[4]), bg = int(words[5]), zscore = float(words[6]), fdr = float(words[7]), name = '.', score = int(words[4]), strand = '.')
+                elif format.lower().startswith('crop'):
+                    entry.addOption(score = int(words[2]), name = '.', strand = '.')
+                    entry.chromEnd = entry.chromStart + 1
+                rows.append(entry)
+            except RuntimeError as e:
+                if not acceptHeaderRows:
+                    raise RuntimeError('Error in BED file at row %d (%s)' % (row, e.strerror))
+                else:
+                    headerRow = words
+                    acceptHeaderRows -= 1 # count down the number of header rows that can occur
+        f.close()
+        return rows
+        
+    def __iter__(self):
+        return self.rows.__iter__()
+
+    def __getslice__(self, i, j):
+        return self.rows.__getslice__(i, j)
+
+    def __getitem__(self, i):
+        return self.rows[i]
+
+    def __len__(self):
+        return len(self.rows)
+
+    def _createIndices(self):
+        index_start = {}
+        index_centre = {}
+        index_end = {}
+        index_name = {}
+        for i in range(len(self.rows)):
+            row = self.rows[i]
+            if not index_start.has_key(row.chrom): # seeing chromosome entry first time
+                index_start[row.chrom] = []
+            if not index_centre.has_key(row.chrom): # seeing chromosome entry first time
+                index_centre[row.chrom] = []
+            if not index_end.has_key(row.chrom): # seeing chromosome entry first time
+                index_end[row.chrom] = []
+            index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
+            index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
+            index_end[row.chrom].append((row.chromEnd, row.chromEnd - row.chromStart, i))
+            if row.name:
+                index_name[row.name] = row
+        for chr in index_start:
+            index_start[chr].sort()
+            index_centre[chr].sort()
+            index_end[chr].sort()
+        return (index_start, index_centre, index_end, index_name)
+
+    def __contains__(self, elem):
+        """ Test for containment: does the specified elem overlap with at least one of the BED entries.
+            The method performs a binary search. """
+        try:
+            if isinstance(elem, BedEntry):
+                elem = elem.loc()
+            entries = self.indices[0][elem[0]] # use the start index
+            upper = len(entries)    # keep an upper boundary
+            lower = 0               # and a lower boundary
+            inspect = (upper - lower) / 2 # start by looking in the middle
+            while True:
+                entry = self.rows[entries[inspect][2]]
+                d = distance(entry.loc(), elem, minimum = True)
+                delta = 0
+                if d == 0:
+                    return True
+                elif d > 0:
+                    lower = inspect + 1
+                    delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                    inspect += delta
+                else:
+                    upper = inspect
+                    delta = (inspect - lower + 1) / 2
+                    inspect -= delta
+                if delta == 0:
+                    return False
+        except KeyError:
+            return False
+
+    def match(self, elem, name):
+        """ Test for containment: does the specified elem overlap with at least one of the BED entries
+            that has the nominated name (label)."""
+        try:
+            if isinstance(elem, BedEntry):
+                elem = elem.loc()
+            entries = self.indices[0][elem[0]] # use the start index
+            upper = len(entries)    # keep an upper boundary
+            lower = 0               # and a lower boundary
+            inspect = (upper - lower) / 2 # start by looking in the middle
+            while True:
+                entry = self.rows[entries[inspect][2]]
+                d = distance(entry.loc(), elem, minimum = True)
+                delta = 0
+                if d == 0:
+                    delta = 0
+                    while d == 0:
+                        if entry.name == name:
+                            return True
+                        delta += 1
+                        entry = self.rows[entries[inspect + delta][2]]
+                        d = distance(entry.loc(), elem, minimum = True)
+                    delta = -1
+                    entry = self.rows[entries[inspect + delta][2]]
+                    d = distance(entry.loc(), elem, minimum = True)
+                    while d == 0:
+                        if entry.name == name:
+                            return True
+                        delta -= 1
+                        entry = self.rows[entries[inspect + delta][2]]
+                        d = distance(entry.loc(), elem, minimum = True)
+                    return False
+                elif d > 0:
+                    lower = inspect + 1
+                    delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                    inspect += delta
+                else:
+                    upper = inspect
+                    delta = (inspect - lower + 1) / 2
+                    inspect -= delta
+                if delta == 0:
+                    return False
+        except KeyError:
+            return False
+
+    def findByName(self, myname):
+        """ Find the unique entry with the specified name.
+            Note that if the name is not unique, the last entry with the name will be returned.
+        """
+        return self.indices[3][myname]
+        
+    def closest(self, myloc, minimum = True):
+        """ Find the closest entry in the current BedFile to a given location.
+            Return a tuple with the absolute distance and the entry that is closest.
+            If several entries are closest, then any of the closest entries are returned.
+            If no location is found on the same chromosome, the tuple None, None is returned.
+            minimum: if True, use minimum distance, if False, use centre to centre distance.
+        """
+        mindist = None
+        minentry = None
+        try:
+            if isinstance(myloc, BedEntry):
+                myloc = myloc.loc()
+            if minimum:
+                entries = self.indices[0][myloc[0]] # use start index
+                upper = len(entries)    # keep an upper boundary
+                lower = 0               # and a lower boundary
+                inspect = (upper - lower) / 2 # start by looking in the middle
+                delta = None
+                while not delta == 0:
+                    entry = self.rows[entries[inspect][2]]
+                    d = distance(entry.loc(), myloc, minimum = True)
+                    if mindist == None:
+                        mindist = abs(d)
+                        minentry = entry
+                    elif abs(d) < mindist:
+                        mindist = abs(d)
+                        minentry = entry
+                    if d == 0:
+                        return (mindist, minentry)
+                    elif d > 0:
+                        lower = inspect + 1
+                        delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                        inspect += delta
+                    else:
+                        upper = inspect
+                        delta = (inspect - lower + 1) / 2
+                        inspect -= delta
+                # we may have missed the closest, so need to look around this point
+                for i_dn in range(inspect + 1, len(entries)): # Look downstream since
+                    entry = self.rows[entries[i_dn][2]]
+                    d = distance(entry.loc(), myloc, minimum = True)
+                    if abs(d) < mindist:
+                        mindist = abs(d)
+                        minentry = entry
+                    elif abs(d) > mindist:
+                        break
+                # also need to investigate upstream, doing so by using end index
+                entries = self.indices[2][myloc[0]] # use end index
+                upper = len(entries)    # keep an upper boundary
+                lower = 0               # and a lower boundary
+                inspect = (upper - lower) / 2 # start by looking in the middle
+                delta = None
+                while not delta == 0:
+                    entry = self.rows[entries[inspect][2]]
+                    d = distance(entry.loc(), myloc, minimum = True)
+                    if abs(d) < mindist:
+                        mindist = abs(d)
+                        minentry = entry
+                    if d == 0:
+                        return (mindist, minentry)
+                    elif d > 0:
+                        lower = inspect + 1
+                        delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                        inspect += delta
+                    else:
+                        upper = inspect
+                        delta = (inspect - lower + 1) / 2
+                        inspect -= delta
+                # we may have missed the closest, so need to look around this point
+                for i_up in range(inspect - 1, 0, -1): # Look upstream since
+                    entry = self.rows[entries[i_up][2]]
+                    d = distance(entry.loc(), myloc, minimum = True)
+                    if abs(d) < mindist:
+                        mindist = abs(d)
+                        minentry = entry
+                    elif abs(d) > mindist:
+                        break
+                return (mindist, minentry)
+            else: # minimum == False, i.e. use centre-to-centre distance
+                entries = self.indices[1][myloc[0]] # use centre index
+                upper = len(entries)    # keep an upper boundary
+                lower = 0               # and a lower boundary
+                inspect = (upper - lower) / 2 # start by looking in the middle
+                delta = None
+                while not delta == 0:
+                    entry = self.rows[entries[inspect][2]]
+                    d = distance(entry.loc(), myloc, minimum = False)
+                    if mindist == None:
+                        mindist = abs(d)
+                        minentry = entry
+                    elif abs(d) < mindist:
+                        mindist = abs(d)
+                        minentry = entry
+                    if d == 0:
+                        return (mindist, minentry)
+                    elif d > 0:
+                        lower = inspect + 1
+                        delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                        inspect += delta
+                    else:
+                        upper = inspect
+                        delta = (inspect - lower + 1) / 2
+                        inspect -= delta
+                # at bottom of search
+                return (mindist, minentry)
+        except KeyError:
+            return None
+
+    def merge(self, usestrand = False):
+        """ Collapse entries that overlap to create a new BedFile.
+            If usestrand is True, then strands are considered exclusively.
+            If usestrand is False, the strand info is ignored when overlap is checked.
+            When entries are merged, the options assigned to the last entry are retained, others are ignored.
+        """
+        starts = self.indices[0]
+        rows = self.rows
+        newrows = []
+        for c in starts: # chromosome
+            earliest_start = None
+            latest_end = None
+            for e in starts[c]:
+                idx = e[2]
+                strand = rows[idx].strand
+                if not usestrand or strand == '+':
+                    start = rows[idx].chromStart
+                    end = rows[idx].chromEnd
+                    if not earliest_start: # not yet initialised
+                        earliest_start = start
+                        latest_end = end
+                    else: 
+                        if start > latest_end: # new entry
+                            entry = BedEntry(c, earliest_start, latest_end)
+                            if self.format == 'Peaks':
+                                entry.addOption(name = rows[idx].name, score = rows[idx].score, signalValue = rows[idx].signalValue, strand = rows[idx].strand, pValue = rows[idx].pValue)
+                            elif self.format == 'Strand':
+                                entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
+                            newrows.append(entry)
+                            earliest_start = start
+                            latest_end = end
+                        earliest_start = min(earliest_start, start)
+                        latest_end = max(latest_end, end)
+            # the last entry on the chromosome
+            if not usestrand or earliest_start: # or strand == '+': # strand could have been overwritten if not last
+                entry = BedEntry(c, earliest_start, latest_end)
+                if self.format == 'Peaks':
+                    entry.addOption(name = rows[idx].name, score = rows[idx].score, signalValue = rows[idx].signalValue, strand = rows[idx].strand, pValue = rows[idx].pValue)
+                elif self.format == 'Strand':
+                    entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
+                newrows.append(entry)
+            if usestrand:
+                earliest_start = None
+                latest_end = None
+                for e in starts[c]:
+                    idx = e[2]
+                    strand = rows[idx].strand
+                    if strand == '-':
+                        start = rows[idx].chromStart
+                        end = rows[idx].chromEnd
+                        if not earliest_start: # not yet initialised
+                            earliest_start = start
+                            latest_end = end
+                        else: 
+                            if start > latest_end: # new entry
+                                entry = BedEntry(c, earliest_start, latest_end)
+                                if self.format == 'Peaks':
+                                    entry.addOption(name = rows[idx].name, score = rows[idx].score, signalValue = rows[idx].signalValue, strand = rows[idx].strand, pValue = rows[idx].pValue)
+                                elif self.format == 'Strand':
+                                    entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
+                                newrows.append(entry)
+                                earliest_start = start
+                                latest_end = end
+                            earliest_start = min(earliest_start, start)
+                            latest_end = max(latest_end, end)
+                # the last entry on the chromosome
+                if earliest_start: # starnd info could've been overwritten so check only that there is an entry to be written
+                    entry = BedEntry(c, earliest_start, latest_end)
+                    if self.format == 'Peaks':
+                        entry.addOption(name = rows[idx].name, score = rows[idx].score, signalValue = rows[idx].signalValue, strand = rows[idx].strand, pValue = rows[idx].pValue)
+                    elif self.format == 'Strand':
+                        entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
+                    newrows.append(entry)
+        return BedFile(newrows, format = self.format)
+        
+    def write(self, filename, format = 'BED6'):
+        """ Save the data
+            format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
+        """
+        f = open(filename, 'w')
+        for row in self.__iter__():
+            if self.format == 'Peaks':
+                #f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
+                f.write("%s %d %d %s %d %s %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
+            elif self.format == 'Limited':
+                f.write("%s %d %d" % (row.chrom, row.chromStart, row.chromEnd))
+            else:
+                f.write("%s %d %d %s %d %s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
+            f.write("\n")
+        f.close()
+
+def readBedFile(filename, format = 'Limited'):
+    """ Read a BED file.
+        format: specifies the format of the file,
+        "Limited", e.g.
+            chr22 1000 5000
+            chr22 2000 6000
+        "Optional", e.g.
+            track name=pairedReads description="Clone Paired Reads" useScore=1
+            chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512
+            chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601
+            ...
+            (also handles the Limited + score format)
+        "Peaks", e.g.
+            chr1    569780    569930    .    0    .    19    6.07811    -1    -1
+            chr1    713300    713450    .    0    .    54    49.1167    -1    -1
+        "Strand", e.g.
+            chr4    185772359    185772424    -
+            chr18    20513381    20513401    +
+        also supports a 5th label field 
+            chr5    20611949        20611949        +       ENSG00000251629_20611949
+            chr3    42187863        42187863        -       ENSG00000234562_42187863
+        "Summit", e.g.
+            # d = 130
+            chr      start    end   length summit  tags -10*log10(pvalue)    fold_enrichment    FDR(%)
+            chr1     8250     8671    422    286    46    145.84    11.68    0.51
+            chr1    36382    36984    603    405    46    315.23    27.05    0.24
+        "CCAT", e.g.
+            chr8    94747805    94747070    94749250    525     3    21.519196    0.002000
+            chr17   55277895    55277070    55279280    560    18    21.283333    0.002000
+        "Cropped", e.g.
+            chr1    851602    10
+            chr1    921184    18
+            chr1    931838    9
+    """
+    return BedFile(filename, format)
+    
+def writeBedFile(entries, filename, format = 'BED6'):
+    """ Save the BED entries to a BED file.
+        format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
+    """
+    f = open(filename, 'w')
+    for row in entries:
+        if format == 'Peaks':
+            #f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
+            f.write("%s %d %d %s %d %s %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
+        elif format == 'Limited':
+            f.write("%s %d %d" % (row.chrom, row.chromStart, row.chromEnd))
+        else:
+            f.write("%s %d %d %s %d %s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
+        f.write("\n")
+    f.close()
+
+def uniteBed(bed1, bed2):
+    if bed1.format != bed2.format:
+        raise RuntimeError('BEDs are of different formats')
+    rows = []
+    rows.extend(bed1.rows)
+    rows.extend(bed2.rows)
+    return BedFile(rows, bed1.format)
+
+"""
+This following code is a modified version of twobitreader (which is under Perl Artistic License 2.0).
+As per license restrictions, the code below indicates what has been modified in relation to the
+standard version (retrieved from https://bitbucket.org/thesylex/twobitreader on the 16 May 2012).
+No warranty is provided, express or implied
+
+Modifications to package:
+- removed download.py and __main__ because they were not used and __main__ had errors.
+- removed command-line interface because the BED file functionality is implemented more extensively elsewhere
+"""
+from array import array
+from bisect import bisect_right
+from errno import ENOENT, EACCES
+from os import R_OK, access
+try:
+    from os import strerror
+except ImportError:
+    strerror = lambda x: 'strerror not supported'
+from os.path import exists
+from itertools import izip
+
+def true_long_type():
+    """
+        OS X uses an 8-byte long, so make sure L (long) is the right size
+        and switch to I (int) if needed
+    """
+    for type_ in ['L', 'I']:
+        test_array = array(type_, [0])
+        long_size = test_array.itemsize
+        if long_size == 4: return type_
+    raise ImportError("Couldn't determine a valid 4-byte long type to use \
+as equivalent to LONG")
+LONG = true_long_type()
+
+def byte_to_bases(x):
+    """convert one byte to the four bases it encodes"""
+    c = (x >> 4) & 0xf
+    f = x & 0xf
+    cc = (c >> 2) & 0x3
+    cf = c & 0x3
+    fc = (f >> 2) & 0x3
+    ff = f & 0x3
+    return map(bits_to_base, (cc, cf, fc, ff))
+
+def bits_to_base(x):
+    """convert integer representation of two bits to correct base"""
+    if x is 0: return 'T'
+    if x is 1: return 'C'
+    if x is 2: return 'A'
+    if x is 3: return 'G'
+
+def base_to_bin(x):
+    """
+        provided for user convenience
+        convert a nucleotide to its bit representation
+    """
+    if x == 'T': return '00'
+    if x == 'C': return '01'
+    if x == 'A': return '10'
+    if x == 'G': return '11'
+
+def create_byte_table():
+    """create BYTE_TABLE"""
+    d = {}
+    for x in xrange(2**8):
+        d[x] = byte_to_bases(x)
+    return d
+
+def split16(x):
+    """
+        split a 16-bit number into integer representation
+        of its course and fine parts in binary representation
+    """
+    c = (x >> 8) & 0xff
+    f = x & 0xff
+    return c, f
+
+def create_twobyte_table():
+    """create TWOBYTE_TABLE"""
+    d = {}
+    for x in xrange(2**16):
+        c, f = split16(x)
+        d[x] = byte_to_bases(c) + byte_to_bases(f)
+    return d
+
+BYTE_TABLE = create_byte_table()
+TWOBYTE_TABLE = create_twobyte_table()
+
+def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
+    """
+        takes in a iterable of longs and converts them to bases in a char array
+        returns a ctypes string buffer
+    """
+    longs_len = len(longs)
+    # dna = ctypes.create_string_buffer(array_size)
+    dna = array('c', 'N' * longs_len)
+    # translate from 32-bit blocks to bytes
+    # this method ensures correct endianess (byteswap as neeed)
+    bytes = array('B')
+    bytes.fromstring(longs.tostring())
+    # first block
+    first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
+    i = 16 - first_base_offset
+    if array_size < i: i = array_size
+    dna[0:i] = array('c', first_block[first_base_offset:first_base_offset + i])
+    if longs_len == 1: return dna
+    # middle blocks (implicitly skipped if they don't exist)
+    for byte in bytes[4:-4]:
+        dna[i:i + 4] = array('c', BYTE_TABLE[byte])
+        i += 4
+    # last block
+    last_block = array('c', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
+    dna[i:i + last_base_offset] = last_block[0:last_base_offset]
+    return dna
+
+class TwoBitFile(dict):
+    """
+        python-level reader for .2bit files (i.e., from UCSC genome browser)
+        (note: no writing support)
+
+        TwoBitFile inherits from dict
+        You may access sequences by name, e.g.
+        >>> genome = TwoBitFile('hg18.2bit')
+        >>> chr20 = genome['chr20']
+
+        Sequences are returned as TwoBitSequence objects
+        You may access intervals by slicing or using str() to dump the entire entry
+        e.g.
+        >>> chr20[100100:100200]
+        'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
+        ttgcatgttaagtttttttcc'
+        >>> whole_chr20 = str(chr20)
+
+        Fair warning: dumping the entire chromosome requires a lot of memory
+
+        See TwoBitSequence for more info
+    """
+
+    def __init__(self, foo):
+        super(TwoBitFile, self).__init__()
+        if not exists(foo):
+            raise IOError(ENOENT, strerror(ENOENT), foo)
+        if not access(foo, R_OK):
+            raise IOError(EACCES, strerror(EACCES), foo)
+        self._filename = foo
+        self._file_handle = open(foo, 'rb')
+        self._load_header()
+        self._load_index()
+        for name, offset in self._offset_dict.iteritems():
+            self[name] = TwoBitSequence(self._file_handle, offset,
+                                        self._byteswapped)
+        return
+
+    def _load_header(self):
+        file_handle = self._file_handle
+        header = array(LONG)
+        header.fromfile(file_handle, 4)
+        # check signature -- must be 0x1A412743
+        # if not, swap bytes
+        byteswapped = False
+        (signature, version, sequence_count, reserved) = header
+        if not signature == 0x1A412743:
+            byteswapped = True
+            header.byteswap()
+            (signature2, version, sequence_count, reserved) = header
+            if not signature2 == 0x1A412743:
+                raise TwoBitFileError('Signature in header should be 0x1A412743'
+                                    + ', instead found 0x%X' % signature)
+        if not version == 0:
+            raise TwoBitFileError('File version in header should be 0.')
+        if not reserved == 0:
+            raise TwoBitFileError('Reserved field in header should be 0.')
+        self._byteswapped = byteswapped
+        self._sequence_count = sequence_count
+
+    def _load_index(self):
+        file_handle = self._file_handle
+        byteswapped = self._byteswapped
+        remaining = self._sequence_count
+        sequence_offsets = []
+        file_handle.seek(16)
+        while True:
+            if remaining == 0: break
+            name_size = array('B')
+            name_size.fromfile(file_handle, 1)
+            if byteswapped: name_size.byteswap()
+            name = array('c')
+            if byteswapped: name.byteswap()
+            name.fromfile(file_handle, name_size[0])
+            offset = array(LONG)
+            offset.fromfile(file_handle, 1)
+            if byteswapped: offset.byteswap()
+            sequence_offsets.append((name.tostring(), offset[0]))
+            remaining -= 1
+        self._sequence_offsets = sequence_offsets
+        self._offset_dict = dict(sequence_offsets)
+
+    def sequence_sizes(self):
+        """returns a dictionary with the sizes of each sequence"""
+        d = {}
+        file_handle = self._file_handle
+        byteswapped = self._byteswapped
+        for name, offset in self._offset_dict.iteritems():
+            file_handle.seek(offset)
+            dna_size = array(LONG)
+            dna_size.fromfile(file_handle, 1)
+            if byteswapped: dna_size.byteswap()
+            d[name] = dna_size[0]
+        return d
+
+class TwoBitSequence(object):
+    """
+        A TwoBitSequence object refers to an entry in a TwoBitFile
+
+        You may access intervals by slicing or using str() to dump the entire entry
+        e.g.
+        >>> genome = TwoBitFile('hg18.2bit')
+        >>> chr20 = genome['chr20']
+        >>> chr20[100100:100200] # slicing returns a string
+        'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
+        ttgcatgttaagtttttttcc'
+        >>> whole_chr20 = str(chr20) # get whole chr as string
+
+        Fair warning: dumping the entire chromosome requires a lot of memory
+
+        Note that we follow python/UCSC conventions:
+        Coordinates are 0-based, end-open
+        (Note: The UCSC web-based genome browser uses 1-based closed coordinates)
+        If you attempt to access a slice past the end of the sequence,
+        it will be truncated at the end.
+
+        Your computer probably doesn't have enough memory to load a whole genome
+        but if you want to string-ize your TwoBitFile, here's a recipe:
+
+        x = TwoBitFile('my.2bit')
+        d = x.dict()
+        for k,v in d.iteritems(): d[k] = str(v)
+    """
+    def __init__(self, file_handle, offset, byteswapped=False):
+        self._file_handle = file_handle
+        self._original_offset = offset
+        self._byteswapped = byteswapped
+        file_handle.seek(offset)
+        header = array(LONG)
+        header.fromfile(file_handle, 2)
+        if byteswapped: header.byteswap()
+        dna_size, n_block_count = header
+        self._dna_size = dna_size
+        self._packed_dna_size = (dna_size + 15) / 16 # this is 32-bit fragments
+        n_block_starts = array(LONG)
+        n_block_sizes = array(LONG)
+        n_block_starts.fromfile(file_handle, n_block_count)
+        if byteswapped: n_block_starts.byteswap()
+        n_block_sizes.fromfile(file_handle, n_block_count)
+        if byteswapped: n_block_sizes.byteswap()
+        self._n_block_starts = n_block_starts
+        self._n_block_sizes= n_block_sizes
+        mask_rawc = array(LONG)
+        mask_rawc.fromfile(file_handle, 1)
+        if byteswapped: mask_rawc.byteswap()
+        mask_block_count = mask_rawc[0]
+        mask_block_starts = array(LONG)
+        mask_block_starts.fromfile(file_handle, mask_block_count)
+        if byteswapped: mask_block_starts.byteswap()
+        mask_block_sizes = array(LONG)
+        mask_block_sizes.fromfile(file_handle, mask_block_count)
+        if byteswapped: mask_block_sizes.byteswap()
+        self._mask_block_starts = mask_block_starts
+        self._mask_block_sizes = mask_block_sizes
+        file_handle.read(4)
+        self._offset = file_handle.tell()
+
+    def __len__(self):
+        return self._dna_size
+
+    def __getslice__(self, min_, max_=None):
+        return self.get_slice(min_, max_)
+
+    def get_slice(self, min_, max_=None):
+        """
+        get_slice returns only a sub-sequence
+        """
+        # handle negative coordinates
+        dna_size = self._dna_size
+        if max_ < 0:
+            if max_ < -dna_size: raise IndexError('index out of range')
+            max_ = dna_size + 1 + max_
+        if min_ < 0:
+            if max_ < -dna_size: raise IndexError('index out of range')
+            min_ = dna_size + 1 + min_
+        # Find out if the reverse complement is sought
+        reverse = False # assume not RC
+        if min_ > max_ and max_ is not None:
+            reverse = True
+            mymax = max_
+            max_ = min_
+            min_ = mymax
+        if max_ == 0: return ''
+        # load all the data
+        if max_ > dna_size: max_ = dna_size
+        file_handle = self._file_handle
+        byteswapped = self._byteswapped
+        n_block_starts = self._n_block_starts
+        n_block_sizes = self._n_block_sizes
+        mask_block_starts = self._mask_block_starts
+        mask_block_sizes = self._mask_block_sizes
+        offset = self._offset
+        packed_dna_size = self._packed_dna_size
+
+        # region_size is how many bases the region is
+        if max_ is None: region_size = dna_size - min_
+        else: region_size = max_ - min_
+
+        # start_block, end_block are the first/last 32-bit blocks we need
+        # note: end_block is not read
+        # blocks start at 0
+        start_block = min_ / 16
+        end_block = max_ / 16
+        # don't read past seq end
+        if end_block >= packed_dna_size: end_block = packed_dna_size - 1
+        # +1 we still need to read block
+        blocks_to_read = end_block - start_block + 1
+
+        # jump directly to desired file location
+        local_offset = offset + start_block * 4
+        file_handle.seek(local_offset)
+
+        # note we won't actually read the last base
+        # this is a python slice first_base_offset:16*blocks+last_base_offset
+        first_base_offset = min_ % 16
+        last_base_offset = max_ % 16
+
+        fourbyte_dna = array(LONG)
+        fourbyte_dna.fromfile(file_handle, blocks_to_read)
+        if byteswapped: fourbyte_dna.byteswap()
+        string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
+                                              last_base_offset, region_size)
+        for start, size in izip(n_block_starts, n_block_sizes):
+            end = start + size
+            if end <= min_: continue
+            if start > max_: break
+            if start < min_: start = min_
+            if end > max_: end = max_
+            start -= min_
+            end -= min_
+            string_as_array[start:end] = array('c', 'N'*(end-start))
+        lower = str.lower
+        first_masked_region = max(0,
+                                  bisect_right(mask_block_starts, min_) - 1)
+        last_masked_region = min(len(mask_block_starts),
+                                 1 + bisect_right(mask_block_starts, max_,
+                                                  lo=first_masked_region))
+        for start, size in izip(mask_block_starts[first_masked_region:last_masked_region],
+                                mask_block_sizes[first_masked_region:last_masked_region]):
+            end = start + size
+            if end <= min_: continue
+            if start > max_: break
+            if start < min_: start = min_
+            if end > max_: end = max_
+            start -= min_
+            end -= min_
+            string_as_array[start:end] = array('c', lower(string_as_array[start:end].tostring()))
+        if not len(string_as_array) == max_ - min_:
+            raise RuntimeError, "Sequence was longer than it should be"
+        if reverse:
+            return self.reverseComplement(string_as_array.tostring())
+        return string_as_array.tostring()
+
+    def reverseComplement(self, dna):
+        """ Return a new sequence: the reverse complement of this sequence. """
+        newseq=''
+        symbols={'A':'T','C':'G','T':'A','G':'C','a':'t','c':'g','t':'a','g':'c','n':'n','N':'N'} # reverse complement dictionary
+        for symbol in dna[::-1]:
+            newsymbol=symbols[symbol] # uses the reverse complement symbols in dictionary
+            newseq+=newsymbol
+        return newseq  # returns RC sequences
+
+    def __str__(self):
+        """
+        returns the entire chromosome
+        """
+        return self.__getslice__(0, None)
+
+class TwoBitFileError(StandardError):
+    """
+    Base exception for TwoBit module
+    """
+    def __init__(self, msg):
+        errtext = 'Invalid 2-bit file. ' + msg
+        return super(TwoBitFileError, self).__init__(errtext)
+
+def print_specification():
+    """
+    Prints the twoBit file format specification I got from the Internet.
+    This is only here for reference
+    """
+    return """
+        From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
+
+        .2bit files
+
+        A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact \
+        randomly accessible format. The two bit files contain masking information as \
+        well as the DNA itself. The file begins with a 16 byte header containing the \
+        following fields:
+
+        signature - the number 0x1A412743 in the architecture of the machine that \
+        created the file.
+        version - zero for now. Readers should abort if they see a version number \
+        higher than 0.
+        sequenceCount - the number of sequences in the file
+        reserved - always zero for now.
+        All fields are 32 bits unless noted. If the signature value is not as given, \
+        the reader program should byte swap the signature and see if the swapped \
+        version matches. If so all multiple-byte entities in the file will need to be \
+        byte-swapped. This enables these binary files to be used unchanged on \
+        different architectures.
+
+        The header is followed by a file index. There is one entry in the index for \
+        each sequence. Each index entry contains three fields:
+
+        nameSize - a byte containing the length of the name field
+        name - this contains the sequence name itself, and is variable length \
+        depending on nameSize.
+        offset - 32 bit offset of the sequence data relative to the start of the file
+
+        The index is followed by the sequence records. These contain 9 fields:
+
+        dnaSize - number of bases of DNA in the sequence.
+        nBlockCount - the number of blocks of N's in the file (representing unknown \
+        sequence).
+        nBlockStarts - a starting position for each block of N's
+        nBlockSizes - the size of each block of N's
+        maskBlockCount - the number of masked (lower case) blocks
+        maskBlockStarts - starting position for each masked block
+        maskBlockSizes - the size of each masked block
+        packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A, \
+        11 - G. The first base is in the most significant 2 bits byte, and the last \
+        base in the least significant 2 bits, so that the sequence TCAG would be \
+        represented as 00011011. The packedDna field will be padded with 0 bits as \
+
+        necessary so that it takes an even multiple of 32 bit in the file, as this \
+        improves i/o performance on some machines.
+        .nib files
+    """
--- a/sequence.py
+++ b/sequence.py
+"""
+Module *** sequence ***
+
+This module depends on the following modules
+
+sym -- defines an alphabet
+prob -- defines structures to hold probabilities (prob also depends on sym)
+
+This module incorporates classes for
+
+Sequence -- names and defines a sequence of symbols; computes various transformations and pairwise alignments
+Alignment -- defines a multiple sequence alignment; computes stats for use in substitution matrices
+SubstMatrix -- substitution matrix class to support alignment methods
+Regexp -- defines patterns as regular expressions for textual pattern matching in sequences
+PWM -- defines a weight matrix that can score any site in actual sequences
+
+Incorporates methods for loading and saving files relevant to the above (e.g. FASTA, ALN, substitution matrices)
+and methods for retrieving relevant data from web services
+
+This code has gone through many updates and has benefitted from kind contributions of course participants.
+Please keep suggestions coming!
+Email: m.boden@uq.edu.au
+"""
+
+import string, sys, re, math, os, array
+import numpy
+from webservice import *
+from sym import *
+from prob import *
+
+# Sequence ------------------****
+
+class Sequence(object):
+    """ A biological sequence. Stores the sequence itself (as a compact array),
+    the alphabet (i.e., type of sequence it is), and optionally a name and further
+    information. """
+
+    sequence = None # The array of symbols that make up the sequence
+    alphabet = None # The alphabet from which symbols come
+    name =     None # The name (identifier) of a sequence
+    info =     None # Other information (free text; e.g. annotations)
+    length =   None # The number of symbols that the sequence is composed of
+    gappy =    None # True if the sequence has "gaps", i.e. positions that represent deletions relative another sequence
+
+    def __init__(self, sequence, alphabet = None, name = '', info = '', gappy = False):
+        """ Create a sequence with the sequence data. Specifying the alphabet,
+        name and other information about the sequence are all optional.
+        The sequence data is immutable (stored as a string).
+        Example:
+        >>> myseq = Sequence('MVSAKKVPAIAMSFGVSF')
+        will create a sequence with no name, and assign one of the predefined
+        alphabets on the basis of what symbols were used.
+        >>> myseq.alphabet.symbols
+        will output the standard protein alphabet:
+        ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
+        'R', 'S', 'T', 'V', 'W', 'Y'] """
+
+        try: # convert sequence data into a compact array representation
+            self.sequence = array.array('c', ''.join([s.upper() for s in sequence]))
+        except TypeError:
+            raise RuntimeError('Sequence data is not specified correctly: must be iterable')
+
+        # Assign an alphabet
+        self.alphabet = None
+        if not alphabet is None:
+            for sym in self.sequence:
+                if not sym in alphabet and (sym != '-' or not gappy):  # error check: bail out
+                    raise RuntimeError('Invalid symbol: %c in sequence %s' % (sym, name))
+            self.alphabet = alphabet
+        else:
+            for alphaName in preferredOrder:
+                alpha = predefAlphabets[alphaName]
+                valid = True
+                for sym in self.sequence:
+                    if not sym in alpha and (sym != '-' or not gappy):
+                        valid = False
+                        break
+                if valid:
+                    self.alphabet = alpha
+                    break
+            if self.alphabet is None:
+                raise RuntimeError('Could not identify alphabet from sequence: %s' % name)
+
+        # Store other information
+        self.name = name
+        self.info = info
+        self.length = len(self.sequence)
+        self.gappy = gappy
+
+    def __len__(self):
+        """ Defines what the "len" operator returns for an instance of Sequence, e.g.
+        >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
+        >>> print len(seq)
+        9
+        """
+        return len(self.sequence)
+
+    def __str__(self):
+        """ Defines what should be printed when the print statement is used on a Sequence instance """
+        str = self.name + ': '
+        for sym in self:
+            str += sym
+        return str
+
+    def __iter__(self):
+        """ Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
+        >>> seq = Sequence('AGGAT', DNA_Alphabet)
+        >>> for sym in seq:
+                print sym
+        will print A, G, G, A, T (each on a separate row)
+        """
+        tsyms = tuple(self.sequence)
+        return tsyms.__iter__()
+
+    def __contains__(self, item):
+        """ Defines what is returned when the "in" operator is used on a Sequence, e.g.
+        >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
+        >>> print 'T' in seq
+        True
+            which is equivalent to
+        >>> print seq.__contains__('T')
+        True
+        >>> print 'X' in seq
+        False
+        """
+        for sym in self.sequence:
+            if sym == item:
+                return True
+        return False
+
+    def __getitem__(self, ndx):
+        """ Retrieve a specified index (or a "slice" of indices) of the sequence data.
+            Calling self.__getitem__(3) is equivalent to self[3]
+        """
+        if type(ndx) is slice:
+            return self.sequence[ndx].tostring()
+        else:
+            return self.sequence[ndx]
+
+    def writeFasta(self):
+        """ Write one sequence in FASTA format to a string and return it. """
+        fasta = '>' + self.name + ' ' + self.info + '\n'
+        data = self.sequence.tostring()
+        nlines = (len(self.sequence) - 1) / 60 + 1
+        for i in range(nlines):
+            lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
+            fasta += lineofseq
+        return fasta
+
+    def count(self, findme = None):
+        """ Get the number of occurrences of specified symbol findme OR
+            if findme = None, return a dictionary of counts of all symbols in alphabet """
+        if findme != None:
+            cnt = 0
+            for sym in self.sequence:
+                if findme == sym:
+                    cnt = cnt + 1
+            return cnt
+        else:
+            symbolCounts = {}
+            for symbol in self.alphabet:
+                symbolCounts[symbol] = self.count(symbol)
+            return symbolCounts
+
+    def find(self, findme):
+        """ Find the position of the specified symbol or sub-sequence """
+        return self.sequence.tostring().find(findme)
+
+"""
+Below are some useful methods for loading data from strings and files.
+Recognize the FASTA format (nothing fancy).
+"""
+def readFasta(string, alphabet = None, ignore = False, gappy = False):
+    """ Read the given string as FASTA formatted data and return the list of
+        sequences contained within it.
+        If alphabet is specified, use it, if None (default) then guess it.
+        If ignore is False, errors cause the method to fail.
+        If ignore is True, errors will disregard sequence.
+        If gappy is False (default), sequence cannot contain gaps,
+        if True gaps are accepted and included in the resulting sequences."""
+    seqlist = []    # list of sequences contained in the string
+    seqname = None  # name of *current* sequence
+    seqinfo = None
+    seqdata = []    # sequence data for *current* sequence
+    for line in string.splitlines():    # read every line
+        if len(line) == 0:              # ignore empty lines
+            continue
+        if line[0] == '>':  # start of new sequence
+            if seqname:     # check if we've got one current
+                try:
+                    current = Sequence(seqdata, alphabet, seqname, seqinfo, gappy)
+                    seqlist.append(current)
+                except RuntimeError as errmsg:
+                    if not ignore:
+                        raise RuntimeError(errmsg)
+            # now collect data about the new sequence
+            seqinfo = line[1:].split() # skip first char
+            if len(seqinfo) > 0:
+                try:
+                    parsed = parseDefline(seqinfo[0])
+                    seqname = parsed[0]
+                    seqinfo = line[1:]
+                except IndexError as errmsg:
+                    if not ignore:
+                        raise RuntimeError(errmsg)
+            else:
+                seqname = ''
+                seqinfo = ''
+            seqdata = []
+        else:               # we assume this is (more) data for current
+            cleanline = line.split()
+            for thisline in cleanline:
+                seqdata.extend(tuple(thisline.strip('*')))
+    # we're done reading the file, but the last sequence remains
+    if seqname:
+        try:
+            lastseq = Sequence(seqdata, alphabet, seqname, seqinfo, gappy)
+            seqlist.append(lastseq)
+        except RuntimeError as errmsg:
+            if not ignore:
+                raise RuntimeError(errmsg)
+    return seqlist
+
+def parseDefline(string):
+    """ Parse the FASTA defline (see http://en.wikipedia.org/wiki/FASTA_format)
+        GenBank, EMBL, etc                gi|gi-number|gb|accession|locus
+        SWISS-PROT, TrEMBL                sp|accession|name
+        ...
+        Return a tuple with
+        [0] primary search key, e.g. UniProt accession, Genbank GI
+        [1] secondary search key, e.g. UniProt name, Genbank accession
+        [2] source, e.g. 'sp' (SwissProt/UniProt), 'tr' (TrEMBL), 'gb' (Genbank)
+    """
+    if len(string) == 0: return ('', '', '', '')
+    s = string.split()[0]
+    if re.match("^sp\|[A-Z][A-Z0-9]{5}\|\S+", s):            arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
+    elif re.match("^tr\|[A-Z][A-Z0-9]{5}\|\S+", s):          arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
+    elif re.match("^gi\|[0-9]*\|\S+\|\S+", s):               arg = s.split('|');  return (arg[1], arg[3], arg[0], arg[2])
+    elif re.match("gb\|\S+\|\S+", s):                        arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
+    elif re.match("emb\|\S+\|\S+", s):                       arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
+    elif re.match("^refseq\|\S+\|\S+", s):                   arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
+    else: return (s, '', '', '')
+
+def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
+    """ Read the given FASTA formatted file and return the list of sequences
+        contained within it. Note that if alphabet is NOT specified, it will take a
+        separate guess for each sequence.
+        If ignore is False, errors cause the method to fail.
+        If ignore is True, errors will disregard sequence.
+        If gappy is False (default), sequence cannot contain gaps,
+        if True gaps are accepted and included in the resulting sequences."""
+    fh = open(filename)
+    seqlist = []
+    batch = '' # a batch of rows including one or more complete FASTA entries
+    rowcnt = 0
+    for row in fh:
+        row = row.strip()
+        if len(row) > 0:
+            if row.startswith('>') and rowcnt > 0:
+                more = readFasta(batch, alphabet, ignore, gappy)
+                if len(more) > 0:
+                    seqlist.extend(more)
+                batch = ''
+                rowcnt = 0
+            batch += row + '\n'
+            rowcnt += 1
+    if len(batch) > 0:
+        more = readFasta(batch, alphabet, ignore, gappy)
+        if len(more) > 0:
+            seqlist.extend(more)
+    fh.close()
+    return seqlist
+
+def writeFastaFile(filename, seqs):
+    """ Write the specified sequences to a FASTA file. """
+    fh = open(filename, 'w')
+    for seq in seqs:
+        fh.write(seq.writeFasta())
+    fh.close()
+
+def getMarkov(seqs, order = 0):
+    """ Retrieve the Markov stats for a set of sequences. """
+    myseqs = seqs
+    if seqs is Sequence:
+        myseqs = list([seqs])
+    myalpha = None
+    for seq in myseqs:
+        if myalpha == None:
+            myalpha = seq.alphabet
+        else:
+            if seq.alphabet != myalpha:
+                raise RuntimeError('Sequence ' + seq.name + ' uses an invalid alphabet ')
+    jp = Joint([myalpha for _ in range(order + 1)])
+    for seq in myseqs:
+        for i in range(len(seq) - order):
+            sub = seq[i:i + order + 1]
+            jp.observe(sub)
+    return jp
+
+def getCount(seqs, findme = None):
+    if findme != None:
+        cnt = 0
+        for seq in seqs:
+            cnt += seq.count(findme)
+        return cnt
+    else:
+        if len(seqs) > 0:
+            alpha = seqs[0].alphabet
+            patcnt = {}
+            for a in alpha:
+                patcnt[a] = getCount(seqs, a)
+        return patcnt
+
+# Alignment ------------------
+
+class Alignment():
+    """ A sequence alignment class. Stores two or more sequences of equal length where
+    one symbol is gap '-'
+    Example usage:
+    >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
+    >>> print Alignment(seqs)
+     THIS-LI-NE-
+     --ISALIGNED
+    """
+
+    alignlen = None
+    seqs = None
+    alphabet = None
+
+    def __init__(self, seqs):
+        self.alignlen = -1
+        self.seqs = seqs
+        self.alphabet = None
+        for s in seqs:
+            if self.alignlen == -1:
+                self.alignlen = len(s)
+            elif self.alignlen != len(s):
+                raise RuntimeError("Alignment invalid: different lengths")
+            if self.alphabet != None and self.alphabet != s.alphabet:
+                raise RuntimeError("Alignment invalid: different alphabets")
+            self.alphabet = s.alphabet
+
+    def getnamelen(self):
+        namelen = 0
+        for seq in self.seqs:
+            namelen = max(len(seq.name), namelen)
+        return namelen
+
+    def __len__(self):
+        """ Defines what the "len" operator returns for an instance of Alignment, e.g.
+        >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
+        >>> aln = Alignment(seqs)
+        >>> print len(aln)
+        2
+        """
+        return len(self.seqs)
+
+    def getSize(self):
+        """ Returns the size of an alignment in terms of number of columns """
+        return self.alignlen
+
+    def __str__(self):
+        string = ''
+        namelen = self.getnamelen()
+        for seq in self.seqs:
+            string += seq.name.ljust(namelen+1)
+            for sym in seq:
+                string += sym
+            string += '\n'
+        return string
+
+    def __getitem__(self, ndx):
+        return self.seqs[ndx]
+
+    def writeClustal(self, filename = None):
+        """ Write the alignment to a string or file using the Clustal file format. """
+        symbolsPerLine = 60
+        maxNameLength =  self.getnamelen() + 1
+        string = ''
+        wholeRows = self.alignlen / symbolsPerLine
+        for i in range(wholeRows):
+            for j in range(len(self.seqs)):
+                string += self.seqs[j].name.ljust(maxNameLength) + ' '
+                string += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
+            string += '\n'
+        # Possible last row
+        lastRowLength = self.alignlen - wholeRows*symbolsPerLine
+        if lastRowLength > 0:
+            for j in range(len(self.seqs)):
+                if maxNameLength > 0:
+                    string += self.seqs[j].name.ljust(maxNameLength) + ' '
+                string += self.seqs[j][-lastRowLength:] + '\n'
+        if filename != None:
+            fh = open(filename, 'w')
+            fh.write('CLUSTAL W (1.83) multiple sequence alignment\n\n\n') # fake header so that clustal believes it
+            fh.write(string)
+            fh.close()
+            return
+        return string
+
+    def getProfile(self, pseudo = 0.0, countGaps = True):
+        """ Determine the probability matrix from the alignment, assuming
+        that each position is independent of all others. """
+        p = IndepJoint([self.alphabet for _ in range(self.alignlen)], pseudo)
+        for seq in self.seqs:
+            p.observe(seq, 1, countGaps = countGaps)
+        return p
+
+    def getConsensus(self):
+        """ Construct a consensus sequence. """
+        syms = []
+        for col in range(self.alignlen):
+            d = Distrib(self.alphabet)
+            for seq in self.seqs:
+                if seq[col] in self.alphabet:
+                    d.observe(seq[col])
+            syms.append(d.getmax())
+        return Sequence(syms)
+
+    def getConsensusForColumn(self, colidx):
+        symcnt = {}
+        for seq in self.seqs:
+            mysym = seq[colidx]
+            try:
+                symcnt[mysym] += 1
+            except:
+                symcnt[mysym] = 1
+        consensus = None
+        maxcnt = 0
+        for mysym in symcnt:
+            if symcnt[mysym] > maxcnt:
+                maxcnt = symcnt[mysym]
+                consensus = mysym
+        return consensus
+
+    def displayConsensus(self, theta1 = 0.2, theta2 = 0.05, lowercase = True):
+        """ Display a table with rows for each alignment column, showing
+            column index, entropy, number of gaps, and symbols in order of decreasing probability.
+            theta1 is the threshold for displaying symbols in upper case,
+            theta2 is the threshold for showing symbols at all, and in lower case. """
+        print "Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)
+        print "Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)
+        for col in range(self.alignlen):
+            d = Distrib(self.alphabet)
+            gaps = 0
+            for seq in self.seqs:
+                if seq[col] in self.alphabet:
+                    d.observe(seq[col])
+                else:
+                    gaps += 1
+            print (col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,
+            symprobs = d.getProbsort()
+            (_, maxprob) = symprobs[0]
+            if maxprob >= theta1:
+                print "%d\tTRUE\t" % int(maxprob * 100),
+            else:
+                print "%d\t\t" % int(maxprob * 100),
+            for (sym, prob) in symprobs:
+                if prob >= theta1:
+                    print sym, "%d%%" % int(prob * 100),
+                elif prob >= theta2 and lowercase:
+                    print sym.lower(), "%d%%" % int(prob * 100),
+                elif prob >= theta2:
+                    print sym, "%d%%" % int(prob * 100),
+            print
+
+    def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False):
+        """ Display a table with rows for each alignment column, showing
+            column index, entropy, number of gaps, and symbols in order of decreasing probability.
+            theta1 is the threshold for displaying symbols in upper case,
+            theta2 is the threshold for showing symbols at all, and in lower case. """
+        filename = ''.join(e for e in filename if e.isalnum() or e == '_' or e == '.')
+        f = open(filename, 'w')
+        f.write("Alignment of %d sequences, with %d columns\n" % (len(self.seqs), self.alignlen))
+        if compact:
+            f.write("Column\tConserv\tVariab\tAll (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2))
+        else:
+            f.write("Column\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2))
+        countrow = 0
+        for col in range(self.alignlen):
+            countrow += 1
+            if myseq[col] == '-':
+                continue
+            alist = list(self.alphabet)
+            alist.append('-')
+            gapalphabet = Alphabet(alist)
+            d_gap = Distrib(gapalphabet)
+            d_nogap = Distrib(self.alphabet)
+            for seq in self.seqs:
+                if seq[col] in gapalphabet:
+                    d_gap.observe(seq[col])
+                if seq[col] in self.alphabet:
+                    d_nogap.observe(seq[col])
+            f.write("%d\t" % (col + 1))
+            symprobs_nogap = d_nogap.getProbsort()
+            symprobs_gap = d_gap.getProbsort()
+            (maxsym, maxprob) = symprobs_nogap[0]
+            if compact:
+                if maxprob >= theta1:
+                    f.write("%c\t" % maxsym)
+                else:
+                    f.write("\t")
+                    for (sym, prob) in symprobs_gap:
+                        if prob >= theta2 and lowercase:
+                            f.write("%c" % sym.lower())
+                        elif prob >= theta2:
+                            f.write("%c" % sym)
+                f.write("\t")
+            else:
+                if maxprob >= theta1:
+                    f.write("%d\t" % int(maxprob * 100))
+                else:
+                    f.write("%d\t\t" % int(maxprob * 100))
+            for (sym, prob) in symprobs_gap:
+                if prob >= theta1:
+                    f.write("%c %d%% " % (sym, int(prob * 100)))
+                elif prob >= theta2 and lowercase:
+                    f.write("%c %d%% " % (sym.lower(), int(prob * 100)))
+                elif prob >= theta2:
+                    f.write("%c %d%% " % (sym, int(prob * 100)))
+            f.write('\n')
+        f.close()
+
+    def calcBackground(self):
+        """ Count the proportion of each amino acid's occurrence in the
+            alignment, and return as a probability distribution. """
+        p = Distrib(self.alphabet)
+        for seq in self.seqs:
+            for sym in seq:
+                if sym in self.alphabet: # ignore "gaps"
+                    p.observe(sym)
+        return p
+
+    def calcSubstMatrix(self, background = None):
+        """ Return a substitutionMatrix whose fg are based on this un-gapped
+        multiple sequence alignment. Scores are given in half-bits. """
+        # Get a list of the amino acids
+        aminoAcids = self.alphabet.symbols
+        columns = self.alignlen                   # Length of sequences in alignment
+        numSeqs = len(self.seqs)                  # Number of sequences in alignment
+        seqPairs = (numSeqs* (numSeqs - 1) ) / 2  # Number of pairs of sequences in ungapped alignment
+        aaPairs = seqPairs * columns              # Number of pairs of amino acids in ungapped alignment
+        # For each pair of amino acids, calculate the proportion of all aligned
+        # amino acids in this alignment which are made up of that pair
+        # (i.e., q[ab] = fab / aaPairs, where fab is the number of times
+        #  a and b are aligned in this alignment)
+        # See page 122 in Understanding Bioinformatics.
+        q = {}
+        for i in range( len(aminoAcids) ):
+            a = aminoAcids[i]
+            for j in range(i, len(aminoAcids)):
+                b = aminoAcids[j]
+                # Count the number of times each pair of amino acids is aligned
+                fab = 0
+                for column in range(columns):
+                    # Count number of each amino acid in each column
+                    col = [seq[column] for seq in self.seqs]
+                    if a == b:
+                        # Number of ways of pairing up n occurrences of amino
+                        # acid a is n*(n-1)/2
+                        cnt = col.count(a)
+                        fab += cnt * (cnt-1)/2
+                    else:
+                        # Number of ways of pairing up n & m occurrences of
+                        # amino acids a & b is n*m
+                        fab += col.count(a)*col.count(b)
+                # Calculate proportion of all aligned pairs of amino acids
+                q[a+b] = q[b+a] = float(fab) / aaPairs
+                if q[a+b] == 0:   # This is so we don't end up doing log(0)
+                    q[a+b] = q[b+a] = 0.001
+        # Background frequency calculation if required
+        p = background or self.calcBackground()
+        # Calculate log-odds ratio for each pair of amino acids
+        s = SubstMatrix(self.alphabet)
+        for a in aminoAcids:
+            for b in aminoAcids:
+                # Calculate random chance probabilitity (eab)
+                if a == b:
+                    eab = p[a]**2
+                else:
+                    eab = 2*p[a]*p[b]
+                if eab == 0:
+                    eab = 0.001
+                # Calculate final score to be set in the substitution matrix
+                odds = q[a+b] / eab
+                sab = math.log(odds, 2) # log_2 transform
+                sab = sab * 2 # units in half bits
+                s.set(a, b, int(round(sab)))
+        return s
+
+    def calcDistances(self, measure, a=1.0):
+        """ Calculate the evolutionary distance between all pairs of sequences
+        in this alignment, using the given measure. Measure can be one of
+        'fractional', 'poisson', 'gamma', 'jc' or 'k2p'. If 'gamma' or 'k2p' is
+        given, then the parameter a must also be specified (or else it will use
+        the default value of 1.0).
+        Definitions of each distance metric are found in Zvelebil and Baum p268-276.
+        These are mostly intended for DNA, but adapted for protein (as below).
+        Note however that there are alternative distance matrices for proteins (p276).
+        """
+        measure = measure.lower()
+        if not measure in ['fractional', 'poisson', 'gamma', 'jc', 'k2p']:
+            raise RuntimeError('Unsupported evolutionary distance measure: %s' % measure)
+        a = float(a)
+        if len(self.alphabet) == 4:
+            oneless = 3
+            alphalen = 4
+        elif len(self.alphabet) == 20:
+            oneless = 19
+            alphalen = 20
+        else:
+            raise RuntimeError('Invalid sequence alphabet: %s' % str(self.alphabet))
+        distmat = numpy.zeros((len(self.seqs), len(self.seqs)))
+        # Loop through each pair of sequences
+        for i in range(len(self.seqs)):
+            for j in range(i + 1, len(self.seqs)):
+                seqA = self.seqs[i]
+                seqB = self.seqs[j]
+                # Calculate the fractional distance (p) first
+                # The two sequences of interest are in seqA and seqB
+                L = 0
+                D = 0
+                for k in range(self.alignlen):
+                    # For every non-gapped column, put to L
+                    # For every non-gapped column where the sequences are
+                    # different, put to D
+                    if seqA[k] != '-' and seqB[k] != '-':
+                        L += 1
+                        if seqA[k] != seqB[k]:
+                            D += 1
+                p = float(D)/L
+                # Now calculate the specified measure based on p
+                if measure == 'fractional':
+                    dist = p
+                elif measure == 'poisson':
+                    dist = -math.log(1-p)
+                elif measure == 'jc':
+                    dist = -(float(oneless)/alphalen)*math.log(1 - (float(alphalen)/oneless)*p)
+                elif measure == 'k2p':
+                    dist = (float(oneless)/alphalen)*a*((1 - (float(alphalen)/oneless)*p)**(-1/a) - 1)
+                else: # measure == 'gamma'
+                    dist = a*((1-p)**(-1/a) - 1)
+                distmat[i, j] = distmat[j, i] = dist
+        return distmat
+
+    def writeHTML(self, filename):
+        """ Generate HTML that displays the alignment in color. 
+            Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
+            and that each symbol maps to a text string naming the color, e.g. 'blue'
+        """
+        fh = open(filename, 'w')
+        fh.write('<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n')
+        maxNameLength =  self.getnamelen()
+        html = ''.ljust(maxNameLength) + ' '
+        for i in range(self.alignlen - 1):
+            if (i+1) % 10 == 0:
+                html += str(i/10+1)[-1]
+            else:
+                html += ' '
+        html += '%s\n' % (self.alignlen)
+        fh.write(html)
+        if self.alignlen > 10:
+            html = ''.ljust(maxNameLength) + ' '
+            for i in range(self.alignlen - 1):
+                if (i+1) % 10 == 0:
+                    html += '0'
+                else:
+                    html += ' '
+            html += '\n'
+            fh.write(html)
+        for seq in self.seqs:
+            html = seq.name.ljust(maxNameLength) + ' '
+            for sym in seq:
+                color = self.alphabet.getAnnotation('html-color', sym)
+                if not color:
+                    color = 'white'
+                html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
+            html += '\n'
+            fh.write(html)
+        fh.write('</pre></body></html>\n')
+        fh.close()
+
+def saveConsensus(aln, theta1 = 0.99, theta2 = 0.01, countgaps = False, consensus = True, filename = None):
+    """ Display a table with rows for each alignment column, showing
+        column index, entropy, number of gaps, and symbols in order of decreasing probability.
+        theta1 is the percent threshold for consensus (when achieved, all other symbols are ignored)
+        theta2 is the percent threshold for inclusion (symbols below are ignored).
+        countgaps, if true, count gaps (default false).
+        consensus, if true, always print the consensus symbol.
+        filename is name of file to save the output to (default stdout)."""
+    if filename == None:
+        f = sys.stdout
+    else:
+        filename = ''.join(e for e in filename if e.isalnum() or e == '_' or e == '.')
+        f = open(filename, 'w')
+    if consensus:
+        f.write("Alignment of %d sequences, with %d columns\n" % (len(aln.seqs), aln.alignlen))
+        f.write("Consensus>=%.2f;Inclusion>=%.2f)\n" % (theta1, theta2))
+    for col in range(aln.alignlen):
+        # collect probabilities for column, with or without gap
+        myalpha = aln.alphabet
+        if countgaps:
+            alist = list(aln.alphabet)
+            alist.append('-')
+            myalpha = Alphabet(alist)
+        d = Distrib(myalpha)
+        for seq in aln.seqs:
+            if seq[col] in myalpha:
+                d.observe(seq[col])
+        symprobs = d.getProbsort() # the symbols sorted by probability
+        ninclusions = 0
+        for (s, p) in symprobs:
+            if p >= theta2:
+                ninclusions += 1
+            else:
+                break
+        if consensus or ninclusions > 1:
+            f.write("%d " % (col + 1))
+        (maxs, maxp) = symprobs[0]
+#        if maxp >= theta1 or consensus:
+#            f.write("%c" % maxs)
+        for (s, p) in symprobs[1:]:
+            if p >= theta2:
+                f.write("%c" % s)
+        f.write("; ")
+    f.write('\n')
+    f.close()
+
+def alignGlobal(seqA, seqB, substMatrix, gap = -1):
+    """ Align seqA with seqB using the Needleman-Wunsch
+    (global) algorithm. subsMatrix is the substitution matrix to use and
+    gap is the linear gap penalty to use. """
+    lenA, lenB = len(seqA), len(seqB)
+    # Create the scoring matrix (S)
+    S = numpy.zeros((lenA + 1, lenB + 1))
+    # Fill the first row and column of S with multiples of the gap penalty
+    for i in range(lenA + 1):
+        S[i, 0] = i * gap
+    for j in range(lenB + 1):
+        S[0, j] = j * gap
+    # Calculate the optimum score at each location in the matrix S
+    # (where the score represents the best possible score for an alignment
+    #  that ends at sequence indices i and j, for A and B, resp.)
+    for i in range(1, lenA + 1):
+        for j in range(1, lenB + 1):
+            match  = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
+            delete = S[i-1, j  ] + gap
+            insert = S[i  , j-1] + gap
+            S[i, j] = max([match, delete, insert])
+    # Traceback the optimal alignment
+    alignA = '' # a string for sequence A when aligned (e.g. 'THIS-LI-NE-', initially empty).
+    alignB = '' # a string for sequence B when aligned (e.g. '--ISALIGNED', initially empty).
+    # Start at the end (bottom-right corner of S)
+    i = lenA
+    j = lenB
+    # Stop when we hit the beginning of at least one sequence
+    while i > 0 and j > 0:
+        if S[i, j] == S[i-1, j] + gap:
+            # Got here by a gap in sequence B (go up)
+            alignA = seqA[i-1] + alignA
+            alignB = '-' + alignB
+            i -= 1
+        elif S[i, j] == S[i, j-1] + gap:
+            # Got here by a gap in sequence A (go left)
+            alignA = '-' + alignA
+            alignB = seqB[j-1] + alignB
+            j -= 1
+        else:
+            # Got here by aligning the bases (go diagonally)
+            alignA = seqA[i-1] + alignA
+            alignB = seqB[j-1] + alignB
+            i -= 1
+            j -= 1
+    # Fill in the rest of the alignment if it begins with gaps
+    # (i.e., traceback all the way to S[0, 0])
+    while i > 0:
+        # Go up
+        alignA = seqA[i-1] + alignA
+        alignB = '-' + alignB
+        i -= 1
+    while j > 0:
+        # Go left
+        alignA = '-' + alignA
+        alignB = seqB[j-1] + alignB
+        j -= 1
+    return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])
+
+def alignLocal(seqA, seqB, substMatrix, gap = -1):
+    """ Align seqA with seqB using the Smith-Waterman
+    (local) algorithm. subsMatrix is the substitution matrix to use and
+    gap is the linear gap penalty to use. """
+    lenA, lenB = len(seqA), len(seqB)
+    # Create the scoring matrix (S)
+    S = numpy.zeros((lenA + 1, lenB + 1))
+    # Fill the first row and column of S with multiples of the gap penalty
+    for i in range(lenA + 1):
+        S[i, 0] = 0  # Local: init 0
+    for j in range(lenB + 1):
+        S[0, j] = 0  # Local: init 0
+    # Calculate the optimum score at each location in the matrix S
+    # (where the score represents the best possible score for an alignment
+    #  that ends at sequence indices i and j, for A and B, resp.)
+    for i in range(1, lenA + 1):
+        for j in range(1, lenB + 1):
+            match  = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
+            delete = S[i-1, j  ] + gap
+            insert = S[i  , j-1] + gap
+            S[i, j] = max([match, delete, insert, 0])  # Local: add option that we re-start alignment from "0"
+    # Trace back the optimal alignment
+    alignA = ''
+    alignB = ''
+    # Local: start at the cell which has the highest score; find it
+    i = 0
+    j = 0
+    for ii in range(1, lenA + 1):
+        for jj in range(1, lenB + 1):
+            if S[ii, jj] > S[i, j]:
+                i = ii
+                j = jj
+
+    # Stop when we hit the end of a sequence
+    # Local: also stop when we hit a score 0
+    while i > 0 and j > 0 and S[i, j] > 0:
+        if S[i, j] == S[i-1, j] + gap:
+            # Got here by a gap in sequence B (go up)
+            alignA = seqA[i-1] + alignA
+            alignB = '-' + alignB
+            i -= 1
+        elif S[i, j] == S[i, j-1] + gap:
+            # Got here by a gap in sequence A (go left)
+            alignA = "-" + alignA
+            alignB = seqB[j-1] + alignB
+            j -= 1
+        else:
+            # Got here by aligning the bases (go diagonally)
+            alignA = seqA[i-1] + alignA
+            alignB = seqB[j-1] + alignB
+            i -= 1
+            j -= 1
+    return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])
+
+def tripletAlignGlobal(seqA, seqB, seqC, subsMatrix, gap = -1):
+    """ Triplet-wise align this sequence with sequences seqB and seqC,
+    using the Needleman-Wunsch (global) algorithm. subsMatrix is the
+    substitution matrix to use and gap is the linear gap penalty to use. """
+
+    lenA, lenB, lenC = [s.length for s in [seqA, seqB, seqC]]
+
+    # Create the 3D scoring matrix
+    traceback = numpy.zeros((lenA+1, lenB+1, lenC+1))
+    # Fill the first row (in each dimension) with multiples of the gap penalty
+    S = numpy.zeros((lenA+1, lenB+1, lenC+1))
+    for i in range(lenA+1):
+        S[i,0,0] = i * gap
+    for j in range(lenB+1):
+        S[0,j,0] = j * gap
+    for k in range(lenC+1):
+        S[0,0,k] = k * gap
+    # Calculate the optimum __getitem__ at each location in the matrix
+    for i in range(1, lenA+1):
+        for j in range(1, lenB+1):
+            for k in range(1, lenC+1):
+                # Scored using sum-of-pairs
+                matchABC = S[i-1, j-1, k-1] + subsMatrix.get(seqA[i-1], seqB[j-1]) \
+                           + subsMatrix.get(seqA[i-1], seqC[k-1]) \
+                           + subsMatrix.get(seqB[j-1], seqC[k-1])
+                matchAB = S[i-1, j-1, k] + 2*gap + subsMatrix.get(seqA[i-1], seqB[j-1])
+                matchBC = S[i, j-1, k-1] + 2*gap + subsMatrix.get(seqB[j-1], seqC[k-1])
+                matchAC = S[i-1, j, k-1] + 2*gap + subsMatrix.get(seqA[i-1], seqC[k-1])
+                gapAB = S[i, j, k-1] + 3*gap
+                gapBC = S[i-1, j, k] + 3*gap
+                gapAC = S[i, j-1, k] + 3*gap
+                # Use maximum of the 7 options for this location
+                S[i, j, k] = max([matchABC, matchAB, matchBC, matchAC, gapAB, gapBC, gapAC])
+                # Remember which one was max., for the traceback
+                if S[i, j, k] == matchABC:
+                    traceback[i, j, k] = 0 #"matchABC"
+                elif S[i, j, k] == matchBC:
+                    traceback[i, j, k] = 1 #"matchBC"
+                elif S[i, j, k] == matchAC:
+                    traceback[i, j, k] = 2 #"matchAC"
+                elif S[i, j, k] == matchAB:
+                    traceback[i, j, k] = 3 #"matchAB"
+                elif S[i, j, k] == gapAB:
+                    traceback[i, j, k] = 4 #"gapAB"
+                elif S[i, j, k] == gapBC:
+                    traceback[i, j, k] = 5 #"gapBC"
+                elif S[i, j, k] == gapAC:
+                    traceback[i, j, k] = 6 #"gapAC"
+
+    # Traceback the optimal alignment
+    alignA = ""
+    alignB = ""
+    alignC = ""
+    # Start at the end
+    i = lenA
+    j = lenB
+    k = lenC
+    # Stop when we hit the end of all but one sequence
+    while (i>0 and j>0) or (j>0 and k>0) or (i>0 and k>0):
+        if traceback[i, j, k] == 0: #"matchABC":
+            alignA = seqA[i-1] + alignA
+            alignB = seqB[j-1] + alignB
+            alignC = seqC[k-1] + alignC
+            i -= 1
+            j -= 1
+            k -= 1
+        elif traceback[i, j, k] == 3: #"matchAB":
+            alignA = seqA[i-1] + alignA
+            alignB = seqB[j-1] + alignB
+            alignC = "-" + alignC
+            i -= 1
+            j -= 1
+        elif traceback[i, j, k] == 2: #"matchAC":
+            alignA = seqA[i-1] + alignA
+            alignB = "-" + alignB
+            alignC = seqC[k-1] + alignC
+            i -= 1
+            k -= 1
+        elif traceback[i, j, k] == 1: #"matchBC":
+            alignA = "-" + alignA
+            alignB = seqB[j-1] + alignB
+            alignC = seqC[k-1] + alignC
+            j -= 1
+            k -= 1
+        elif traceback[i, j, k] == 4: #"gapAB":
+            alignA = "-" + alignA
+            alignB = "-" + alignB
+            alignC = seqC[k-1] + alignC
+            k -= 1
+        elif traceback[i, j, k] == 6: #"gapAC":
+            alignA = "-" + alignA
+            alignB = seqB[j-1] + alignB
+            alignC = "-" + alignC
+            j -= 1
+        elif traceback[i, j, k] == 5: #"gapBC":
+            alignA = seqA[i-1] + alignA
+            alignB = "-" + alignB
+            alignC = "-" + alignC
+            i -= 1
+    # Fill in the rest of the alignment if it begins with gaps
+    # (i.e., traceback all the way to S[0, 0, 0])
+    while i > 0:
+        alignA = seqA[i-1] + alignA
+        alignB = "-" + alignB
+        alignC = "-" + alignC
+        i -= 1
+    while j > 0:
+        alignA = "-" + alignA
+        alignB = seqB[j-1] + alignB
+        alignC = "-" + alignC
+        j -= 1
+    while k > 0:
+        alignA = "-" + alignA
+        alignB = "-" + alignB
+        alignC = seqC[k-1] + alignC
+        k -= 1
+
+    return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True),
+                      Sequence(alignB, seqB.alphabet, seqB.name, gappy = True),
+                      Sequence(alignC, seqC.alphabet, seqC.name, gappy = True)])
+
+def readClustal(string, alphabet):
+    """ Read a ClustalW2 alignment in the given string and return as an
+    Alignment object. """
+    seqs = {} # sequence data
+    for line in string.splitlines():
+        if line.startswith('CLUSTAL') or line.startswith('STOCKHOLM') \
+           or line.startswith('#'):
+            continue
+        if len(line.strip()) == 0:
+            continue
+        if line[0] == ' ' or '*' in line or ':' in line:
+            continue
+        sections = line.split()
+        name, seqstr = sections[0:2]
+        index = name.find('/')
+        if index >= 0:
+            name = name[0:index]
+        if seqs.has_key(name):
+            seqs[name] += seqstr
+        else:
+            seqs[name] = seqstr
+    sequences = []
+    for name, seqstr in seqs.items():
+        sequences.append(Sequence(seqstr, alphabet, name, gappy = True))
+    return Alignment(sequences)
+
+def readClustalFile(filename, alphabet):
+    """ Read a ClustalW2 alignment file and return an Alignment object
+    containing the alignment. """
+    fh = open(filename)
+    data = fh.read()
+    fh.close()
+    aln = readClustal(data, alphabet)
+    return aln
+
+# Substitution Matrix ------------------
+
+class SubstMatrix():
+
+    scoremat = None
+    alphabet = None
+
+    def __init__(self, alphabet):
+        self.alphabet = alphabet
+        self.scoremat = {}
+
+    def setScores(self, scoremat):
+        """ Set all scores in one go.
+            scoremat is a (sym1, sym2)-keyed dictionary of scores. """
+        self.scoremat = scoremat
+
+    def _getkey(self, sym1, sym2):
+        """ Construct canonical (unordered) key for two symbols """
+        if sym1 <= sym2:
+            return tuple([sym1, sym2])
+        else:
+            return tuple([sym2, sym1])
+
+    def set(self, sym1, sym2, score):
+        """ Add a score to the substitution matrix """
+        self.scoremat[self._getkey(sym1, sym2)] = score
+
+    def get(self, sym1, sym2):
+        return self.scoremat[self._getkey(sym1, sym2)]
+
+    def __str__(self):
+        symbols = self.alphabet.symbols # what symbols are in the alphabet
+        i = len(symbols)
+        string = ''
+        for a in symbols:
+            string += a + ' '
+            for b in symbols[:len(symbols)-i+1]:
+                score = self.scoremat[self._getkey(a, b)]
+                if score != None:
+                    string += str(score).rjust(3) + ' '
+                else:
+                    string += "?".rjust(3) + ' '
+            string += '\n'
+            i -= 1
+        string += '    ' + '   '.join(self.alphabet.symbols)
+        return string
+
+    def writeFile(self, filename):
+        """ Write this substitution matrix to the given file. """
+        fh = open(filename, 'w')
+        file = ''
+        for key in self.scoremat:
+            file += ''.join(key) + ': ' + str(self.scoremat[key]) + '\n'
+        fh.write(file)
+        fh.close()
+
+
+def readSubstMatrix(filename, alphabet):
+    """ Read in the substitution matrix stored in the given file. """
+    mat = SubstMatrix(alphabet)
+    fh = open(filename, 'r')
+    data = fh.read()
+    fh.close()
+    lines = data.splitlines()
+    for line in lines:
+        if len(line.strip()) == 0:
+            continue
+        symbols, score = line.split(':')
+        score = int(score)
+        mat.set(symbols[0], symbols[1], score)
+    return mat
+
+#import os
+#os.chdir('/Users/mikael/workspace/binf/data/')  # set to the directory where you keep your files
+#BLOSUM62 = readSubstMatrix('blosum62.matrix', Protein_Alphabet)
+
+# Motifs -------------------
+
+class Regexp(object):
+
+    """ A class that defines a sequence pattern in terms of a
+    given regular expression, with . indicating any symbol and square brackets
+    indicating a selection. See standard regexp definitions for more. """
+
+    def __init__(self, pattern):
+        """ Create a new consensus sequence with the given pattern. """
+        try:
+            self.pattern = pattern
+            self.regex = re.compile(pattern)
+        except:
+            raise RuntimeError('invalid consensus sequence given: %s' % pattern)
+
+    def __str__(self):
+        return self.pattern
+
+    def search(self, sequence):
+        """ Find matches to the motif in the specified sequence. Returns a list
+        of triples, of the form (position, matched string, score). Note that
+        the score is always 1.0 because a consensus sequence either matches
+        or doesn't. """
+        if not type(sequence) is Sequence:
+            sequence = Sequence(sequence)
+        sequenceString = sequence[:]
+
+        results = []
+        for match in self.regex.finditer(sequenceString):
+            results.append((match.start(), match.group(), 1.0))
+        return results
+
+
+class PWM(object):
+
+    """ A position weight matrix. """
+
+    def __init__(self, foreground, background = None, start = 0, end = None, pseudo = 0.0):
+        """ Create a new PWM from the given probability matrix/ces.
+        foreground: can be either an Alignment, a list of Distrib's or an instance of IndepJoint.
+        background: must be a Distrib instance or None (in which case a uniform background will be used)
+        Specify only a section of the matrix to use with start and end. """
+        if isinstance(foreground, Alignment):
+            foreground = foreground.getProfile(pseudo = pseudo)
+        if isinstance(foreground, IndepJoint):
+            foreground = foreground.store
+        self.start = start
+        self.end = end or len(foreground)
+        self.length = self.end - self.start
+        self.alphabet = foreground[self.start].alpha
+        if False in [ col.alpha == self.alphabet for col in foreground[self.start + 1 : self.end] ]:
+            raise RuntimeError("All positions need to be based on the same alphabet")
+        self.symbols = self.alphabet.symbols
+        # Set foreground probabilities from given alignment
+        self.m = numpy.zeros((len(self.symbols), self.length))
+        self.fg = foreground[self.start:self.end]
+        self.bg = background or Distrib(self.alphabet, 1.0) # specified background or uniform
+        if not self.alphabet == self.bg.alpha:
+            raise RuntimeError("Background needs to use the same alphabet as the foreground")
+        p = self.bg.prob()
+        for i in range(self.length):
+            q = self.fg[i].prob()
+            for j in range(len(self.alphabet)):
+                self.m[j][i] = self.logme(q[j], p[j])
+
+    def __len__(self):
+        return self.length
+
+    def getRC(self, swap = [('A', 'T'), ('C', 'G')] ):
+        """ Get the reverse complement of the current PWM.
+            Use for DNA sequences with default params.
+        """
+        new_fg = self.fg[::-1]  # backwards
+        for s in swap:
+            new_fg = [d.swapxcopy(s[0], s[1]) for d in new_fg]
+        return PWM(new_fg, self.bg)
+
+    MIN_VALUE = 0.00000000001
+
+    def logme(self, fg, bg):
+        if fg > self.MIN_VALUE and bg > self.MIN_VALUE:
+            ratio = fg / bg
+            return math.log(ratio)
+        # if not, one of fg and bg is practically zero
+        if fg > self.MIN_VALUE: # bg is zero
+            return math.log(fg / self.MIN_VALUE)
+        else: # fg is zero
+            return math.log(self.MIN_VALUE)
+
+    def getMatrix(self):
+        return self.m
+
+    def __str__(self):
+        str = ''
+        for j in range(len(self.alphabet)):
+            str += "%s\t%s\n" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
+        return str
+
+    def display(self, format = 'COLUMN'):
+        if format == 'COLUMN':
+            print " \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))
+            for j in range(len(self.alphabet)):
+                print "%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
+        elif format == 'JASPAR':
+            for j in range(len(self.alphabet)):
+                print "%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
+
+    def search(self, sequence, lowerBound=0):
+        """ Find matches to the motif in a specified sequence. Returns a list
+        of  results as triples: (position, matched string, score).
+        The optional argument lowerBound specifies a lower bound on reported
+        scores. """
+        results = []
+        for i in range(len(sequence)-self.length+1):
+            subseq = sequence[i:i + self.length]
+            ndxseq = [ self.alphabet.index(sym) for sym in subseq ]
+            score = 0.0
+            for w in range(len(ndxseq)):
+                score += self.m[ ndxseq[w] ][ w ]
+            if score > lowerBound:
+                results.append((i, subseq, score))
+        return results
+
+    def maxscore(self, sequence):
+        """ Find matches to the motif in a specified sequence.
+            Returns the maximum score found in the sequence and its index as a tuple:
+            (maxscore, maxindex) """
+        maxscore = None
+        maxindex = None
+        for i in range(len(sequence)-self.length+1):
+            subseq = sequence[i:i + self.length]
+            ndxseq = [ self.alphabet.index(sym) for sym in subseq ]
+            score = 0.0
+            for w in range(len(ndxseq)):
+                score += self.m[ ndxseq[w] ][ w ]
+            if maxscore == None:
+                maxscore = score
+                maxindex = i
+            elif maxscore < score:
+                maxscore = score
+                maxindex = i
+        return (maxscore, maxindex)
+
+# Web Service Functions -------------------
+
+def getSequence(id, database = 'uniprotkb', start=None, end=None):
+    """ Get the sequence identified by the given ID from the given database
+    (e.g. 'uniprotkb', 'refseqn' or 'refseqp'), and return it as a Sequence
+    object. An error is caused if the sequence ID is not found. If start and
+    end are given, then only that section of the sequence is returned. 
+    Note: more flexible search options are supported by using webservice.fetch
+    directly."""
+
+    MAX_TRY = 5
+
+    for i in range(MAX_TRY):
+        try:
+            fastaData = fetch(id, database)
+            seq = readFasta(fastaData)[0]
+            break
+        except:
+            from time import sleep
+            print 'Failed on {i}th try for id {id}'.format(i=i, id=id)
+            sleep(0.1)
+    try:
+        return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info)
+    except:
+        raise RuntimeError('An error occurred while retrieving the specified sequence: %s (maybe the ID doesn\'t exist)' % id)
+
+def searchSequences(query, database='uniprot'):
+    """ Search for sequences matching the given query in the given database
+    (must be 'uniprot'), and return a list of sequence IDs. """
+    ids = search(query, limit = None)
+    return ids
+
+def runClustal(sequences, method='slow'):
+    """ Run a ClustalOmega alignment of the given list of Sequence objects.
+    Return an Alignment object. Method should be one of 'fast' or 'slow'. """
+    alpha = None
+    for seq in sequences:
+        if alpha == None:
+            alpha = seq.alphabet
+        elif alpha != seq.alphabet:
+            raise RuntimeError("Invalid alphabet: " + str(seq.alphabet) + ". Not compatible with " + str(alpha))
+    serviceName = 'clustalo'
+    resultType = 'aln-clustal'
+    fastaSeqs = ''.join([seq.writeFasta() for seq in sequences])
+    params = {'alignment': method.lower(), 'sequence': fastaSeqs}
+    service = EBI(serviceName)
+    result = service.submit(params, resultType)
+    alignment = readClustal(result, alpha)
+    return alignment
+
+def createTree(alignment, type):
+    """ Run a ClustalW 2 phylogeny tree creation of either a 'Neighbour-joining'
+    or 'UPGMA' type tree from the given multiple sequence Alignment object. """
+    if not type in ['Neighbour-joining', 'UPGMA']:
+        raise RuntimeError('type must be either \'Neighbour-joining\' or \'UPGMA\'.')
+    serviceName = 'clustalw2_phylogeny'
+    resultType = 'tree'
+    output = 'dist'
+    clustalAln = alignment.writeClustal()
+    params = {'tree': output, 'sequence': clustalAln, 'clustering': type, 'tossgaps': 'true'}
+    service = EBI(serviceName)
+    tree = service.submit(params, resultType)
+    return tree
+
+def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
+    """ Run a BLAST search of nucleotide mouse databases using the given
+    sequence as a query. Return a list of matched sequence IDs, in descending
+    order of similarity to query sequence.
+    program: either blastn (nucleotide) or blastp (protein)
+    database: many available, e.g. uniprotkb, pdb (protein); em_rel, nrnl1 (EMBL nucleotide, non-redundant resp)
+        (for protein see http://www.ebi.ac.uk/Tools/sss/ncbiblast/help/index-protein.html#database)
+        (for nucleotide see http://www.ebi.ac.uk/Tools/sss/ncbiblast/help/index-nucleotide.html#database)
+    exp: E-value threshold (select only hits that have a better E-value than this)
+    """
+    if sequence.alphabet == predefAlphabets['DNA']:
+        stype = 'dna'
+    elif sequence.alphabet == predefAlphabets['RNA']:
+        stype = 'rna'
+    else:
+        stype = 'protein'
+    serviceName = 'ncbiblast'
+    resultTypes = ['ids', 'out'] # request
+    fastaSeq = sequence.writeFasta()
+    databases = [database]
+    params = {'program': program, 'database': databases, 'sequence': fastaSeq,
+              'stype': stype, 'exp': exp}
+    service = EBI(serviceName)
+    idsData, output = service.submit(params, resultTypes)
+    ids=[]
+    for id in idsData.splitlines():
+        if len(id) > 0:
+            ids.append(id.split(':')[1])
+    return ids
+
+if __name__ == '__main__':
+    seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True)
+    print 'Read', len(seqs), 'sequences'
+
--- a/spred.py
+++ b/spred.py
+'''
+A module to enable experimentation with various methods for predicting properties
+assigned to sequence elements, e.g. secondary structure of proteins.
+A neural net wrapper class is provided.
+A couple of example applications are found at the end of this module.
+'''
+import numpy
+import sym
+import prob
+import sequence
+import ml
+
+def slidewin(seq, winsize):
+    """ Produce a list of sub-sequences of a given length from a complete sequence """
+    subseqs = []
+    for i in range(len(seq) - winsize + 1):
+        subseqs.append(seq[i : i + winsize])
+    return subseqs
+
+def _onehotIndex(alpha, sym):
+    """ Create array with "one-hot" bit codes (only adding "ones" to an all-"zero" array) """
+    symlen = len(sym)
+    alphalen = len(alpha)
+    indices = [ alpha.index(sym[i]) + (i * alphalen) for i in range(symlen) ]
+    return indices
+
+class SeqNN():
+    """ A neural net wrapper for multinomial classification of sequence input """
+
+    def __init__(self, inp_len, inp_alpha, outp_alpha, nhidden, cascade = 0):
+        """ Construct a neural net with numeric inputs and outputs
+            depending on alphabets used for inputs and outputs.
+            inp_len: number of symbols to use as input
+            inp_alpha: input alphabet
+            outp_alpha: output alphabet (defines number of classes)
+            nhidden: number of "hidden" nodes in the net
+            cascade: if non-zero, number of positions to feed into a cascaded structure-to-structure NN (also the number of hidden nodes of this NN)
+        """
+        self.nn1 = ml.NN(inp_len * len(inp_alpha), nhidden, len(outp_alpha))  # neural net
+        self.nn2 = None
+        self.cascade = cascade
+        if cascade > 0:
+            self.nn2 = ml.NN(cascade * len(outp_alpha), cascade, len(outp_alpha))  # cascaded neural net
+        self.inp_len  = inp_len
+        self.inp_alpha  = inp_alpha
+        self.outp_alpha = outp_alpha
+
+    def _encodeseq(self, seqs, targets = None):
+        """ Convert a list of sequences into numeric input suitable as input to NN. """
+        try:
+            len(seqs[0]) # if this does not throw error, it is a multi-input already
+        except TypeError:
+            seqs = [ seqs ]
+            targets = [ targets ]
+        totlen = 0
+        alpha = None
+        for seq in seqs:
+            if not alpha:
+                alpha = seq.alphabet
+            totlen += len(seq) - self.inp_len + 1
+        im = numpy.zeros((totlen, self.inp_len * len(alpha)))
+        if targets:
+            om = numpy.zeros((totlen, len(self.outp_alpha)))
+        row = 0
+        for i in range(len(seqs)):
+            subseqs = slidewin(seqs[i], self.inp_len)
+            if targets:
+                # Note how we remove the targets at the ends of the sequence
+                subtarg = targets[i][self.inp_len/2:-self.inp_len/2+1]
+            for k in range(len(subseqs)):
+                im[row, _onehotIndex(alpha,  subseqs[k])] = 1
+                if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1
+                row += 1
+        print "There are", row, "entries in data set"
+        if targets:
+            return im, om
+        else:
+            return im, None
+
+    def observeAll(self, seqs, targets, eta = 0.1, niter = 1):
+        """ Train a classifier to map from all possible windows to the target symbols.
+            Decompose each sequence to all full-width sub-sequences. Map each sub-sequence
+            to the target symbol for the symbol in the centre of the sub-sequence. """
+        assert len(seqs) == len(targets), "Number of input sequences need to match the number of target sequences"
+        im, om = self._encodeseq(seqs, targets)
+        for i in range(niter):  # train first NN
+            rmse = self.nn1.train(im, om, eta = eta, niter = 1)
+            print i, ":", rmse
+        if not self.cascade:    # if there's no cascaded NN, finish here
+            return rmse
+        nn1seqs = []            # a list of new SS sequences ...
+        for seq in seqs:        # ... based on AA sequences
+            nn1seq = self.predict(seq, useCascade = False) # construct a new sequence which consists of SS predictions
+            nn1seqs.append(nn1seq)
+        im, om = self._encodeseq(nn1seqs, targets)  # construct input/output patterns from SS sequences
+        for i in range(niter):  # train cascaded NN
+            rmse = self.nn2.train(im, om, eta = eta, niter = 1)
+            print i, ":", rmse
+        return rmse
+
+    def testAll(self, seqs, targets):
+        """ Test the neural network on the specified sequences and target sequences.
+            Returns a confusion matrix with the predictions. """
+        assert len(seqs) == len(targets), "Number of input sequences needs to match the number of target sequences"
+        if not self.cascade:
+            im, om = self._encodeseq(seqs, targets)
+            cm = self.nn1.test(im, om)
+            return cm
+        else:
+            nn1seqs = []
+            for seq in seqs:
+                nn1seq = self.predict(seq, useCascade = False)
+                nn1seqs.append(nn1seq)
+            im, om = self._encodeseq(nn1seqs, targets)
+            cm = self.nn2.test(im, om)
+            return cm
+
+    def predict(self, inpseq, useCascade = True):
+        """ Classify each symbol in a sequence.
+            Return the predictions as a list of symbols. """
+        W = self.nn1.ninput / len(self.inp_alpha)
+        if useCascade and self.cascade:
+            nn1seq = self.predict(inpseq, useCascade = False)
+            subseqs = slidewin(nn1seq, self.cascade)
+            predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions
+            for i in range(len(subseqs)):    # for each input sub-sequence of the primary NN
+                input = numpy.zeros(self.cascade * len(self.outp_alpha))
+                input[_onehotIndex(self.outp_alpha, subseqs[i])] = 1
+                outvec = self.nn2.feedforward(input)
+                d = prob.Distrib(self.outp_alpha)
+                for k in range(len(outvec)):
+                    d.observe(self.outp_alpha[k], outvec[k])
+                predsyms[i + self.cascade / 2] = d.getmax()    # use the symbol with the highest probability
+            return sequence.Sequence(predsyms, self.outp_alpha)
+        else: # only predict using the first NN
+            subseqs = slidewin(inpseq, W)
+            predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions
+            for i in range(len(subseqs)):    # for each input sub-sequence of the primary NN
+                input = numpy.zeros(self.inp_len * len(self.inp_alpha))
+                input[_onehotIndex(self.inp_alpha, subseqs[i])] = 1
+                outvec = self.nn1.feedforward(input)
+                d = prob.Distrib(self.outp_alpha)
+                for k in range(len(outvec)):
+                    d.observe(self.outp_alpha[k], outvec[k])
+                predsyms[i + W / 2] = d.getmax()    # use the symbol with the highest probability
+            return sequence.Sequence(predsyms, self.outp_alpha)
+
--- a/sstruct.py
+++ b/sstruct.py
+'''
+Module sstruct -- methods for protein secondary structure
+'''
+
+import sequence
+import sym
+
+cf_dict = {  # Chou-Fasman table
+#     P(a), P(b), P(t),    f(i), f(i+1), f(i+2), f(i+3)
+'A': ( 142,   83,   66,   0.060,  0.076,  0.035,  0.058 ),    # Alanine
+'R': (  98,   93,   95,   0.070,  0.106,  0.099,  0.085 ),    # Arginine
+'N': ( 101,   54,  146,   0.147,  0.110,  0.179,  0.081 ),    # Aspartic Acid
+'D': (  67,   89,  156,   0.161,  0.083,  0.191,  0.091 ),    # Asparagine
+'C': (  70,  119,  119,   0.149,  0.050,  0.117,  0.128 ),    # Cysteine
+'E': ( 151,   37,   74,   0.056,  0.060,  0.077,  0.064 ),    # Glutamic Acid
+'Q': ( 111,  110,   98,   0.074,  0.098,  0.037,  0.098 ),    # Glutamine
+'G': (  57,   75,  156,   0.102,  0.085,  0.190,  0.152 ),    # Glycine
+'H': ( 100,   87,   95,   0.140,  0.047,  0.093,  0.054 ),    # Histidine
+'I': ( 108,  160,   47,   0.043,  0.034,  0.013,  0.056 ),    # Isoleucine
+'L': ( 121,  130,   59,   0.061,  0.025,  0.036,  0.070 ),    # Leucine
+'K': ( 114,   74,  101,   0.055,  0.115,  0.072,  0.095 ),    # Lysine
+'M': ( 145,  105,   60,   0.068,  0.082,  0.014,  0.055 ),    # Methionine
+'F': ( 113,  138,   60,   0.059,  0.041,  0.065,  0.065 ),    # Phenylalanine
+'P': (  57,   55,  152,   0.102,  0.301,  0.034,  0.068 ),    # Proline
+'S': (  77,   75,  143,   0.120,  0.139,  0.125,  0.106 ),    # Serine
+'T': (  83,  119,   96,   0.086,  0.108,  0.065,  0.079 ),    # Threonine
+'W': ( 108,  137,   96,   0.077,  0.013,  0.064,  0.167 ),    # Tryptophan
+'Y': (  69,  147,  114,   0.082,  0.065,  0.114,  0.125 ),    # Tyrosine
+'V': ( 106,  170,   50,   0.062,  0.048,  0.028,  0.053 ),    # Valine
+'Y': (  69,  147,  114,   0.082,  0.065,  0.114,  0.125 ),    # Tyrosine
+'V': ( 106,  170,   50,   0.062,  0.048,  0.028,  0.053 ),}   # Valine
+
+prot_alpha = sym.Protein_Alphabet
+sstr_alpha = sym.DSSP3_Alphabet
+
+def makesstr(seq, sym = '*', gap = '-'):
+    """ Create a string from a list of booleans (seq) that indicate with sym what elements are true.
+        gap is used for elements that are false.
+    """
+    sstr = ''
+    for yes in seq:
+        if yes:
+            sstr += sym
+        else:
+            sstr += gap
+    return sstr
+
+def markCountAbove(scores, width = 6, call_cnt = 4):
+    """ Create a list of booleans that mark all positions within a window
+        of specified width that have scores above 100.
+        scores: a list of scores (one for each position in sequence)
+        width: width of window
+        call_cnt: required number of positions with score 100 or more
+        return: list of "calls" (positions in windows with at least call_cnt)
+    """
+    above = [False for _ in range(len(scores))]
+    cnt = 0 # keep track of how many in the current window that are > 100
+    for i in range(len(scores)):
+        if scores[i] > 100: cnt += 1
+        if i >= width:
+            if scores[i - width] > 100: cnt -= 1
+        if cnt >= call_cnt:
+            for j in range(max(0, i - width + 1), i + 1):
+                above[j] = True
+    return above
+
+def markAvgAbove(scores, width = 4, call_avg = 100.0):
+    """ Create a list of booleans that mark all positions within a window of specified width
+        that have an average score above specified call_avg.
+    """
+    above = [False for _ in range(len(scores))]
+    sum = 0.0 #
+    for i in range(len(scores)):
+        sum += scores[i]
+        if i >= width: #
+            sum -= scores[i - width]
+        if sum >= call_avg * width:
+            for j in range(max(0, i - width + 1), i + 1):
+                above[j] = True
+    return above
+
+def extendDownstream(scores, calls, width = 4):
+    """ Create a list of booleans that mark all positions that are contained
+        in supplied calls list AND extend this list downstream containing a
+        specified width average of 100.
+    """
+    sum = 0.0
+    order = range(0, len(calls) - 1, +1)  # we are extending calls downstream
+    cnt = 0
+    for i in order:  # extend to the right
+        if calls[i]: # to extend a call is required in the first place
+            cnt += 1
+            sum += scores[i] # keep a sum to be able to average
+            if cnt >= width:   # only average over a width
+                sum -= scores[i - width + 1]
+            if not calls[i + 1] and sum + scores[i + 1] > width * 100: # check
+                calls[i + 1] = True
+        else: # no call, reset sum
+            cnt = 0
+            sum = 0.0
+    return calls
+
+def extendUpstream(scores, calls, width = 4):
+    """ Create a list of booleans that mark all positions that are contained in supplied calls list
+        AND extend this list upstream containing a specified width average of 100.
+    """
+    sum = 0.0
+    order = range(len(calls) - 1, 0, -1)  # we are extending calls upstream/to-the-left
+    cnt = 0
+    for i in order:  # extend to the right
+        if calls[i]: # a requirement to extend is to have a call in the first place
+            cnt += 1
+            sum += scores[i] # keep a sum to be able to average
+            if cnt >= width:   # only average over a width
+                sum -= scores[i + width - 1]
+            if not calls[i - 1] and sum + scores[i - 1] > width * 100: # check average
+                calls[i - 1] = True
+        else: # no call, reset sum
+            cnt = 0
+            sum = 0.0
+    return calls
+
+def calcRegionAverage(scores, calls):
+    """ Determine for each position in a calls list the average score over the region
+        in which it is contained.
+    """
+    region_avg = []
+    sum = 0.0
+    cnt = 0
+    # First determine the average for each region
+    for i in range(len(scores)):  # go through each position
+        if calls[i]:              # position is part of a "called" region
+            sum += scores[i]      # add the score of that position to the average
+            cnt += 1              # keep track of the number of positions in the region
+        else:                     # we are outside a "called" region
+            if cnt > 0:           # if it is the first AFTER a called region
+                region_avg.append(sum/cnt)   # save the average
+            sum = 0.0             # reset average
+            cnt = 0
+    if cnt > 0:           # if it is the first AFTER a called region
+        region_avg.append(sum/cnt)   # save the average
+    # with all averages known, we'll populate the sequence of "averages"
+    region = 0
+    pos_avg = []
+    cnt = 0
+    for i in range(len(scores)):
+        if calls[i]:
+            pos_avg.append(region_avg[region])
+            cnt += 1
+        else:
+            pos_avg.append(0)
+            if cnt > 0:
+                region += 1
+            cnt = 0
+    return pos_avg
+
+def checkSupport(calls, diff):
+    """ Create a list of booleans indicating if each true position is supported
+        by a positive score """
+    supported = []
+    for i in range(len(calls)):   # go through each position
+        supported.append(calls[i] and diff[i] > 0)
+    return supported
+
+def getScores(seq, index = 0):
+    """ Create a score list for a sequence by referencing the Chou-Fasman table.
+    """
+    return [cf_dict[s.upper()][index] for s in seq]
--- a/stats.py
+++ b/stats.py
+import math
+
+'''
+Module with methods for doing some statistics.
+
+'''
+
+# Fisher's Exact Test
+
+def getFETpval(a1, a2, b1, b2, left=True):
+    """Computes Fisher's exact test based on a
+    null-hypothesis distribution specified by the totals, and
+    an observed distribution specified by b1 and b2, i.e.
+    determines the p-value of b's outcomes 1 and 2.
+    The default setting is to use the "left" side of the density
+    to determine the p-value.
+
+    Returns p-value."""
+    (prob, sless, sright, sleft, slarg)=getFETprob(a1, a2, b1, b2)
+    if left:
+        return sless # sless
+    else:
+        return slarg # slarg
+
+def getFET2tail(a1, a2, b1, b2):
+    """Computes Fisher's exact test based on a
+    null-hypothesis distribution specified by the totals, and
+    an observed distribution specified by b1 and b2, i.e.
+    determines the two-tailed p-value of b's outcomes 1 and 2.
+
+    Returns p-value."""
+    (prob, sless, sright, sleft, slarg)=getFETprob(a1, a2, b1, b2)
+    return min(1.0, sleft + sright)
+
+def getFETprob(a1, a2, b1, b2):
+    """Computes Fisher's exact test based on a
+    null-hypothesis distribution specified by the totals, and
+    an observed distribution specified by b1 and b2, i.e.
+    determines the probability of b's outcomes 1 and 2.
+
+    Returns an immutable list consisting of the exact
+    probability, and assorted p-values (sless, sright, sleft,
+    slarg) based on the density."""
+    sless  = 0.0
+    sright = 0.0
+    sleft  = 0.0
+    slarg  = 0.0
+    n = a1 + a2 + b1 + b2
+    row1 = a1 + a2 # the row containing the null hypothesis
+    col1 = a1 + b1 # the column containing samples for outcome 1
+    max = row1
+    if col1 < max:
+        max = col1
+    min = row1 + col1 - n
+    if min < 0:
+        min = 0
+    if min == max:
+        rt = (prob, sless, sright, sleft, slarg) = (1.0,1.0,1.0,1.0,1.0)
+        return rt
+    prob = hyper0(a1, row1, col1, n)
+    sleft = 0.0
+    p = hyper(min)
+
+    i = min + 1
+    while p < (0.99999999 * prob):
+        sleft = sleft + p
+        p = hyper(i)
+        i = i + 1
+
+    i = i - 1
+    if p < (1.00000001 * prob):
+        sleft = sleft + p
+    else:
+        i = i - 1
+
+    sright = 0.0
+    p = hyper(max)
+
+    j = max - 1
+    while p < (0.99999999 * prob):
+        sright = sright + p
+        p = hyper(j)
+        j = j - 1
+
+    j = j + 1
+    if p < (1.00000001 * prob):
+        sright = sright + p
+    else:
+        j = j + 1
+
+    if abs(i - a1) < abs(j - a1):
+        sless = sleft
+        slarg = 1.0 - sleft + prob
+    else:
+        sless = 1.0 - sright + prob
+        slarg = sright
+    return (prob, sless, sright, sleft, slarg)
+
+def lngamm(z):
+    # Reference: "Lanczos, C. 'A precision approximation
+    # of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964."
+    # Translation of  Alan Miller's FORTRAN-implementation
+    # See http://lib.stat.cmu.edu/apstat/245
+    x = 0.0
+    x = x + 0.1659470187408462e-06/(z+7.0)
+    x = x + 0.9934937113930748e-05/(z+6.0)
+    x = x - 0.1385710331296526    /(z+5.0)
+    x = x + 12.50734324009056     /(z+4.0)
+    x = x - 176.6150291498386     /(z+3.0)
+    x = x + 771.3234287757674     /(z+2.0)
+    x = x - 1259.139216722289     /(z+1.0)
+    x = x + 676.5203681218835     /(z)
+    x = x + 0.9999999999995183
+    return math.log(x)-5.58106146679532777-z+(z-0.5)*math.log(z+6.5)
+
+def lnfact(n):
+    if n<=1:
+        return 0.0
+    return lngamm(n+1.0)
+
+def lnbico(n, k):
+    return lnfact(n)-lnfact(k)-lnfact(n-k)
+
+def hyper_323(n11, n1_, n_1, n):
+    return math.exp(lnbico(n1_,n11)+lnbico(n-n1_,n_1-n11)-lnbico(n,n_1))
+
+(_sn11, _sn1_, _sn_1, _sn, _sprob) = (0,0,0,0,0.0) # global variables used by hyper0
+def hyper0(n11i, n1_i, n_1i, ni):
+    global _sn11, _sn1_, _sn_1, _sn, _sprob
+    if not ((n1_i|n_1i|ni)!=0):
+        if not (n11i % 10 == 0):
+            if n11i==_sn11+1:
+                _sprob = _sprob * ((_sn1_-_sn11)/float(n11i))*((_sn_1-_sn11)/float(n11i+_sn-_sn1_-_sn_1))
+                _sn11 = n11i
+                return _sprob
+            if n11i==_sn11-1:
+                _sprob = _sprob * ((_sn11)/float(_sn1_-n11i))*((_sn11+_sn-_sn1_-_sn_1)/float(_sn_1-n11i))
+                _sn11 = n11i
+                return _sprob
+        _sn11 = n11i
+    else:
+        _sn11 = n11i
+        _sn1_=n1_i
+        _sn_1=n_1i
+        _sn=ni
+    _sprob = hyper_323(_sn11,_sn1_,_sn_1,_sn)
+    return _sprob
+
+def hyper(n11):
+    return hyper0(n11,0,0,0)
+
+def mean(X):
+    sum = 0
+    for x in X:
+        sum += x
+    return sum/len(X)
+
+def meanvar(X):
+    """ The mean and variance of the sample. """
+    mu = mean(X)
+    dev = 0
+    for x in X:
+        dev += (x - mu) * (x - mu)
+    return (mu, dev / len(X))
+
+def getZScore(X, sample):
+    (mu, var) = meanvar(X)
+    return (sample - mu) / math.sqrt(var)
+
+def getZScores(X):
+    (mu, var) = meanvar(X)
+    Y = [((x - mu) / math.sqrt(var)) for x in X]
+    return Y
+
+def getPearson(X, Y):
+    """ Pearson correlation coefficient (r). Note that we are using the standard deviation of the sample, NOT the sample standard deviation (see http://en.wikipedia.org/wiki/Standard_deviation).
+    """
+    (Xmu, Xvar) = meanvar(X)
+    (Ymu, Yvar) = meanvar(Y)
+    if len(X) != len(Y):
+        raise RuntimeError('Vectors are of uneven length')
+    n = len(X)
+    sum = 0
+    for i in range(n):
+        sum += (X[i] * Y[i])
+    if n == 0 or Xvar == 0 or Yvar == 0:
+        return 0
+    return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar))
+
+# normal distribution
+def error(x):
+    """
+    Error function
+    Cephes Math Library Release 2.8:  June, 2000
+    Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
+    """
+    result = 0.0
+    xsq = 0.0
+    s = 0.0
+    p = 0.0
+    q = 0.0
+
+    s = +1
+    if x<0:
+        s = -1
+    x = abs(x)
+    if x<0.5:
+        xsq = x*x
+        p = 0.007547728033418631287834
+        p = 0.288805137207594084924010+xsq*p
+        p = 14.3383842191748205576712+xsq*p
+        p = 38.0140318123903008244444+xsq*p
+        p = 3017.82788536507577809226+xsq*p
+        p = 7404.07142710151470082064+xsq*p
+        p = 80437.3630960840172832162+xsq*p
+        q = 0.0
+        q = 1.00000000000000000000000+xsq*q
+        q = 38.0190713951939403753468+xsq*q
+        q = 658.070155459240506326937+xsq*q
+        q = 6379.60017324428279487120+xsq*q
+        q = 34216.5257924628539769006+xsq*q
+        q = 80437.3630960840172826266+xsq*q
+        result = s*1.1283791670955125738961589031*x*p/q
+        return result
+    elif x>=10:
+        result = s
+        return result
+    result = s*(1-errorComplement(x))
+    return result
+
+def errorComplement(x):
+    """
+    Complementary error function
+    Cephes Math Library Release 2.8:  June, 2000
+    Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
+    """
+    result = 0.0
+    p = 0.0
+    q = 0.0
+
+    if x<0.0:
+        result = 2.0-errorComplement(-x)
+        return result
+    elif x<0.5:
+        result = 1.0-errorComplement(x)
+        return result
+    elif x>=10:
+        result = 0
+        return result
+    p = 0.0
+    p = 0.5641877825507397413087057563+x*p
+    p = 9.675807882987265400604202961+x*p
+    p = 77.08161730368428609781633646+x*p
+    p = 368.5196154710010637133875746+x*p
+    p = 1143.262070703886173606073338+x*p
+    p = 2320.439590251635247384768711+x*p
+    p = 2898.0293292167655611275846+x*p
+    p = 1826.3348842295112592168999+x*p
+    q = 1.0
+    q = 17.14980943627607849376131193+x*q
+    q = 137.1255960500622202878443578+x*q
+    q = 661.7361207107653469211984771+x*q
+    q = 2094.384367789539593790281779+x*q
+    q = 4429.612803883682726711528526+x*q
+    q = 6089.5424232724435504633068+x*q
+    q = 4958.82756472114071495438422+x*q
+    q = 1826.3348842295112595576438+x*q
+    result = math.exp(-(x*x))*p/q
+    return result
+
+def f(x):
+    """
+    Normal distribution function
+    Returns the area under the Gaussian probability density
+    function, integrated from minus infinity to x
+    Cephes Math Library Release 2.8:  June, 2000
+    Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
+    """
+
+    result = 0.0
+
+    result = 0.5*(error(x/1.41421356237309504880)+1)
+    return result
+
+def inverseError(e):
+    """
+    Inverse of the error function
+    Cephes Math Library Release 2.8:  June, 2000
+    Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
+    """
+    result = 0.0
+
+    result = inverse(0.5*(e+1))/math.sqrt(2)
+    return result
+
+def inverse(y0):
+    """
+    Inverse of Normal distribution function
+    Returns the argument, x, for which the area under the
+    Gaussian probability density function (integrated from
+    minus infinity to x) is equal to y.
+
+    For small arguments 0 < y < exp(-2), the program computes
+    z = sqrt( -2.0 * log(y) );  then the approximation is
+    x = z - log(z)/z  - (1/z) P(1/z) / Q(1/z).
+    There are two rational functions P/Q, one for 0 < y < exp(-32)
+    and the other for y up to exp(-2).  For larger arguments,
+    w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
+
+    Cephes Math Library Release 2.8:  June, 2000
+    Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
+    """
+    result = 0.0
+    expm2 = 0.0
+    s2pi = 0.0
+    x = 0.0
+    y = 0.0
+    z = 0.0
+    y2 = 0.0
+    x0 = 0.0
+    x1 = 0.0
+    code = 0 # int
+    p0 = 0.0
+    q0 = 0.0
+    p1 = 0.0
+    q1 = 0.0
+    p2 = 0.0
+    q2 = 0.0
+
+    MAX_VALUE = 1.e23
+    expm2 = 0.13533528323661269189
+    s2pi = 2.50662827463100050242
+    if y0<=0:
+        result = -MAX_VALUE
+        return result
+    elif y0>=1:
+        result = MAX_VALUE
+        return result
+    code = 1
+    y = y0
+    if y>1.0-expm2:
+        y = 1.0-y
+        code = 0
+    if y>expm2:
+        y = y-0.5
+        y2 = y*y
+        p0 = -59.9633501014107895267
+        p0 = 98.0010754185999661536+y2*p0
+        p0 = -56.6762857469070293439+y2*p0
+        p0 = 13.9312609387279679503+y2*p0
+        p0 = -1.23916583867381258016+y2*p0
+        q0 = 1.0
+        q0 = 1.95448858338141759834+y2*q0
+        q0 = 4.67627912898881538453+y2*q0
+        q0 = 86.3602421390890590575+y2*q0
+        q0 = -225.462687854119370527+y2*q0
+        q0 = 200.260212380060660359+y2*q0
+        q0 = -82.0372256168333339912+y2*q0
+        q0 = 15.9056225126211695515+y2*q0
+        q0 = -1.18331621121330003142+y2*q0
+        x = y+y*y2*p0/q0
+        x = x*s2pi
+        result = x
+        return result
+    x = math.sqrt(-(2.0*math.log(y)))
+    x0 = x-math.log(x)/x
+    z = 1.0/x
+    if x<8.0:
+        p1 = 4.05544892305962419923
+        p1 = 31.5251094599893866154+z*p1
+        p1 = 57.1628192246421288162+z*p1
+        p1 = 44.0805073893200834700+z*p1
+        p1 = 14.6849561928858024014+z*p1
+        p1 = 2.18663306850790267539+z*p1
+        p1 = -(1.40256079171354495875*0.1)+z*p1
+        p1 = -(3.50424626827848203418*0.01)+z*p1
+        p1 = -(8.57456785154685413611*0.0001)+z*p1
+        q1 = 1.0
+        q1 = 15.7799883256466749731+z*q1
+        q1 = 45.3907635128879210584+z*q1
+        q1 = 41.3172038254672030440+z*q1
+        q1 = 15.0425385692907503408+z*q1
+        q1 = 2.50464946208309415979+z*q1
+        q1 = -(1.42182922854787788574*0.1)+z*q1
+        q1 = -(3.80806407691578277194*0.01)+z*q1
+        q1 = -(9.33259480895457427372*0.0001)+z*q1
+        x1 = z*p1/q1
+    else:
+        p2 = 3.23774891776946035970
+        p2 = 6.91522889068984211695+z*p2
+        p2 = 3.93881025292474443415+z*p2
+        p2 = 1.33303460815807542389+z*p2
+        p2 = 2.01485389549179081538*0.1+z*p2
+        p2 = 1.23716634817820021358*0.01+z*p2
+        p2 = 3.01581553508235416007*0.0001+z*p2
+        p2 = 2.65806974686737550832*0.000001+z*p2
+        p2 = 6.23974539184983293730*0.000000001+z*p2
+        q2 = 1.0
+        q2 = 6.02427039364742014255+z*q2
+        q2 = 3.67983563856160859403+z*q2
+        q2 = 1.37702099489081330271+z*q2
+        q2 = 2.16236993594496635890*0.1+z*q2
+        q2 = 1.34204006088543189037*0.01+z*q2
+        q2 = 3.28014464682127739104*0.0001+z*q2
+        q2 = 2.89247864745380683936*0.000001+z*q2
+        q2 = 6.79019408009981274425*0.000000001+z*q2
+        x1 = z*p2/q2
+    x = x0-x1
+    if code!=0:
+        x = -x
+    result = x
+    return result
+
+def getRSpval(a, b):
+    """
+    Compute the Wilcoxon rank sum test (aka the Mann-Whitney U-test), return the p-value
+    The approximation is based on the normal distribution and is reliable
+    when sample sets are of size 5 or larger.
+    The default is based on the area of the left side of the Gaussian, relative the
+    estimated z-value.
+    NULL: a==b ONE-SIDED: a<b (default=left), for ONE-SIDED: b<a (right) use 1-returned value.
+    For a two-tailed, double the p-value.
+    Implemented by Mikael Boden
+    """
+
+    # create a new list consisting of the two sample sets that can be sorted
+    lst=[]
+    for elem in a:
+        lst.append([elem, +1, 0])
+    for elem in b:
+        lst.append([elem, -1, 0])
+    # ok sort it
+    lst.sort(lambda p, q: cmp(p[0], q[0]))
+
+    # let's go through it and edit each rank
+    rank=0
+    na=0
+    nb=0 # the number of points in each set (A & B)
+    same=[] # a dynamic list to keep track of elements with same measurement
+    measurement=lst[0][0]
+    for row in lst:
+        if row[1]==+1: # belongs to class 'a'
+            na=na+1
+        else:
+            nb=nb+1
+        if (measurement!=row[0]): # here's an entry that differed from the previous...
+            # before moving on to handling the new element we need to sort out the "old" same list
+            firstInGroup=rank+1-len(same)
+            lastInGroup=rank
+            average=float(lastInGroup-firstInGroup)/2.0
+            for srow in same:
+                srow[2]=firstInGroup+average
+            same=[]
+            measurement=row[0]
+        same.append(row)
+        rank=rank+1
+    # the last batch of entries is handled outside the loop...
+    firstInGroup=rank+1-len(same)
+    lastInGroup=rank
+    average=float(lastInGroup-firstInGroup)/2.0
+    for srow in same:
+        srow[2]=firstInGroup+average
+
+    n=na+nb      # the total number of measurements
+    ta_obs=0     # sum of na ranks in group A
+    tb_obs=0     # sum of nb ranks in group B
+    # sum the ranks (replace the measurements)
+    for entry in lst:
+        if entry[1]==+1: # class 'a'
+            ta_obs+=entry[2]
+        else:
+            tb_obs+=entry[2]
+
+    tab=ta_obs+tb_obs                     # sum of n ranks in groups A and B combined
+    sd=math.sqrt((na*nb*(n+1.0))/12.0)    # the standard deviation is the same in both sets
+    ta_null=na*(n+1.0)/2.0                # the sum of the "null" case
+    tb_null=nb*(n+1.0)/2.0                # the sum of the "null" case
+    ta_max=na*nb+(na*(na+1.0))/2.0        # the max sum set A can take
+    tb_max=na*nb+(nb*(nb+1.0))/2.0        # the max sum set B can take
+    ua=ta_max-ta_obs                      # the "U" value for A which is the mirror of ...
+    ub=tb_max-tb_obs                      # the "U" value for B (we only need one)
+    ua_null=ta_max-ta_null                # the U value for the null case
+    ub_null=tb_max-tb_null
+    if ta_obs>ta_null:                    # a "continuity correction" for A
+        da=-0.5
+    else:
+        da=+0.5
+    if tb_obs>tb_null:                    # a "continuity correction" for B
+        db=-0.5
+    else:
+        db=+0.5
+    za=((ta_obs-ta_null)+da)/sd           # the z value for A which is the mirror of ...
+    zb=((tb_obs-tb_null)+db)/sd           # the z value for B (we only need one)
+    p=f(za)                        # figure out the area of the normal distribution
+    u=ua;                                 # remember one of the U values
+    return p                              # the p-value: null is that a==b, one-sided (a has lower values)
+
+
+def getPointBiserialCorr(group1, group2):
+    """
+    The point biserial correlation coefficient (rpb) is a correlation coefficient used when one variable (e.g. Y) is dichotomous,
+    with continuous data divided into two groups (group1 and group2 here).
+    group1 corresponds to "greater", group to "lesser", i.e. 1 and 0 respectively
+    See https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient
+    """
+    n1 = len(group1)
+    n0 = len(group2)
+    if n1 < 1 or n0 < 1:
+        raise RuntimeError('At least one group is empty')
+    n  = n1 + n0
+    M1 = sum(group1) / float(n1)
+    M0 = sum(group2) / float(n0)
+    M  = (M1 * n1 + M0 * n0) / float(n)
+    all = []
+    all.extend(group1)
+    all.extend(group2)
+    sn = math.sqrt(sum([(x_i - M)**2 for x_i in all]) / float(n))
+    return (M1 - M0) / sn * math.sqrt((n1 * n0) / float(n**2))
\ No newline at end of file
--- a/sym.py
+++ b/sym.py
+"""
+Module symbol is for defining alphabets (of symbols), and
+for storing and operating on symbols and tuples (ordered or
+unordered).
+"""
+import os
+
+# ------------------ Alphabet ------------------
+
+class Alphabet(object):
+    """ Defines an immutable biological alphabet (e.g. the alphabet for DNA is AGCT)
+    that can be used to create sequences (see sequence.py).
+    We use alphabets to define "tuple" tables, where entries are keyed by combinations
+    of symbols of an alphabet (see class TupleStore below).
+    Alphabets are used to define probability distributions for stochastic events
+    (see prob.py). """
+
+    def __init__(self, symbolString):
+        """ Construct an alphabet from a string of symbols. Lower case characters
+        will be converted to upper case, repeated characters are ignored.
+        Example of constructing the DNA alphabet:
+        >>> alpha = Alphabet('ACGTttga')
+        >>> alpha.symbols
+        ('A', 'C', 'G', 'T') """
+
+        # Add each symbol to the symbols list, one at a time, and ignore doubles (could use "set" here...)
+        _symbols = [] # create a temporary list
+        for s in symbolString:
+            if not str(s).upper()[0] in _symbols:
+                _symbols.append(str(s).upper()[0])
+        _symbols.sort() # we put them in alphabetical (one canonical) order
+        # OK done extracting, put them in place
+        self.symbols = tuple(_symbols); # create the immutable tuple from the extracted list
+        self.length = len(self.symbols)
+        self.annotations = {}
+
+    def __str__(self):
+        return str(self.symbols)
+
+    def __len__(self):
+        return len(self.symbols)
+
+    def __iter__(self):
+        return self.symbols.__iter__()
+
+    def __getitem__(self, ndx):
+        """ Retrieve the symbol(s) at the specified index (or slice of indices) """
+        return self.symbols[ndx]
+
+    def __contains__(self, sym):
+        """ Check if the given symbol is a member of the alphabet. """
+        return sym in self.symbols
+
+    def index(self, sym):
+        """ Retrieve the index of the given symbol in the alphabet. """
+        # If the symbol is valid, use the tuple's index function
+        if sym in self.symbols:
+            syms = self.symbols
+            return syms.index(sym)
+        else:
+            raise RuntimeError('Symbol %s is not indexed by alphabet %s' % (sym, str(self.symbols)))
+
+    def __eq__(self, rhs):
+        """ Test if the rhs alphabet is equal to ours. """
+        if rhs == None:
+            return False
+        if len(rhs) != len(self):
+            return False
+        # OK we know they're same size...
+        for sym in self.symbols:
+            if not sym in rhs:
+                return False
+        return True
+
+    def isSubsetOf(self, alpha2):
+        """ Test if this alphabet is a subset of alpha2. """
+        for sym in self.symbols:
+            if not alpha2.isValidSymbol(sym):
+                return False
+        return True
+
+    def isSupersetOf(self, alpha2):
+        """ Test if this alphabet is a superset of alpha2. """
+        return alpha2.isSubsetOf(self)
+
+    def annotateSym(self, label, sym, value):
+        try:
+            lookup = self.annotations[label]
+        except KeyError:
+            lookup = self.annotations[label] = {}
+        lookup[sym] = value
+
+    def annotateAll(self, label, symdictOrFilename):
+        if isinstance(symdictOrFilename, str): # we assume it is a filename
+            fh = open(symdictOrFilename)
+            string = fh.read()
+            d = {}
+            for line in string.splitlines():
+                if len(line.strip()) == 0:
+                    continue
+                sections = line.split()
+                symstr, value = sections[0:2]
+                for sym in symstr:
+                    d[sym] = value
+            fh.close()
+        else: # we assume it is a dictionary
+            d = symdictOrFilename
+        for sym in d:
+            self.annotateSym(label, sym, d[sym])
+
+    def getAnnotation(self, label, sym):
+        try:
+            lookup = self.annotations[label]
+            return lookup[sym]
+        except KeyError:
+            return None
+
+
+""" Below we declare alphabets that are going to be available when
+this module is imported """
+Bool_Alphabet = Alphabet('TF')
+DNA_Alphabet = Alphabet('ACGT')
+DNA_Alphabet_wN = Alphabet('ACGTN')
+RNA_Alphabet = Alphabet('ACGU')
+Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
+Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
+Protein_Alphabet_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
+DSSP_Alphabet = Alphabet('GHITEBSC')
+DSSP3_Alphabet = Alphabet('HEC')
+
+predefAlphabets = {'DNA': DNA_Alphabet,
+                   'RNA': RNA_Alphabet,
+                   'DNAwN': Alphabet('ACGTN'),
+                   'RNAwN': Alphabet('ACGUN'),
+                   'Protein': Protein_Alphabet,
+                   'ProteinwX': Protein_wX}
+# The preferred order in which a predefined alphabet is assigned to a sequence
+# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
+preferredOrder = ['DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX']
+# Useful annotations
+DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
+RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
+Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
+
+# ------------------ Substitution Matrix ------------------
+
+class TupleStore(dict):
+    """ Internal utility class that can be used for associating
+    a value with ordered n-tuples (n=1..N).
+    Read/write functions are defined for instances of this class.
+    """
+
+    def __init__(self, alphas=None, entries=None, sparse=True):
+        """
+        Manage entries keyed by symbol-tuples with values of arbitrary type.
+        If alphas is None, the alphabet(s) are inferred from the provided entries.
+        If entries is None, all entries are defined by possible combinations of symbols from specified alphabets,
+        and are assumed to be None until specified. Either alphas or entries must be supplied.
+        If sparse is True, a sparse memory-saving encoding is used, if false, a time-saving, more flexible encoding is used.
+        >>> matrix = TupleStore({'AA': 2, 'AW': -3, 'WW': 4, 'AR': -1})
+        >>> matrix[('A', 'W')]
+        -3
+        >>> matrix['AR']
+        -1
+        """
+        assert sparse, "Currently only sparse encoding is implemented."
+        assert alphas or entries, "Either alphabets or entries (from which alphabets can be inferred) must be supplied."
+        self.sparse = sparse         # sparse encoding if true
+        if alphas == None:
+            self.alphas = None       # need to figure out alphabet from supplied entries
+            self.keylen = None       # tuple length not known yet
+        elif type(alphas) is Alphabet:
+            self.alphas = tuple ([ alphas ]) # make it into a tuple
+            self.keylen = 1          # tuple length 1
+        else:
+            self.alphas = alphas     # alphabets are supplied
+            self.keylen = len(alphas)# length of tuples is the same as the number alphabets
+
+        # Check if entries are supplied to the constructor
+        if entries == None:
+            self.entries = entries = {}
+        elif type(entries) is dict:
+            raise RuntimeError("When specified, entries must be a dictionary")
+        # Check length of tuples, must be the same for all
+        for entry in entries:
+            if self.keylen == None:
+                self.keylen = len(entry)
+            elif self.keylen != len(entry):
+                raise RuntimeError("All entries must have the same number of symbols")
+
+        # go through each position in tuples, to check what alphabet is right
+        myalphas = []                   # my suggestions from entries (need to be subsets of specified)
+        for idx in range(self.keylen):
+            symset = set()              # we collect all symbols in position idx here
+            for key in entries:
+                symset.add(key[idx])
+            myalpha = Alphabet(symset)
+            myalphas.append(myalpha)
+            if self.alphas != None:     # if specified it needs to be a superset of that we constructed
+                if not self.alphas[idx].isSupersetOf(myalpha):
+                    raise RuntimeError("Specified alphabet is not compatible with specified entries")
+
+        if self.alphas == None:     # if not specified to constructor use those we found
+            self.alphas = tuple(myalphas)
+
+        for key in entries:
+            self[key] = entries[key]
+
+    def _isValid(self, symkey):
+        for idx in range(self.keylen):
+            if not symkey[idx] in self.alphas[idx]:
+                return False
+        return True
+
+    def __setitem__(self, symkey, value):
+        assert self.keylen == len(symkey), "All entries in dictionary must be equally long"
+        assert self._isValid(symkey), "Invalid symbol in entry"
+        self.entries[symkey] = value
+
+    def __getitem__(self, symkey):
+        """ Return the score matching the given symbols together."""
+        assert self.keylen == len(symkey), "Entries must be of the same length"
+        try:
+            return self.entries[symkey]
+        except KeyError:
+            return None
+
+    def __iadd__(self, symkey, ivalue):
+        assert self.keylen == len(symkey), "All entries in dictionary must be equally long"
+        assert self._isValid(symkey), "Invalid symbol in entry"
+        try:
+            self.entries[symkey] += ivalue
+        except KeyError:
+            self.entries[symkey] = ivalue
+
+    def __isub__(self, symkey, ivalue):
+        assert self.keylen == len(symkey), "All entries in dictionary must be equally long"
+        assert self._isValid(symkey), "Invalid symbol in entry"
+        try:
+            self.entries[symkey] -= ivalue
+        except KeyError:
+            self.entries[symkey] = -ivalue
+
+    def getAll(self, symkey=None):
+        """ Return the values matching the given symbols together.
+        symkey: tuple (or list) of symbols or None (symcount symbol); if tuple is None, all entries are iterated over.
+        """
+        if symkey == None:
+            symkey = []
+            for idx in range(self.keylen):
+                symkey.append(None)
+        else:
+            assert self.keylen == len(symkey), "Entries must be of the same length"
+        for idx in range(self.keylen):
+            if symkey[idx] != None:
+                if not symkey[idx] in self.alphas[idx]:
+                    raise RuntimeError("Invalid entry: must be symbols from specified alphabet or None")
+        return TupleEntries(self, symkey)
+
+    def __iter__(self):
+        return TupleEntries(self, tuple([None for _ in range(self.keylen)]))
+
+    def items(self, sort = False):
+        """ In a dictionary-like way return all entries as a list of 2-tuples (key, prob).
+        If sort is True, entries are sorted in descending order of value.
+        Note that this function should NOT be used for big (>5 variables) tables."""
+        ret = []
+        for s in self.entries:
+            if self[s] != None:
+                ret.append((s, self[s]))
+        if sort:
+            return sorted(ret, key=lambda v: v[1], reverse=True)
+        return ret
+
+class TupleEntries(object):
+    """ Iterator class for multiple entries in a tuple store.
+    """
+    def __init__(self, tuplestore, symkey):
+        self.tuplestore = tuplestore
+        self.symkey = symkey
+        self.symcount = []
+        self.indices = []
+        for ndx in range(tuplestore.keylen):
+            if symkey[ndx] == None:
+                self.indices.append(ndx)
+                self.symcount.append(0)        # start at this index to alter symbol
+            else:
+                self.symcount.append(None)     # do not alter this symbol
+        self.nextIsLast = False
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        """ Step through sequence of entries, either
+        (if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
+        (if sparse) with calls to tuple store based on all possible symbol combinations."""
+
+        if self.nextIsLast:
+            raise StopIteration
+
+        mykey = [] # construct current combination from known and unspecified symbols
+        for ndx in range(self.tuplestore.keylen):
+            if (self.symkey[ndx] == None):
+                sym = self.tuplestore.alphas[ndx][self.symcount[ndx]]
+                mykey.append(sym)
+            else:
+                mykey.append(self.symkey[ndx])
+
+        # decide which ndx that should be increased (only one)
+        self.nextIsLast = True # assume this is the last round (all counters are re-set)
+        for ndx in self.indices:
+            if self.symcount[ndx] == len(self.tuplestore.alphas[ndx]) - 1: # if we just entered the last symbol of this alphabet
+                self.symcount[ndx] = 0                  # reset count here
+            else:
+                self.symcount[ndx] = self.symcount[ndx] + 1
+                self.nextIsLast = False
+                break
+
+        return tuple(mykey)
+
--- a/webservice.py
+++ b/webservice.py
+import urllib, urllib2
+import os
+from time import sleep
+import stats
+from StringIO import StringIO
+import gzip
+
+""" This module is collection of functions for accessing the EBI REST web services,
+    including sequence retrieval, searching, gene ontology, BLAST and ClustalW.
+    The class EBI takes precautions taken as to not send too many requests when
+    performing BLAST and ClustalW queries.
+
+    See
+    http://www.ebi.ac.uk/Tools/webservices/tutorials/01_intro and
+    http://www.ebi.ac.uk/Tools/webservices/tutorials/02_rest
+    http://www.ebi.ac.uk/Tools/webservices/tutorials/06_programming/python/rest/urllib
+    """
+
+__ebiUrl__ =        'http://www.ebi.ac.uk/Tools/'               # Use UQ mirror when available
+__ebiGOUrl__ =      'http://www.ebi.ac.uk/QuickGO/'             # Use UQ mirror when available
+__uniprotUrl__ =    'http://www.uniprot.org/'                   #
+
+def fetch(entryId, dbName='uniprotkb', format='fasta'):
+    """
+    Retrieve a single entry from a database
+    entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' (database dependent; examples for uniprotkb)
+    dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
+    format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases)
+    See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax
+    """
+     # Construct URL
+    url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
+    # Get the entry
+    try:
+        data = urllib2.urlopen(url).read()
+        if data.startswith('ERROR'):
+            raise RuntimeError(data)
+        return data
+    except urllib2.HTTPError, ex:
+        raise RuntimeError(ex.read())
+
+def search(query, dbName='uniprot', format='list', limit=100):
+    """
+    Retrieve multiple entries matching query from a database currently only via UniProtKB
+    query: search term(s) e.g. 'organism:9606+AND+antigen'
+    dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
+    format: file format e.g. 'list', 'fasta' or 'txt'
+    limit: max number of results (specify None for all results)
+    See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
+    See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
+    """
+    if dbName.startswith('uniprot'):
+        # Construct URL
+        if limit == None: # no limit to number of results returned
+            url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query
+        else:
+            url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
+        # Get the entries
+        try:
+            data = urllib2.urlopen(url).read()
+            if format == 'list':
+                return data.splitlines()
+            else:
+                return data
+        except urllib2.HTTPError, ex:
+            raise RuntimeError(ex.read())
+    elif dbName.startswith('refseq'):
+        dbs = dbName.split(":")
+        if len(dbs) > 1:
+            dbName = dbs[1]
+        base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
+        url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
+        # Get the entries
+        try:
+            data = urllib2.urlopen(url).read()
+            words = data.split("</Id>")
+            words = [w[w.find("<Id>")+4:] for w in words[:-1]]
+            if format == 'list':
+                return words
+            elif format == 'fasta' and len(words) > 0:
+                url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
+                for w in words:
+                    url += w + ","
+                data = urllib2.urlopen(url).read()
+                return data
+            else:
+                return ''
+        except urllib2.HTTPError, ex:
+            raise RuntimeError(ex.read())
+    return
+
+authorised_database_tag = {9606:  ['Homo sapiens', 'ACC', 'ID'],
+                           3702:  ['Arabidopsis thaliana', 'TAIR_ID'],
+                           4932:  ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
+                           10090: ['Mus musculus', 'MGI_ID']}
+
+def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False):
+    """
+    Map identifiers between databases (based on UniProtKB; see http://www.uniprot.org/faq/28)
+    identifiers: a list of identifiers (list of strings)
+    frm: the tag/abbreviation for the identifier FROM which to idmap
+    to: the tag/abbreviation for the identifier TO which to idmap
+    format: the results format to use
+    reverse: reverse the returned mapping key (to) -> value (from)
+    Returns a dictionary with key (from) -> value (to)
+    Set reverse to True if dictionary should contain the reverse mapping, useful if the mapping is non-unique
+    """
+    url = __uniprotUrl__ + 'mapping/'
+    # construct query by concatenating the list of identifiers
+    if isinstance(identifiers, str):
+        query = identifiers.strip()
+    else: # assume it is a list of strings
+        query = ''
+        for id in identifiers:
+            query = query + id.strip() + ' '
+        query = query.strip() # remove trailing spaces
+    params = {
+        'from' : frm,
+        'to' : to,
+        'format' : format,
+        'query' : query
+    }
+    if len(query) > 0:
+        request = urllib2.Request(url, urllib.urlencode(params))
+        response = urllib2.urlopen(request).read()
+        d = dict()
+        for row in response.splitlines()[1:]:
+            pair = row.split('\t')
+            if not reverse:
+                d[pair[0]] = pair[1]
+            else:
+                d[pair[1]] = pair[0]
+        return d
+    else:
+        return dict()
+
+"""
+Gene Ontology service (QuickGO)
+http://www.ebi.ac.uk/QuickGO/WebServices.html
+Note that this service can be slow for queries involving a large number of entries.
+"""
+
+def getGOReport(positives, background = None, database = 'UniProtKB'):
+    """ Generate a complete GO term report for a set of genes (positives).
+        Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
+        Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
+        (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
+        E-value is a Bonferroni-corrected p-value.
+        """
+    pos = set(positives)
+    fg_map = getGOTerms(pos, database)
+    fg_list = []
+    for id in fg_map:
+        for t in fg_map[id]:
+            fg_list.append(t)
+    bg_map = {}
+    bg_list = []
+    neg = set()
+    if background != None:
+        neg = set(background).difference(pos)
+        bg_map = getGOTerms(neg, database)
+        for id in bg_map:
+            for t in bg_map[id]:
+                bg_list.append(t)
+    term_set = set(fg_list)
+    term_cnt = {}
+
+    nPos = len(pos)
+    nNeg = len(neg)
+    if background == None:
+        for t in term_set:
+            term_cnt[t] = fg_list.count(t)
+        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
+    else: # a background is provided
+        for t in term_set:
+            fg_hit = fg_list.count(t)
+            bg_hit = bg_list.count(t)
+            fg_nohit = nPos - fg_hit
+            bg_nohit = nNeg - bg_hit
+            term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
+        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
+
+    ret = []
+    for t in sorted_cnt:
+        defin = getGODef(t[0])
+        if background != None:
+            ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name']))
+        else:
+            ret.append((t[0], t[1], defin['name']))
+    return ret
+
+def getGODef(goterm):
+    """
+    Retrieve information about a GO term
+    goterm: the identifier, e.g. 'GO:0002080'
+    """
+     # Construct URL
+    url = __ebiGOUrl__ + 'GTerm?format=obo&id=' + goterm
+    # Get the entry: fill in the fields specified below
+    try:
+        entry={'id': None, 'name': None, 'def': None}
+        data = urllib2.urlopen(url).read()
+        for row in data.splitlines():
+            index = row.find(':')
+            if index > 0 and len(row[index:]) > 1:
+                field = row[0:index].strip()
+                value = row[index+1:].strip(' "') # remove spaces and quotation marks
+                if field in entry.keys():         # check if we need this field
+                    if entry[field] == None:      # check if not yet assigned
+                        entry[field] = value
+        return entry
+    except urllib2.HTTPError, ex:
+        raise RuntimeError(ex.read())
+
+def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
+    """
+    Retrieve all GO terms for a given set of genes (or single gene).
+    database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
+    The result is given as a map (key=gene name, value=list of unique terms) OR
+    in the case of a single gene as a list of unique terms.
+    If completeAnnot is True (default is False) then the above "terms" is the first element
+    in a tuple with (gene-terms-map, gene-taxon-id).
+    """
+    if type(genes) != list and type(genes) != set and type(genes) != tuple:
+        genes = [genes]
+    termsmap = dict()
+    taxonmap = dict()
+    uri_string = 'GAnnotation?format=tsv&gz&db=' + database + '&protein='
+    # build queries (batches of genes)
+    queryLength = 2000
+    queries = []
+    query = None
+    for gene in genes:
+        if query == None:
+            query = gene
+        elif len(query) < queryLength:
+            query += ','+gene
+        else:
+            queries.append(query)
+            query = gene
+    if query != None:
+        queries.append(query)
+    # execute queries, each involving a number of genes
+    for query in queries:
+        # Construct URL
+        url = __ebiGOUrl__ + uri_string + query
+        # Get the entry: fill in the fields specified below
+        try:
+            urlreq = urllib2.Request(url)
+            urlreq.add_header('Accept-encoding', 'gzip')
+            response = urllib2.urlopen(urlreq)
+            if response.info().get('Content-Encoding') == 'gzip':
+                buf = StringIO(response.read())
+                f = gzip.GzipFile(fileobj=buf)
+                data = f.read()
+            else:
+                data = response.read()
+            for row in data.splitlines()[1:]:  # we ignore first (header) row
+                values = row.split('\t')
+                if len(values) >= 7:
+                    key = values[1]
+                    if termsmap.has_key(key):
+                        termsmap[key].add(values[6])
+                    else:
+                        termsmap[key] = set([values[6]])
+                        taxonmap[key] = int(values[4])
+        except urllib2.HTTPError, ex:
+            raise RuntimeError(ex.read())
+    if completeAnnot:
+        if len(genes) == 1:
+            if len(termsmap) == 1:
+                return (termsmap[genes[0]], taxonmap[genes[0]])
+            else:
+                return (set(), None)
+        else:
+            return (termsmap, taxonmap)
+    else:
+        if len(genes) == 1:
+            if len(termsmap) == 1:
+                return termsmap[genes[0]]
+            else:
+                return set()
+        else:
+            return termsmap
+
+def getGenes(goterms, database='UniProtKB', taxo=None):
+    """
+    Retrieve all genes/proteins for a given set of GO terms (or single GO term).
+    database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
+    taxo: use specific taxonomic identifier, e.g. 9606 (human)
+    The result is given as a map (key=gene name, value=list of unique terms) OR
+    in the case of a single gene as a list of unique terms.
+    """
+    if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
+        goterms = [goterms]
+    map = dict()
+    if taxo == None:
+        uri_string = 'GAnnotation?format=tsv&db=' + database + '&term='
+    else:
+        uri_string = 'GAnnotation?format=tsv&db=' + database + '&tax=' + str(taxo) + '&term='
+    for goterm in goterms:
+        genes = set()
+        # Construct URL
+        url = __ebiGOUrl__ + uri_string + goterm.strip()
+        # Get the entry: fill in the fields specified below
+        try:
+            data = urllib2.urlopen(url).read()
+            for row in data.splitlines()[1:]:  # we ignore first (header) row
+                values = row.split('\t')
+                if len(values) >= 7:
+                    genes.add(values[1])
+            map[goterm] = list(genes)
+        except urllib2.HTTPError, ex:
+            raise RuntimeError(ex.read())
+    if len(goterms) == 1:
+        return map[goterms[0]]
+    else:
+        return map
+
+
+class EBI(object):
+
+    __email__ =         'anon@uq.edu.au'                            # to whom emails about jobs should go
+    __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
+    __checkInterval__ = 2                                           # how long to wait between checking job status
+
+    def __init__(self, service=None):
+        """ Initialise service session.
+        service: presently, ncbiblast and clustalw2 are supported. Use None (default) for fetch/idmap jobs.
+        """
+        self.service = service
+        self.lockFile = '%s.lock' % service
+
+    def createLock(self):
+        """ Create a lock file to prevent submission of more than 1 job
+        at a time by a single user. """
+        fh = open(self.lockFile, 'w')
+        fh.write(self.jobId)
+        fh.close()
+
+    def removeLock(self):
+        """ Remove the lock file. """
+        os.remove(self.lockFile)
+
+    def isLocked(self):
+        """ Check if there is a lock on this service. If there is, check if
+        the job is complete, and if so remove the lock. Return True if still
+        locked and False if not. """
+        if os.path.exists(self.lockFile):
+            fh = open(self.lockFile, 'r')
+            jobId = fh.read()
+            fh.close()
+            status = self.status(jobId)
+            if status == 'RUNNING':
+                self.jobId = jobId
+                return True
+            else:
+                self.removeLock()
+                return False
+        else:
+            return False
+
+    """
+    BLAST and CLUSTALW services
+    """
+
+    def run(self, params):
+        """ Submit a job to the given service with the given parameters, given
+        as a dictionary. Return the jobId. """
+        if self.service == None:
+            raise RuntimeError('No service specified')
+        if self.isLocked():
+            raise RuntimeError("""You currently have a %s job running. You must
+                                  wait until it is complete before submitting another job. Go to
+                                  %sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId))
+        url = self.__ebiServiceUrl__ + self.service + '/run/'
+        # ncbiblast database parameter needs special handling
+        if self.service == 'ncbiblast':
+            databaseList = params['database']
+            del params['database']
+            databaseData = ''
+            for db in databaseList:
+                databaseData += '&database=' + db
+            encodedParams = urllib.urlencode(params)
+            encodedParams += databaseData
+        else:
+            encodedParams = urllib.urlencode(params)
+        print url
+        self.jobId = urllib2.urlopen(url, encodedParams).read()
+        self.createLock()
+        return self.jobId
+
+    def status(self, jobId=None):
+        """ Check the status of the given job (or the current job if none is
+        specified), and return the result. """
+        if jobId is None:
+            jobId = self.jobId
+        url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
+        status = urllib2.urlopen(url).read()
+        return status
+
+    def resultTypes(self):
+        """ Get the available result types. Will only work on a finished job. """
+        url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
+        resultTypes = urllib2.urlopen(url).read()
+        return resultTypes
+
+    def result(self, resultType):
+        """ Get the result of the given job of the specified type. """
+        url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
+        try:
+            result = urllib2.urlopen(url).read()
+            if resultType == 'error':
+                raise RuntimeError('An error occurred: %s' % result)
+        except urllib2.HTTPError:
+            if resultType == 'error':
+                raise RuntimeError('An unknown error occurred while processing the job (check your input)')
+            else:
+                self.result('error')
+        return result
+
+    def submit(self, params, resultTypes):
+        """ Submit a new job to the service with the given parameters.
+        Return the output in the specified format. """
+        params['email'] = self.__email__
+        self.run(params)
+        print 'Submitted new', self.service, 'job, jobId:', self.jobId
+        print 'Please be patient while the job is completed'
+        status = 'RUNNING'
+        observe = 0
+        while status == 'RUNNING':
+            observe = observe + 1
+            status = self.status()
+            sleep(self.__checkInterval__)
+        if status != 'FINISHED':
+            raise RuntimeError('An error occurred and the job could not be completed')
+        print 'Job complete.'
+        self.removeLock()
+        if type(resultTypes) != list:
+            resultTypes = [resultTypes]
+        results = []
+        for resultType in resultTypes:
+            results.append(self.result(resultType))
+        if len(results) == 1:
+            return results[0]
+        else:
+            return results
+
+
--- a/wordcount.py
+++ b/wordcount.py
+#!/usr/bin/python
+
+import sys, math, random, getopt
+import numpy as np
+import matplotlib.pyplot as plt
+import prob as prb
+import sequence
+import stats
+from rcdict import *
+import operator        # for use with key= in max() function
+import binomial
+
+def slidewin(seq, winsize):
+    """ Produce a list of sub-sequences of a given length from a complete sequence """
+    subseqs = []
+    for i in range(len(seq) - winsize + 1):
+        subseqs.append(seq[i : i + winsize])
+    return subseqs
+
+def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
+    """ Produce a report of enriched words of specified length.
+        seqs: DNA sequence data
+        WordWidth: length of sought words
+        PeakWidth: width of window around centre of sequence
+        PeakMargin: the width of the margin on each side of the centre window
+        (which delineates the positives around peak from negatives away from peak). """
+    pos = RCDict() # reverse complement-aware dictionary for DNA
+    neg = RCDict() # reverse complement-aware dictionary for DNA
+    for seq in seqs:
+        centre = len(seq)/2 # find peak
+        """ Construct all words around peak (positives) and count their presence """
+        words = set(slidewin(seq[centre-PeakWidth/2:centre+PeakWidth/2], WordWidth))
+        for word in words:
+            try:
+                pos[word] += 1
+            except KeyError:
+                pos[word] = 1
+        """ Construct all words away from peak (negatives) and count """
+        words = set(slidewin(seq[:centre-PeakWidth/2-PeakMargin], WordWidth))
+        words.union(slidewin(seq[centre+PeakWidth/2+PeakMargin:], WordWidth))
+        for word in words:
+            try:
+                neg[word] += 1
+            except KeyError:
+                neg[word] = 1
+
+    logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg
+    for (word, cnt_pos) in pos.items():
+        cnt_neg = 0.0001
+        try:
+            cnt_neg = neg[word]
+        except KeyError:
+            pass
+        logratio[word] = math.log(float(cnt_pos) / float(cnt_neg))
+
+    allpos = logratio.items() # extract all pairs of words:log-ratio
+    sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them
+    print "Enriched words (sorted by ln pos/neg)"
+    print "Word    \tln pos/neg\tE-value"
+    for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values
+        cnt_pos = int(pos[word])
+        try: cnt_neg = int(neg[word])
+        except KeyError: cnt_neg = 0
+        # Compute p-value using Fisher's Exact test
+        pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False)
+        # Correct for multiple testing (very conservatively)
+        eval = pval * len(allpos)
+        print "%s\t%6.3f  \t%e" % (word, lgr, eval)
+
+def getReverse(distribs):
+    """ Construct a new list of probability distributions of DNA, by
+        1. swapping their order, and
+        2. swapping A's and T's, and C's and G's """
+    return [d.swapxcopy('A','T').swapxcopy('C','G') for d in distribs[::-1]] # backwards
+
+
+def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
+    """ Produce a plot for a scan of the specified motif.
+        The plot has as its x-axis position of sequence, and
+        the y-axis the cumulative, non-negative PWM score over all sequences. """
+    # check that all sequences are the same length and set sequence length
+    seq_len = len(seqs[0])
+    for seq in seqs:
+        if len(seq) != seq_len:
+            usage(sys.argv[0], "All sequences must have same length")
+            return
+
+    # create the motif and its reverse complemennt
+    bg = prb.Distrib(sym.DNA_Alphabet, sequence.getCount(seqs))
+    d = prb.readMultiCounts(jaspar)
+    try:
+        fg1 = d[motif]
+        fg2 = getReverse(d[motif])
+    except KeyError:
+        usage(sys.argv[0], "Unknown motif %s" % motif)
+        return
+    print "Motif %s:" % motif
+    pwm1 = sequence.PWM(fg1, bg)
+    pwm1.display(format='JASPAR')
+    print "Motif %s (reverse complement):" % motif
+    pwm2 = sequence.PWM(fg2, bg)
+    pwm2.display(format='JASPAR')
+
+    # initialize things to zero
+    avg_motif_score = np.zeros(seq_len)
+
+    # compute average score at each position (on both strands) in sequences
+    i_seq = 0
+    motif_width = pwm1.length
+    for seq in seqs:
+        i_seq += 1
+        # print >> sys.stderr, "Scoring seq: %4d\r" % (i_seq),
+
+        # positive strand
+        hits = pwm1.search(seq, threshold)
+        pos_scores = seq_len * [0]
+        for hit in hits:
+            # mark hit at *center* of site (hence motif_width/2)
+            pos_scores[hit[0]+(motif_width/2)] = hit[2]
+
+        # negative strand
+        hits = pwm2.search(seq, threshold)
+        neg_scores = seq_len * [0]
+        for hit in hits:
+            neg_scores[hit[0]+(motif_width/2)] = hit[2]
+
+        # use maximum score on two strands
+        for i in range(seq_len):
+            score = max(pos_scores[i], neg_scores[i])
+            if (score > threshold):
+                avg_motif_score[i] += score
+
+    # compute average score
+    for i in range(seq_len):
+        avg_motif_score[i] /= len(seqs)
+
+    # hw = 5 # window width is 2*hw + 1
+    # smoothed_avg_motif_score = np.zeros(seq_len)
+    # for i in range(hw, seq_len-motif_width+1-hw):
+    #    smoothed_avg_motif_score[i]=sum(avg_motif_score[i-hw:i+hw+1])/(2*hw+1)
+
+    # plot the average score curve
+    # print >> sys.stderr, ""
+    x = range(-(seq_len/2), (seq_len/2))    # call center of sequence X=0
+    lbl = "%s" % (motif)
+    plt.plot(x, avg_motif_score, label=lbl)
+    #plt.plot(x, smoothed_avg_motif_score, label=lbl)
+    plt.axhline(color='black', linestyle='dotted')
+    plt.legend(loc='lower center')
+    plt.xlabel('position')
+    plt.ylabel('average motif score')
+    plt.title(motif)
+    plt.show()
+
+
+def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices.txt', seed=0):
+    """ Produce a plot for a scan of the specified motif.
+        The plot has as its x-axis position of sequence, and
+        the y-axis the number of sequences with a best hit at position x.
+        Sequences with no hit above 'threshold' are ignored.
+        Ties for best hit are broken randomly.
+        The p-value of the central region that is most "centrally enriched"
+        and the width of the best central region is printed in the label
+        of the plot.
+    """
+
+    # set the random seed for repeatability
+    random.seed(seed)
+
+    # Copy the code from your "improved" version of scanMotifReport()
+    # to here, and follow the instructions in the Prac to develop this
+    # new function.
+
+    # check that all sequences are the same length and set sequence length
+    seq_len = len(seqs[0])
+    for seq in seqs:
+        if len(seq) != seq_len:
+            usage(sys.argv[0], "All sequences must have same length")
+            return
+
+    # create the motif and its reverse complemennt
+    bg = prb.Distrib(sym.DNA_Alphabet, sequence.getCount(seqs))
+    d = prb.readMultiCounts(jaspar)
+    try:
+        fg1 = d[motif]
+        fg2 = getReverse(d[motif])
+    except KeyError:
+        usage(sys.argv[0], "Unknown motif %s" % motif)
+        return
+    print "Motif %s:" % motif
+    pwm1 = sequence.PWM(fg1, bg)
+    pwm1.display(format='JASPAR')
+    print "Motif %s (reverse complement):" % motif
+    pwm2 = sequence.PWM(fg2, bg)
+    pwm2.display(format='JASPAR')
+
+    # initialize things to zero
+    hit_count = np.zeros(seq_len)
+    n_seqs_with_hits = 0.0
+
+    # Scan each sequence for all hits on both strands and record
+    # the number of "best hits" at each sequence position.
+    #
+    motif_width = pwm1.length
+    i_seq = 0
+    for seq in seqs:
+        i_seq += 1
+        # print >> sys.stderr, "Scoring seq: %4d\r" % (i_seq),
+        # scan with both motifs
+        hits = pwm1.search(seq, threshold) + pwm2.search(seq, threshold)
+        # Record position of best hit
+        if (hits):
+                n_seqs_with_hits += 1
+                # find best hit score
+                best_score = max(hits, key=operator.itemgetter(1))[2]
+                # find ties
+                best_hits = [ hit for hit in hits if hit[2] == best_score ]
+                # break ties at random
+                best_hit = random.choice(best_hits)
+                # mark hit at *center* of site (hence pwm1.length/2)
+                hit_count[best_hit[0] + pwm1.length/2] += 1
+    # divide number of sequences with hit by total number of hits
+    site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ]
+
+    print >> sys.stderr, "Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits)
+
+    # STATISTICS
+    # Get the cumulative hit counts in concentric windows
+    # and perform the Binomial Test.  Report best region and its p-value.
+    #
+    best_r = 0
+    best_log_pvalue = 1
+    center = seq_len/2                  # center of sequence
+    cum_hit_count = np.zeros(seq_len)   # total hits in window of width i
+    for i in range(1, (seq_len - pwm1.length/2 + 1)/2):
+        cum_hit_count[i] = cum_hit_count[i-1] + hit_count[center-i] + hit_count[center+i]
+        # Compute probability of observed or more best hits in central window
+        # assuming uniform probability distribution in each sequence.
+    #   successes = cum_hit_count[i]
+    #   trials = n_seqs_with_hits
+    #    p_success = ?
+    #    log_pvalue = ?
+    #    if (log_pvalue < best_log_pvalue):
+    #        best_log_pvalue = log_pvalue
+    #        best_r = 2*i
+    # End STATISTICS
+
+    hw = 5
+    smoothed_site_probability = np.zeros(seq_len)
+    for i in range(hw, seq_len-motif_width+1-hw):
+        smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1)
+
+    x = range(-(seq_len/2), (seq_len/2))        # call center of sequence X=0
+    lbl = "%s, t=%.2f" % (motif, threshold)
+    #lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
+    plt.plot(x, smoothed_site_probability, label=lbl)
+    plt.axhline(color='black', linestyle='dotted')
+    plt.legend(loc='lower center')
+    plt.xlabel('Position of best site')
+    plt.ylabel('Smoothed probability')
+    plt.title(motif)
+    plt.show()
+
+def usage(name, errmsg = None):
+    if errmsg != None:
+        print "Error: %s" % errmsg
+    print """Usage: %s [options]
+                -f <fasta-filename> (required)
+                -d discover enriched words
+                -w <word width, default 8>
+                -p <peak width, default 100>
+                -m <peak margin, default 100>
+                -s <JASPAR-ID> scan for JASPAR motif
+                -h print this help""" % name
+
+if __name__ == '__main__':
+    try:
+        optlst, args = getopt.getopt(sys.argv[1:], 'f:hds:j:w:p:m:')
+    except getopt.GetoptError, err:
+        usage(sys.argv[0], str(err))
+        sys.exit(2)
+    FILENAME =      None
+    DISCOVER_MODE = False
+    SCAN_MODE =     False
+    WORD_WIDTH =    8
+    PEAK_WIDTH =    100
+    PEAK_MARGIN =   100
+    MOTIF_ID =      'MA0112.2'
+    JASPAR_FILE =   'JASPAR_matrices.txt'
+    for o, a in optlst:
+        if   o == '-h': usage(sys.argv[0])
+        elif o == '-f': FILENAME = a
+        elif o == '-d': DISCOVER_MODE = True
+        elif o == '-w': WORD_WIDTH = int(a)
+        elif o == '-p': PEAK_WIDTH = int(a)
+        elif o == '-m': PEAK_MARGIN = int(a)
+        elif o == '-s': SCAN_MODE = True; MOTIF_ID = a
+        elif o == '-j': JASPAR_FILE = a
+    if FILENAME == None:
+        usage(sys.argv[0], "Filename not specified")
+        sys.exit(3)
+    seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN)
+    if DISCOVER_MODE:
+        print "Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
+        countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
+    elif SCAN_MODE:
+        scanMotifReport(seqs, MOTIF_ID)
+    else:
+        usage(sys.argv[0], "No run mode selected")
+