python3_5

ac6c5d6b · Mikael Boden · 934c2bff · ac6c5d6b · ac6c5d6b · ac6c5d6b
Commit ac6c5d6b authored Feb 14, 2017 by Mikael Boden
16 changed files
--- a/binomial.py
+++ b/binomial.py
@@ -95,8 +95,8 @@ def betacf(a, b, x):
        h *= delta
        if (abs(delta-1.0) < EPS): break
-    if (m > MAXIT):  print >> sys.stderr, ("a or b too big or MAXIT too small "
+    if (m > MAXIT):  print(("a or b too big or MAXIT too small "
-                                           "in betacf")
+                                           "in betacf"), file=sys.stderr)
    return h
@@ -118,5 +118,5 @@ def gammaln(x):
 def die(string):
-    print >> sys.stderr, string
+    print(string, file=sys.stderr)
--- a/genome.py
+++ b/genome.py
@@ -105,7 +105,7 @@ class GeneExpression:
            {'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
            """
        if names == None:
-            return self.genes.keys()
+            return list(self.genes.keys())
        elif isinstance(names, str):
            return self.matrix[self.genes[names],:]
        else:
@@ -148,7 +148,7 @@ class GeneExpression:
        except:
            index = samples
        mygenes = {}
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            mygenes[name] = self.matrix[ndx, index]
        return mygenes
@@ -165,7 +165,7 @@ class GeneExpression:
            sort_ndx = np.nan_to_num(self.matrix[:,index]).argsort()
        except:
            sort_ndx = np.nan_to_num(self.matrix[:,sample]).argsort()
-        name_tuples = sorted(self.genes.items(), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
+        name_tuples = sorted(list(self.genes.items()), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
        names = []
        if descending:
            for (name, index) in [name_tuples[index] for index in sort_ndx[::-1]]: # reverse the order 
@@ -199,7 +199,7 @@ class GeneExpression:
            Creates and returns a gene dictionary with the corresponding ratios. """
        mygenes = {}
        mdiv = self.matrix[:, index1] / self.matrix[:, index2]
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            mygenes[name] = mdiv[ndx]
        return mygenes
@@ -208,7 +208,7 @@ class GeneExpression:
            Creates and returns a gene dictionary with the corresponding log-ratios. """
        mygenes = {}
        mlr = np.log2(self.matrix[:, index1] / self.matrix[:, index2])
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            mygenes[name] = mlr[ndx]
        return mygenes
@@ -218,7 +218,7 @@ class GeneExpression:
        index = self.genes[probeID]
        profile = self.matrix[index, :]
        mygenes = {}
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            other = self.matrix[ndx, :]
            mygenes[name] = pearson(profile, other)
        return mygenes
@@ -252,7 +252,7 @@ class GeneExpression:
        # Calculate Z-score for the given column for each gene
        zscore = (self.matrix[:, index] - mu) / sd
        mygenes = {}
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            try:
                mygenes[name] = zscore[ndx, :]
            except IndexError:
@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0):
                    genes[name] = values
    if len(genes) == 0:
        raise RuntimeError('No data in file')
-    print 'Data set %s contains %d entries' % (dataset, len(genes))
+    print('Data set %s contains %d genes' % (dataset, len(genes)))
    if cnt_null > 0:
-        print 'Data set has %d null-values' % (cnt_null)
+        print('Data set has %d null-values' % (cnt_null))
    return GeneExpression(dataset, headers[2:], genes)
@@ -357,40 +357,29 @@ def pearson(X, Y):
        return 0
    return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar))
-# ------------------- Example ---------------------
+# ------------------- Example (basically exercise 7 in prac 9)---------------------
-ge3716 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS3716.soft')
+if __name__=='__main__':
-ratio = GeneExpression('GDS3716_ratio')
+    g = readGEOFile('GDS3198.soft', id_column = 1)
-ratio.addSamples('S1_ER+/Healthy', ge3716.getRatio( 33,  0))
+    meanfold = {}
-ratio.addSamples('S2_ER+/Healthy', ge3716.getRatio( 34,  1))
+    for gene in g.genes:
-ratio.addSamples('S3_ER+/Healthy', ge3716.getRatio( 35,  2))
+        profile = g.getGenes(gene)
-ratio.addSamples('S4_ER+/Healthy', ge3716.getRatio( 36,  3))
+        meanfold[gene] = (np.log2(profile[0] / profile[3]) + np.log2(profile[1] / profile[4]) + np.log2(profile[2] / profile[5])) / 3
-ratio.addSamples('S5_ER+/Healthy', ge3716.getRatio( 37,  4))
-ratio.addSamples('S6_ER+/Healthy', ge3716.getRatio( 38,  5))
+    import matplotlib.pyplot as plt
-ratio.addSamples('S7_ER+/Healthy', ge3716.getRatio( 39,  6))
+    scores = [y for y in list(meanfold.values()) if not np.isnan(y)]
-ratio.addSamples('S8_ER+/Healthy', ge3716.getRatio( 40,  7))
+    hist, bins = np.histogram(scores, bins=50)
-ratio.addSamples('S9_ER+/Healthy', ge3716.getRatio( 41,  8))
+    width = 0.7 * (bins[1] - bins[0])
-ratio.addSamples('S1_ER-/Healthy', ge3716.getRatio( 24,  9))
+    center = (bins[:-1] + bins[1:]) / 2
-ratio.addSamples('S2_ER-/Healthy', ge3716.getRatio( 25, 10))
+    plt.bar(center, hist, align='center', width=width)
-ratio.addSamples('S3_ER-/Healthy', ge3716.getRatio( 26, 11))
+    plt.show()
-ratio.addSamples('S4_ER-/Healthy', ge3716.getRatio( 27, 12))
-ratio.addSamples('S5_ER-/Healthy', ge3716.getRatio( 28, 13))
-ratio.addSamples('S6_ER-/Healthy', ge3716.getRatio( 29, 14))
-ratio.addSamples('S7_ER-/Healthy', ge3716.getRatio( 30, 15))
-ratio.addSamples('S8_ER-/Healthy', ge3716.getRatio( 31, 16))
-ratio.addSamples('S9_ER-/Healthy', ge3716.getRatio( 32, 17))
-ratio.writeGEOFile('/Users/mikael/workspace/COSC2000/GDS3716_ratios.soft')
-print ge3716.getHeaders()
-z = ratio.getZScore(0) # NOT recommended! Ratios are NOT normally distributed! Use log-ratios instead.
-ge38 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS38.soft', id_column = 1)
-cln2_profile = ge38.getGenes('CLN2')
-pcorr = ge38.getPearson('CLN2')
-gp = GeneExpression('Ex3', 'PC_CLN2', pcorr)
-sorted = gp.sort('PC_CLN2', True)
-print sorted[0], ge38.getGenes(sorted[0])
-print sorted[1], ge38.getGenes(sorted[1])
+    result = sorted(list(meanfold.items()), key=lambda v: v[1])
+    print('========== Wildtype may down-regulate ==========')
+    for r in result[0:100]:
+        print(r[0], r[1])
+    print('========== Wildtype may up-regulate ==========')
+    for r in result[-1:-100:-1]:
+        print(r[0], r[1])
--- a/gibbs.py
+++ b/gibbs.py
@@ -138,7 +138,7 @@ class GibbsMotif():
                        LL += math.log(Qk / Pk)
                    except ZeroDivisionError:
                        pass
-                print "LL @ %5d=\t%5.2f" % (round, LL)
+                print("LL @ %5d=\t%5.2f" % (round, LL))
        # end main for-loop
        self.q = q
@@ -312,7 +312,7 @@ class GibbsAlign():
                        LL += math.log(Qk / Pk)
                    except ZeroDivisionError:
                        pass
-                print "LL @ %5d=\t%5.2f" % (round, LL)
+                print("LL @ %5d=\t%5.2f" % (round, LL))
        # end main for-loop
        self.q = q

--- a/godata.py
+++ b/godata.py
@@ -92,7 +92,7 @@ class GO():
        say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations:
        go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8))
        """
-        print "Started at", time.asctime()
+        print(("Started at", time.asctime()))
        # Get GO definitions
        terms = readOBOFile(obofile)
        for term in terms:
@@ -108,7 +108,7 @@ class GO():
                    cset.add((term, prel))
                except KeyError:
                    pass
-        print "Read %d GO definitions" % len(terms)
+        print(("Read %d GO definitions" % len(terms)))
        # open annotation file to analyse and index data
        src = open(annotFile, 'r')
        gene_cnt = 0
@@ -126,7 +126,7 @@ class GO():
                terms_map = {term: (evid, qual != 'NOT')}
                self.annots[gene] = (taxa, terms_map)
        src.close()
-        print "Read annotations for %d genes" % gene_cnt
+        print(("Read annotations for %d genes" % gene_cnt))
    def _makeIntoList(self, id_or_ids):
        if type(id_or_ids) != list and type(id_or_ids) != set and type(id_or_ids) != tuple:
@@ -283,7 +283,7 @@ class GO():
    def getAllAnnots(self):
        """ Retrieve all annotated gene products """
-        return self.annots.keys()
+        return list(self.annots.keys())
    def getAllBackground(self, positives = [], taxa = None, evid = None, include_more_general = False):
        """ Retrieve all genes and terms that are annotated but not in a list of positives (gene products).
@@ -328,12 +328,12 @@ class GO():
            cnt = fg_list.count(t)
            if cnt >= threshold:
                term_cnt[t] = cnt
-        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
+        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
        ret = []
        for t in sorted_cnt:
            defin = self.getTermdef(t[0])
            if defin == None:
-                print 'Could not find definition of %s' % t[0]
+                print(('Could not find definition of %s' % t[0]))
            else:
                ret.append((t[0], t[1], defin[2], defin[0]))
        return ret
@@ -360,7 +360,7 @@ class GO():
        # Process background: find terms of genes
        bg_list = []
        if background == None: # need to use the full set
-            background = self.annots.keys()
+            background = list(self.annots.keys())
        negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives
        nNeg = len(negatives)
        bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general)
@@ -383,12 +383,12 @@ class GO():
            evalue = pvalue * len(term_set) # Bonferroni correction
            if evalue <= threshold: # check if significance req is fulfilled
                term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue)
-        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
+        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
        ret = []
        for t in sorted_cnt:
            defin = self.getTermdef(t[0])
            if defin == None:
-                print 'Could not find definition of %s' % t[0]
+                print(('Could not find definition of %s' % t[0]))
            else:
                ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0]))
        return ret
@@ -448,17 +448,17 @@ class BinGO():
        found = set()
        try:
            (_, closure, _) = self.termdefs[term]
-            for (t, r) in closure.items():
+            for (t, r) in list(closure.items()):
                if (not rel) or r == rel:
                    found.add(t)
                    found.update(self._getSuperTerms(t, rel))
        except KeyError:
-            print 'Could not find GO:%s' % (''.join(decode(term, self.term_code)))
+            print(('Could not find GO:%s' % (''.join(decode(term, self.term_code)))))
        return found
    def _getChildTerms(self, term, rel = None):
        found = set()
-        for (child, termdef) in self.termdefs.items():
+        for (child, termdef) in list(self.termdefs.items()):
            (_, parents_dict, _) = termdef
            try:
                myrel = parents_dict[term]
@@ -491,7 +491,7 @@ class BinGO():
            mymap[gene_name] = set()
            try:
                (taxa, terms) = self._getGeneEntry(i)
-                for (term, evid_and_qual) in terms.items():
+                for (term, evid_and_qual) in list(terms.items()):
                    if evid_and_qual[1] and not evid: # if True and no evidence is specified
                        direct.add(term)
                        mymap[gene_name].add(term)
@@ -505,7 +505,7 @@ class BinGO():
            # STEP 2: Find the transitive closure of each term identified, store as a dictionary
            indirect = {}
            for t in direct:
-                if not indirect.has_key(t):
+                if not t in indirect:
                    indirect[t] = set(self._getSuperTerms(t))
        # STEP 3: compile and return results
        for gene in mymap:
@@ -521,7 +521,7 @@ class BinGO():
    def getAllGenes(self):
        names = []
-        for g in self._decodeGeneIDs(self.annot_index.keys()):
+        for g in self._decodeGeneIDs(list(self.annot_index.keys())):
            names.append(''.join(g))
        return names
@@ -545,7 +545,7 @@ class BinGO():
            gene_name = decode(g, self.gene_code)
            (mytaxa, tdict) = self._getGeneEntry(g)
            if not taxa or taxa == mytaxa:
-                for annot_term in tdict.keys():
+                for annot_term in list(tdict.keys()):
                    if tdict[annot_term] == evid:
                        if annot_term in terms:
                            try:
@@ -642,10 +642,10 @@ class BinGO():
                        (annot_offset, obo_offset) = unpack('II', buf)
                        break
                except error as inst:
-                    print "Problem reading binary file: ", inst, "at gene ", current_gene_cnt, "at definition ", current_terms_cnt, "at", f.tell()
+                    print(("Problem reading binary file: ", inst, "at gene ", current_gene_cnt, "at definition ", current_terms_cnt, "at", f.tell()))
                    exit(3)
-        print "Read %d genes and %d term definitions" % (current_gene_cnt, current_terms_cnt)
+        print(("Read %d genes and %d term definitions" % (current_gene_cnt, current_terms_cnt)))
-        print "Annotations start at", annot_offset, "\nDefinitions start at", obo_offset
+        print(("Annotations start at", annot_offset, "\nDefinitions start at", obo_offset))
        return f
    #FIXME: write code to perform test of taxa enrichment
@@ -655,7 +655,7 @@ class BinGO():
            Uses the Wilcoxon Ranksum test for each GO term to assign a p-value,
            indicating the enrichment of term to "top" genes in descending order by score (by default).
        """
-        fg_map = self.getTerms(gene_score_map.keys(), include_more_general = include_more_general)
+        fg_map = self.getTerms(list(gene_score_map.keys()), include_more_general = include_more_general)
        fg_list = []
        for id in fg_map:
            for t in fg_map[id]:
@@ -663,7 +663,7 @@ class BinGO():
        term_set = set(fg_list)
        term_pval = {}
        if len(negatives_score_map) > 0:
-            bg_map = self.getTerms(negatives_score_map.keys(), include_more_general = include_more_general)
+            bg_map = self.getTerms(list(negatives_score_map.keys()), include_more_general = include_more_general)
        for t in term_set:
            pos = []
            neg = []
@@ -698,13 +698,13 @@ class BinGO():
            else:
                term_pval[t] = (p, 1.0)
-        sorted_pval = sorted(term_pval.items(), key=lambda v: v[1][0], reverse=False)
+        sorted_pval = sorted(list(term_pval.items()), key=lambda v: v[1][0], reverse=False)
        ret = []
        for t in sorted_pval:
            defin = self.getTermdef(t[0])
            if defin == None:
-                print 'Could not find definition of %s' % t[0]
+                print(('Could not find definition of %s' % t[0]))
            else:
                ret.append((t[0], t[1][0], t[1][1], defin[2].strip(), defin[0]))
        return ret
@@ -739,7 +739,7 @@ class BinGO():
        if background == None:
            for t in term_set:
                term_cnt[t] = fg_list.count(t)
-            sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
+            sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
        else: # a background is provided
            for t in term_set:
                fg_hit = fg_list.count(t)
@@ -747,13 +747,13 @@ class BinGO():
                fg_nohit = nPos - fg_hit
                bg_nohit = nNeg - bg_hit
                term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
-            sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
+            sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
        ret = []
        for t in sorted_cnt:
            defin = self.getTermdef(t[0])
            if defin == None:
-                print 'Could not find definition of %s' % t[0]
+                print(('Could not find definition of %s' % t[0]))
            else:
                if background != None:
                    ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][1], defin[2], defin[0]))
@@ -773,7 +773,7 @@ def encode(code_me, encode_strings):
                    accum *= codelen
                    break
    except IndexError as e:
-        print e, code_me
+        print((e, code_me))
    return code
 def decode(code, encode_strings):
@@ -787,7 +787,7 @@ def decode(code, encode_strings):
            code -= accum[pos] * indices[pos]
        string = [encode_strings[pos][indices[pos]] for pos in range(len(encode_strings))]
    except IndexError as e:
-        print e, code
+        print((e, code))
    return string
 def _extractAnnotFields(line, columns = (1,2,3,4,6,8)):
@@ -837,10 +837,10 @@ def _extractAnnotFields(line, columns = (1,2,3,4,6,8)):
        term = None
        raise "No GO term on line: " + line
    evid = fields[columns[4]]
-    if not evid_codes.has_key(evid):
+    if not evid in evid_codes:
        evid = None
    onto = fields[columns[5]]
-    if not onto_codes.has_key(onto):
+    if not onto in onto_codes:
        onto = None
    taxa_idx = line.find('taxon:')
    if taxa_idx == -1:
@@ -902,7 +902,7 @@ def readOBOFile(obofile):
    return terms
 def writeBitFile(annotFile, obofile, destFile, taxas = None):
-    print "Started at", time.asctime()
+    print(("Started at", time.asctime()))
    # open annotation file to analyse and index data
    src = open(annotFile, 'r')
    gene_index = [{} for _ in range(6)]  # count different characters in different positions
@@ -942,27 +942,27 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
            pos += 1
        prev_gene = gene
    src.close()
-    print "Read annotations for %d genes" % gene_cnt
+    print(("Read annotations for %d genes" % gene_cnt))
    gene_code = ['' for _ in range(6)]
    term_code = ['' for _ in range(7)]
    for d in range(len(gene_index)):
        arr = ['?' for _ in gene_index[d]]
-        for (ch, index) in gene_index[d].items():
+        for (ch, index) in list(gene_index[d].items()):
            arr[index] = ch
        gene_code[d] = ''.join(arr)
    for d in range(len(term_index)):
        arr = ['?' for _ in term_index[d]]
-        for (ch, index) in term_index[d].iteritems():
+        for (ch, index) in term_index[d].items():
            arr[index] = ch
        term_code[d] = ''.join(arr)
    evid_code = ['' for _ in range(len(evid_index))]
-    for (e, ndx) in evid_index.items():
+    for (e, ndx) in list(evid_index.items()):
        evid_code[ndx] = e
    # Get GO definitions
    terms = readOBOFile(obofile)
-    print "Read %d GO definitions" % len(terms)
+    print(("Read %d GO definitions" % len(terms)))
    # re-open, now with the aim of copying info
    src = open(annotFile, 'r')
@@ -975,7 +975,7 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
        dst.write(code_str+"\n")
    for e_str in evid_code:
        dst.write(e_str+'\n')
-    print "Wrote header %d\t%d\t%d\t%d\t%d, now at @%d" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index), dst.tell())
+    print(("Wrote header %d\t%d\t%d\t%d\t%d, now at @%d" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index), dst.tell())))
    # STEP 2: write annotations
    annot_offset = dst.tell()
@@ -1012,11 +1012,11 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
            (o, q, e) = concat_terms[t]
            s = pack('?BI', q, evid_index[e], encode(t, term_code))
            dst.write(s)
-    print "Wrote GO annotations, now at @%d" % dst.tell()
+    print(("Wrote GO annotations, now at @%d" % dst.tell()))
    # Next, the ontology definition...
    obo_offset = dst.tell() # remember the position where the OBO starts
-    sorted_terms = sorted(terms.iteritems(), key=operator.itemgetter(0))
+    sorted_terms = sorted(iter(terms.items()), key=operator.itemgetter(0))
    for [t, _] in sorted_terms:
        (term_name, term_onto, term_is) = terms[t]
        s = pack('IcH', encode(t[3:], term_code), term_onto, len(term_is))
@@ -1029,12 +1029,10 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
            s = pack('BI', index, encode(sup_term[3:], term_code))
            dst.write(s)
        dst.write(term_name + '\n')
-    print "Wrote %d GO definitions, now at @%d" % (len(sorted_terms), dst.tell())
+    print(("Wrote %d GO definitions, now at @%d" % (len(sorted_terms), dst.tell())))
    # Finally, write the offsets to quickly access annotations and definitions, resp
    dst.write(pack('II', annot_offset, obo_offset))
    # done, close
    dst.close()
-    print "Completed at", time.asctime()
+    print(("Completed at", time.asctime()))
--- a/guide.py
+++ b/guide.py
 ###################################################
 # This module is a supplement to the Python guide #
-# Version 2.2016.1 (8/3/2016)                    #
+# Version 1.20160  (19/12/2016)                     #
 ###################################################
-''' 
+'''
-This module contains code for that can help solving bioinformatics problems.
+This module contains code that can help solve bioinformatics problems.
-See the accompanying Python guide for more explanations and examples. 
+See the accompanying Python guide document for more explanations and examples.
-Alphabet is a class that defines valid symbols that we then use to make up valid 
+Alphabet is a class that defines valid symbols that we then use to make up valid
-biological sequences. Note that we also define variables corresponding to 
+biological sequences. Note that we also define variables corresponding to
 DNA, RNA and Protein sequences that can be used directly.
-Sequence defines basic parts and operations on biological sequences. 
+Sequence is a class that defines basic parts and operations on biological sequences.
-Alignment defines an alignment of sequences (how symbols in different sequences line 
+Alignment is a class that defines an alignment of sequences (how symbols in different sequences line
-up when placed on-top). Alignment methods should generate instances of this class.
+up when placed on-top of one another). Alignment methods should generate instances of this class.
-SubstMatrix defines a substitution matrix, i.e. a scoring system for performing 
+SubstMatrix is a class that defines a substitution matrix, i.e. a scoring system for performing
 alignments. You can read these from files or construct them manually.
-GeneProfile defines parts and operations for gene expression profiles. Essentially, 
+GeneProfile is a class that defines parts and operations for gene expression profiles. Essentially,
 the class will help to index expression data by gene name (rows) and by sample name (columns).
-There are several methods not tied to a particular class because they construct new instances, 
+There are several methods not tied to a particular class because they construct new instances,
 e.g. reading from file, retrieving from the internet, creating an alignment from sequences etc.
-You need to have numpy installed (see http://www.numpy.org/). 
+You need to have numpy installed (see http://www.numpy.org/).
-Should work with Python v2.6-2.7 (see http://www.python.org/). 
+Should work with Python v2.6-3.5 (see http://www.python.org/).
-Has not been written to work with Python v3 and later--but this should be easy to do.
 The code may contain bugs--please report to m.boden@uq.edu.au
 '''
-import math, numpy, urllib, urllib2
+import math, numpy, urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse
-###############################################################################
-# Alphabet                                                                    #
-###############################################################################
 class Alphabet():
-    """ A minimal class for alphabets """
+    """ A minimal class for alphabets
+        Alphabets include DNA, RNA and Protein """
    def __init__(self, symbolString):
        self.symbols = symbolString
-    def __len__(self):              # implements the "len" operator, e.g. "len(Alphabet('XYZ')" results in 3
+    def __len__(self):              # implements the "len" operator, e.g. "len(Alphabet('XYZ'))" results in 3
-        return len(self.symbols)
+        return len(self.symbols)    # will tell you the length of the symbols in an Alphabet instance
-    def __contains__(self, sym):    # implements the "in" operator, e.g. "'A' in Alphabet('ACGT')" results in True 
+    def __contains__(self, sym):    # implements the "in" operator, e.g. "'A' in Alphabet('ACGT')" results in True
-        return sym in self.symbols
+        return sym in self.symbols  # will tell you if 'A' is in the symbols in an Alphabet instance
    def __iter__(self):             # method that allows us to iterate over all symbols, e.g. "for sym in Alphabet('ACGT'): print sym" prints A, C, G and T on separate lines
        tsyms = tuple(self.symbols)
        return tsyms.__iter__()
-    def __getitem__(self, ndx):
+""" Below we declare alphabet variables that are going to be available when
-        """ Retrieve the symbol(s) at the specified index (or slice of indices) """
+this module (this .py file) is imported """
-        return self.symbols[ndx]
-    def index(self, sym):
-        """ Retrieve the index of the given symbol in the alphabet. """
-        return self.symbols.index(sym)
-    def __str__(self):
-        return self.symbols
-""" Below we declare alphabet variables that are going to be available when 
-this module is imported """ 
 DNA_Alphabet = Alphabet('ACGT')
 RNA_Alphabet = Alphabet('ACGU')
 Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
 Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
-###############################################################################
-# Sequence                                                                    #
-###############################################################################
 class Sequence():
-    """ A biological sequence class. Stores the sequence itself, 
+    """ A biological sequence class. Stores the sequence itself,
-        the alphabet and a name. 
+        the alphabet and a name.
        Usage:
+        Create an instance of Sequence - have to pass in sequence, alphabet and name - gappy is an optional argument
        >>> seq1 = Sequence('ACGGGAGAGG', DNA_Alphabet, 'ABC')
-        >>> print seq1
+        >>> print(seq1)
        ABC: ACGGGAGAGG
        >>> 'C' in seq1
        True
        >>> for sym in seq1:
-        ...     print sym
+        ...     print(sym)
        """
-    def __init__(self, sequence, alphabet, name = '', gappy = False, annot = ''):
+    def __init__(self, sequence, alphabet, name = '', gappy = False):
        """ Construct a sequence from a string, an alphabet (gappy or not) and a name.
-            The parameter gappy is for sequences when used in alignments, which means that '-' is allowed. """
+            The parameter gappy is for sequences when used in alignments. """
        for sym in sequence:
            if not sym in alphabet and (sym != '-' or not gappy):  # error check: bail out
                raise RuntimeError('Invalid symbol: ' + sym)
-        self.sequence = sequence
+        self.sequence = sequence # Store sequence
-        self.alphabet = alphabet
+        self.alphabet = alphabet # Store alphabet
-        self.name = name
+        self.name = name         # Store name
        self.gappy = gappy
-        self.annot = annot # some annotation, e.g. species
    def __len__(self):      # the "len" operator
        return len(self.sequence)
    def __iter__(self):     # method that allows us to iterate over a sequence
@@ -104,7 +87,7 @@ class Sequence():
        return self.sequence[ndx]
    def writeFasta(self):
        """ Write one sequence in FASTA format to a string and return it. """
-        fasta = '>' + self.name + ' ' + self.annot + '\n'
+        fasta = '>' + self.name + '\n'
        data = self.sequence
        nlines = (len(self.sequence) - 1) / 60 + 1
        for i in range(nlines):
@@ -126,244 +109,96 @@ class Sequence():
    def find(self, findme):
        """ Find the position of the specified symbol or sub-sequence """
        return self.sequence.find(findme)
+        #Challege - who can tell me what find() does in under 30 seconds
+        #demo this and teach them how to read pydocs
+        #Can we use it if we want to find multiple occurrences of a sequence?
+        #What can we use instead?
+    def findAll(self, findme):
+        """ Find all occurrences of the specified symbol or sub-sequence.
+            Uses a sliding window approach - very common when searching for motifs
+            or contamination.
+            Then, calculate the percentage of sequence made up of 'findme'.
+            If the percentage is above 20%, replace all occurrences of 'findme' with Ns.
+        """
+        word = len(findme)
+        store = []
+        count = 0
+        for s in range(0, len(self.sequence)):
+            cur = self.sequence[s:s+word]
+            if cur == findme:
+                store.append(s)
+                count += 1
+                print(cur, "Match at position ", s)
+            else:
+                print(cur)
+        print("---------------")
+        percentage = float(count*word) / len(self.sequence)
+        print("Percentage of findme in sequence: ", percentage)
+        if percentage > 20:
+            print("Removing contamination")
+            newSeq = self.sequence.replace(findme, "N")
+        return store, percentage, newSeq
-###############################################################################
-# Alignment                                                                   #
-###############################################################################
 class Alignment():
    """ A sequence alignment class. Stores two or more sequences of equal length where
-    one symbol is gap '-'. The number of columns in the alignment is given by alignlen. 
+    one symbol is gap '-'. The number of columns in the alignment is given by alignlen.
    Example usage:
    >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
-    >>> print Alignment(seqs)
+    >>> print(Alignment(seqs))
     THIS-LI-NE-
     --ISALIGNED """
    def __init__(self, seqs):
-        self.alphabet = None
        self.alignlen = -1
        self.seqs = seqs
-        self.namelen = 0
        for s in seqs:
-            if self.alphabet == None:
-                self.alphabet = s.alphabet
-            elif self.alphabet != s.alphabet:
-                raise RuntimeError("Alignment invalid: contains a mix of alphabets")
            if self.alignlen == -1:
                self.alignlen = len(s)
            elif self.alignlen != len(s):
-                raise RuntimeError("Alignment invalid: lengths vary")
+                raise RuntimeError("Alignment invalid")
-            self.namelen = max(len(s.name), self.namelen)
+    def __str__(self):
-    def __str__(self):    
+        string = u''
-        string = ''
+        namelen = 0
+        for seq in self.seqs:
+            namelen = max(len(seq.name), namelen)
        for seq in self.seqs:
-            string += seq.name.ljust(self.namelen+1)
+            string += seq.name.ljust(namelen+1)
            for sym in seq:
                string += sym
            string += '\n'
        return string
-    def __len__(self):
-        """ Defines what the "len" operator returns for an instance of Alignment: the number of sequences. """
-        return len(self.seqs)
-    def __getitem__(self, ndx):
-        return self.seqs[ndx]
-    def calcDistances(self, measure, a=1.0):
-        """ Calculate the evolutionary distance between all pairs of sequences
-        in this alignment, using the given measure. Measure can be one of
-        'fractional', 'poisson', 'gamma', 'jc' or 'k2p'. If 'gamma' or 'k2p' is
-        given, then the parameter a must also be specified (or else it will use
-        the default value of 1.0).
-        Definitions of each distance metric are found in Zvelebil and Baum p268-276.
-        These are mostly intended for DNA, but adapted for protein (as below).
-        Note however that there are alternative distance matrices for proteins (p276).
-        """
-        measure = measure.lower()
-        if not measure in ['fractional', 'poisson', 'gamma', 'jc', 'k2p']:
-            raise RuntimeError('Unsupported evolutionary distance measure: %s' % measure)
-        a = float(a)
-        distmat = numpy.zeros((len(self.seqs), len(self.seqs)))
-        # Loop through each pair of sequences
-        for i in range(len(self.seqs)):
-            for j in range(i + 1, len(self.seqs)):
-                seqA = self.seqs[i]
-                seqB = self.seqs[j]
-                # Calculate the fractional distance (p) first
-                # The two sequences of interest are in seqA and seqB
-                L = 0
-                D = 0
-                for k in range(self.alignlen):
-                    # For every non-gapped column, put to L
-                    # For every non-gapped column where the sequences are
-                    # different, put to D
-                    if seqA[k] != '-' and seqB[k] != '-':
-                        L += 1
-                        if seqA[k] != seqB[k]:
-                            D += 1
-                p = float(D)/L
-                # Now calculate the specified measure based on p
-                if measure == 'fractional':
-                    dist = p
-                else:
-                    raise RuntimeError('Not implemented: %s' % measure)
-                distmat[i, j] = distmat[j, i] = dist
-        return distmat
-    def writeClustal(self):
-        """ Write the alignment to a string using the Clustal file format. """
-        symbolsPerLine = 60
-        maxNameLength =  self.namelen + 1
-        mystring = ''
-        wholeRows = self.alignlen / symbolsPerLine
-        for i in range(wholeRows):
-            for j in range(len(self.seqs)):
-                mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
-                mystring += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
-            mystring += '\n'
-        # Possible last row
-        lastRowLength = self.alignlen - wholeRows*symbolsPerLine
-        if lastRowLength > 0:
-            for j in range(len(self.seqs)):
-                if maxNameLength > 0:
-                    mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
-                mystring += self.seqs[j][-lastRowLength:] + '\n'
-        return mystring
-    def writeHTML(self, filename):
-        """ Generate HTML that displays the alignment in colour. 
-        """
-        fh = open(filename, 'w')
-        fh.write('<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n')
-        html = ''.ljust(self.namelen) + ' '
-        for i in range(self.alignlen - 1):
-            if (i+1) % 10 == 0:
-                html += str(i/10+1)[-1]
-            else:
-                html += ' '
-        html += '%s\n' % (self.alignlen)
-        fh.write(html)
-        if self.alignlen > 10:
-            html = ''.ljust(self.namelen) + ' '
-            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0:
-                    html += '0'
-                else:
-                    html += ' '
-            html += '\n'
-            fh.write(html)
-        if len(self.alphabet) <= 5: # DNA or RNA
-            colours = {'A':'green','C':'orange','G':'red','T':'#66bbff','U':'#66bbff'}
-        else: # amino acids
-            colours = {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'}
-        for seq in self.seqs:
-            html = seq.name.ljust(self.namelen) + ' '
-            for sym in seq:
-                try:
-                    colour = colours[sym]
-                except KeyError:
-                    colour = 'white'
-                html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (colour, sym)
-            html += '\n'
-            fh.write(html)
-        fh.write('</pre></body></html>\n')
-        fh.close()
-    def scoreAlignment(self, substmat = None, gap = -1):
-        """Score the alignment using a substitution matrix (substmat).
-           If the alignment consists of more than two sequences, the minimum
-           score of each column is used.
-           If substmat is not specified (None), the count of matches is returned.
-        """
-        nseqs = len(self.seqs)
-        total = 0
-        for pos in range(self.alignlen):
-            min = None
-            for i in range(nseqs):
-                for j in range(i+1, nseqs):
-                    gap_here = self.seqs[i][pos] == '-' or self.seqs[j][pos] == '-'
-                    score = 0
-                    if substmat == None:
-                        if self.seqs[i][pos] == self.seqs[j][pos]:
-                            score = 1
-                    else: # we have a substitution matrix 
-                        if gap_here:
-                            score = gap
-                        else:
-                            score = substmat.get(self.seqs[i][pos], self.seqs[j][pos])
-                    if min == None:
-                        min = score
-                    elif min > score:
-                        min = score
-            total += min
-        return total
-###############################################################################
+def scoreAlignment(aln, substmat = None, gap = -1):
-# Methods to create instances of Alignment                                    #
+    """Score an alignment (aln) using a substitution matrix (substmat).
-###############################################################################
+       If the alignment consists of more than two sequences, the minimum
+       score of each column is used.
-def align(seqA, seqB, substMatrix, gap = -1):
+       If substmat is not specified (None), the count of matches is returned.
-    """ Align seqA with seqB using the Needleman-Wunsch
+    """
-    (global) algorithm. substMatrix is the substitution matrix to use and
+    nseqs = len(aln.seqs)
-    gap is the linear gap penalty to use. """
+    total = 0
-    stringA, stringB = seqA.sequence, seqB.sequence
+    for pos in range(aln.alignlen):
-    lenA, lenB = len(seqA), len(seqB)
+        min = None
-    # Create the scoring matrix (S) and a matrix for traceback
+        for i in range(nseqs):
-    S = numpy.zeros((lenA + 1, lenB + 1))
+            for j in range(i+1, nseqs):
-    Traceback = numpy.zeros((lenA + 1, lenB + 1))
+                gap_here = aln.seqs[i][pos] == '-' or aln.seqs[j][pos] == '-'
-    # Fill the first row and column of S with multiples of the gap penalty
+                score = 0
-    for i in range(lenA + 1):
+                if substmat == None:
-        S[i, 0] = i * gap
+                    if aln.seqs[i][pos] == aln.seqs[j][pos]:
-    for j in range(lenB + 1):
+                        score = 1
-        S[0, j] = j * gap
+                else: # we have a substitution matrix
-    # Calculate the optimum score at each location in the matrix, note which option that was chosen for traceback
+                    if gap_here:
-    for i in range(1, lenA + 1):
+                        score = gap
-        for j in range(1, lenB + 1):
+                    else:
-            match  = S[i-1, j-1] + substMatrix.get(stringA[i-1], stringB[j-1])
+                        score = substmat.get(aln.seqs[i][pos], aln.seqs[j][pos])
-            delete = S[i-1, j  ] + gap
+                if min == None:
-            insert = S[i  , j-1] + gap
+                    min = score
-            Traceback[i, j] = numpy.argmax([match, delete, insert])
+                elif min > score:
-            S[i, j] = max([match, delete, insert])
+                    min = score
-    # Trace back the optimal alignment
+        total += min
-    alignA = ''
+    return total
-    alignB = ''
-    # Start at the end
-    i = lenA
-    j = lenB
-    # Stop when we hit the end of a sequence
-    while i > 0 and j > 0:
-        if Traceback[i, j] == 1: 
-            # Got here by a gap in sequence B (go up)
-            alignA = stringA[i-1] + alignA
-            alignB = '-' + alignB
-            i -= 1
-        elif Traceback[i, j] == 2: 
-            # Got here by a gap in sequence A (go left)
-            alignA = "-" + alignA
-            alignB = stringB[j-1] + alignB
-            j -= 1
-        else:
-            # Got here by aligning the bases (go diagonally)
-            alignA = stringA[i-1] + alignA
-            alignB = stringB[j-1] + alignB
-            i -= 1
-            j -= 1
-    # Fill in the rest of the alignment if it begins with gaps
-    # (i.e., trace back all the way to S[0, 0])
-    while i > 0:
-        # Go up
-        alignA = stringA[i-1] + alignA
-        alignB = '-' + alignB
-        i -= 1
-    while j > 0:
-        # Go left
-        alignA = '-' + alignA
-        alignB = stringB[j-1] + alignB
-        j -= 1
-    return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])
-###############################################################################
-# SubstMatrix                                                                 #
-###############################################################################
 class SubstMatrix():
    """ Create a substitution matrix for an alphabet.
    Example usage:
@@ -375,11 +210,11 @@ class SubstMatrix():
    ...         elif a == b:
    ...             sm.set(a, b, +1)
    ...
-    >>> print sm
+    >>> print(sm)
-    A   1 
+    A   1
-    C  -1   1 
+    C  -1   1
-    G  -1  -1   1 
+    G  -1  -1   1
-    T  -1  -1  -1   1 
+    T  -1  -1  -1   1
        A   C   G   T
    >>> sm.get('C', 'T')
    -1
@@ -401,7 +236,7 @@ class SubstMatrix():
    def __str__(self):
        symbols = self.alphabet.symbols # what symbols are in the alphabet
        i = len(symbols)
-        string = ''
+        string = u''
        for a in symbols:
            string += a + ' '
            for b in symbols[:len(symbols)-i+1]:
@@ -417,17 +252,12 @@ class SubstMatrix():
    def writeFile(self, filename):
        """ Write this substitution matrix to the given file. """
        fh = open(filename, 'w')
-        file = ''
+        file = u''
        for key in self.scoremat:
            file += ''.join(key) + ': ' + str(self.scoremat[key]) + '\n'
        fh.write(file)
        fh.close()
-###############################################################################
-# Below are some useful methods for loading data from strings and files.      #
-# They recognize the FASTA and Clustal formats (nothing fancy).               # 
-###############################################################################
 def readSubstMatrix(filename, alphabet):
    """ Read in the substitution matrix stored in the given file. """
    mat = SubstMatrix(alphabet)
@@ -443,26 +273,25 @@ def readSubstMatrix(filename, alphabet):
        mat.set(symbols[0], symbols[1], score)
    return mat
-def readFastaString(string, alphabet, gappy = False):
+"""
+Below are some useful methods for loading data from strings and files.
+Recognize the FASTA and Clustal formats (nothing fancy).
+"""
+def readFastaString(string, alphabet):
    """ Read the given string as FASTA formatted data and return the list of
    sequences contained within it. """
-    seqlist = []    # list of sequences contained in the string 
+    seqlist = []    # list of sequences contained in the string
-    seqname = ''  # name of *current* sequence 
+    seqname = None  # name of *current* sequence
-    seqannot = '' # annotation of *current* sequence 
    seqdata = []    # sequence data for *current* sequence
    for line in string.splitlines():    # read every line
        if len(line) == 0:              # ignore empty lines
            continue
-        if line[0] == '>':  # start of new sequence            
+        if line[0] == '>':  # start of new sequence
            if seqname:     # check if we've got one current
-                current = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot)
+                current = Sequence(''.join(seqdata), alphabet, seqname)
                seqlist.append(current)
            # now collect data about the new sequence
-            parts = line[1:].split() # skip first char
+            seqname = line[1:].split()[0] # skip first char
-            seqname = ''  # name of *current* sequence 
-            seqannot = '' # annotation of *current* sequence 
-            if len(parts) > 0: seqname = parts[0] 
-            if len(parts) > 1: seqannot = line[len(seqname) + 2:] # the rest of the line
            seqdata = []
        else:               # we assume this is (more) data for current
            cleanline = line.split()
@@ -470,26 +299,26 @@ def readFastaString(string, alphabet, gappy = False):
                seqdata.extend(tuple(thisline.strip('*')))
    # we're done reading the file, but the last sequence remains
    if seqname:
-        lastseq = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot)
+        lastseq = Sequence(''.join(seqdata), alphabet, seqname)
        seqlist.append(lastseq)
    return seqlist
-def readFastaFile(filename, alphabet, gappy = False):
+def readFastaFile(filename, alphabet):
-    """ Read the given FASTA formatted file and return the list of sequences 
+    """ Read the given FASTA formatted file and return the list of sequences
    contained within it. """
    fh = open(filename)
    data = fh.read()
    fh.close()
-    seqlist = readFastaString(data, alphabet, gappy)
+    seqlist = readFastaString(data, alphabet)
    return seqlist
 def writeFastaFile(filename, seqs):
    """ Write the specified sequences to a FASTA file. """
    fh = open(filename, 'w')
    for seq in seqs:
-        fh.write(seq.writeFasta())
+        fh.write(str(seq))
    fh.close()
 def readClustalString(string, alphabet):
    """ Read a ClustalW2 alignment in the given string and return as an
    Alignment object. """
@@ -504,12 +333,12 @@ def readClustalString(string, alphabet):
            continue
        sections = line.split()
        name, seq = sections[0:2]
-        if seqs.has_key(name):
+        if name in seqs:
            seqs[name] += seq
        else:
            seqs[name] = seq
    sequences = []
-    for name, seq in seqs.items():
+    for name, seq in list(seqs.items()):
        sequences.append(Sequence(seq, alphabet, name, gappy = True))
    return Alignment(sequences)
@@ -522,17 +351,6 @@ def readClustalFile(filename, alphabet):
    aln = readClustalString(data, alphabet)
    return aln
-def writeClustalFile(filename, aln):
-    """ Write the specified alignment to a Clustal file. """
-    fh = open(filename, 'w')
-    fh.write('CLUSTAL W (1.83) multiple sequence alignment\n\n\n') # fake header so that clustal believes it
-    fh.write(aln.writeClustal())
-    fh.close()
-###############################################################################
-# GeneProfile                                                                 #
-###############################################################################
 class GeneProfile():
    """ A class for gene expression data.
    Example usage:
@@ -557,7 +375,7 @@ class GeneProfile():
    def getSorted(self, index, descending=True):
        """Get a list of (gene, value) tuples in descending order by value"""
        key_fn = lambda v: v[1][index]
-        return sorted(self.genes.items(), key=key_fn, reverse=descending)
+        return sorted(list(self.genes.items()), key=key_fn, reverse=descending)
    def addSample(self, sample_name, sample_dict):
        """Add a sample to the current data set.
           sample_dict is a dictionary with the same keys as the current gene set.
@@ -579,7 +397,7 @@ class GeneProfile():
        if isinstance(sample_name, str):    # a single sample-name
            mysamples = [sample_name]
        else:                               # a list of sample-names
-            mysamples = sample_name         
+            mysamples = sample_name
        for gene in self.genes:
            mygenes[gene] = []
            for name in mysamples:
@@ -629,7 +447,7 @@ def readGeoFile(filename, id_column = 0):
    manylines = fh.read()
    fh.close()
    data_rows = False  # Indicates whether we're reading the data section or metadata
-    name = 'Unknown'
+    name = u'Unknown'
    cnt_data = 0
    for line in manylines.splitlines():
        if line.startswith('^DATASET'):
@@ -662,28 +480,28 @@ def readGeoFile(filename, id_column = 0):
                    continue
            if not ignore:
                dataset[id] = tuple(values)
-    print 'Data set %s contains %d genes' % (name, len(dataset.genes))
+    print('Data set %s contains %d genes' % (name, len(dataset.genes)))
    return dataset
-###############################################################################
+"""
-# Web service methods that find data in online databases. 
+Web service methods that find data in online databases.
-# Our implementations are mainly serviced by EBI.
+Our implementations are mainly serviced by EBI.
-###############################################################################
+"""
 def getSequence(entryId, dbName, alphabet):
    """ Retrieve a single entry from a database
-    entryId: ID for entry e.g. 'P63166' (Uniprot Accession) or 'SUMO1_MOUSE' (Uniprot Identifier) 
+    entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE'
    dbName: name of db e.g. 'uniprotkb', 'pdb' or 'refseqn'.
    See: http://www.uniprot.org/faq/28. """
-    url = 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' +\
+    if not isinstance(entryId, str):
-    dbName + '&format=fasta&id=' + entryId
+        entryId = entryId.decode("utf-8")
+    url ='http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' + dbName + '&format=fasta&id=' + entryId
    try:
-        data = urllib2.urlopen(url).read()
+        data = urllib.request.urlopen(url).read()
-        return readFastaString(data, alphabet)[0]
+        return readFastaString(data.decode("utf-8"), alphabet)[0]
-    except urllib2.HTTPError, ex:
+    except urllib.error.HTTPError as ex:
        raise RuntimeError(ex.read())
-def searchSequences(query, dbName = 'uniprot'):
+def searchSequences(query, dbName):
    """
    Retrieve multiple entries matching query from a database currently only via UniProtKB
    query: search term(s) e.g. 'organism:9606+AND+antigen'
@@ -696,42 +514,40 @@ def searchSequences(query, dbName = 'uniprot'):
        url = 'http://www.uniprot.org/' + dbName + '/?format=list&query=' + query
        # Get the entries
        try:
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            return data.splitlines()
-        except urllib2.HTTPError, ex:
+        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
    elif dbName.startswith('refseq'):
        dbs = dbName.split(":")
        if len(dbs) > 1:
            dbName = dbs[1]
-        base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
+        base = u'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
        url = base + "esearch.fcgi?db=" + dbName + "&term=" + query
        # Get the entries
        try:
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            words = data.split("</Id>")
            words = [w[w.find("<Id>")+4:] for w in words[:-1]]
            return words
-        except urllib2.HTTPError, ex:
+        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
    return
 def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
    """
-    Map identifiers between databases (based on UniProtKB; 
+    Map identifiers between databases (based on UniProtKB;
    see http://www.uniprot.org/faq/28)
    identifiers: a list of identifiers (list of strings)
    frm: the abbreviation for the identifier FROM which to idmap
    to: the abbreviation for the identifier TO which to idmap
-    Returns a dictionary with key (from) -> value (to).
+    Returns a dictionary with key (from) -> value (to) """
-    ACC is Uniprot Accession (e.g. 'P42813').
+    url = u'http://www.uniprot.org/mapping/'
-    """
-    url = 'http://www.uniprot.org/mapping/'
    # construct query by concatenating the list of identifiers
    if isinstance(identifiers, str):
        query = identifiers.strip()
    else: # assume it is a list of strings
-        query = ''
+        query = u''
        for id in identifiers:
            query = query + id.strip() + ' '
        query = query.strip() # remove trailing spaces
@@ -742,8 +558,8 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
        'query' : query
    }
    if len(query) > 0:
-        request = urllib2.Request(url, urllib.urlencode(params))
+        request = urllib.request.Request(url, urllib.parse.urlencode(params))
-        response = urllib2.urlopen(request).read()
+        response = urllib.request.urlopen(request).read()
        d = dict()
        for row in response.splitlines()[1:]:
            pair = row.split('\t')
@@ -752,43 +568,42 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
    else:
        return dict()
-###############################################################################
+"""
-# Gene Ontology services.
+Gene Ontology services.
-# See http://www.ebi.ac.uk/QuickGO/WebServices.html for more info
+See http://www.ebi.ac.uk/QuickGO/WebServices.html for more info
-###############################################################################
+"""
 def getGODef(goterm):
    """
    Retrieve information about a GO term
    goterm: the identifier, e.g. 'GO:0002080'
    """
    # Construct URL
-    url = 'http://www.ebi.ac.uk/QuickGO/GTerm?format=obo&id=' + goterm
+    url = u'http://www.ebi.ac.uk/QuickGO/GTerm?format=obo&id=' + goterm
    # Get the entry: fill in the fields specified below
    try:
        entry={'id': None, 'name': None, 'def': None}
-        data = urllib2.urlopen(url).read()
+        data = urllib.request.urlopen(url).read()
        for row in data.splitlines():
            index = row.find(':')
            if index > 0 and len(row[index:]) > 1:
                field = row[0:index].strip()
                value = row[index+1:].strip(' "') # remove spaces
-                if field in entry.keys():         # check if we need field
+                if field in list(entry.keys()):         # check if we need field
                    if entry[field] == None:      # check if assigned
                        entry[field] = value
        return entry
-    except urllib2.HTTPError, ex:
+    except urllib.error.HTTPError as ex:
        raise RuntimeError(ex.read())
 def getGOTerms(genes, db='UniProtKB'):
    """
    Retrieve all GO terms for a given set of genes (or single gene).
-    db: use specified database, e.g. 'UniProtKB', 'UniGene', 
+    db: use specified database, e.g. 'UniProtKB', 'UniGene',
    or 'Ensembl'.
-    The result is given as a map (key=gene name, value=list of unique 
+    The result is given as a map (key=gene name, value=list of unique
    terms) OR in the case of a single gene as a list of unique terms.
    """
-    if type(genes) != list and type(genes) != set and type(genes) != tuple:   
+    if type(genes) != list and type(genes) != set and type(genes) != tuple:
        genes = [genes]  # if 'genes' is a single gene, we make a single item list
    map = dict()
    uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&protein='
@@ -796,13 +611,13 @@ def getGOTerms(genes, db='UniProtKB'):
        terms = set()  # empty result set
        url = uri + gene.strip() # Construct URL
        try: # Get the entry: fill in the fields specified below
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            for row in data.splitlines()[1:]:  # we ignore header row
                values = row.split('\t')
                if len(values) >= 7:
                    terms.add(values[6]) # add term to result set
            map[gene] = list(terms)      # make a list of the set
-        except urllib2.HTTPError, ex:
+        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
    if len(genes) == 1:
        return map[genes[0]]
@@ -811,12 +626,12 @@ def getGOTerms(genes, db='UniProtKB'):
 def getGenes(goterms, db='UniProtKB', taxo=None):
    """
-    Retrieve all genes/proteins for a given set of GO terms 
+    Retrieve all genes/proteins for a given set of GO terms
    (or single GO term).
-    db: use specified database, e.g. 'UniProtKB', 'UniGene', 
+    db: use specified database, e.g. 'UniProtKB', 'UniGene',
    or 'Ensembl'
    taxo: use specific taxonomic identifier, e.g. 9606 (human)
-    The result is given as a map (key=gene name, value=list of unique 
+    The result is given as a map (key=gene name, value=list of unique
    terms) OR in the case of a single gene as a list of unique terms.
    """
    if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
@@ -831,502 +646,15 @@ def getGenes(goterms, db='UniProtKB', taxo=None):
        genes = set()   # start with empty result set
        url = uri + goterm.strip() # Construct URL
        try: # Get the entry: fill in the fields specified below
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            for row in data.splitlines()[1:]:  # we ignore first (header) row
                values = row.split('\t')
                if len(values) >= 7:
                    genes.add(values[1])  # add gene name to result set
            map[goterm] = list(genes)
-        except urllib2.HTTPError, ex:
+        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
    if len(goterms) == 1:
        return map[goterms[0]]
    else:
        return map
-###############################################################################
-# PhyloTree                                                                   #
-###############################################################################
-class PhyloTree:
-    """ Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
-        Functionality includes labelling and traversing nodes; reading and writing to Newick format;
-        association with sequence alignment; maximum parsimony inference of ancestral sequence;
-        generation of single, bifurcating rooted tree by UPGMA.
-        Known issues: Binary only; Parsimony does not handle gaps in alignment.
-        Programmers should note that almost all functionality is implemented through recursion. """
-    def __init__(self, root):
-        """ Create a tree from a node that is "root" in the tree."""
-        self.root = root
-    def putAlignment(self, aln):
-        """ Associate the tree with a set of sequences/alignment.
-            Involves assigning the sequence to the leaf nodes. """
-        self.aln = aln
-        self.root._assignAlignment(aln)
-    def __str__(self):
-        """ Produce a printable representation of the tree, specifically the root of the tree. """
-        return str(self.root)
-    def strSequences(self, start = None, end = None):
-        """ Produce a sequence representation of the tree, specifically the root of the tree.
-            Specify the start and end positions in the alignment for the sequence to be printed
-            (if None the min and max positions will be used). """
-        if self.aln != None:
-            my_start = start or 0
-            my_end = end or self.aln.alignlen
-            return self.root._printSequences(my_start, my_end)
-    def findLabel(self, label):
-        """ Retrieve/return the node with the specified label.
-            Returns None if not found."""
-        return self.root._findLabel(label)
-    def getDescendantsOf(self, node, transitive = False):
-        """ Retrieve and return the (list of) descendants (children) of a specified node.
-            Node can be the label or the instance.
-            transitive indicates if only the direct descendants (False) or if all descendants
-            should be returned.
-            If node does not exist, None is returned.
-            If node has no descendants, an empty list will be returned."""
-        if not isinstance(node, PhyloNode):
-            node = self.root.findLabel(node)
-        if node:
-            return node.getDescendants(transitive)
-        return None
-    def getAncestorsOf(self, node, transitive = False):
-        """ Retrieve and return the ancestor (transitive=False) or
-            ancestors (transitive=True) of a specified node.
-            Node can be the label or the instance.
-            If node does not exist, None is returned.
-            If node is the root of the tree, None is returned."""
-        if not isinstance(node, PhyloNode):
-            node = self.root.findLabel(node)
-        if node:
-            myroot = self.root
-            found = False
-            branching = []
-            while not found and myroot != None:
-                branching.append(myroot)
-                if myroot.left == node or myroot.right == node:
-                    found = True
-                    break
-                if myroot.left:
-                    if myroot.left.isAncestorOf(node, transitive = True):
-                        myroot = myroot.left
-                    else: # must be right branch then...
-                        myroot = myroot.right
-                else: # must be right branch then...
-                    myroot = myroot.right
-            if found and transitive:
-                return branching
-            elif found and len(branching) > 0:
-                return branching[len(branching)-1]
-            return None
-    def parsimony(self):
-        """ Solve the "small parsimony problem",
-            i.e. find the sequences on each of the internal nodes.
-            See Jones and Pevzner, p. 368 and onwards, for details. """
-        self.root._forwardParsimony(self.aln)  # setup and compute scores for all nodes
-        self.root._backwardParsimony(self.aln) # use scores to determine sequences
-        return self.root.getSequence() # return the sequence found at the root
-###############################################################################
-# PhyloNode                                                                   #
-###############################################################################
-class PhyloNode:
-    """ A class for a node in a rooted, binary (bifurcating) tree.
-        Contains pointers to descendants/daughters (left and right),
-        optional fields include data, label, sequence and dist.
-        If parsimony is used scores and traceback pointers are available.
-        A number of methods are named with a _ prefix. These can be, but
-        are not intended to be used from outside the class. """
-    def __init__(self, label = ''):
-        """ Initialise an initially unlinked node.
-            Populate fields left and right to link it with other nodes.
-            Set label to name it.
-            Use field data for any type of information associated with node.
-            Use dist to indicate the distance to its parent (if any).
-            Other fields are used internally, including sequence for associated alignment,
-            seqscores, backleft and backright for maximum parsimony. """
-        self.left = None
-        self.right = None
-        self.data = None
-        self.label = label
-        self.dist = None
-        self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
-        self.seqscores = None # The scores propagated from leaves via children
-        self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols
-        self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
-    def __str__(self):
-        """ Returns string with node (incl descendants) in a Newick style. """
-        left = right = label = dist = ''
-        if self.left:
-            left = str(self.left)
-        if self.right:
-            right = str(self.right)
-        if self.dist or self.dist == 0.0:
-            dist = ':' + str(self.dist)
-        if self.label != None:
-            label = str(self.label)
-            if not self.left and not self.right:
-                return label + dist
-            else:
-                return '(' + left + ',' + right + ')' + label + dist
-        else: # there is no label
-            if not self.left and self.right:
-                return ','+right
-            elif self.left and not self.right:
-                return left+','
-            elif self.left and self.right:
-                return '(' + left + ',' + right + ')' + dist
-    def _printSequences(self, start, end):
-        """ Returns string with node (incl descendants) in a Newick style. """
-        left = right = label = dist = ''
-        if self.left:
-            left = self.left._printSequences(start, end)
-        if self.right:
-            right = self.right._printSequences(start, end)
-        if self.dist:
-            dist = ':' + str(self.dist)
-        if self.sequence != None:
-            label = "".join(self.sequence[start:end]) + ""
-            if not self.left and not self.right:
-                return label + dist
-            else:
-                return '(' + left + ',' + right + ')' + label + dist
-        else: # there is no label
-            if not self.left and self.right:
-                return ','+right
-            elif self.left and not self.right:
-                return left+','
-            elif self.left and self.right:
-                return '(' + left + ',' + right + ')' + dist
-    def _findLabel(self, label):
-        """ Find a node by label at this node or in any descendants (recursively). """
-        if self.label == label:
-            return self
-        else:
-            if self.left:
-                foundLeft = self.left._findLabel(label)
-                if foundLeft:
-                    return foundLeft
-            if self.right:
-                return self.right._findLabel(label)
-            return None
-    def _propagateDistance(self, parent_dist):
-        """ Convert absolute distances to relative.
-            The only parameter is the absolute distance to the parent of this node. """
-        travelled = self.dist               # absolute distance to this node
-        self.dist = parent_dist - self.dist # relative distance to this node
-        if self.left != None:               # if there is a child node...
-            self.left._propagateDistance(travelled) # pass absolute distance to this node
-        if self.right != None:
-            self.right._propagateDistance(travelled)
-    def _assignAlignment(self, aln):
-        """ Assign an alignment to the node, which implies assigning a sequence to it if one is
-            available in the alignment. """
-        self.sequence = None
-        if self.left != None:
-            self.left._assignAlignment(aln)
-        if self.right != None:
-            self.right._assignAlignment(aln)
-        for seq in aln.seqs:
-            if seq.name == self.label:
-                self.sequence = seq
-                break
-    def _forwardParsimony(self, aln):
-        """ Internal function that operates recursively to first initialise each node (forward),
-            stopping only once a sequence has been assigned to the node,
-            then to propagate scores from sequence assigned nodes to root (backward). """
-        if self.sequence == None: # no sequence has been assigned
-            if self.left == None and self.right == None:    # no children, so terminal, cannot propagate scores
-                raise RuntimeError("No sequence assigned to leaf node:", self.label)
-            scoresleft = scoresright = None
-            if self.left != None:
-                scoresleft = self.left._forwardParsimony(aln)
-            if self.right != None:
-                scoresright = self.right._forwardParsimony(aln)
-            # for each position in the alignment,
-            # introduce (initially zero) score for each symbol in alphabet
-            self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
-            # for each position in the alignment,
-            # allocate a position to put the left child symbol from which each current node symbol score was determined
-            self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
-            # allocate a position to put the right child symbol from which each current node symbol score was determined
-            self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
-            for col in range(aln.alignlen):
-                for a_parent in range(len(aln.alphabet)):
-                    best_score_left = +9999999
-                    best_score_right = +9999999
-                    best_symb_left = 0
-                    best_symb_right = 0
-                    for a_left in range(len(aln.alphabet)):
-                        score = (scoresleft[col][a_left] + (1 if a_left != a_parent else 0)) # if we want to weight scores, this would need to change
-                        if score < best_score_left:
-                            best_symb_left = a_left
-                            best_score_left = score
-                    for a_right in range(len(aln.alphabet)):
-                        score = (scoresright[col][a_right] + (1 if a_right != a_parent else 0)) # if we want to weight scores, this would need to change
-                        if score < best_score_right:
-                            best_symb_right = a_right
-                            best_score_right = score
-                    self.seqscores[col][a_parent] = best_score_left + best_score_right
-                    self.backleft[col][a_parent] = best_symb_left
-                    self.backright[col][a_parent] = best_symb_right
-        else:
-            self.seqscores = [[0 if a==sym else 999999 for a in aln.alphabet] for sym in self.sequence] # if we want to weight scores, this would need to change
-        return self.seqscores
-    def _backwardParsimony(self, aln, seq = None):
-        """ Internal function that operates recursively to inspect scores to determine
-            most parsimonious sequence, from root to leaves. """
-        if self.sequence == None: # no sequence has been assigned
-            leftbuf = []
-            rightbuf = []
-            if self.left == None and self.right == None:    # no children, so terminal, cannot propagate scores
-                raise RuntimeError("No sequence assigned to leaf node:", self.label)
-            if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
-                currbuf = []
-                for col in range(aln.alignlen):
-                    min_score = 999999
-                    min_symb = None
-                    left_symb = None
-                    right_symb = None
-                    for a_parent in range(len(aln.alphabet)):
-                        if self.seqscores[col][a_parent] < min_score:
-                            min_score = self.seqscores[col][a_parent]
-                            min_symb = a_parent
-                            left_symb = self.backleft[col][a_parent]
-                            right_symb = self.backright[col][a_parent]
-                    currbuf.append(aln.alphabet[min_symb])
-                    leftbuf.append(aln.alphabet[left_symb])
-                    rightbuf.append(aln.alphabet[right_symb])
-                self.sequence = Sequence(currbuf, aln.alphabet, self.label, gappy = True)
-            else: # Non-root, but not leaf
-                self.sequence = seq
-                col = 0
-                for sym_parent in self.sequence:
-                    a_parent = aln.alphabet.index(sym_parent)
-                    left_symb = self.backleft[col][a_parent]
-                    right_symb = self.backright[col][a_parent]
-                    leftbuf.append(aln.alphabet[left_symb])
-                    rightbuf.append(aln.alphabet[right_symb])
-                    col += 1
-            self.left._backwardParsimony(aln, Sequence(leftbuf, aln.alphabet, self.label, gappy = True))
-            self.right._backwardParsimony(aln, Sequence(rightbuf, aln.alphabet, self.label, gappy = True))
-        return self.sequence
-    def getSequence(self):
-        """ Get the sequence for the node. Return None if no sequence is assigned.
-            Requires that an alignment is associated with the tree, and that sequence names match node labels.
-            If the explored node is not a leaf, the sequence can be determined by parsimony. """
-        if self.sequence != None: # a sequence has been assigned
-            return self.sequence
-        elif self.seqscores != None: # inferred by parsimony but not yet assigned
-            return None # determine most parsimonous sequence, not yet implemented
-    def isAncestorOf(self, node, transitive = True):
-        """ Decide if this node is the ancestor of specified node.
-            If transitive is True (default), all descendants are included.
-            If transitive is False, only direct descendants are included. """
-        if node == self.left or node == self.right:
-            return True
-        elif transitive:
-            if self.left:
-                statusLeft = self.left.isAncestorOf(node, transitive)
-                if statusLeft: return True
-            if self.right:
-                return self.right.isAncestorOf(node, transitive)
-        else:
-            return False
-    def getDescendants(self, transitive = False):
-        """ Retrieve and return (list of) nodes descendant of this.
-            If transitive is False (default), only direct descendants are included.
-            If transitive is True, all descendants are (recursively) included. """
-        children = []
-        if self.left:
-            children.append(self.left)
-        if self.right:
-            children.append(self.right)
-        if not transitive:
-            return children
-        else:
-            grandchildren = []
-            for c in children:
-                d = c.getDescendants(transitive)
-                if d:
-                    grandchildren.extend(d)
-            children.extend(grandchildren)
-            return children
-###############################################################################
-# Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
-# Methods for processing files of trees on the Newick format
-###############################################################################
-def runUPGMA(aln, measure, absoluteDistances = False):
-    """ Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
-        Use specified distance metric (see sequence.calcDistances).
-        If absoluteDistances is True, the tree will be assigned the total distance from provided species.
-        Otherwise, the relative addition at each path will be assigned."""
-    D = {}
-    N = {} # The number of sequences in each node
-    M = aln.calcDistances(measure) # determine all pairwise distances
-    nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
-    """ For each node-pair, assign the distance between them. """
-    for i in range(len(nodes)):
-        nodes[i].sequence = aln.seqs[i]
-        nodes[i].dist = 0.0
-        N[nodes[i]] = 1 # each cluster contains a single sequence
-        for j in range(0, i):
-            D[_getkey(nodes[i], nodes[j])] = M[i, j]
-    """ Now: treat each node as a cluster,
-        until there is only one cluster left,
-        find the *closest* pair of clusters, and
-        merge that pair into a new cluster (to replace the two that merged).
-        In each case, the new cluster is represented by the (phylo)node that is formed. """
-    while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
-        closest_pair = (None, None)    # The two nodes that are closest to one another according to supplied metric
-        closest_dist = None            # The distance between them
-        for pair in D:                 # check all pairs which should be merged
-            dist = D[pair]
-            if dist < closest_dist or closest_dist == None:
-                closest_dist = dist
-                closest_pair = pair
-        # So we know the closest, now we need to merge...
-        x = closest_pair[0]            # See Zvelebil and Baum p. 278 for notation
-        y = closest_pair[1]
-        z = PhyloNode()                # create a new node for the cluster z
-        z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
-        Nx = N.pop(x)                  # find number of sequences in x, remove the cluster from list N
-        Ny = N.pop(y)                  # find number of sequences in y, remove the cluster from list N
-        dz = {}                        # new distances to cluster z
-        for w in N:                    # for each node w ...
-            # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
-            dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
-            dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
-            dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
-        N[z] = Nx + Ny                 # total number of sequences in new cluster, insert new cluster in list N
-        for w in dz:                   # we have to run through the nodes again, now not including the removed x and y
-            D[_getkey(z, w)] = dz[w]   # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
-        z.left = x                     # link the phylogenetic tree
-        z.right = y
-        nodes.append(z)
-    if not absoluteDistances:
-        x._propagateDistance(z.dist)   # convert absolute distances to relative by recursing down left path
-        y._propagateDistance(z.dist)   # convert absolute distances to relative by recursing down right path
-        z.dist = 0.0                   # root z is at distance 0 from merged x and y
-    return PhyloTree(z)                # make it to tree, return
-def _getkey(node1, node2):
-    """ Construct canonical (unordered) key for two symbols """
-    if node1 <= node2:
-        return tuple([node1, node2])
-    else:
-        return tuple([node2, node1])
-def _findComma(string, level = 0):
-    """ Find first comma at specified level of embedding """
-    mylevel = 0
-    for i in range(len(string)):
-        if string[i] == '(':
-            mylevel += 1
-        elif string[i] == ')':
-            mylevel -= 1
-        elif string[i] == ',' and mylevel == level:
-            return i
-    return -1
-def parseNewickNode(string):
-    """ Utility function that recursively parses embedded string using Newick format. """
-    first = string.find('(')
-    last = string[::-1].find(')') # look from the back
-    if first == -1 and last == -1: # we are at leaf
-        y = string.split(':')
-        node = PhyloNode(y[0])
-        if len(y) >= 2:
-            node.dist = float(y[1])
-        return node
-    elif first >= 0 and last >= 0:
-        # remove parentheses
-        last = len(string) - last - 1 # correct index to refer from start instead of end of string
-        embed = string[first + 1:last]
-        tail = string[last + 1:]
-        # find where corresp comma is
-        comma = _findComma(embed)
-        if comma == -1:
-            raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
-        left = embed[0:comma].strip()
-        right = embed[comma + 1:].strip()
-        y = tail.split(':')
-        node = PhyloNode(y[0])
-        if len(y) >= 2:
-            node.dist = float(y[1])
-        node.left = parseNewickNode(left)
-        node.right = parseNewickNode(right)
-        return node
-    else:
-        raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
-def parseNewick(string):
-    """ Main method for parsing a Newick string into a (phylogenetic) tree.
-        Handles labels (on both leaves and internal nodes), and includes distances (if provided).
-        Returns an instance of a PhyloTree. """
-    if string.find(';') != -1:
-        string = string[:string.find(';')]
-    return PhyloTree(parseNewickNode(string))
-def readNewickFile(filename):
-    """ Read file on Newick format.
-        Returns an instance of a PhyloTree."""
-    f = open(filename)
-    string = ''.join(f)
-    return parseNewick(string)
-def writeNewickFile(filename, tree):
-    """ Write the specified tree to a Newick file. """
-    fh = open(filename, 'w')
-    fh.write(tree.__str__())
-    fh.close()
-###############################################################################
-# Below is code that will be run if the module is "run", and not just "imported".
-###############################################################################
-if __name__=='__main__':
-    x = Sequence('ACTGA', DNA_Alphabet, 'x')
-    print "Sequence", x, "is constructed from the symbols", x.alphabet.symbols
-    print "( There are", x.count('A'), "occurrences of the symbol 'A' in", x.sequence, ")" 
-    y = Sequence('TACGA', DNA_Alphabet, 'y')
-    print "Sequence", y, "is constructed from the symbols", y.alphabet.symbols
-    print 
-    print "( The sub-sequence 'CG' starts at index", y.find('CG'), "of", y.sequence, ")"
-    print
-    sm = SubstMatrix(DNA_Alphabet)
-    for a in DNA_Alphabet:
-        for b in DNA_Alphabet:
-            if a==b: 
-                sm.set(a, b, +2) # match
-            else: 
-                sm.set(a, b, -1) # mismatch
-    print "Below is a substitution matrix for the alphabet", DNA_Alphabet.symbols
-    print sm
-    print
-    aln = align(x, y, sm, -2)
-    print "Below is the alignment between x and y"
-    print aln
\ No newline at end of file
--- a/ml.py
+++ b/ml.py
@@ -21,7 +21,7 @@ class NN():
        self.b_hid  = numpy.random.randn(nHidden)           # biases hidden layer
        self.w_out  = numpy.random.randn(nOutput, nHidden)  # weights hid -> out
        self.b_out  = numpy.random.randn(nOutput)           # biases output layer
-        print "Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output))
+        print("Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output)))
    def writeFile(self, filename):
        """ Save NN to a file. """
@@ -110,7 +110,7 @@ class NN():
            multi_targ  = [ target ]
        for i in range(niter):
            mse = 0.0
-            entries = range(len(multi_input))
+            entries = list(range(len(multi_input)))
            if shuffle:
                random.shuffle(entries)
            for p in entries:

--- a/phylo.py
+++ b/phylo.py
@@ -2,7 +2,7 @@
 Module with methods and classes for phylogeny.
 @author: mikael
 '''
-##import sequence
+import sequence
 class PhyloTree:
    """ Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
@@ -140,7 +140,19 @@ class PhyloNode:
                return left+','
            elif self.left and self.right:
                return '(' + left + ',' + right + ')' + dist
+    def __le__(self, other):
+        """ Returns indication of less than other node. """
+        return other and self.__hash__() <= other.__hash__()
+    def __eq__(self, other):
+        """ Returns indication of equivalence to other node. """
+        return other and self.__hash__() == other.__hash__()
+    def __hash__(self):
+        """ Returns hash of object. """
+        return hash((self.label, self.dist, self.sequence))
    def _printSequences(self, start, end):
        """ Returns string with node (incl descendants) in a Newick style. """
        left = right = label = dist = ''
@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False):
        find the *closest* pair of clusters, and
        merge that pair into a new cluster (to replace the two that merged).
        In each case, the new cluster is represented by the (phylo)node that is formed. """
-    while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
+    while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below
        closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
        closest_dist = None         # The distance between them
        for pair in D:              # check all pairs which should be merged
            dist = D[pair]
-            if dist < closest_dist or closest_dist == None:
+            if closest_dist == None or dist < closest_dist:
                closest_dist = dist
                closest_pair = pair
        # So we know the closest, now we need to merge...
@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False):
        y = closest_pair[1]
        z = PhyloNode()             # create a new node for the cluster z
        z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
-        Nx = N.pop(x)               # find number of sequences in x, remove the cluster from list N
+        Nx = N.pop(x, None)         # find number of sequences in x, remove the cluster from list N
-        Ny = N.pop(y)               # find number of sequences in y, remove the cluster from list N
+        Ny = N.pop(y, None)         # find number of sequences in y, remove the cluster from list N
+        if Nx == None or Ny == None:
+        	continue
        dz = {}                     # new distances to cluster z
        for w in N:                 # for each node w ...
            # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)

--- a/prob.py
+++ b/prob.py
@@ -277,7 +277,7 @@ def _readDistrib(linelist):
    if len(d) == 0:
        return None
    alpha = Alphabet(symstr)
-    if '*' in d.keys(): # tot provided
+    if '*' in list(d.keys()): # tot provided
        for sym in d:
            if sym != '*':
                d[sym] = d[sym] * d['*']
@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'):
                ncol = len(counts)
                if len(name) == 1: # proper symbol
                    symcount[name] = counts
-        alpha = Alphabet(''.join(symcount.keys()))
+        alpha = Alphabet(''.join(list(symcount.keys())))
        distribs = []
        for col in range(ncol):
            d = dict([(sym, symcount[sym][col]) for sym in symcount])
@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'):
    """
    d = readMultiCounts(filename, format=format)
    if len(d) > 0:
-        return d.values()[0]
+        return list(d.values())[0]
 #################################################################################################
 # Joint class
@@ -628,12 +628,12 @@ class IndepJoint(Joint):
    def displayMatrix(self, count = False):
        """ Pretty-print matrix """
-        print " \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))
+        print((" \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))))
        for a in self.alphas[0]:
            if count:
-                print "%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))
+                print(("%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))))
            else:
-                print "%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))
+                print(("%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))))
    def __str__(self):
        """ Text representation of the table. Note that size is an issue so big tables
@@ -718,5 +718,3 @@ class NaiveBayes():
                prob *= condprob[i][key[i]] or 0.0
            out.observe(outsym, prob)
        return out
--- a/sam.py
+++ b/sam.py
@@ -37,7 +37,7 @@ def base_percentages(reads):
        for nuc in seq:
            all_seqs.append(nuc)
    counts=dict(Counter(all_seqs))
-    nucs=counts.keys()
+    nucs=list(counts.keys())
    freqs={}
    for nuc in nucs:
        freqs[nuc]=float(counts[nuc])/sum(counts.values())
@@ -67,7 +67,7 @@ you will get an accurate "percentage of reads aligned" statistic.
        mapped=len(mapped)+len(mapped)
    else:
        mapped=len(mapped)
-    print "number of mapped reads",mapped
+    print("number of mapped reads",mapped)
    return store_reads
@@ -108,9 +108,9 @@ def subgroups(mapped_reads):
            group3.append(read)
        else:
            pass
-    print len(group1),"in p<1e-3 group"
+    print(len(group1),"in p<1e-3 group")
-    print len(group2),"in 1e-3<=p<1e-2 group"
+    print(len(group2),"in 1e-3<=p<1e-2 group")
-    print len(group3),"in 1e-2<=p<1 group"
+    print(len(group3),"in 1e-2<=p<1 group")
    return group1,group2,group3
@@ -124,7 +124,7 @@ def dinuc_freq(mapped_reads):
        for nuc in seq:
            all_seqs.append(nuc)
    counts=dict(Counter(all_seqs))
-    nucs=counts.keys()
+    nucs=list(counts.keys())
    freqs={}
    for nuc in nucs:
        freqs[nuc]=float(counts[nuc])/sum(counts.values())
@@ -135,7 +135,7 @@ def dinuc_freq(mapped_reads):
        for nuc in seq:
            all_seqs.append(nuc)
    counts=dict(Counter(all_seqs))
-    dinucs=counts.keys()
+    dinucs=list(counts.keys())
    dinuc_counts={}
    for i in dinucs:
        val=float(counts[i])/sum(counts.values())
@@ -178,11 +178,11 @@ Calculations are based on the the length of the (possibly hard-clipped) sequence
                length=int(read[8])
            lengths.append(length)
        mean_len=np.mean(lengths)
-        print "group"+str(i+1)+"mean",mean_len
+        print("group"+str(i+1)+"mean",mean_len)
        max_len=np.max(lengths)
-        print "group"+str(i+1)+"max length",max_len
+        print("group"+str(i+1)+"max length",max_len)
        min_len=np.min(lengths)
-        print "group"+str(i+1)+"min length",min_len
+        print("group"+str(i+1)+"min length",min_len)
        data.append(["group"+str(i+1),mean_len,max_len,min_len])
    return data
@@ -221,15 +221,15 @@ def plot_base_composition(reads,sym):
        all_nucs.append(nucs)
    all_items=[]
    counts=[]
-    pos=range(1,len(seq)+1)
+    pos=list(range(1,len(seq)+1))
    for dicts in all_nucs:
-        for item in dicts.items():
+        for item in list(dicts.items()):
            all_items.append(item)
    all_items.sort(key=operator.itemgetter(0))
-    groups= [map(operator.itemgetter(1),list(group)) for key, group in itertools.groupby(all_items, operator.itemgetter(0))]
+    groups= [list(map(operator.itemgetter(1),list(group))) for key, group in itertools.groupby(all_items, operator.itemgetter(0))]
    for group in groups:
        counts.append(group.count(sym))
-    print counts
+    print(counts)
    plt.figure(1, figsize=(8,8))
    ax = plt.axes([0.1, 0.1, 0.8, 0.8])
    plt.bar(pos,counts,facecolor='g')
@@ -261,7 +261,7 @@ def raw_count_reader(filename):
 def get_RPKM(data,num_map1,num_map2,num_map3,num_map4):
    """provide number of mapped reads for the two groups of interest and raw count data .This method provides length normalisation to prevent length and total count bias"""
    all_rpkms=[];final={}
-    for i,s,ii,ss,v in data.values():
+    for i,s,ii,ss,v in list(data.values()):
        rpkms=[]
        num_mapped_reads=[num_map1,num_map2,num_map3,num_map4]
        vals=[i,s,ii,ss]
@@ -276,15 +276,15 @@ def get_RPKM(data,num_map1,num_map2,num_map3,num_map4):
                rpkms.append(rpkm)
        all_rpkms.append(rpkms)
    #return gene names and rpkms
-    for i in range(0,len(data.keys())):
+    for i in range(0,len(list(data.keys()))):
-        final[data.keys()[i]]=[float(all_rpkms[i][0]),float(all_rpkms[i][1]),float(all_rpkms[i][2]),float(all_rpkms[i][3])]
+        final[list(data.keys())[i]]=[float(all_rpkms[i][0]),float(all_rpkms[i][1]),float(all_rpkms[i][2]),float(all_rpkms[i][3])]
    return final
 def write_RPKM_data(RPKM_data,filename):
    """write RPKM data to a file"""
    f=open(filename,'w')
    for i in range(0,len(RPKM_data)):
-        f.write("%s\t%d\t%d\t%d\t%d\n"%(RPKM_data.keys()[i],int(RPKM_data.values()[i][0]),int(RPKM_data.values()[i][1]),int(RPKM_data.values()[i][2]),int(RPKM_data.values()[i][3])))
+        f.write("%s\t%d\t%d\t%d\t%d\n"%(list(RPKM_data.keys())[i],int(list(RPKM_data.values())[i][0]),int(list(RPKM_data.values())[i][1]),int(list(RPKM_data.values())[i][2]),int(list(RPKM_data.values())[i][3])))
    f.close()
@@ -316,13 +316,13 @@ def plotreprpkm(rpkm_data,timepoint):
    one=[]
    two=[]
    if timepoint=="t1":
-        for i in range(0,len(rpkm_data.values())):
+        for i in range(0,len(list(rpkm_data.values()))):
-            one.append(int(rpkm_data.values()[i][0]))
+            one.append(int(list(rpkm_data.values())[i][0]))
-            two.append(int(rpkm_data.values()[i][1]))
+            two.append(int(list(rpkm_data.values())[i][1]))
    else:
-        for i in range(0,len(rpkm_data.values())):
+        for i in range(0,len(list(rpkm_data.values()))):
-            one.append(int(rpkm_data.values()[i][2]))
+            one.append(int(list(rpkm_data.values())[i][2]))
-            two.append(int(rpkm_data.values()[i][3]))
+            two.append(int(list(rpkm_data.values())[i][3]))
    plt.plot(one,two,'o')
    pcc=pearson_def(one,two)
    R2=pcc**2
@@ -343,15 +343,15 @@ def plotMAreprpkm(rpkm_data,timepoint):
    m=[]
    a=[]
    if timepoint=="t1":
-        for i in range(0,len(rpkm_data.values())):
+        for i in range(0,len(list(rpkm_data.values()))):
-            y=np.log2(rpkm_data.values()[i][0]+1)-np.log2(rpkm_data.values()[i][1]+1)
+            y=np.log2(list(rpkm_data.values())[i][0]+1)-np.log2(list(rpkm_data.values())[i][1]+1)
-            x=(np.log2(rpkm_data.values()[i][0]+1)+np.log2(rpkm_data.values()[i][1]+1))/2
+            x=(np.log2(list(rpkm_data.values())[i][0]+1)+np.log2(list(rpkm_data.values())[i][1]+1))/2
            m.append(y)
            a.append(x)
    else:
-        for i in range(0,len(rpkm_data.values())):
+        for i in range(0,len(list(rpkm_data.values()))):
-            y=np.log2(rpkm_data.values()[i][2]+1)-np.log2(rpkm_data.values()[i][3]+1)
+            y=np.log2(list(rpkm_data.values())[i][2]+1)-np.log2(list(rpkm_data.values())[i][3]+1)
-            x=(np.log2(rpkm_data.values()[i][2]+1)+np.log2(rpkm_data.values()[i][3]+1))/2
+            x=(np.log2(list(rpkm_data.values())[i][2]+1)+np.log2(list(rpkm_data.values())[i][3]+1))/2
            m.append(y)
            a.append(x)
    plt.figure(1, figsize=(8,8))
@@ -370,18 +370,18 @@ def plotMAreprpkm(rpkm_data,timepoint):
 def get_cv(data1,condition):
    cvs=[]
    if condition=="t1":
-        for i in range(0,len(data1.values())):
+        for i in range(0,len(list(data1.values()))):
-            mean = np.mean([data1.values()[i][0],data1.values()[i][1]])
+            mean = np.mean([list(data1.values())[i][0],list(data1.values())[i][1]])
-            std=np.std([data1.values()[i][0],data1.values()[i][1]])
+            std=np.std([list(data1.values())[i][0],list(data1.values())[i][1]])
            if mean==0.0 and std==0.0:
                pass
            else:
                cv=float(mean+1)/(std+1)
                cvs.append(cv)
    else:
-        for i in range(0,len(data1.values())):
+        for i in range(0,len(list(data1.values()))):
-            mean = np.mean([data1.values()[i][2],data1.values()[i][3]])
+            mean = np.mean([list(data1.values())[i][2],list(data1.values())[i][3]])
-            std=np.std([data1.values()[i][2],data1.values()[i][3]])
+            std=np.std([list(data1.values())[i][2],list(data1.values())[i][3]])
            if mean==0.0 and std==0.0:
                pass
            else:
@@ -430,7 +430,7 @@ def plotMA(rpkm_data,cutoff=[-1.5,1.5]):
    avg_rpkm2=[]
    sig_logfc2=[]
    sig_avg_rpkm2=[]
-    for i,ii,s,ss in rpkm_data.values():
+    for i,ii,s,ss in list(rpkm_data.values()):
        fc=np.log2(float(s+1)/(i+1))
        if fc<cutoff[0] or fc>cutoff[1]:
            sig_logfc.append(fc)
@@ -438,7 +438,7 @@ def plotMA(rpkm_data,cutoff=[-1.5,1.5]):
        else:
            logfc.append(fc)
            avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
-    for i,ii,s,ss in rpkm_data.values():
+    for i,ii,s,ss in list(rpkm_data.values()):
        fc2=np.log2(float(ss+1)/(ii+1))
        if fc2<cutoff[0] or fc2>cutoff[1]:
            sig_logfc2.append(fc2)
@@ -470,7 +470,7 @@ def plotMA_pval(rpkm_data,cutoff=0.05):
    avg_rpkm2=[]
    sig_logfc2=[]
    sig_avg_rpkm2=[]
-    for i,ii,s,ss,pval in rpkm_data.values():
+    for i,ii,s,ss,pval in list(rpkm_data.values()):
        fc=np.log2(float(s+1)/(i+1))
        if float(pval)<cutoff:
            sig_logfc.append(fc)
@@ -478,7 +478,7 @@ def plotMA_pval(rpkm_data,cutoff=0.05):
        else:
            logfc.append(fc)
            avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
-    for i,ii,s,ss,pval in rpkm_data.values():
+    for i,ii,s,ss,pval in list(rpkm_data.values()):
        fc2=np.log2(float(ss+1)/(ii+1))
        if float(pval)<cutoff:
            sig_logfc2.append(fc2)
@@ -506,7 +506,7 @@ def Welcht(rpkm):
    """Performs Welchs T-statistic (one-tailed)"""
    ts=[]
    result={}
-    for i,ii,s,ss in rpkm.values():
+    for i,ii,s,ss in list(rpkm.values()):
        sd1=np.std([i,ii])
        sd2=np.std([s,ss])
        t=(np.mean([s,ss])-np.mean([i,ii]))/(math.sqrt(((float(sd2)/2)+(float(sd1)/2))))
@@ -521,8 +521,8 @@ def Welcht(rpkm):
            pval=pval
            pvals.append(pval)
    corr_pvals=correct_pvalues_for_multiple_testing(pvals, correction_type = "Benjamini-Hochberg")
-    for i in range(0,len(rpkm.values())):
+    for i in range(0,len(list(rpkm.values()))):
-        result[rpkm.keys()[i]]=[rpkm.values()[i][0],rpkm.values()[i][1],rpkm.values()[i][2],rpkm.values()[i][3],corr_pvals[i]]
+        result[list(rpkm.keys())[i]]=[list(rpkm.values())[i][0],list(rpkm.values())[i][1],list(rpkm.values())[i][2],list(rpkm.values())[i][3],corr_pvals[i]]
    return result
@@ -551,7 +551,7 @@ def correct_pvalues_for_multiple_testing(pvalues, correction_type = "Benjamini-H
            rank = n - i
            pvalue, index = vals
            new_values.append((n/rank) * pvalue)
-        for i in xrange(0, int(n)-1):
+        for i in range(0, int(n)-1):
            if new_values[i] < new_values[i+1]:
                new_values[i+1] = new_values[i]
        for i, vals in enumerate(values):

--- a/seqdata.py
+++ b/seqdata.py
@@ -257,7 +257,7 @@ class BedFile():
            self.rows = entries
        self.format = format
        self.indices = self._createIndices()
    def _read(self, filename, format = 'Limited'):
        """ Read a BED file.
            format: specifies the format of the file,
@@ -276,7 +276,7 @@ class BedFile():
            "Strand", e.g.
                chr4    185772359    185772424    -
                chr18    20513381    20513401    +
-            also supports a 5th label field 
+            also supports a 5th label field
                chr5    20611949        20611949        +       ENSG00000251629_20611949
                chr3    42187863        42187863        -       ENSG00000234562_42187863
            "Summit", e.g.
@@ -361,7 +361,7 @@ class BedFile():
                    acceptHeaderRows -= 1 # count down the number of header rows that can occur
        f.close()
        return rows
    def __iter__(self):
        return self.rows.__iter__()
@@ -381,11 +381,11 @@ class BedFile():
        index_name = {}
        for i in range(len(self.rows)):
            row = self.rows[i]
-            if not index_start.has_key(row.chrom): # seeing chromosome entry first time
+            if not row.chrom in index_start: # seeing chromosome entry first time
                index_start[row.chrom] = []
-            if not index_centre.has_key(row.chrom): # seeing chromosome entry first time
+            if not row.chrom in index_centre: # seeing chromosome entry first time
                index_centre[row.chrom] = []
-            if not index_end.has_key(row.chrom): # seeing chromosome entry first time
+            if not row.chrom in index_end: # seeing chromosome entry first time
                index_end[row.chrom] = []
            index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
            index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
@@ -477,7 +477,7 @@ class BedFile():
            Note that if the name is not unique, the last entry with the name will be returned.
        """
        return self.indices[3][myname]
    def closest(self, myloc, minimum = True):
        """ Find the closest entry in the current BedFile to a given location.
            Return a tuple with the absolute distance and the entry that is closest.
@@ -607,7 +607,7 @@ class BedFile():
                    if not earliest_start: # not yet initialised
                        earliest_start = start
                        latest_end = end
-                    else: 
+                    else:
                        if start > latest_end: # new entry
                            entry = BedEntry(c, earliest_start, latest_end)
                            if self.format == 'Peaks':
@@ -639,7 +639,7 @@ class BedFile():
                        if not earliest_start: # not yet initialised
                            earliest_start = start
                            latest_end = end
-                        else: 
+                        else:
                            if start > latest_end: # new entry
                                entry = BedEntry(c, earliest_start, latest_end)
                                if self.format == 'Peaks':
@@ -660,7 +660,7 @@ class BedFile():
                        entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
                    newrows.append(entry)
        return BedFile(newrows, format = self.format)
    def write(self, filename, format = 'BED6', header = None):
        """ Save the data
            format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
@@ -697,7 +697,7 @@ def readBedFile(filename, format = 'Limited'):
        "Strand", e.g.
            chr4    185772359    185772424    -
            chr18    20513381    20513401    +
-        also supports a 5th label field 
+        also supports a 5th label field
            chr5    20611949        20611949        +       ENSG00000251629_20611949
            chr3    42187863        42187863        -       ENSG00000234562_42187863
        "Summit", e.g.
@@ -714,7 +714,7 @@ def readBedFile(filename, format = 'Limited'):
            chr1    931838    9
    """
    return BedFile(filename, format)
 def writeBedFile(entries, filename, format = 'BED6', header = None):
    """ Save the BED entries to a BED file.
        format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
@@ -725,11 +725,11 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
    for row in entries:
        if format == 'Peaks':
            #f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
-            f.write("%s %d %d %s %d %s %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
+            f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
        elif format == 'Limited':
-            f.write("%s %d %d" % (row.chrom, row.chromStart, row.chromEnd))
+            f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
        else:
-            f.write("%s %d %d %s %d %s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
+            f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
        f.write("\n")
    f.close()
@@ -760,7 +760,7 @@ try:
 except ImportError:
    strerror = lambda x: 'strerror not supported'
 from os.path import exists
-from itertools import izip
+from itertools import chain
 def true_long_type():
    """
@@ -805,7 +805,7 @@ def base_to_bin(x):
 def create_byte_table():
    """create BYTE_TABLE"""
    d = {}
-    for x in xrange(2**8):
+    for x in range(2**8):
        d[x] = byte_to_bases(x)
    return d
@@ -821,9 +821,9 @@ def split16(x):
 def create_twobyte_table():
    """create TWOBYTE_TABLE"""
    d = {}
-    for x in xrange(2**16):
+    for x in range(2**16):
        c, f = split16(x)
-        d[x] = byte_to_bases(c) + byte_to_bases(f)
+        d[x] = chain(byte_to_bases(c), byte_to_bases(f))
    return d
 BYTE_TABLE = create_byte_table()
@@ -836,7 +836,7 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
    """
    longs_len = len(longs)
    # dna = ctypes.create_string_buffer(array_size)
-    dna = array('c', 'N' * longs_len)
+    dna = array('b', 'N' * longs_len)
    # translate from 32-bit blocks to bytes
    # this method ensures correct endianess (byteswap as neeed)
    bytes = array('B')
@@ -845,14 +845,14 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
    first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
    i = 16 - first_base_offset
    if array_size < i: i = array_size
-    dna[0:i] = array('c', first_block[first_base_offset:first_base_offset + i])
+    dna[0:i] = array('b', first_block[first_base_offset:first_base_offset + i])
    if longs_len == 1: return dna
    # middle blocks (implicitly skipped if they don't exist)
    for byte in bytes[4:-4]:
-        dna[i:i + 4] = array('c', BYTE_TABLE[byte])
+        dna[i:i + 4] = array('b', BYTE_TABLE[byte])
        i += 4
    # last block
-    last_block = array('c', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
+    last_block = array('b', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
    dna[i:i + last_base_offset] = last_block[0:last_base_offset]
    return dna
@@ -889,7 +889,7 @@ class TwoBitFile(dict):
        self._file_handle = open(foo, 'rb')
        self._load_header()
        self._load_index()
-        for name, offset in self._offset_dict.iteritems():
+        for name, offset in self._offset_dict.items():
            self[name] = TwoBitSequence(self._file_handle, offset,
                                        self._byteswapped)
        return
@@ -926,13 +926,16 @@ class TwoBitFile(dict):
            if remaining == 0: break
            name_size = array('B')
            name_size.fromfile(file_handle, 1)
-            if byteswapped: name_size.byteswap()
+            if byteswapped:
-            name = array('c')
+                name_size.byteswap()
-            if byteswapped: name.byteswap()
+            name = array('b')
+            if byteswapped:
+                name.byteswap()
            name.fromfile(file_handle, name_size[0])
            offset = array(LONG)
            offset.fromfile(file_handle, 1)
-            if byteswapped: offset.byteswap()
+            if byteswapped:
+                offset.byteswap()
            sequence_offsets.append((name.tostring(), offset[0]))
            remaining -= 1
        self._sequence_offsets = sequence_offsets
@@ -943,7 +946,7 @@ class TwoBitFile(dict):
        d = {}
        file_handle = self._file_handle
        byteswapped = self._byteswapped
-        for name, offset in self._offset_dict.iteritems():
+        for name, offset in self._offset_dict.items():
            file_handle.seek(offset)
            dna_size = array(LONG)
            dna_size.fromfile(file_handle, 1)
@@ -1078,7 +1081,7 @@ class TwoBitSequence(object):
        if byteswapped: fourbyte_dna.byteswap()
        string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
                                              last_base_offset, region_size)
-        for start, size in izip(n_block_starts, n_block_sizes):
+        for start, size in zip(n_block_starts, n_block_sizes):
            end = start + size
            if end <= min_: continue
            if start > max_: break
@@ -1086,14 +1089,14 @@ class TwoBitSequence(object):
            if end > max_: end = max_
            start -= min_
            end -= min_
-            string_as_array[start:end] = array('c', 'N'*(end-start))
+            string_as_array[start:end] = array('b', 'N'*(end-start))
        lower = str.lower
        first_masked_region = max(0,
                                  bisect_right(mask_block_starts, min_) - 1)
        last_masked_region = min(len(mask_block_starts),
                                 1 + bisect_right(mask_block_starts, max_,
                                                  lo=first_masked_region))
-        for start, size in izip(mask_block_starts[first_masked_region:last_masked_region],
+        for start, size in zip(mask_block_starts[first_masked_region:last_masked_region],
                                mask_block_sizes[first_masked_region:last_masked_region]):
            end = start + size
            if end <= min_: continue
@@ -1102,9 +1105,9 @@ class TwoBitSequence(object):
            if end > max_: end = max_
            start -= min_
            end -= min_
-            string_as_array[start:end] = array('c', lower(string_as_array[start:end].tostring()))
+            string_as_array[start:end] = array('b', lower(string_as_array[start:end].tostring()))
        if not len(string_as_array) == max_ - min_:
-            raise RuntimeError, "Sequence was longer than it should be"
+            raise RuntimeError("Sequence was longer than it should be")
        if reverse:
            return self.reverseComplement(string_as_array.tostring())
        return string_as_array.tostring()
@@ -1124,7 +1127,7 @@ class TwoBitSequence(object):
        """
        return self.__getslice__(0, None)
-class TwoBitFileError(StandardError):
+class TwoBitFileError(Exception):
    """
    Base exception for TwoBit module
    """

--- a/sequence.py
+++ b/sequence.py
@@ -55,10 +55,11 @@ class Sequence(object):
        ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'] """
-        try: # convert sequence data into a compact array representation
+        #try: # convert sequence data into a compact array representation
-            self.sequence = array.array('c', ''.join([s.upper() for s in sequence]))
+        #    self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
-        except TypeError:
+        #except TypeError:
-            raise RuntimeError('Sequence data is not specified correctly: must be iterable')
+        #    raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
+        self.sequence = sequence
        # Assign an alphabet
        self.alphabet = None
@@ -133,15 +134,15 @@ class Sequence(object):
            Calling self.__getitem__(3) is equivalent to self[3]
        """
        if type(ndx) is slice:
-            return self.sequence[ndx].tostring()
+            return ''.join(self.sequence[ndx])
        else:
            return self.sequence[ndx]
    def writeFasta(self):
        """ Write one sequence in FASTA format to a string and return it. """
        fasta = '>' + self.name + ' ' + self.info + '\n'
-        data = self.sequence.tostring()
+        data = ''.join(self.sequence)
-        nlines = (len(self.sequence) - 1) / 60 + 1
+        nlines = int(math.ceil((len(self.sequence) - 1) / 60 + 1))
        for i in range(nlines):
            lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
            fasta += lineofseq
@@ -164,7 +165,7 @@ class Sequence(object):
    def find(self, findme):
        """ Find the position of the specified symbol or sub-sequence """
-        return self.sequence.tostring().find(findme)
+        return ''.join(self.sequence).find(findme)
 """
 Below are some useful methods for loading data from strings and files.
@@ -438,8 +439,8 @@ class Alignment():
            column index, entropy, number of gaps, and symbols in order of decreasing probability.
            theta1 is the threshold for displaying symbols in upper case,
            theta2 is the threshold for showing symbols at all, and in lower case. """
-        print "Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)
+        print(("Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)))
-        print "Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)
+        print(("Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)))
        for col in range(self.alignlen):
            d = Distrib(self.alphabet)
            gaps = 0
@@ -448,21 +449,21 @@ class Alignment():
                    d.observe(seq[col])
                else:
                    gaps += 1
-            print (col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,
+            print(((col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,))
            symprobs = d.getProbsort()
            (_, maxprob) = symprobs[0]
            if maxprob >= theta1:
-                print "%d\tTRUE\t" % int(maxprob * 100),
+                print(("%d\tTRUE\t" % int(maxprob * 100),))
            else:
-                print "%d\t\t" % int(maxprob * 100),
+                print(("%d\t\t" % int(maxprob * 100),))
            for (sym, prob) in symprobs:
                if prob >= theta1:
-                    print sym, "%d%%" % int(prob * 100),
+                    print((sym, "%d%%" % int(prob * 100),))
                elif prob >= theta2 and lowercase:
-                    print sym.lower(), "%d%%" % int(prob * 100),
+                    print((sym.lower(), "%d%%" % int(prob * 100),))
                elif prob >= theta2:
-                    print sym, "%d%%" % int(prob * 100),
+                    print((sym, "%d%%" % int(prob * 100),))
-            print
+            print()
    def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False):
        """ Display a table with rows for each alignment column, showing
@@ -644,7 +645,7 @@ class Alignment():
        return distmat
    def writeHTML(self, filename=None):
-        """ Generate HTML that displays the alignment in color. 
+        """ Generate HTML that displays the alignment in color.
            Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
            and that each symbol maps to a text string naming the color, e.g. 'blue'
        """
@@ -681,10 +682,9 @@ class Alignment():
            htmlstr += html
        htmlstr += '<pre>'
        if filename:
-            fh = open(filename, 'w')
+            with open(filename, 'w+') as fh:
-            fh.write(htmlstr)
+                fh.write(htmlstr)
-            fh.write('</body></html>\n')
+                fh.write('</body></html>\n')
-            fh.close()
        else:
            return htmlstr
@@ -985,12 +985,12 @@ def readClustal(string, alphabet):
        index = name.find('/')
        if index >= 0:
            name = name[0:index]
-        if seqs.has_key(name):
+        if name in seqs:
            seqs[name] += seqstr
        else:
            seqs[name] = seqstr
    sequences = []
-    for name, seqstr in seqs.items():
+    for name, seqstr in list(seqs.items()):
        sequences.append(Sequence(seqstr, alphabet, name, gappy = True))
    return Alignment(sequences)
@@ -1180,12 +1180,12 @@ class PWM(object):
    def display(self, format = 'COLUMN'):
        if format == 'COLUMN':
-            print " \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))
+            print((" \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))))
            for j in range(len(self.alphabet)):
-                print "%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
+                print(("%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
        elif format == 'JASPAR':
            for j in range(len(self.alphabet)):
-                print "%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
+                print(("%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
    def search(self, sequence, lowerBound=0):
        """ Find matches to the motif in a specified sequence. Returns a list
@@ -1229,7 +1229,7 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
    """ Get the sequence identified by the given ID from the given database
    (e.g. 'uniprotkb', 'refseqn' or 'refseqp'), and return it as a Sequence
    object. An error is caused if the sequence ID is not found. If start and
-    end are given, then only that section of the sequence is returned. 
+    end are given, then only that section of the sequence is returned.
    Note: more flexible search options are supported by using webservice.fetch
    directly."""
@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
    for i in range(MAX_TRY):
        try:
-            fastaData = fetch(id, database)
+            fastaData = fetch(id, database).decode("utf-8")
            seq = readFasta(fastaData)[0]
            break
        except:
            from time import sleep
-            print 'Failed on {i}th try for id {id}'.format(i=i, id=id)
+            print(('Failed on {i}th try for id {id}'.format(i=i, id=id)))
            sleep(0.1)
    try:
        return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info)
@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
 if __name__ == '__main__':
    seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True)
-    print 'Read', len(seqs), 'sequences'
+    print(('Read', len(seqs), 'sequences'))
--- a/spred.py
+++ b/spred.py
@@ -71,7 +71,7 @@ class SeqNN():
                im[row, _onehotIndex(alpha,  subseqs[k])] = 1
                if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1
                row += 1
-        print "There are", row, "entries in data set"
+        print("There are", row, "entries in data set")
        if targets:
            return im, om
        else:
@@ -85,7 +85,7 @@ class SeqNN():
        im, om = self._encodeseq(seqs, targets)
        for i in range(niter):  # train first NN
            rmse = self.nn1.train(im, om, eta = eta, niter = 1)
-            print i, ":", rmse
+            print(i, ":", rmse)
        if not self.cascade:    # if there's no cascaded NN, finish here
            return rmse
        nn1seqs = []            # a list of new SS sequences ...
@@ -95,7 +95,7 @@ class SeqNN():
        im, om = self._encodeseq(nn1seqs, targets)  # construct input/output patterns from SS sequences
        for i in range(niter):  # train cascaded NN
            rmse = self.nn2.train(im, om, eta = eta, niter = 1)
-            print i, ":", rmse
+            print(i, ":", rmse)
        return rmse
    def testAll(self, seqs, targets):

--- a/sstruct.py
+++ b/sstruct.py
@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4):
        specified width average of 100.
    """
    sum = 0.0
-    order = range(0, len(calls) - 1, +1)  # we are extending calls downstream
+    order = list(range(0, len(calls) - 1, +1))  # we are extending calls downstream
    cnt = 0
    for i in order:  # extend to the right
        if calls[i]: # to extend a call is required in the first place
@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4):
        AND extend this list upstream containing a specified width average of 100.
    """
    sum = 0.0
-    order = range(len(calls) - 1, 0, -1)  # we are extending calls upstream/to-the-left
+    order = list(range(len(calls) - 1, 0, -1))  # we are extending calls upstream/to-the-left
    cnt = 0
    for i in order:  # extend to the right
        if calls[i]: # a requirement to extend is to have a call in the first place

--- a/sym.py
+++ b/sym.py
@@ -291,7 +291,7 @@ class TupleEntries(object):
    def __iter__(self):
        return self
-    def next(self):
+    def __next__(self):
        """ Step through sequence of entries, either
        (if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
        (if sparse) with calls to tuple store based on all possible symbol combinations."""

--- a/webservice.py
+++ b/webservice.py
-import urllib, urllib2
+import urllib.request
 import os
 from time import sleep
 import stats
-from StringIO import StringIO
+from io import StringIO
 import gzip
 """ This module is collection of functions for accessing the EBI REST web services,
    including sequence retrieval, searching, gene ontology, BLAST and ClustalW.
    The class EBI takes precautions taken as to not send too many requests when
    performing BLAST and ClustalW queries.
    See
    http://www.ebi.ac.uk/Tools/webservices/tutorials/01_intro and
    http://www.ebi.ac.uk/Tools/webservices/tutorials/02_rest
    http://www.ebi.ac.uk/Tools/webservices/tutorials/06_programming/python/rest/urllib
    """
 __ebiUrl__ =        'http://www.ebi.ac.uk/Tools/'               # Use UQ mirror when available
 __ebiGOUrl__ =      'http://www.ebi.ac.uk/QuickGO/'             # Use UQ mirror when available
 __uniprotUrl__ =    'http://www.uniprot.org/'                   #
 def fetch(entryId, dbName='uniprotkb', format='fasta'):
    """
    Retrieve a single entry from a database
    entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' (database dependent; examples for uniprotkb)
    dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
    format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases)
    See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax
    """
     # Construct URL
    url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
    # Get the entry
    try:
-        data = urllib2.urlopen(url).read()
+        data = urllib.request.urlopen(url).read()
-        if data.startswith('ERROR'):
+        if data.startswith(b'ERROR'):
            raise RuntimeError(data)
        return data
-    except urllib2.HTTPError, ex:
+    except(urllib.error.HTTPError, ex):
        raise RuntimeError(ex.read())
 def search(query, dbName='uniprot', format='list', limit=100):
    """
    Retrieve multiple entries matching query from a database currently only via UniProtKB
    query: search term(s) e.g. 'organism:9606+AND+antigen'
    dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
    format: file format e.g. 'list', 'fasta' or 'txt'
    limit: max number of results (specify None for all results)
    See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
    See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
    """
    if dbName.startswith('uniprot'):
        # Construct URL
        if limit == None: # no limit to number of results returned
            url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query
        else:
            url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
        # Get the entries
        try:
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            if format == 'list':
                return data.splitlines()
            else:
                return data
-        except urllib2.HTTPError, ex:
+        except(urllib.error.HTTPError, ex):
            raise RuntimeError(ex.read())
    elif dbName.startswith('refseq'):
        dbs = dbName.split(":")
        if len(dbs) > 1:
            dbName = dbs[1]
        base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
        url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
        # Get the entries
        try:
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            words = data.split("</Id>")
            words = [w[w.find("<Id>")+4:] for w in words[:-1]]
            if format == 'list':
                return words
            elif format == 'fasta' and len(words) > 0:
                url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
                for w in words:
                    url += w + ","
-                data = urllib2.urlopen(url).read()
+                data = urllib.request.urlopen(url).read()
                return data
            else:
                return ''
-        except urllib2.HTTPError, ex:
+        except(urllib.error.HTTPError, ex):
            raise RuntimeError(ex.read())
    return
 authorised_database_tag = {9606:  ['Homo sapiens', 'ACC', 'ID'],
                           3702:  ['Arabidopsis thaliana', 'TAIR_ID'],
                           4932:  ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
                           10090: ['Mus musculus', 'MGI_ID']}
 def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False):
    """
    Map identifiers between databases (based on UniProtKB; see http://www.uniprot.org/faq/28)
    identifiers: a list of identifiers (list of strings)
    frm: the tag/abbreviation for the identifier FROM which to idmap
    to: the tag/abbreviation for the identifier TO which to idmap
    format: the results format to use
    reverse: reverse the returned mapping key (to) -> value (from)
    Returns a dictionary with key (from) -> value (to)
    Set reverse to True if dictionary should contain the reverse mapping, useful if the mapping is non-unique
    """
    url = __uniprotUrl__ + 'mapping/'
    # construct query by concatenating the list of identifiers
    if isinstance(identifiers, str):
        query = identifiers.strip()
    else: # assume it is a list of strings
        query = ''
        for id in identifiers:
            query = query + id.strip() + ' '
        query = query.strip() # remove trailing spaces
    params = {
        'from' : frm,
        'to' : to,
        'format' : format,
        'query' : query
    }
    if len(query) > 0:
-        request = urllib2.Request(url, urllib.urlencode(params))
+        request = urllib.request.Request(url, urllib.parse.urlencode(params))
-        response = urllib2.urlopen(request).read()
+        response = urllib.request.urlopen(request).read()
        d = dict()
        for row in response.splitlines()[1:]:
            pair = row.split('\t')
            if not reverse:
                d[pair[0]] = pair[1]
            else:
                d[pair[1]] = pair[0]
        return d
    else:
        return dict()
 """
 Gene Ontology service (QuickGO)
 http://www.ebi.ac.uk/QuickGO/WebServices.html
 Note that this service can be slow for queries involving a large number of entries.
 """
 def getGOReport(positives, background = None, database = 'UniProtKB'):
    """ Generate a complete GO term report for a set of genes (positives).
        Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
        Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
        (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
        E-value is a Bonferroni-corrected p-value.
        """
    pos = set(positives)
    fg_map = getGOTerms(pos, database)
    fg_list = []
    for id in fg_map:
        for t in fg_map[id]:
            fg_list.append(t)
    bg_map = {}
    bg_list = []
    neg = set()
    if background != None:
        neg = set(background).difference(pos)
        bg_map = getGOTerms(neg, database)
        for id in bg_map:
            for t in bg_map[id]:
                bg_list.append(t)
    term_set = set(fg_list)
    term_cnt = {}
    nPos = len(pos)
    nNeg = len(neg)
    if background == None:
        for t in term_set:
            term_cnt[t] = fg_list.count(t)
-        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
+        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
    else: # a background is provided
        for t in term_set:
            fg_hit = fg_list.count(t)
            bg_hit = bg_list.count(t)
            fg_nohit = nPos - fg_hit
            bg_nohit = nNeg - bg_hit
            term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
-        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
+        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
    ret = []
    for t in sorted_cnt:
        defin = getGODef(t[0])
        if background != None:
            ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name']))
        else:
            ret.append((t[0], t[1], defin['name']))
    return ret
 def getGODef(goterm):
    """
    Retrieve information about a GO term
    goterm: the identifier, e.g. 'GO:0002080'
    """
     # Construct URL
    url = __ebiGOUrl__ + 'GTerm?format=obo&id=' + goterm
    # Get the entry: fill in the fields specified below
    try:
        entry={'id': None, 'name': None, 'def': None}
-        data = urllib2.urlopen(url).read()
+        data = urllib.request.urlopen(url).read()
        for row in data.splitlines():
            index = row.find(':')
            if index > 0 and len(row[index:]) > 1:
                field = row[0:index].strip()
                value = row[index+1:].strip(' "') # remove spaces and quotation marks
-                if field in entry.keys():         # check if we need this field
+                if field in list(entry.keys()):         # check if we need this field
                    if entry[field] == None:      # check if not yet assigned
                        entry[field] = value
        return entry
-    except urllib2.HTTPError, ex:
+    except(urllib.error.HTTPError, ex):
        raise RuntimeError(ex.read())
 def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
    """
    Retrieve all GO terms for a given set of genes (or single gene).
    database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
    The result is given as a map (key=gene name, value=list of unique terms) OR
    in the case of a single gene as a list of unique terms.
    If completeAnnot is True (default is False) then the above "terms" is the first element
    in a tuple with (gene-terms-map, gene-taxon-id).
    """
    if type(genes) != list and type(genes) != set and type(genes) != tuple:
        genes = [genes]
    termsmap = dict()
    taxonmap = dict()
    uri_string = 'GAnnotation?format=tsv&gz&db=' + database + '&protein='
    # build queries (batches of genes)
    queryLength = 2000
    queries = []
    query = None
    for gene in genes:
        if query == None:
            query = gene
        elif len(query) < queryLength:
            query += ','+gene
        else:
            queries.append(query)
            query = gene
    if query != None:
        queries.append(query)
    # execute queries, each involving a number of genes
    for query in queries:
        # Construct URL
        url = __ebiGOUrl__ + uri_string + query
        # Get the entry: fill in the fields specified below
        try:
-            urlreq = urllib2.Request(url)
+            urlreq = urllib.request.Request(url)
            urlreq.add_header('Accept-encoding', 'gzip')
-            response = urllib2.urlopen(urlreq)
+            response = urllib.request.urlopen(urlreq)
            if response.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(response.read())
                f = gzip.GzipFile(fileobj=buf)
                data = f.read()
            else:
                data = response.read()
            for row in data.splitlines()[1:]:  # we ignore first (header) row
                values = row.split('\t')
                if len(values) >= 7:
                    key = values[1]
-                    if termsmap.has_key(key):
+                    if key in termsmap:
                        termsmap[key].add(values[6])
                    else:
                        termsmap[key] = set([values[6]])
                        taxonmap[key] = int(values[4])
-        except urllib2.HTTPError, ex:
+        except(urllib.error.HTTPError, ex):
            raise RuntimeError(ex.read())
    if completeAnnot:
        if len(genes) == 1:
            if len(termsmap) == 1:
                return (termsmap[genes[0]], taxonmap[genes[0]])
            else:
                return (set(), None)
        else:
            return (termsmap, taxonmap)
    else:
        if len(genes) == 1:
            if len(termsmap) == 1:
                return termsmap[genes[0]]
            else:
                return set()
        else:
            return termsmap
 def getGenes(goterms, database='UniProtKB', taxo=None):
    """
    Retrieve all genes/proteins for a given set of GO terms (or single GO term).
    database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
    taxo: use specific taxonomic identifier, e.g. 9606 (human)
    The result is given as a map (key=gene name, value=list of unique terms) OR
    in the case of a single gene as a list of unique terms.
    """
    if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
        goterms = [goterms]
    map = dict()
    if taxo == None:
        uri_string = 'GAnnotation?format=tsv&db=' + database + '&term='
    else:
        uri_string = 'GAnnotation?format=tsv&db=' + database + '&tax=' + str(taxo) + '&term='
    for goterm in goterms:
        genes = set()
        # Construct URL
        url = __ebiGOUrl__ + uri_string + goterm.strip()
        # Get the entry: fill in the fields specified below
        try:
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            for row in data.splitlines()[1:]:  # we ignore first (header) row
                values = row.split('\t')
                if len(values) >= 7:
                    genes.add(values[1])
            map[goterm] = list(genes)
-        except urllib2.HTTPError, ex:
+        except(urllib.error.HTTPError, ex):
            raise RuntimeError(ex.read())
    if len(goterms) == 1:
        return map[goterms[0]]
    else:
        return map
 class EBI(object):
    __email__ =         'anon@uq.edu.au'                            # to whom emails about jobs should go
    __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
    __checkInterval__ = 2                                           # how long to wait between checking job status
    def __init__(self, service=None):
        """ Initialise service session.
        service: presently, ncbiblast and clustalw2 are supported. Use None (default) for fetch/idmap jobs.
        """
        self.service = service
        self.lockFile = '%s.lock' % service
    def createLock(self):
        """ Create a lock file to prevent submission of more than 1 job
        at a time by a single user. """
        fh = open(self.lockFile, 'w')
        fh.write(self.jobId)
        fh.close()
    def removeLock(self):
        """ Remove the lock file. """
        os.remove(self.lockFile)
    def isLocked(self):
        """ Check if there is a lock on this service. If there is, check if
        the job is complete, and if so remove the lock. Return True if still
        locked and False if not. """
        if os.path.exists(self.lockFile):
            fh = open(self.lockFile, 'r')
            jobId = fh.read()
            fh.close()
            status = self.status(jobId)
            if status == 'RUNNING':
                self.jobId = jobId
                return True
            else:
                self.removeLock()
                return False
        else:
            return False
    """
    BLAST and CLUSTALW services
    """
    def run(self, params):
        """ Submit a job to the given service with the given parameters, given
        as a dictionary. Return the jobId. """
        if self.service == None:
            raise RuntimeError('No service specified')
        if self.isLocked():
            raise RuntimeError("""You currently have a %s job running. You must
                                  wait until it is complete before submitting another job. Go to
                                  %sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId))
        url = self.__ebiServiceUrl__ + self.service + '/run/'
        # ncbiblast database parameter needs special handling
        if self.service == 'ncbiblast':
            databaseList = params['database']
            del params['database']
            databaseData = ''
            for db in databaseList:
                databaseData += '&database=' + db
-            encodedParams = urllib.urlencode(params)
+            encodedParams = urllib.parse.urlencode(params)
            encodedParams += databaseData
        else:
-            encodedParams = urllib.urlencode(params)
+            encodedParams = urllib.parse.urlencode(params)
-        print url
+        print(url)
-        self.jobId = urllib2.urlopen(url, encodedParams).read()
+        self.jobId = urllib.request.urlopen(url, encodedParams).read()
        self.createLock()
        return self.jobId
    def status(self, jobId=None):
        """ Check the status of the given job (or the current job if none is
        specified), and return the result. """
        if jobId is None:
            jobId = self.jobId
        url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
-        status = urllib2.urlopen(url).read()
+        status = urllib.request.urlopen(url).read()
        return status
    def resultTypes(self):
        """ Get the available result types. Will only work on a finished job. """
        url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
-        resultTypes = urllib2.urlopen(url).read()
+        resultTypes = urllib.request.urlopen(url).read()
        return resultTypes
    def result(self, resultType):
        """ Get the result of the given job of the specified type. """
        url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
        try:
-            result = urllib2.urlopen(url).read()
+            result = urllib.request.urlopen(url).read()
            if resultType == 'error':
                raise RuntimeError('An error occurred: %s' % result)
-        except urllib2.HTTPError:
+        except(urllib.error.HTTPError):
            if resultType == 'error':
                raise RuntimeError('An unknown error occurred while processing the job (check your input)')
            else:
                self.result('error')
        return result
    def submit(self, params, resultTypes):
        """ Submit a new job to the service with the given parameters.
        Return the output in the specified format. """
        params['email'] = self.__email__
        self.run(params)
-        print 'Submitted new', self.service, 'job, jobId:', self.jobId
+        print(('Submitted new', self.service, 'job, jobId:', self.jobId))
-        print 'Please be patient while the job is completed'
+        print('Please be patient while the job is completed')
        status = 'RUNNING'
        observe = 0
        while status == 'RUNNING':
            observe = observe + 1
            status = self.status()
            sleep(self.__checkInterval__)
        if status != 'FINISHED':
            raise RuntimeError('An error occurred and the job could not be completed')
-        print 'Job complete.'
+        print('Job complete.')
        self.removeLock()
        if type(resultTypes) != list:
            resultTypes = [resultTypes]
        results = []
        for resultType in resultTypes:
            results.append(self.result(resultType))
        if len(results) == 1:
            return results[0]
        else:
            return results
--- a/wordcount.py
+++ b/wordcount.py
@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
                neg[word] = 1
    logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg
-    for (word, cnt_pos) in pos.items():
+    for (word, cnt_pos) in list(pos.items()):
        cnt_neg = 0.0001
        try:
            cnt_neg = neg[word]
@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
            pass
        logratio[word] = math.log(float(cnt_pos) / float(cnt_neg))
-    allpos = logratio.items() # extract all pairs of words:log-ratio
+    allpos = list(logratio.items()) # extract all pairs of words:log-ratio
    sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them
-    print "Enriched words (sorted by ln pos/neg)"
+    print("Enriched words (sorted by ln pos/neg)")
-    print "Word    \tln pos/neg\tE-value"
+    print("Word    \tln pos/neg\tE-value")
    for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values
        cnt_pos = int(pos[word])
        try: cnt_neg = int(neg[word])
@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
        pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False)
        # Correct for multiple testing (very conservatively)
        eval = pval * len(allpos)
-        print "%s\t%6.3f  \t%e" % (word, lgr, eval)
+        print("%s\t%6.3f  \t%e" % (word, lgr, eval))
 def getReverse(distribs):
    """ Construct a new list of probability distributions of DNA, by
@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
    except KeyError:
        usage(sys.argv[0], "Unknown motif %s" % motif)
        return
-    print "Motif %s:" % motif
+    print("Motif %s:" % motif)
    pwm1 = sequence.PWM(fg1, bg)
    pwm1.display(format='JASPAR')
-    print "Motif %s (reverse complement):" % motif
+    print("Motif %s (reverse complement):" % motif)
    pwm2 = sequence.PWM(fg2, bg)
    pwm2.display(format='JASPAR')
@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
    # plot the average score curve
    # print >> sys.stderr, ""
-    x = range(-(seq_len/2), (seq_len/2))    # call center of sequence X=0
+    x = list(range(-(seq_len/2), (seq_len/2)))    # call center of sequence X=0
    lbl = "%s" % (motif)
    plt.plot(x, avg_motif_score, label=lbl)
    #plt.plot(x, smoothed_avg_motif_score, label=lbl)
@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
    except KeyError:
        usage(sys.argv[0], "Unknown motif %s" % motif)
        return
-    print "Motif %s:" % motif
+    print("Motif %s:" % motif)
    pwm1 = sequence.PWM(fg1, bg)
    pwm1.display(format='JASPAR')
-    print "Motif %s (reverse complement):" % motif
+    print("Motif %s (reverse complement):" % motif)
    pwm2 = sequence.PWM(fg2, bg)
    pwm2.display(format='JASPAR')
@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
    # divide number of sequences with hit by total number of hits
    site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ]
-    print >> sys.stderr, "Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits)
+    print("Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits), file=sys.stderr)
    # STATISTICS
    # Get the cumulative hit counts in concentric windows
@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
    for i in range(hw, seq_len-motif_width+1-hw):
        smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1)
-    x = range(-(seq_len/2), (seq_len/2))        # call center of sequence X=0
+    x = list(range(-(seq_len/2), (seq_len/2)))        # call center of sequence X=0
    lbl = "%s, t=%.2f" % (motif, threshold)
    #lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
    plt.plot(x, smoothed_site_probability, label=lbl)
@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
 def usage(name, errmsg = None):
    if errmsg != None:
-        print "Error: %s" % errmsg
+        print("Error: %s" % errmsg)
-    print """Usage: %s [options]
+    print("""Usage: %s [options]
                -f <fasta-filename> (required)
                -d discover enriched words
                -w <word width, default 8>
                -p <peak width, default 100>
                -m <peak margin, default 100>
                -s <JASPAR-ID> scan for JASPAR motif
-                -h print this help""" % name
+                -h print this help""" % name)
 if __name__ == '__main__':
    try:
        optlst, args = getopt.getopt(sys.argv[1:], 'f:hds:j:w:p:m:')
-    except getopt.GetoptError, err:
+    except getopt.GetoptError as err:
        usage(sys.argv[0], str(err))
        sys.exit(2)
    FILENAME =      None
@@ -301,7 +301,7 @@ if __name__ == '__main__':
        sys.exit(3)
    seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN)
    if DISCOVER_MODE:
-        print "Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
+        print("Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN))
        countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
    elif SCAN_MODE:
        scanMotifReport(seqs, MOTIF_ID)