python3_5

ac6c5d6b · Mikael Boden · 934c2bff · ac6c5d6b · ac6c5d6b · ac6c5d6b
Commit ac6c5d6b authored Feb 14, 2017 by Mikael Boden
16 changed files
--- a/binomial.py
+++ b/binomial.py
@@ -95,8 +95,8 @@ def betacf(a, b, x):
        h *= delta
        if (abs(delta-1.0) < EPS): break
-    if (m > MAXIT):  print >> sys.stderr, ("a or b too big or MAXIT too small "
+    if (m > MAXIT):  print(("a or b too big or MAXIT too small "
-                                           "in betacf")
+                                           "in betacf"), file=sys.stderr)
    return h
@@ -118,5 +118,5 @@ def gammaln(x):
 def die(string):
-    print >> sys.stderr, string
+    print(string, file=sys.stderr)
--- a/genome.py
+++ b/genome.py
@@ -105,7 +105,7 @@ class GeneExpression:
            {'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
            """
        if names == None:
-            return self.genes.keys()
+            return list(self.genes.keys())
        elif isinstance(names, str):
            return self.matrix[self.genes[names],:]
        else:
@@ -148,7 +148,7 @@ class GeneExpression:
        except:
            index = samples
        mygenes = {}
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            mygenes[name] = self.matrix[ndx, index]
        return mygenes
@@ -165,7 +165,7 @@ class GeneExpression:
            sort_ndx = np.nan_to_num(self.matrix[:,index]).argsort()
        except:
            sort_ndx = np.nan_to_num(self.matrix[:,sample]).argsort()
-        name_tuples = sorted(self.genes.items(), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
+        name_tuples = sorted(list(self.genes.items()), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
        names = []
        if descending:
            for (name, index) in [name_tuples[index] for index in sort_ndx[::-1]]: # reverse the order 
@@ -199,7 +199,7 @@ class GeneExpression:
            Creates and returns a gene dictionary with the corresponding ratios. """
        mygenes = {}
        mdiv = self.matrix[:, index1] / self.matrix[:, index2]
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            mygenes[name] = mdiv[ndx]
        return mygenes
@@ -208,7 +208,7 @@ class GeneExpression:
            Creates and returns a gene dictionary with the corresponding log-ratios. """
        mygenes = {}
        mlr = np.log2(self.matrix[:, index1] / self.matrix[:, index2])
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            mygenes[name] = mlr[ndx]
        return mygenes
@@ -218,7 +218,7 @@ class GeneExpression:
        index = self.genes[probeID]
        profile = self.matrix[index, :]
        mygenes = {}
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            other = self.matrix[ndx, :]
            mygenes[name] = pearson(profile, other)
        return mygenes
@@ -252,7 +252,7 @@ class GeneExpression:
        # Calculate Z-score for the given column for each gene
        zscore = (self.matrix[:, index] - mu) / sd
        mygenes = {}
-        for (name, ndx) in self.genes.items():
+        for (name, ndx) in list(self.genes.items()):
            try:
                mygenes[name] = zscore[ndx, :]
            except IndexError:
@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0):
                    genes[name] = values
    if len(genes) == 0:
        raise RuntimeError('No data in file')
-    print 'Data set %s contains %d entries' % (dataset, len(genes))
+    print('Data set %s contains %d genes' % (dataset, len(genes)))
    if cnt_null > 0:
-        print 'Data set has %d null-values' % (cnt_null)
+        print('Data set has %d null-values' % (cnt_null))
    return GeneExpression(dataset, headers[2:], genes)
@@ -357,40 +357,29 @@ def pearson(X, Y):
        return 0
    return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar))
-# ------------------- Example ---------------------
+# ------------------- Example (basically exercise 7 in prac 9)---------------------
-ge3716 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS3716.soft')
+if __name__=='__main__':
-ratio = GeneExpression('GDS3716_ratio')
+    g = readGEOFile('GDS3198.soft', id_column = 1)
-ratio.addSamples('S1_ER+/Healthy', ge3716.getRatio( 33,  0))
+    meanfold = {}
-ratio.addSamples('S2_ER+/Healthy', ge3716.getRatio( 34,  1))
+    for gene in g.genes:
-ratio.addSamples('S3_ER+/Healthy', ge3716.getRatio( 35,  2))
+        profile = g.getGenes(gene)
-ratio.addSamples('S4_ER+/Healthy', ge3716.getRatio( 36,  3))
+        meanfold[gene] = (np.log2(profile[0] / profile[3]) + np.log2(profile[1] / profile[4]) + np.log2(profile[2] / profile[5])) / 3
-ratio.addSamples('S5_ER+/Healthy', ge3716.getRatio( 37,  4))
-ratio.addSamples('S6_ER+/Healthy', ge3716.getRatio( 38,  5))
-ratio.addSamples('S7_ER+/Healthy', ge3716.getRatio( 39,  6))
-ratio.addSamples('S8_ER+/Healthy', ge3716.getRatio( 40,  7))
-ratio.addSamples('S9_ER+/Healthy', ge3716.getRatio( 41,  8))
-ratio.addSamples('S1_ER-/Healthy', ge3716.getRatio( 24,  9))
-ratio.addSamples('S2_ER-/Healthy', ge3716.getRatio( 25, 10))
-ratio.addSamples('S3_ER-/Healthy', ge3716.getRatio( 26, 11))
-ratio.addSamples('S4_ER-/Healthy', ge3716.getRatio( 27, 12))
-ratio.addSamples('S5_ER-/Healthy', ge3716.getRatio( 28, 13))
-ratio.addSamples('S6_ER-/Healthy', ge3716.getRatio( 29, 14))
-ratio.addSamples('S7_ER-/Healthy', ge3716.getRatio( 30, 15))
-ratio.addSamples('S8_ER-/Healthy', ge3716.getRatio( 31, 16))
-ratio.addSamples('S9_ER-/Healthy', ge3716.getRatio( 32, 17))
-ratio.writeGEOFile('/Users/mikael/workspace/COSC2000/GDS3716_ratios.soft')
-print ge3716.getHeaders()
+    import matplotlib.pyplot as plt
+    scores = [y for y in list(meanfold.values()) if not np.isnan(y)]
+    hist, bins = np.histogram(scores, bins=50)
+    width = 0.7 * (bins[1] - bins[0])
+    center = (bins[:-1] + bins[1:]) / 2
+    plt.bar(center, hist, align='center', width=width)
+    plt.show()
-z = ratio.getZScore(0) # NOT recommended! Ratios are NOT normally distributed! Use log-ratios instead.
+    result = sorted(list(meanfold.items()), key=lambda v: v[1])
+    print('========== Wildtype may down-regulate ==========')
-ge38 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS38.soft', id_column = 1)
+    for r in result[0:100]:
-cln2_profile = ge38.getGenes('CLN2')
+        print(r[0], r[1])
-pcorr = ge38.getPearson('CLN2')
+    print('========== Wildtype may up-regulate ==========')
-gp = GeneExpression('Ex3', 'PC_CLN2', pcorr)
+    for r in result[-1:-100:-1]:
-sorted = gp.sort('PC_CLN2', True)
+        print(r[0], r[1])
-print sorted[0], ge38.getGenes(sorted[0])
-print sorted[1], ge38.getGenes(sorted[1])
--- a/gibbs.py
+++ b/gibbs.py
@@ -138,7 +138,7 @@ class GibbsMotif():
                        LL += math.log(Qk / Pk)
                    except ZeroDivisionError:
                        pass
-                print "LL @ %5d=\t%5.2f" % (round, LL)
+                print("LL @ %5d=\t%5.2f" % (round, LL))
        # end main for-loop
        self.q = q
@@ -312,7 +312,7 @@ class GibbsAlign():
                        LL += math.log(Qk / Pk)
                    except ZeroDivisionError:
                        pass
-                print "LL @ %5d=\t%5.2f" % (round, LL)
+                print("LL @ %5d=\t%5.2f" % (round, LL))
        # end main for-loop
        self.q = q

--- a/godata.py
+++ b/godata.py
--- a/guide.py
+++ b/guide.py
--- a/ml.py
+++ b/ml.py
@@ -21,7 +21,7 @@ class NN():
        self.b_hid  = numpy.random.randn(nHidden)           # biases hidden layer
        self.w_out  = numpy.random.randn(nOutput, nHidden)  # weights hid -> out
        self.b_out  = numpy.random.randn(nOutput)           # biases output layer
-        print "Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output))
+        print("Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output)))
    def writeFile(self, filename):
        """ Save NN to a file. """
@@ -110,7 +110,7 @@ class NN():
            multi_targ  = [ target ]
        for i in range(niter):
            mse = 0.0
-            entries = range(len(multi_input))
+            entries = list(range(len(multi_input)))
            if shuffle:
                random.shuffle(entries)
            for p in entries:

--- a/phylo.py
+++ b/phylo.py
@@ -2,7 +2,7 @@
 Module with methods and classes for phylogeny.
 @author: mikael
 '''
-##import sequence
+import sequence
 class PhyloTree:
    """ Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
@@ -141,6 +141,18 @@ class PhyloNode:
            elif self.left and self.right:
                return '(' + left + ',' + right + ')' + dist
+    def __le__(self, other):
+        """ Returns indication of less than other node. """
+        return other and self.__hash__() <= other.__hash__()
+    def __eq__(self, other):
+        """ Returns indication of equivalence to other node. """
+        return other and self.__hash__() == other.__hash__()
+    def __hash__(self):
+        """ Returns hash of object. """
+        return hash((self.label, self.dist, self.sequence))
    def _printSequences(self, start, end):
        """ Returns string with node (incl descendants) in a Newick style. """
        left = right = label = dist = ''
@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False):
        find the *closest* pair of clusters, and
        merge that pair into a new cluster (to replace the two that merged).
        In each case, the new cluster is represented by the (phylo)node that is formed. """
-    while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
+    while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below
        closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
        closest_dist = None         # The distance between them
        for pair in D:              # check all pairs which should be merged
            dist = D[pair]
-            if dist < closest_dist or closest_dist == None:
+            if closest_dist == None or dist < closest_dist:
                closest_dist = dist
                closest_pair = pair
        # So we know the closest, now we need to merge...
@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False):
        y = closest_pair[1]
        z = PhyloNode()             # create a new node for the cluster z
        z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
-        Nx = N.pop(x)               # find number of sequences in x, remove the cluster from list N
+        Nx = N.pop(x, None)         # find number of sequences in x, remove the cluster from list N
-        Ny = N.pop(y)               # find number of sequences in y, remove the cluster from list N
+        Ny = N.pop(y, None)         # find number of sequences in y, remove the cluster from list N
+        if Nx == None or Ny == None:
+        	continue
        dz = {}                     # new distances to cluster z
        for w in N:                 # for each node w ...
            # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)

--- a/prob.py
+++ b/prob.py
@@ -277,7 +277,7 @@ def _readDistrib(linelist):
    if len(d) == 0:
        return None
    alpha = Alphabet(symstr)
-    if '*' in d.keys(): # tot provided
+    if '*' in list(d.keys()): # tot provided
        for sym in d:
            if sym != '*':
                d[sym] = d[sym] * d['*']
@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'):
                ncol = len(counts)
                if len(name) == 1: # proper symbol
                    symcount[name] = counts
-        alpha = Alphabet(''.join(symcount.keys()))
+        alpha = Alphabet(''.join(list(symcount.keys())))
        distribs = []
        for col in range(ncol):
            d = dict([(sym, symcount[sym][col]) for sym in symcount])
@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'):
    """
    d = readMultiCounts(filename, format=format)
    if len(d) > 0:
-        return d.values()[0]
+        return list(d.values())[0]
 #################################################################################################
 # Joint class
@@ -628,12 +628,12 @@ class IndepJoint(Joint):
    def displayMatrix(self, count = False):
        """ Pretty-print matrix """
-        print " \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))
+        print((" \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))))
        for a in self.alphas[0]:
            if count:
-                print "%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))
+                print(("%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))))
            else:
-                print "%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))
+                print(("%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))))
    def __str__(self):
        """ Text representation of the table. Note that size is an issue so big tables
@@ -718,5 +718,3 @@ class NaiveBayes():
                prob *= condprob[i][key[i]] or 0.0
            out.observe(outsym, prob)
        return out
--- a/sam.py
+++ b/sam.py
--- a/seqdata.py
+++ b/seqdata.py
@@ -381,11 +381,11 @@ class BedFile():
        index_name = {}
        for i in range(len(self.rows)):
            row = self.rows[i]
-            if not index_start.has_key(row.chrom): # seeing chromosome entry first time
+            if not row.chrom in index_start: # seeing chromosome entry first time
                index_start[row.chrom] = []
-            if not index_centre.has_key(row.chrom): # seeing chromosome entry first time
+            if not row.chrom in index_centre: # seeing chromosome entry first time
                index_centre[row.chrom] = []
-            if not index_end.has_key(row.chrom): # seeing chromosome entry first time
+            if not row.chrom in index_end: # seeing chromosome entry first time
                index_end[row.chrom] = []
            index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
            index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
@@ -725,11 +725,11 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
    for row in entries:
        if format == 'Peaks':
            #f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
-            f.write("%s %d %d %s %d %s %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
+            f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
        elif format == 'Limited':
-            f.write("%s %d %d" % (row.chrom, row.chromStart, row.chromEnd))
+            f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
        else:
-            f.write("%s %d %d %s %d %s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
+            f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
        f.write("\n")
    f.close()
@@ -760,7 +760,7 @@ try:
 except ImportError:
    strerror = lambda x: 'strerror not supported'
 from os.path import exists
-from itertools import izip
+from itertools import chain
 def true_long_type():
    """
@@ -805,7 +805,7 @@ def base_to_bin(x):
 def create_byte_table():
    """create BYTE_TABLE"""
    d = {}
-    for x in xrange(2**8):
+    for x in range(2**8):
        d[x] = byte_to_bases(x)
    return d
@@ -821,9 +821,9 @@ def split16(x):
 def create_twobyte_table():
    """create TWOBYTE_TABLE"""
    d = {}
-    for x in xrange(2**16):
+    for x in range(2**16):
        c, f = split16(x)
-        d[x] = byte_to_bases(c) + byte_to_bases(f)
+        d[x] = chain(byte_to_bases(c), byte_to_bases(f))
    return d
 BYTE_TABLE = create_byte_table()
@@ -836,7 +836,7 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
    """
    longs_len = len(longs)
    # dna = ctypes.create_string_buffer(array_size)
-    dna = array('c', 'N' * longs_len)
+    dna = array('b', 'N' * longs_len)
    # translate from 32-bit blocks to bytes
    # this method ensures correct endianess (byteswap as neeed)
    bytes = array('B')
@@ -845,14 +845,14 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
    first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
    i = 16 - first_base_offset
    if array_size < i: i = array_size
-    dna[0:i] = array('c', first_block[first_base_offset:first_base_offset + i])
+    dna[0:i] = array('b', first_block[first_base_offset:first_base_offset + i])
    if longs_len == 1: return dna
    # middle blocks (implicitly skipped if they don't exist)
    for byte in bytes[4:-4]:
-        dna[i:i + 4] = array('c', BYTE_TABLE[byte])
+        dna[i:i + 4] = array('b', BYTE_TABLE[byte])
        i += 4
    # last block
-    last_block = array('c', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
+    last_block = array('b', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
    dna[i:i + last_base_offset] = last_block[0:last_base_offset]
    return dna
@@ -889,7 +889,7 @@ class TwoBitFile(dict):
        self._file_handle = open(foo, 'rb')
        self._load_header()
        self._load_index()
-        for name, offset in self._offset_dict.iteritems():
+        for name, offset in self._offset_dict.items():
            self[name] = TwoBitSequence(self._file_handle, offset,
                                        self._byteswapped)
        return
@@ -926,13 +926,16 @@ class TwoBitFile(dict):
            if remaining == 0: break
            name_size = array('B')
            name_size.fromfile(file_handle, 1)
-            if byteswapped: name_size.byteswap()
+            if byteswapped:
-            name = array('c')
+                name_size.byteswap()
-            if byteswapped: name.byteswap()
+            name = array('b')
+            if byteswapped:
+                name.byteswap()
            name.fromfile(file_handle, name_size[0])
            offset = array(LONG)
            offset.fromfile(file_handle, 1)
-            if byteswapped: offset.byteswap()
+            if byteswapped:
+                offset.byteswap()
            sequence_offsets.append((name.tostring(), offset[0]))
            remaining -= 1
        self._sequence_offsets = sequence_offsets
@@ -943,7 +946,7 @@ class TwoBitFile(dict):
        d = {}
        file_handle = self._file_handle
        byteswapped = self._byteswapped
-        for name, offset in self._offset_dict.iteritems():
+        for name, offset in self._offset_dict.items():
            file_handle.seek(offset)
            dna_size = array(LONG)
            dna_size.fromfile(file_handle, 1)
@@ -1078,7 +1081,7 @@ class TwoBitSequence(object):
        if byteswapped: fourbyte_dna.byteswap()
        string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
                                              last_base_offset, region_size)
-        for start, size in izip(n_block_starts, n_block_sizes):
+        for start, size in zip(n_block_starts, n_block_sizes):
            end = start + size
            if end <= min_: continue
            if start > max_: break
@@ -1086,14 +1089,14 @@ class TwoBitSequence(object):
            if end > max_: end = max_
            start -= min_
            end -= min_
-            string_as_array[start:end] = array('c', 'N'*(end-start))
+            string_as_array[start:end] = array('b', 'N'*(end-start))
        lower = str.lower
        first_masked_region = max(0,
                                  bisect_right(mask_block_starts, min_) - 1)
        last_masked_region = min(len(mask_block_starts),
                                 1 + bisect_right(mask_block_starts, max_,
                                                  lo=first_masked_region))
-        for start, size in izip(mask_block_starts[first_masked_region:last_masked_region],
+        for start, size in zip(mask_block_starts[first_masked_region:last_masked_region],
                                mask_block_sizes[first_masked_region:last_masked_region]):
            end = start + size
            if end <= min_: continue
@@ -1102,9 +1105,9 @@ class TwoBitSequence(object):
            if end > max_: end = max_
            start -= min_
            end -= min_
-            string_as_array[start:end] = array('c', lower(string_as_array[start:end].tostring()))
+            string_as_array[start:end] = array('b', lower(string_as_array[start:end].tostring()))
        if not len(string_as_array) == max_ - min_:
-            raise RuntimeError, "Sequence was longer than it should be"
+            raise RuntimeError("Sequence was longer than it should be")
        if reverse:
            return self.reverseComplement(string_as_array.tostring())
        return string_as_array.tostring()
@@ -1124,7 +1127,7 @@ class TwoBitSequence(object):
        """
        return self.__getslice__(0, None)
-class TwoBitFileError(StandardError):
+class TwoBitFileError(Exception):
    """
    Base exception for TwoBit module
    """

--- a/sequence.py
+++ b/sequence.py
@@ -55,10 +55,11 @@ class Sequence(object):
        ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'] """
-        try: # convert sequence data into a compact array representation
+        #try: # convert sequence data into a compact array representation
-            self.sequence = array.array('c', ''.join([s.upper() for s in sequence]))
+        #    self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
-        except TypeError:
+        #except TypeError:
-            raise RuntimeError('Sequence data is not specified correctly: must be iterable')
+        #    raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
+        self.sequence = sequence
        # Assign an alphabet
        self.alphabet = None
@@ -133,15 +134,15 @@ class Sequence(object):
            Calling self.__getitem__(3) is equivalent to self[3]
        """
        if type(ndx) is slice:
-            return self.sequence[ndx].tostring()
+            return ''.join(self.sequence[ndx])
        else:
            return self.sequence[ndx]
    def writeFasta(self):
        """ Write one sequence in FASTA format to a string and return it. """
        fasta = '>' + self.name + ' ' + self.info + '\n'
-        data = self.sequence.tostring()
+        data = ''.join(self.sequence)
-        nlines = (len(self.sequence) - 1) / 60 + 1
+        nlines = int(math.ceil((len(self.sequence) - 1) / 60 + 1))
        for i in range(nlines):
            lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
            fasta += lineofseq
@@ -164,7 +165,7 @@ class Sequence(object):
    def find(self, findme):
        """ Find the position of the specified symbol or sub-sequence """
-        return self.sequence.tostring().find(findme)
+        return ''.join(self.sequence).find(findme)
 """
 Below are some useful methods for loading data from strings and files.
@@ -438,8 +439,8 @@ class Alignment():
            column index, entropy, number of gaps, and symbols in order of decreasing probability.
            theta1 is the threshold for displaying symbols in upper case,
            theta2 is the threshold for showing symbols at all, and in lower case. """
-        print "Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)
+        print(("Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)))
-        print "Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)
+        print(("Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)))
        for col in range(self.alignlen):
            d = Distrib(self.alphabet)
            gaps = 0
@@ -448,21 +449,21 @@ class Alignment():
                    d.observe(seq[col])
                else:
                    gaps += 1
-            print (col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,
+            print(((col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,))
            symprobs = d.getProbsort()
            (_, maxprob) = symprobs[0]
            if maxprob >= theta1:
-                print "%d\tTRUE\t" % int(maxprob * 100),
+                print(("%d\tTRUE\t" % int(maxprob * 100),))
            else:
-                print "%d\t\t" % int(maxprob * 100),
+                print(("%d\t\t" % int(maxprob * 100),))
            for (sym, prob) in symprobs:
                if prob >= theta1:
-                    print sym, "%d%%" % int(prob * 100),
+                    print((sym, "%d%%" % int(prob * 100),))
                elif prob >= theta2 and lowercase:
-                    print sym.lower(), "%d%%" % int(prob * 100),
+                    print((sym.lower(), "%d%%" % int(prob * 100),))
                elif prob >= theta2:
-                    print sym, "%d%%" % int(prob * 100),
+                    print((sym, "%d%%" % int(prob * 100),))
-            print
+            print()
    def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False):
        """ Display a table with rows for each alignment column, showing
@@ -681,10 +682,9 @@ class Alignment():
            htmlstr += html
        htmlstr += '<pre>'
        if filename:
-            fh = open(filename, 'w')
+            with open(filename, 'w+') as fh:
                fh.write(htmlstr)
                fh.write('</body></html>\n')
-            fh.close()
        else:
            return htmlstr
@@ -985,12 +985,12 @@ def readClustal(string, alphabet):
        index = name.find('/')
        if index >= 0:
            name = name[0:index]
-        if seqs.has_key(name):
+        if name in seqs:
            seqs[name] += seqstr
        else:
            seqs[name] = seqstr
    sequences = []
-    for name, seqstr in seqs.items():
+    for name, seqstr in list(seqs.items()):
        sequences.append(Sequence(seqstr, alphabet, name, gappy = True))
    return Alignment(sequences)
@@ -1180,12 +1180,12 @@ class PWM(object):
    def display(self, format = 'COLUMN'):
        if format == 'COLUMN':
-            print " \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))
+            print((" \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))))
            for j in range(len(self.alphabet)):
-                print "%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
+                print(("%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
        elif format == 'JASPAR':
            for j in range(len(self.alphabet)):
-                print "%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
+                print(("%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
    def search(self, sequence, lowerBound=0):
        """ Find matches to the motif in a specified sequence. Returns a list
@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
    for i in range(MAX_TRY):
        try:
-            fastaData = fetch(id, database)
+            fastaData = fetch(id, database).decode("utf-8")
            seq = readFasta(fastaData)[0]
            break
        except:
            from time import sleep
-            print 'Failed on {i}th try for id {id}'.format(i=i, id=id)
+            print(('Failed on {i}th try for id {id}'.format(i=i, id=id)))
            sleep(0.1)
    try:
        return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info)
@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
 if __name__ == '__main__':
    seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True)
-    print 'Read', len(seqs), 'sequences'
+    print(('Read', len(seqs), 'sequences'))
--- a/spred.py
+++ b/spred.py
@@ -71,7 +71,7 @@ class SeqNN():
                im[row, _onehotIndex(alpha,  subseqs[k])] = 1
                if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1
                row += 1
-        print "There are", row, "entries in data set"
+        print("There are", row, "entries in data set")
        if targets:
            return im, om
        else:
@@ -85,7 +85,7 @@ class SeqNN():
        im, om = self._encodeseq(seqs, targets)
        for i in range(niter):  # train first NN
            rmse = self.nn1.train(im, om, eta = eta, niter = 1)
-            print i, ":", rmse
+            print(i, ":", rmse)
        if not self.cascade:    # if there's no cascaded NN, finish here
            return rmse
        nn1seqs = []            # a list of new SS sequences ...
@@ -95,7 +95,7 @@ class SeqNN():
        im, om = self._encodeseq(nn1seqs, targets)  # construct input/output patterns from SS sequences
        for i in range(niter):  # train cascaded NN
            rmse = self.nn2.train(im, om, eta = eta, niter = 1)
-            print i, ":", rmse
+            print(i, ":", rmse)
        return rmse
    def testAll(self, seqs, targets):

--- a/sstruct.py
+++ b/sstruct.py
@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4):
        specified width average of 100.
    """
    sum = 0.0
-    order = range(0, len(calls) - 1, +1)  # we are extending calls downstream
+    order = list(range(0, len(calls) - 1, +1))  # we are extending calls downstream
    cnt = 0
    for i in order:  # extend to the right
        if calls[i]: # to extend a call is required in the first place
@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4):
        AND extend this list upstream containing a specified width average of 100.
    """
    sum = 0.0
-    order = range(len(calls) - 1, 0, -1)  # we are extending calls upstream/to-the-left
+    order = list(range(len(calls) - 1, 0, -1))  # we are extending calls upstream/to-the-left
    cnt = 0
    for i in order:  # extend to the right
        if calls[i]: # a requirement to extend is to have a call in the first place

--- a/sym.py
+++ b/sym.py
@@ -291,7 +291,7 @@ class TupleEntries(object):
    def __iter__(self):
        return self
-    def next(self):
+    def __next__(self):
        """ Step through sequence of entries, either
        (if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
        (if sparse) with calls to tuple store based on all possible symbol combinations."""

--- a/webservice.py
+++ b/webservice.py
-import urllib, urllib2
+import urllib.request
 import os
 from time import sleep
 import stats
-from StringIO import StringIO
+from io import StringIO
 import gzip
 """ This module is collection of functions for accessing the EBI REST web services,
@@ -32,11 +32,11 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
    url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
    # Get the entry
    try:
-        data = urllib2.urlopen(url).read()
+        data = urllib.request.urlopen(url).read()
-        if data.startswith('ERROR'):
+        if data.startswith(b'ERROR'):
            raise RuntimeError(data)
        return data
-    except urllib2.HTTPError, ex:
+    except(urllib.error.HTTPError, ex):
        raise RuntimeError(ex.read())
 def search(query, dbName='uniprot', format='list', limit=100):
@@ -57,12 +57,12 @@ def search(query, dbName='uniprot', format='list', limit=100):
            url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
        # Get the entries
        try:
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            if format == 'list':
                return data.splitlines()
            else:
                return data
-        except urllib2.HTTPError, ex:
+        except(urllib.error.HTTPError, ex):
            raise RuntimeError(ex.read())
    elif dbName.startswith('refseq'):
        dbs = dbName.split(":")
@@ -72,7 +72,7 @@ def search(query, dbName='uniprot', format='list', limit=100):
        url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
        # Get the entries
        try:
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            words = data.split("</Id>")
            words = [w[w.find("<Id>")+4:] for w in words[:-1]]
            if format == 'list':
@@ -81,11 +81,11 @@ def search(query, dbName='uniprot', format='list', limit=100):
                url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
                for w in words:
                    url += w + ","
-                data = urllib2.urlopen(url).read()
+                data = urllib.request.urlopen(url).read()
                return data
            else:
                return ''
-        except urllib2.HTTPError, ex:
+        except(urllib.error.HTTPError, ex):
            raise RuntimeError(ex.read())
    return
@@ -121,8 +121,8 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False)
        'query' : query
    }
    if len(query) > 0:
-        request = urllib2.Request(url, urllib.urlencode(params))
+        request = urllib.request.Request(url, urllib.parse.urlencode(params))
-        response = urllib2.urlopen(request).read()
+        response = urllib.request.urlopen(request).read()
        d = dict()
        for row in response.splitlines()[1:]:
            pair = row.split('\t')
@@ -170,7 +170,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
    if background == None:
        for t in term_set:
            term_cnt[t] = fg_list.count(t)
-        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
+        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
    else: # a background is provided
        for t in term_set:
            fg_hit = fg_list.count(t)
@@ -178,7 +178,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
            fg_nohit = nPos - fg_hit
            bg_nohit = nNeg - bg_hit
            term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
-        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
+        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
    ret = []
    for t in sorted_cnt:
@@ -199,17 +199,17 @@ def getGODef(goterm):
    # Get the entry: fill in the fields specified below
    try:
        entry={'id': None, 'name': None, 'def': None}
-        data = urllib2.urlopen(url).read()
+        data = urllib.request.urlopen(url).read()
        for row in data.splitlines():
            index = row.find(':')
            if index > 0 and len(row[index:]) > 1:
                field = row[0:index].strip()
                value = row[index+1:].strip(' "') # remove spaces and quotation marks
-                if field in entry.keys():         # check if we need this field
+                if field in list(entry.keys()):         # check if we need this field
                    if entry[field] == None:      # check if not yet assigned
                        entry[field] = value
        return entry
-    except urllib2.HTTPError, ex:
+    except(urllib.error.HTTPError, ex):
        raise RuntimeError(ex.read())
 def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
@@ -246,9 +246,9 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
        url = __ebiGOUrl__ + uri_string + query
        # Get the entry: fill in the fields specified below
        try:
-            urlreq = urllib2.Request(url)
+            urlreq = urllib.request.Request(url)
            urlreq.add_header('Accept-encoding', 'gzip')
-            response = urllib2.urlopen(urlreq)
+            response = urllib.request.urlopen(urlreq)
            if response.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(response.read())
                f = gzip.GzipFile(fileobj=buf)
@@ -259,12 +259,12 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
                values = row.split('\t')
                if len(values) >= 7:
                    key = values[1]
-                    if termsmap.has_key(key):
+                    if key in termsmap:
                        termsmap[key].add(values[6])
                    else:
                        termsmap[key] = set([values[6]])
                        taxonmap[key] = int(values[4])
-        except urllib2.HTTPError, ex:
+        except(urllib.error.HTTPError, ex):
            raise RuntimeError(ex.read())
    if completeAnnot:
        if len(genes) == 1:
@@ -304,13 +304,13 @@ def getGenes(goterms, database='UniProtKB', taxo=None):
        url = __ebiGOUrl__ + uri_string + goterm.strip()
        # Get the entry: fill in the fields specified below
        try:
-            data = urllib2.urlopen(url).read()
+            data = urllib.request.urlopen(url).read()
            for row in data.splitlines()[1:]:  # we ignore first (header) row
                values = row.split('\t')
                if len(values) >= 7:
                    genes.add(values[1])
            map[goterm] = list(genes)
-        except urllib2.HTTPError, ex:
+        except(urllib.error.HTTPError, ex):
            raise RuntimeError(ex.read())
    if len(goterms) == 1:
        return map[goterms[0]]
@@ -381,12 +381,12 @@ class EBI(object):
            databaseData = ''
            for db in databaseList:
                databaseData += '&database=' + db
-            encodedParams = urllib.urlencode(params)
+            encodedParams = urllib.parse.urlencode(params)
            encodedParams += databaseData
        else:
-            encodedParams = urllib.urlencode(params)
+            encodedParams = urllib.parse.urlencode(params)
-        print url
+        print(url)
-        self.jobId = urllib2.urlopen(url, encodedParams).read()
+        self.jobId = urllib.request.urlopen(url, encodedParams).read()
        self.createLock()
        return self.jobId
@@ -396,23 +396,23 @@ class EBI(object):
        if jobId is None:
            jobId = self.jobId
        url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
-        status = urllib2.urlopen(url).read()
+        status = urllib.request.urlopen(url).read()
        return status
    def resultTypes(self):
        """ Get the available result types. Will only work on a finished job. """
        url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
-        resultTypes = urllib2.urlopen(url).read()
+        resultTypes = urllib.request.urlopen(url).read()
        return resultTypes
    def result(self, resultType):
        """ Get the result of the given job of the specified type. """
        url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
        try:
-            result = urllib2.urlopen(url).read()
+            result = urllib.request.urlopen(url).read()
            if resultType == 'error':
                raise RuntimeError('An error occurred: %s' % result)
-        except urllib2.HTTPError:
+        except(urllib.error.HTTPError):
            if resultType == 'error':
                raise RuntimeError('An unknown error occurred while processing the job (check your input)')
            else:
@@ -424,8 +424,8 @@ class EBI(object):
        Return the output in the specified format. """
        params['email'] = self.__email__
        self.run(params)
-        print 'Submitted new', self.service, 'job, jobId:', self.jobId
+        print(('Submitted new', self.service, 'job, jobId:', self.jobId))
-        print 'Please be patient while the job is completed'
+        print('Please be patient while the job is completed')
        status = 'RUNNING'
        observe = 0
        while status == 'RUNNING':
@@ -434,7 +434,7 @@ class EBI(object):
            sleep(self.__checkInterval__)
        if status != 'FINISHED':
            raise RuntimeError('An error occurred and the job could not be completed')
-        print 'Job complete.'
+        print('Job complete.')
        self.removeLock()
        if type(resultTypes) != list:
            resultTypes = [resultTypes]
@@ -445,5 +445,3 @@ class EBI(object):
            return results[0]
        else:
            return results
--- a/wordcount.py
+++ b/wordcount.py
@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
                neg[word] = 1
    logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg
-    for (word, cnt_pos) in pos.items():
+    for (word, cnt_pos) in list(pos.items()):
        cnt_neg = 0.0001
        try:
            cnt_neg = neg[word]
@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
            pass
        logratio[word] = math.log(float(cnt_pos) / float(cnt_neg))
-    allpos = logratio.items() # extract all pairs of words:log-ratio
+    allpos = list(logratio.items()) # extract all pairs of words:log-ratio
    sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them
-    print "Enriched words (sorted by ln pos/neg)"
+    print("Enriched words (sorted by ln pos/neg)")
-    print "Word    \tln pos/neg\tE-value"
+    print("Word    \tln pos/neg\tE-value")
    for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values
        cnt_pos = int(pos[word])
        try: cnt_neg = int(neg[word])
@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
        pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False)
        # Correct for multiple testing (very conservatively)
        eval = pval * len(allpos)
-        print "%s\t%6.3f  \t%e" % (word, lgr, eval)
+        print("%s\t%6.3f  \t%e" % (word, lgr, eval))
 def getReverse(distribs):
    """ Construct a new list of probability distributions of DNA, by
@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
    except KeyError:
        usage(sys.argv[0], "Unknown motif %s" % motif)
        return
-    print "Motif %s:" % motif
+    print("Motif %s:" % motif)
    pwm1 = sequence.PWM(fg1, bg)
    pwm1.display(format='JASPAR')
-    print "Motif %s (reverse complement):" % motif
+    print("Motif %s (reverse complement):" % motif)
    pwm2 = sequence.PWM(fg2, bg)
    pwm2.display(format='JASPAR')
@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
    # plot the average score curve
    # print >> sys.stderr, ""
-    x = range(-(seq_len/2), (seq_len/2))    # call center of sequence X=0
+    x = list(range(-(seq_len/2), (seq_len/2)))    # call center of sequence X=0
    lbl = "%s" % (motif)
    plt.plot(x, avg_motif_score, label=lbl)
    #plt.plot(x, smoothed_avg_motif_score, label=lbl)
@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
    except KeyError:
        usage(sys.argv[0], "Unknown motif %s" % motif)
        return
-    print "Motif %s:" % motif
+    print("Motif %s:" % motif)
    pwm1 = sequence.PWM(fg1, bg)
    pwm1.display(format='JASPAR')
-    print "Motif %s (reverse complement):" % motif
+    print("Motif %s (reverse complement):" % motif)
    pwm2 = sequence.PWM(fg2, bg)
    pwm2.display(format='JASPAR')
@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
    # divide number of sequences with hit by total number of hits
    site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ]
-    print >> sys.stderr, "Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits)
+    print("Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits), file=sys.stderr)
    # STATISTICS
    # Get the cumulative hit counts in concentric windows
@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
    for i in range(hw, seq_len-motif_width+1-hw):
        smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1)
-    x = range(-(seq_len/2), (seq_len/2))        # call center of sequence X=0
+    x = list(range(-(seq_len/2), (seq_len/2)))        # call center of sequence X=0
    lbl = "%s, t=%.2f" % (motif, threshold)
    #lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
    plt.plot(x, smoothed_site_probability, label=lbl)
@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
 def usage(name, errmsg = None):
    if errmsg != None:
-        print "Error: %s" % errmsg
+        print("Error: %s" % errmsg)
-    print """Usage: %s [options]
+    print("""Usage: %s [options]
                -f <fasta-filename> (required)
                -d discover enriched words
                -w <word width, default 8>
                -p <peak width, default 100>
                -m <peak margin, default 100>
                -s <JASPAR-ID> scan for JASPAR motif
-                -h print this help""" % name
+                -h print this help""" % name)
 if __name__ == '__main__':
    try:
        optlst, args = getopt.getopt(sys.argv[1:], 'f:hds:j:w:p:m:')
-    except getopt.GetoptError, err:
+    except getopt.GetoptError as err:
        usage(sys.argv[0], str(err))
        sys.exit(2)
    FILENAME =      None
@@ -301,7 +301,7 @@ if __name__ == '__main__':
        sys.exit(3)
    seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN)
    if DISCOVER_MODE:
-        print "Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
+        print("Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN))
        countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
    elif SCAN_MODE:
        scanMotifReport(seqs, MOTIF_ID)