Commit ac6c5d6b authored by Mikael Boden's avatar Mikael Boden

python3_5

parent 934c2bff
...@@ -95,8 +95,8 @@ def betacf(a, b, x): ...@@ -95,8 +95,8 @@ def betacf(a, b, x):
h *= delta h *= delta
if (abs(delta-1.0) < EPS): break if (abs(delta-1.0) < EPS): break
if (m > MAXIT): print >> sys.stderr, ("a or b too big or MAXIT too small " if (m > MAXIT): print(("a or b too big or MAXIT too small "
"in betacf") "in betacf"), file=sys.stderr)
return h return h
...@@ -118,5 +118,5 @@ def gammaln(x): ...@@ -118,5 +118,5 @@ def gammaln(x):
def die(string): def die(string):
print >> sys.stderr, string print(string, file=sys.stderr)
...@@ -105,7 +105,7 @@ class GeneExpression: ...@@ -105,7 +105,7 @@ class GeneExpression:
{'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])} {'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
""" """
if names == None: if names == None:
return self.genes.keys() return list(self.genes.keys())
elif isinstance(names, str): elif isinstance(names, str):
return self.matrix[self.genes[names],:] return self.matrix[self.genes[names],:]
else: else:
...@@ -148,7 +148,7 @@ class GeneExpression: ...@@ -148,7 +148,7 @@ class GeneExpression:
except: except:
index = samples index = samples
mygenes = {} mygenes = {}
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
mygenes[name] = self.matrix[ndx, index] mygenes[name] = self.matrix[ndx, index]
return mygenes return mygenes
...@@ -165,7 +165,7 @@ class GeneExpression: ...@@ -165,7 +165,7 @@ class GeneExpression:
sort_ndx = np.nan_to_num(self.matrix[:,index]).argsort() sort_ndx = np.nan_to_num(self.matrix[:,index]).argsort()
except: except:
sort_ndx = np.nan_to_num(self.matrix[:,sample]).argsort() sort_ndx = np.nan_to_num(self.matrix[:,sample]).argsort()
name_tuples = sorted(self.genes.items(), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles name_tuples = sorted(list(self.genes.items()), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
names = [] names = []
if descending: if descending:
for (name, index) in [name_tuples[index] for index in sort_ndx[::-1]]: # reverse the order for (name, index) in [name_tuples[index] for index in sort_ndx[::-1]]: # reverse the order
...@@ -199,7 +199,7 @@ class GeneExpression: ...@@ -199,7 +199,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding ratios. """ Creates and returns a gene dictionary with the corresponding ratios. """
mygenes = {} mygenes = {}
mdiv = self.matrix[:, index1] / self.matrix[:, index2] mdiv = self.matrix[:, index1] / self.matrix[:, index2]
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
mygenes[name] = mdiv[ndx] mygenes[name] = mdiv[ndx]
return mygenes return mygenes
...@@ -208,7 +208,7 @@ class GeneExpression: ...@@ -208,7 +208,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding log-ratios. """ Creates and returns a gene dictionary with the corresponding log-ratios. """
mygenes = {} mygenes = {}
mlr = np.log2(self.matrix[:, index1] / self.matrix[:, index2]) mlr = np.log2(self.matrix[:, index1] / self.matrix[:, index2])
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
mygenes[name] = mlr[ndx] mygenes[name] = mlr[ndx]
return mygenes return mygenes
...@@ -218,7 +218,7 @@ class GeneExpression: ...@@ -218,7 +218,7 @@ class GeneExpression:
index = self.genes[probeID] index = self.genes[probeID]
profile = self.matrix[index, :] profile = self.matrix[index, :]
mygenes = {} mygenes = {}
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
other = self.matrix[ndx, :] other = self.matrix[ndx, :]
mygenes[name] = pearson(profile, other) mygenes[name] = pearson(profile, other)
return mygenes return mygenes
...@@ -252,7 +252,7 @@ class GeneExpression: ...@@ -252,7 +252,7 @@ class GeneExpression:
# Calculate Z-score for the given column for each gene # Calculate Z-score for the given column for each gene
zscore = (self.matrix[:, index] - mu) / sd zscore = (self.matrix[:, index] - mu) / sd
mygenes = {} mygenes = {}
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
try: try:
mygenes[name] = zscore[ndx, :] mygenes[name] = zscore[ndx, :]
except IndexError: except IndexError:
...@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0): ...@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0):
genes[name] = values genes[name] = values
if len(genes) == 0: if len(genes) == 0:
raise RuntimeError('No data in file') raise RuntimeError('No data in file')
print 'Data set %s contains %d entries' % (dataset, len(genes)) print('Data set %s contains %d genes' % (dataset, len(genes)))
if cnt_null > 0: if cnt_null > 0:
print 'Data set has %d null-values' % (cnt_null) print('Data set has %d null-values' % (cnt_null))
return GeneExpression(dataset, headers[2:], genes) return GeneExpression(dataset, headers[2:], genes)
...@@ -357,40 +357,29 @@ def pearson(X, Y): ...@@ -357,40 +357,29 @@ def pearson(X, Y):
return 0 return 0
return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar)) return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar))
# ------------------- Example --------------------- # ------------------- Example (basically exercise 7 in prac 9)---------------------
ge3716 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS3716.soft') if __name__=='__main__':
ratio = GeneExpression('GDS3716_ratio') g = readGEOFile('GDS3198.soft', id_column = 1)
ratio.addSamples('S1_ER+/Healthy', ge3716.getRatio( 33, 0)) meanfold = {}
ratio.addSamples('S2_ER+/Healthy', ge3716.getRatio( 34, 1)) for gene in g.genes:
ratio.addSamples('S3_ER+/Healthy', ge3716.getRatio( 35, 2)) profile = g.getGenes(gene)
ratio.addSamples('S4_ER+/Healthy', ge3716.getRatio( 36, 3)) meanfold[gene] = (np.log2(profile[0] / profile[3]) + np.log2(profile[1] / profile[4]) + np.log2(profile[2] / profile[5])) / 3
ratio.addSamples('S5_ER+/Healthy', ge3716.getRatio( 37, 4))
ratio.addSamples('S6_ER+/Healthy', ge3716.getRatio( 38, 5))
ratio.addSamples('S7_ER+/Healthy', ge3716.getRatio( 39, 6))
ratio.addSamples('S8_ER+/Healthy', ge3716.getRatio( 40, 7))
ratio.addSamples('S9_ER+/Healthy', ge3716.getRatio( 41, 8))
ratio.addSamples('S1_ER-/Healthy', ge3716.getRatio( 24, 9))
ratio.addSamples('S2_ER-/Healthy', ge3716.getRatio( 25, 10))
ratio.addSamples('S3_ER-/Healthy', ge3716.getRatio( 26, 11))
ratio.addSamples('S4_ER-/Healthy', ge3716.getRatio( 27, 12))
ratio.addSamples('S5_ER-/Healthy', ge3716.getRatio( 28, 13))
ratio.addSamples('S6_ER-/Healthy', ge3716.getRatio( 29, 14))
ratio.addSamples('S7_ER-/Healthy', ge3716.getRatio( 30, 15))
ratio.addSamples('S8_ER-/Healthy', ge3716.getRatio( 31, 16))
ratio.addSamples('S9_ER-/Healthy', ge3716.getRatio( 32, 17))
ratio.writeGEOFile('/Users/mikael/workspace/COSC2000/GDS3716_ratios.soft')
print ge3716.getHeaders()
import matplotlib.pyplot as plt
scores = [y for y in list(meanfold.values()) if not np.isnan(y)]
hist, bins = np.histogram(scores, bins=50)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.show()
z = ratio.getZScore(0) # NOT recommended! Ratios are NOT normally distributed! Use log-ratios instead. result = sorted(list(meanfold.items()), key=lambda v: v[1])
print('========== Wildtype may down-regulate ==========')
ge38 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS38.soft', id_column = 1) for r in result[0:100]:
cln2_profile = ge38.getGenes('CLN2') print(r[0], r[1])
pcorr = ge38.getPearson('CLN2') print('========== Wildtype may up-regulate ==========')
gp = GeneExpression('Ex3', 'PC_CLN2', pcorr) for r in result[-1:-100:-1]:
sorted = gp.sort('PC_CLN2', True) print(r[0], r[1])
print sorted[0], ge38.getGenes(sorted[0])
print sorted[1], ge38.getGenes(sorted[1])
...@@ -138,7 +138,7 @@ class GibbsMotif(): ...@@ -138,7 +138,7 @@ class GibbsMotif():
LL += math.log(Qk / Pk) LL += math.log(Qk / Pk)
except ZeroDivisionError: except ZeroDivisionError:
pass pass
print "LL @ %5d=\t%5.2f" % (round, LL) print("LL @ %5d=\t%5.2f" % (round, LL))
# end main for-loop # end main for-loop
self.q = q self.q = q
...@@ -312,7 +312,7 @@ class GibbsAlign(): ...@@ -312,7 +312,7 @@ class GibbsAlign():
LL += math.log(Qk / Pk) LL += math.log(Qk / Pk)
except ZeroDivisionError: except ZeroDivisionError:
pass pass
print "LL @ %5d=\t%5.2f" % (round, LL) print("LL @ %5d=\t%5.2f" % (round, LL))
# end main for-loop # end main for-loop
self.q = q self.q = q
......
This diff is collapsed.
This diff is collapsed.
...@@ -21,7 +21,7 @@ class NN(): ...@@ -21,7 +21,7 @@ class NN():
self.b_hid = numpy.random.randn(nHidden) # biases hidden layer self.b_hid = numpy.random.randn(nHidden) # biases hidden layer
self.w_out = numpy.random.randn(nOutput, nHidden) # weights hid -> out self.w_out = numpy.random.randn(nOutput, nHidden) # weights hid -> out
self.b_out = numpy.random.randn(nOutput) # biases output layer self.b_out = numpy.random.randn(nOutput) # biases output layer
print "Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output)) print("Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output)))
def writeFile(self, filename): def writeFile(self, filename):
""" Save NN to a file. """ """ Save NN to a file. """
...@@ -110,7 +110,7 @@ class NN(): ...@@ -110,7 +110,7 @@ class NN():
multi_targ = [ target ] multi_targ = [ target ]
for i in range(niter): for i in range(niter):
mse = 0.0 mse = 0.0
entries = range(len(multi_input)) entries = list(range(len(multi_input)))
if shuffle: if shuffle:
random.shuffle(entries) random.shuffle(entries)
for p in entries: for p in entries:
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
Module with methods and classes for phylogeny. Module with methods and classes for phylogeny.
@author: mikael @author: mikael
''' '''
##import sequence import sequence
class PhyloTree: class PhyloTree:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships. """ Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
...@@ -141,6 +141,18 @@ class PhyloNode: ...@@ -141,6 +141,18 @@ class PhyloNode:
elif self.left and self.right: elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist return '(' + left + ',' + right + ')' + dist
def __le__(self, other):
""" Returns indication of less than other node. """
return other and self.__hash__() <= other.__hash__()
def __eq__(self, other):
""" Returns indication of equivalence to other node. """
return other and self.__hash__() == other.__hash__()
def __hash__(self):
""" Returns hash of object. """
return hash((self.label, self.dist, self.sequence))
def _printSequences(self, start, end): def _printSequences(self, start, end):
""" Returns string with node (incl descendants) in a Newick style. """ """ Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = '' left = right = label = dist = ''
...@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False): ...@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False):
find the *closest* pair of clusters, and find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged). merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """ In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged for pair in D: # check all pairs which should be merged
dist = D[pair] dist = D[pair]
if dist < closest_dist or closest_dist == None: if closest_dist == None or dist < closest_dist:
closest_dist = dist closest_dist = dist
closest_pair = pair closest_pair = pair
# So we know the closest, now we need to merge... # So we know the closest, now we need to merge...
...@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False): ...@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False):
y = closest_pair[1] y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N Nx = N.pop(x, None) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N Ny = N.pop(y, None) # find number of sequences in y, remove the cluster from list N
if Nx == None or Ny == None:
continue
dz = {} # new distances to cluster z dz = {} # new distances to cluster z
for w in N: # for each node w ... for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y) # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
......
...@@ -277,7 +277,7 @@ def _readDistrib(linelist): ...@@ -277,7 +277,7 @@ def _readDistrib(linelist):
if len(d) == 0: if len(d) == 0:
return None return None
alpha = Alphabet(symstr) alpha = Alphabet(symstr)
if '*' in d.keys(): # tot provided if '*' in list(d.keys()): # tot provided
for sym in d: for sym in d:
if sym != '*': if sym != '*':
d[sym] = d[sym] * d['*'] d[sym] = d[sym] * d['*']
...@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'): ...@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'):
ncol = len(counts) ncol = len(counts)
if len(name) == 1: # proper symbol if len(name) == 1: # proper symbol
symcount[name] = counts symcount[name] = counts
alpha = Alphabet(''.join(symcount.keys())) alpha = Alphabet(''.join(list(symcount.keys())))
distribs = [] distribs = []
for col in range(ncol): for col in range(ncol):
d = dict([(sym, symcount[sym][col]) for sym in symcount]) d = dict([(sym, symcount[sym][col]) for sym in symcount])
...@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'): ...@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'):
""" """
d = readMultiCounts(filename, format=format) d = readMultiCounts(filename, format=format)
if len(d) > 0: if len(d) > 0:
return d.values()[0] return list(d.values())[0]
################################################################################################# #################################################################################################
# Joint class # Joint class
...@@ -628,12 +628,12 @@ class IndepJoint(Joint): ...@@ -628,12 +628,12 @@ class IndepJoint(Joint):
def displayMatrix(self, count = False): def displayMatrix(self, count = False):
""" Pretty-print matrix """ """ Pretty-print matrix """
print " \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas)))) print((" \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))))
for a in self.alphas[0]: for a in self.alphas[0]:
if count: if count:
print "%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True))) print(("%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))))
else: else:
print "%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a))) print(("%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))))
def __str__(self): def __str__(self):
""" Text representation of the table. Note that size is an issue so big tables """ Text representation of the table. Note that size is an issue so big tables
...@@ -718,5 +718,3 @@ class NaiveBayes(): ...@@ -718,5 +718,3 @@ class NaiveBayes():
prob *= condprob[i][key[i]] or 0.0 prob *= condprob[i][key[i]] or 0.0
out.observe(outsym, prob) out.observe(outsym, prob)
return out return out
This diff is collapsed.
...@@ -381,11 +381,11 @@ class BedFile(): ...@@ -381,11 +381,11 @@ class BedFile():
index_name = {} index_name = {}
for i in range(len(self.rows)): for i in range(len(self.rows)):
row = self.rows[i] row = self.rows[i]
if not index_start.has_key(row.chrom): # seeing chromosome entry first time if not row.chrom in index_start: # seeing chromosome entry first time
index_start[row.chrom] = [] index_start[row.chrom] = []
if not index_centre.has_key(row.chrom): # seeing chromosome entry first time if not row.chrom in index_centre: # seeing chromosome entry first time
index_centre[row.chrom] = [] index_centre[row.chrom] = []
if not index_end.has_key(row.chrom): # seeing chromosome entry first time if not row.chrom in index_end: # seeing chromosome entry first time
index_end[row.chrom] = [] index_end[row.chrom] = []
index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i)) index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i)) index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
...@@ -725,11 +725,11 @@ def writeBedFile(entries, filename, format = 'BED6', header = None): ...@@ -725,11 +725,11 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
for row in entries: for row in entries:
if format == 'Peaks': if format == 'Peaks':
#f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser #f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
f.write("%s %d %d %s %d %s %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue)) f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
elif format == 'Limited': elif format == 'Limited':
f.write("%s %d %d" % (row.chrom, row.chromStart, row.chromEnd)) f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
else: else:
f.write("%s %d %d %s %d %s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand)) f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
f.write("\n") f.write("\n")
f.close() f.close()
...@@ -760,7 +760,7 @@ try: ...@@ -760,7 +760,7 @@ try:
except ImportError: except ImportError:
strerror = lambda x: 'strerror not supported' strerror = lambda x: 'strerror not supported'
from os.path import exists from os.path import exists
from itertools import izip from itertools import chain
def true_long_type(): def true_long_type():
""" """
...@@ -805,7 +805,7 @@ def base_to_bin(x): ...@@ -805,7 +805,7 @@ def base_to_bin(x):
def create_byte_table(): def create_byte_table():
"""create BYTE_TABLE""" """create BYTE_TABLE"""
d = {} d = {}
for x in xrange(2**8): for x in range(2**8):
d[x] = byte_to_bases(x) d[x] = byte_to_bases(x)
return d return d
...@@ -821,9 +821,9 @@ def split16(x): ...@@ -821,9 +821,9 @@ def split16(x):
def create_twobyte_table(): def create_twobyte_table():
"""create TWOBYTE_TABLE""" """create TWOBYTE_TABLE"""
d = {} d = {}
for x in xrange(2**16): for x in range(2**16):
c, f = split16(x) c, f = split16(x)
d[x] = byte_to_bases(c) + byte_to_bases(f) d[x] = chain(byte_to_bases(c), byte_to_bases(f))
return d return d
BYTE_TABLE = create_byte_table() BYTE_TABLE = create_byte_table()
...@@ -836,7 +836,7 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size): ...@@ -836,7 +836,7 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
""" """
longs_len = len(longs) longs_len = len(longs)
# dna = ctypes.create_string_buffer(array_size) # dna = ctypes.create_string_buffer(array_size)
dna = array('c', 'N' * longs_len) dna = array('b', 'N' * longs_len)
# translate from 32-bit blocks to bytes # translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed) # this method ensures correct endianess (byteswap as neeed)
bytes = array('B') bytes = array('B')
...@@ -845,14 +845,14 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size): ...@@ -845,14 +845,14 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)]) first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
i = 16 - first_base_offset i = 16 - first_base_offset
if array_size < i: i = array_size if array_size < i: i = array_size
dna[0:i] = array('c', first_block[first_base_offset:first_base_offset + i]) dna[0:i] = array('b', first_block[first_base_offset:first_base_offset + i])
if longs_len == 1: return dna if longs_len == 1: return dna
# middle blocks (implicitly skipped if they don't exist) # middle blocks (implicitly skipped if they don't exist)
for byte in bytes[4:-4]: for byte in bytes[4:-4]:
dna[i:i + 4] = array('c', BYTE_TABLE[byte]) dna[i:i + 4] = array('b', BYTE_TABLE[byte])
i += 4 i += 4
# last block # last block
last_block = array('c', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)])) last_block = array('b', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
dna[i:i + last_base_offset] = last_block[0:last_base_offset] dna[i:i + last_base_offset] = last_block[0:last_base_offset]
return dna return dna
...@@ -889,7 +889,7 @@ class TwoBitFile(dict): ...@@ -889,7 +889,7 @@ class TwoBitFile(dict):
self._file_handle = open(foo, 'rb') self._file_handle = open(foo, 'rb')
self._load_header() self._load_header()
self._load_index() self._load_index()
for name, offset in self._offset_dict.iteritems(): for name, offset in self._offset_dict.items():
self[name] = TwoBitSequence(self._file_handle, offset, self[name] = TwoBitSequence(self._file_handle, offset,
self._byteswapped) self._byteswapped)
return return
...@@ -926,13 +926,16 @@ class TwoBitFile(dict): ...@@ -926,13 +926,16 @@ class TwoBitFile(dict):
if remaining == 0: break if remaining == 0: break
name_size = array('B') name_size = array('B')
name_size.fromfile(file_handle, 1) name_size.fromfile(file_handle, 1)
if byteswapped: name_size.byteswap() if byteswapped:
name = array('c') name_size.byteswap()
if byteswapped: name.byteswap() name = array('b')
if byteswapped:
name.byteswap()
name.fromfile(file_handle, name_size[0]) name.fromfile(file_handle, name_size[0])
offset = array(LONG) offset = array(LONG)
offset.fromfile(file_handle, 1) offset.fromfile(file_handle, 1)
if byteswapped: offset.byteswap() if byteswapped:
offset.byteswap()
sequence_offsets.append((name.tostring(), offset[0])) sequence_offsets.append((name.tostring(), offset[0]))
remaining -= 1 remaining -= 1
self._sequence_offsets = sequence_offsets self._sequence_offsets = sequence_offsets
...@@ -943,7 +946,7 @@ class TwoBitFile(dict): ...@@ -943,7 +946,7 @@ class TwoBitFile(dict):
d = {} d = {}
file_handle = self._file_handle file_handle = self._file_handle
byteswapped = self._byteswapped byteswapped = self._byteswapped
for name, offset in self._offset_dict.iteritems(): for name, offset in self._offset_dict.items():
file_handle.seek(offset) file_handle.seek(offset)
dna_size = array(LONG) dna_size = array(LONG)
dna_size.fromfile(file_handle, 1) dna_size.fromfile(file_handle, 1)
...@@ -1078,7 +1081,7 @@ class TwoBitSequence(object): ...@@ -1078,7 +1081,7 @@ class TwoBitSequence(object):
if byteswapped: fourbyte_dna.byteswap() if byteswapped: fourbyte_dna.byteswap()
string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset, string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
last_base_offset, region_size) last_base_offset, region_size)
for start, size in izip(n_block_starts, n_block_sizes): for start, size in zip(n_block_starts, n_block_sizes):
end = start + size end = start + size
if end <= min_: continue if end <= min_: continue
if start > max_: break if start > max_: break
...@@ -1086,14 +1089,14 @@ class TwoBitSequence(object): ...@@ -1086,14 +1089,14 @@ class TwoBitSequence(object):
if end > max_: end = max_ if end > max_: end = max_
start -= min_ start -= min_
end -= min_ end -= min_
string_as_array[start:end] = array('c', 'N'*(end-start)) string_as_array[start:end] = array('b', 'N'*(end-start))
lower = str.lower lower = str.lower
first_masked_region = max(0, first_masked_region = max(0,
bisect_right(mask_block_starts, min_) - 1) bisect_right(mask_block_starts, min_) - 1)
last_masked_region = min(len(mask_block_starts), last_masked_region = min(len(mask_block_starts),
1 + bisect_right(mask_block_starts, max_, 1 + bisect_right(mask_block_starts, max_,
lo=first_masked_region)) lo=first_masked_region))
for start, size in izip(mask_block_starts[first_masked_region:last_masked_region], for start, size in zip(mask_block_starts[first_masked_region:last_masked_region],
mask_block_sizes[first_masked_region:last_masked_region]): mask_block_sizes[first_masked_region:last_masked_region]):
end = start + size end = start + size
if end <= min_: continue if end <= min_: continue
...@@ -1102,9 +1105,9 @@ class TwoBitSequence(object): ...@@ -1102,9 +1105,9 @@ class TwoBitSequence(object):
if end > max_: end = max_ if end > max_: end = max_
start -= min_ start -= min_
end -= min_ end -= min_
string_as_array[start:end] = array('c', lower(string_as_array[start:end].tostring())) string_as_array[start:end] = array('b', lower(string_as_array[start:end].tostring()))
if not len(string_as_array) == max_ - min_: if not len(string_as_array) == max_ - min_:
raise RuntimeError, "Sequence was longer than it should be" raise RuntimeError("Sequence was longer than it should be")
if reverse: if reverse:
return self.reverseComplement(string_as_array.tostring()) return self.reverseComplement(string_as_array.tostring())
return string_as_array.tostring() return string_as_array.tostring()
...@@ -1124,7 +1127,7 @@ class TwoBitSequence(object): ...@@ -1124,7 +1127,7 @@ class TwoBitSequence(object):
""" """
return self.__getslice__(0, None) return self.__getslice__(0, None)
class TwoBitFileError(StandardError): class TwoBitFileError(Exception):
""" """
Base exception for TwoBit module Base exception for TwoBit module
""" """
......
...@@ -55,10 +55,11 @@ class Sequence(object): ...@@ -55,10 +55,11 @@ class Sequence(object):
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'Y'] """ 'R', 'S', 'T', 'V', 'W', 'Y'] """
try: # convert sequence data into a compact array representation #try: # convert sequence data into a compact array representation
self.sequence = array.array('c', ''.join([s.upper() for s in sequence])) # self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
except TypeError: #except TypeError:
raise RuntimeError('Sequence data is not specified correctly: must be iterable') # raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
self.sequence = sequence
# Assign an alphabet # Assign an alphabet
self.alphabet = None self.alphabet = None
...@@ -133,15 +134,15 @@ class Sequence(object): ...@@ -133,15 +134,15 @@ class Sequence(object):
Calling self.__getitem__(3) is equivalent to self[3] Calling self.__getitem__(3) is equivalent to self[3]
""" """
if type(ndx) is slice: if type(ndx) is slice:
return self.sequence[ndx].tostring() return ''.join(self.sequence[ndx])
else: else:
return self.sequence[ndx] return self.sequence[ndx]
def writeFasta(self): def writeFasta(self):
""" Write one sequence in FASTA format to a string and return it. """ """ Write one sequence in FASTA format to a string and return it. """
fasta = '>' + self.name + ' ' + self.info + '\n' fasta = '>' + self.name + ' ' + self.info + '\n'
data = self.sequence.tostring() data = ''.join(self.sequence)
nlines = (len(self.sequence) - 1) / 60 + 1 nlines = int(math.ceil((len(self.sequence) - 1) / 60 + 1))
for i in range(nlines): for i in range(nlines):
lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n' lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
fasta += lineofseq fasta += lineofseq
...@@ -164,7 +165,7 @@ class Sequence(object): ...@@ -164,7 +165,7 @@ class Sequence(object):
def find(self, findme): def find(self, findme):
""" Find the position of the specified symbol or sub-sequence """ """ Find the position of the specified symbol or sub-sequence """
return self.sequence.tostring().find(findme) return ''.join(self.sequence).find(findme)
""" """
Below are some useful methods for loading data from strings and files. Below are some useful methods for loading data from strings and files.
...@@ -438,8 +439,8 @@ class Alignment(): ...@@ -438,8 +439,8 @@ class Alignment():
column index, entropy, number of gaps, and symbols in order of decreasing probability. column index, entropy, number of gaps, and symbols in order of decreasing probability.
theta1 is the threshold for displaying symbols in upper case, theta1 is the threshold for displaying symbols in upper case,
theta2 is the threshold for showing symbols at all, and in lower case. """ theta2 is the threshold for showing symbols at all, and in lower case. """
print "Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen) print(("Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)))
print "Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2) print(("Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)))
for col in range(self.alignlen): for col in range(self.alignlen):
d = Distrib(self.alphabet) d = Distrib(self.alphabet)
gaps = 0 gaps = 0
...@@ -448,21 +449,21 @@ class Alignment(): ...@@ -448,21 +449,21 @@ class Alignment():
d.observe(seq[col]) d.observe(seq[col])
else: else:
gaps += 1 gaps += 1
print (col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps, print(((col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,))
symprobs = d.getProbsort() symprobs = d.getProbsort()
(_, maxprob) = symprobs[0] (_, maxprob) = symprobs[0]
if maxprob >= theta1: if maxprob >= theta1:
print "%d\tTRUE\t" % int(maxprob * 100), print(("%d\tTRUE\t" % int(maxprob * 100),))
else: else:
print "%d\t\t" % int(maxprob * 100), print(("%d\t\t" % int(maxprob * 100),))
for (sym, prob) in symprobs: for (sym, prob) in symprobs:
if prob >= theta1: if prob >= theta1:
print sym, "%d%%" % int(prob * 100), print((sym, "%d%%" % int(prob * 100),))
elif prob >= theta2 and lowercase: elif prob >= theta2 and lowercase:
print sym.lower(), "%d%%" % int(prob * 100), print((sym.lower(), "%d%%" % int(prob * 100),))
elif prob >= theta2: elif prob >= theta2:
print sym, "%d%%" % int(prob * 100), print((sym, "%d%%" % int(prob * 100),))
print print()
def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False): def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False):
""" Display a table with rows for each alignment column, showing """ Display a table with rows for each alignment column, showing
...@@ -681,10 +682,9 @@ class Alignment(): ...@@ -681,10 +682,9 @@ class Alignment():
htmlstr += html htmlstr += html
htmlstr += '<pre>' htmlstr += '<pre>'
if filename: if filename:
fh = open(filename, 'w') with open(filename, 'w+') as fh:
fh.write(htmlstr) fh.write(htmlstr)
fh.write('</body></html>\n') fh.write('</body></html>\n')
fh.close()
else: else:
return htmlstr return htmlstr
...@@ -985,12 +985,12 @@ def readClustal(string, alphabet): ...@@ -985,12 +985,12 @@ def readClustal(string, alphabet):
index = name.find('/') index = name.find('/')
if index >= 0: if index >= 0:
name = name[0:index] name = name[0:index]
if seqs.has_key(name): if name in seqs:
seqs[name] += seqstr seqs[name] += seqstr
else: else:
seqs[name] = seqstr seqs[name] = seqstr
sequences = [] sequences = []
for name, seqstr in seqs.items(): for name, seqstr in list(seqs.items()):
sequences.append(Sequence(seqstr, alphabet, name, gappy = True)) sequences.append(Sequence(seqstr, alphabet, name, gappy = True))
return Alignment(sequences) return Alignment(sequences)
...@@ -1180,12 +1180,12 @@ class PWM(object): ...@@ -1180,12 +1180,12 @@ class PWM(object):
def display(self, format = 'COLUMN'): def display(self, format = 'COLUMN'):
if format == 'COLUMN': if format == 'COLUMN':
print " \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length))) print((" \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))))
for j in range(len(self.alphabet)): for j in range(len(self.alphabet)):
print "%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j])) print(("%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
elif format == 'JASPAR': elif format == 'JASPAR':
for j in range(len(self.alphabet)): for j in range(len(self.alphabet)):
print "%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j])) print(("%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
def search(self, sequence, lowerBound=0): def search(self, sequence, lowerBound=0):
""" Find matches to the motif in a specified sequence. Returns a list """ Find matches to the motif in a specified sequence. Returns a list
...@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None): ...@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
for i in range(MAX_TRY): for i in range(MAX_TRY):
try: try:
fastaData = fetch(id, database) fastaData = fetch(id, database).decode("utf-8")
seq = readFasta(fastaData)[0] seq = readFasta(fastaData)[0]
break break
except: except:
from time import sleep from time import sleep
print 'Failed on {i}th try for id {id}'.format(i=i, id=id) print(('Failed on {i}th try for id {id}'.format(i=i, id=id)))
sleep(0.1) sleep(0.1)
try: try:
return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info) return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info)
...@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'): ...@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
if __name__ == '__main__': if __name__ == '__main__':
seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True) seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True)
print 'Read', len(seqs), 'sequences' print(('Read', len(seqs), 'sequences'))
...@@ -71,7 +71,7 @@ class SeqNN(): ...@@ -71,7 +71,7 @@ class SeqNN():
im[row, _onehotIndex(alpha, subseqs[k])] = 1 im[row, _onehotIndex(alpha, subseqs[k])] = 1
if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1 if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1
row += 1 row += 1
print "There are", row, "entries in data set" print("There are", row, "entries in data set")
if targets: if targets:
return im, om return im, om
else: else:
...@@ -85,7 +85,7 @@ class SeqNN(): ...@@ -85,7 +85,7 @@ class SeqNN():
im, om = self._encodeseq(seqs, targets) im, om = self._encodeseq(seqs, targets)
for i in range(niter): # train first NN for i in range(niter): # train first NN
rmse = self.nn1.train(im, om, eta = eta, niter = 1) rmse = self.nn1.train(im, om, eta = eta, niter = 1)
print i, ":", rmse print(i, ":", rmse)
if not self.cascade: # if there's no cascaded NN, finish here if not self.cascade: # if there's no cascaded NN, finish here
return rmse return rmse
nn1seqs = [] # a list of new SS sequences ... nn1seqs = [] # a list of new SS sequences ...
...@@ -95,7 +95,7 @@ class SeqNN(): ...@@ -95,7 +95,7 @@ class SeqNN():
im, om = self._encodeseq(nn1seqs, targets) # construct input/output patterns from SS sequences im, om = self._encodeseq(nn1seqs, targets) # construct input/output patterns from SS sequences
for i in range(niter): # train cascaded NN for i in range(niter): # train cascaded NN
rmse = self.nn2.train(im, om, eta = eta, niter = 1) rmse = self.nn2.train(im, om, eta = eta, niter = 1)
print i, ":", rmse print(i, ":", rmse)
return rmse return rmse
def testAll(self, seqs, targets): def testAll(self, seqs, targets):
......
...@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4): ...@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4):
specified width average of 100. specified width average of 100.
""" """
sum = 0.0 sum = 0.0
order = range(0, len(calls) - 1, +1) # we are extending calls downstream order = list(range(0, len(calls) - 1, +1)) # we are extending calls downstream
cnt = 0 cnt = 0
for i in order: # extend to the right for i in order: # extend to the right
if calls[i]: # to extend a call is required in the first place if calls[i]: # to extend a call is required in the first place
...@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4): ...@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4):
AND extend this list upstream containing a specified width average of 100. AND extend this list upstream containing a specified width average of 100.
""" """
sum = 0.0 sum = 0.0
order = range(len(calls) - 1, 0, -1) # we are extending calls upstream/to-the-left order = list(range(len(calls) - 1, 0, -1)) # we are extending calls upstream/to-the-left
cnt = 0 cnt = 0
for i in order: # extend to the right for i in order: # extend to the right
if calls[i]: # a requirement to extend is to have a call in the first place if calls[i]: # a requirement to extend is to have a call in the first place
......
...@@ -291,7 +291,7 @@ class TupleEntries(object): ...@@ -291,7 +291,7 @@ class TupleEntries(object):
def __iter__(self): def __iter__(self):
return self return self
def next(self): def __next__(self):
""" Step through sequence of entries, either """ Step through sequence of entries, either
(if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or (if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
(if sparse) with calls to tuple store based on all possible symbol combinations.""" (if sparse) with calls to tuple store based on all possible symbol combinations."""
......
import urllib, urllib2 import urllib.request
import os import os
from time import sleep from time import sleep
import stats import stats
from StringIO import StringIO from io import StringIO
import gzip import gzip
""" This module is collection of functions for accessing the EBI REST web services, """ This module is collection of functions for accessing the EBI REST web services,
...@@ -32,11 +32,11 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'): ...@@ -32,11 +32,11 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
# Get the entry # Get the entry
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
if data.startswith('ERROR'): if data.startswith(b'ERROR'):
raise RuntimeError(data) raise RuntimeError(data)
return data return data
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def search(query, dbName='uniprot', format='list', limit=100): def search(query, dbName='uniprot', format='list', limit=100):
...@@ -57,12 +57,12 @@ def search(query, dbName='uniprot', format='list', limit=100): ...@@ -57,12 +57,12 @@ def search(query, dbName='uniprot', format='list', limit=100):
url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
# Get the entries # Get the entries
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
if format == 'list': if format == 'list':
return data.splitlines() return data.splitlines()
else: else:
return data return data
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
elif dbName.startswith('refseq'): elif dbName.startswith('refseq'):
dbs = dbName.split(":") dbs = dbName.split(":")
...@@ -72,7 +72,7 @@ def search(query, dbName='uniprot', format='list', limit=100): ...@@ -72,7 +72,7 @@ def search(query, dbName='uniprot', format='list', limit=100):
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit) url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
# Get the entries # Get the entries
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
words = data.split("</Id>") words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]] words = [w[w.find("<Id>")+4:] for w in words[:-1]]
if format == 'list': if format == 'list':
...@@ -81,11 +81,11 @@ def search(query, dbName='uniprot', format='list', limit=100): ...@@ -81,11 +81,11 @@ def search(query, dbName='uniprot', format='list', limit=100):
url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id=" url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
for w in words: for w in words:
url += w + "," url += w + ","
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
return data return data
else: else:
return '' return ''
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return return
...@@ -121,8 +121,8 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False) ...@@ -121,8 +121,8 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False)
'query' : query 'query' : query
} }
if len(query) > 0: if len(query) > 0:
request = urllib2.Request(url, urllib.urlencode(params)) request = urllib.request.Request(url, urllib.parse.urlencode(params))
response = urllib2.urlopen(request).read() response = urllib.request.urlopen(request).read()
d = dict() d = dict()
for row in response.splitlines()[1:]: for row in response.splitlines()[1:]:
pair = row.split('\t') pair = row.split('\t')
...@@ -170,7 +170,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'): ...@@ -170,7 +170,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
if background == None: if background == None:
for t in term_set: for t in term_set:
term_cnt[t] = fg_list.count(t) term_cnt[t] = fg_list.count(t)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
else: # a background is provided else: # a background is provided
for t in term_set: for t in term_set:
fg_hit = fg_list.count(t) fg_hit = fg_list.count(t)
...@@ -178,7 +178,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'): ...@@ -178,7 +178,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
fg_nohit = nPos - fg_hit fg_nohit = nPos - fg_hit
bg_nohit = nNeg - bg_hit bg_nohit = nNeg - bg_hit
term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False)) term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
ret = [] ret = []
for t in sorted_cnt: for t in sorted_cnt:
...@@ -199,17 +199,17 @@ def getGODef(goterm): ...@@ -199,17 +199,17 @@ def getGODef(goterm):
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
entry={'id': None, 'name': None, 'def': None} entry={'id': None, 'name': None, 'def': None}
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
for row in data.splitlines(): for row in data.splitlines():
index = row.find(':') index = row.find(':')
if index > 0 and len(row[index:]) > 1: if index > 0 and len(row[index:]) > 1:
field = row[0:index].strip() field = row[0:index].strip()
value = row[index+1:].strip(' "') # remove spaces and quotation marks value = row[index+1:].strip(' "') # remove spaces and quotation marks
if field in entry.keys(): # check if we need this field if field in list(entry.keys()): # check if we need this field
if entry[field] == None: # check if not yet assigned if entry[field] == None: # check if not yet assigned
entry[field] = value entry[field] = value
return entry return entry
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def getGOTerms(genes, database='UniProtKB', completeAnnot = False): def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
...@@ -246,9 +246,9 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False): ...@@ -246,9 +246,9 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
url = __ebiGOUrl__ + uri_string + query url = __ebiGOUrl__ + uri_string + query
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
urlreq = urllib2.Request(url) urlreq = urllib.request.Request(url)
urlreq.add_header('Accept-encoding', 'gzip') urlreq.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(urlreq) response = urllib.request.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip': if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read()) buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf) f = gzip.GzipFile(fileobj=buf)
...@@ -259,12 +259,12 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False): ...@@ -259,12 +259,12 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
values = row.split('\t') values = row.split('\t')
if len(values) >= 7: if len(values) >= 7:
key = values[1] key = values[1]
if termsmap.has_key(key): if key in termsmap:
termsmap[key].add(values[6]) termsmap[key].add(values[6])
else: else:
termsmap[key] = set([values[6]]) termsmap[key] = set([values[6]])
taxonmap[key] = int(values[4]) taxonmap[key] = int(values[4])
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
if completeAnnot: if completeAnnot:
if len(genes) == 1: if len(genes) == 1:
...@@ -304,13 +304,13 @@ def getGenes(goterms, database='UniProtKB', taxo=None): ...@@ -304,13 +304,13 @@ def getGenes(goterms, database='UniProtKB', taxo=None):
url = __ebiGOUrl__ + uri_string + goterm.strip() url = __ebiGOUrl__ + uri_string + goterm.strip()
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore first (header) row for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t') values = row.split('\t')
if len(values) >= 7: if len(values) >= 7:
genes.add(values[1]) genes.add(values[1])
map[goterm] = list(genes) map[goterm] = list(genes)
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
if len(goterms) == 1: if len(goterms) == 1:
return map[goterms[0]] return map[goterms[0]]
...@@ -381,12 +381,12 @@ class EBI(object): ...@@ -381,12 +381,12 @@ class EBI(object):
databaseData = '' databaseData = ''
for db in databaseList: for db in databaseList:
databaseData += '&database=' + db databaseData += '&database=' + db
encodedParams = urllib.urlencode(params) encodedParams = urllib.parse.urlencode(params)
encodedParams += databaseData encodedParams += databaseData
else: else:
encodedParams = urllib.urlencode(params) encodedParams = urllib.parse.urlencode(params)
print url print(url)
self.jobId = urllib2.urlopen(url, encodedParams).read() self.jobId = urllib.request.urlopen(url, encodedParams).read()
self.createLock() self.createLock()
return self.jobId return self.jobId
...@@ -396,23 +396,23 @@ class EBI(object): ...@@ -396,23 +396,23 @@ class EBI(object):
if jobId is None: if jobId is None:
jobId = self.jobId jobId = self.jobId
url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
status = urllib2.urlopen(url).read() status = urllib.request.urlopen(url).read()
return status return status
def resultTypes(self): def resultTypes(self):
""" Get the available result types. Will only work on a finished job. """ """ Get the available result types. Will only work on a finished job. """
url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
resultTypes = urllib2.urlopen(url).read() resultTypes = urllib.request.urlopen(url).read()
return resultTypes return resultTypes
def result(self, resultType): def result(self, resultType):
""" Get the result of the given job of the specified type. """ """ Get the result of the given job of the specified type. """
url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType) url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
try: try:
result = urllib2.urlopen(url).read() result = urllib.request.urlopen(url).read()
if resultType == 'error': if resultType == 'error':
raise RuntimeError('An error occurred: %s' % result) raise RuntimeError('An error occurred: %s' % result)
except urllib2.HTTPError: except(urllib.error.HTTPError):
if resultType == 'error': if resultType == 'error':
raise RuntimeError('An unknown error occurred while processing the job (check your input)') raise RuntimeError('An unknown error occurred while processing the job (check your input)')
else: else:
...@@ -424,8 +424,8 @@ class EBI(object): ...@@ -424,8 +424,8 @@ class EBI(object):
Return the output in the specified format. """ Return the output in the specified format. """
params['email'] = self.__email__ params['email'] = self.__email__
self.run(params) self.run(params)
print 'Submitted new', self.service, 'job, jobId:', self.jobId print(('Submitted new', self.service, 'job, jobId:', self.jobId))
print 'Please be patient while the job is completed' print('Please be patient while the job is completed')
status = 'RUNNING' status = 'RUNNING'
observe = 0 observe = 0
while status == 'RUNNING': while status == 'RUNNING':
...@@ -434,7 +434,7 @@ class EBI(object): ...@@ -434,7 +434,7 @@ class EBI(object):
sleep(self.__checkInterval__) sleep(self.__checkInterval__)
if status != 'FINISHED': if status != 'FINISHED':
raise RuntimeError('An error occurred and the job could not be completed') raise RuntimeError('An error occurred and the job could not be completed')
print 'Job complete.' print('Job complete.')
self.removeLock() self.removeLock()
if type(resultTypes) != list: if type(resultTypes) != list:
resultTypes = [resultTypes] resultTypes = [resultTypes]
...@@ -445,5 +445,3 @@ class EBI(object): ...@@ -445,5 +445,3 @@ class EBI(object):
return results[0] return results[0]
else: else:
return results return results
...@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100): ...@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
neg[word] = 1 neg[word] = 1
logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg
for (word, cnt_pos) in pos.items(): for (word, cnt_pos) in list(pos.items()):
cnt_neg = 0.0001 cnt_neg = 0.0001
try: try:
cnt_neg = neg[word] cnt_neg = neg[word]
...@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100): ...@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pass pass
logratio[word] = math.log(float(cnt_pos) / float(cnt_neg)) logratio[word] = math.log(float(cnt_pos) / float(cnt_neg))
allpos = logratio.items() # extract all pairs of words:log-ratio allpos = list(logratio.items()) # extract all pairs of words:log-ratio
sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them
print "Enriched words (sorted by ln pos/neg)" print("Enriched words (sorted by ln pos/neg)")
print "Word \tln pos/neg\tE-value" print("Word \tln pos/neg\tE-value")
for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values
cnt_pos = int(pos[word]) cnt_pos = int(pos[word])
try: cnt_neg = int(neg[word]) try: cnt_neg = int(neg[word])
...@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100): ...@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False) pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False)
# Correct for multiple testing (very conservatively) # Correct for multiple testing (very conservatively)
eval = pval * len(allpos) eval = pval * len(allpos)
print "%s\t%6.3f \t%e" % (word, lgr, eval) print("%s\t%6.3f \t%e" % (word, lgr, eval))
def getReverse(distribs): def getReverse(distribs):
""" Construct a new list of probability distributions of DNA, by """ Construct a new list of probability distributions of DNA, by
...@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'): ...@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
except KeyError: except KeyError:
usage(sys.argv[0], "Unknown motif %s" % motif) usage(sys.argv[0], "Unknown motif %s" % motif)
return return
print "Motif %s:" % motif print("Motif %s:" % motif)
pwm1 = sequence.PWM(fg1, bg) pwm1 = sequence.PWM(fg1, bg)
pwm1.display(format='JASPAR') pwm1.display(format='JASPAR')
print "Motif %s (reverse complement):" % motif print("Motif %s (reverse complement):" % motif)
pwm2 = sequence.PWM(fg2, bg) pwm2 = sequence.PWM(fg2, bg)
pwm2.display(format='JASPAR') pwm2.display(format='JASPAR')
...@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'): ...@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
# plot the average score curve # plot the average score curve
# print >> sys.stderr, "" # print >> sys.stderr, ""
x = range(-(seq_len/2), (seq_len/2)) # call center of sequence X=0 x = list(range(-(seq_len/2), (seq_len/2))) # call center of sequence X=0
lbl = "%s" % (motif) lbl = "%s" % (motif)
plt.plot(x, avg_motif_score, label=lbl) plt.plot(x, avg_motif_score, label=lbl)
#plt.plot(x, smoothed_avg_motif_score, label=lbl) #plt.plot(x, smoothed_avg_motif_score, label=lbl)
...@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices ...@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
except KeyError: except KeyError:
usage(sys.argv[0], "Unknown motif %s" % motif) usage(sys.argv[0], "Unknown motif %s" % motif)
return return
print "Motif %s:" % motif print("Motif %s:" % motif)
pwm1 = sequence.PWM(fg1, bg) pwm1 = sequence.PWM(fg1, bg)
pwm1.display(format='JASPAR') pwm1.display(format='JASPAR')
print "Motif %s (reverse complement):" % motif print("Motif %s (reverse complement):" % motif)
pwm2 = sequence.PWM(fg2, bg) pwm2 = sequence.PWM(fg2, bg)
pwm2.display(format='JASPAR') pwm2.display(format='JASPAR')
...@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices ...@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
# divide number of sequences with hit by total number of hits # divide number of sequences with hit by total number of hits
site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ] site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ]
print >> sys.stderr, "Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits) print("Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits), file=sys.stderr)
# STATISTICS # STATISTICS
# Get the cumulative hit counts in concentric windows # Get the cumulative hit counts in concentric windows
...@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices ...@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
for i in range(hw, seq_len-motif_width+1-hw): for i in range(hw, seq_len-motif_width+1-hw):
smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1) smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1)
x = range(-(seq_len/2), (seq_len/2)) # call center of sequence X=0 x = list(range(-(seq_len/2), (seq_len/2))) # call center of sequence X=0
lbl = "%s, t=%.2f" % (motif, threshold) lbl = "%s, t=%.2f" % (motif, threshold)
#lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue)) #lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
plt.plot(x, smoothed_site_probability, label=lbl) plt.plot(x, smoothed_site_probability, label=lbl)
...@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices ...@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
def usage(name, errmsg = None): def usage(name, errmsg = None):
if errmsg != None: if errmsg != None:
print "Error: %s" % errmsg print("Error: %s" % errmsg)
print """Usage: %s [options] print("""Usage: %s [options]
-f <fasta-filename> (required) -f <fasta-filename> (required)
-d discover enriched words -d discover enriched words
-w <word width, default 8> -w <word width, default 8>
-p <peak width, default 100> -p <peak width, default 100>
-m <peak margin, default 100> -m <peak margin, default 100>
-s <JASPAR-ID> scan for JASPAR motif -s <JASPAR-ID> scan for JASPAR motif
-h print this help""" % name -h print this help""" % name)
if __name__ == '__main__': if __name__ == '__main__':
try: try:
optlst, args = getopt.getopt(sys.argv[1:], 'f:hds:j:w:p:m:') optlst, args = getopt.getopt(sys.argv[1:], 'f:hds:j:w:p:m:')
except getopt.GetoptError, err: except getopt.GetoptError as err:
usage(sys.argv[0], str(err)) usage(sys.argv[0], str(err))
sys.exit(2) sys.exit(2)
FILENAME = None FILENAME = None
...@@ -301,7 +301,7 @@ if __name__ == '__main__': ...@@ -301,7 +301,7 @@ if __name__ == '__main__':
sys.exit(3) sys.exit(3)
seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN) seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN)
if DISCOVER_MODE: if DISCOVER_MODE:
print "Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN) print("Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN))
countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN) countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
elif SCAN_MODE: elif SCAN_MODE:
scanMotifReport(seqs, MOTIF_ID) scanMotifReport(seqs, MOTIF_ID)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment