Commit ac6c5d6b authored by Mikael Boden's avatar Mikael Boden

python3_5

parent 934c2bff
...@@ -95,8 +95,8 @@ def betacf(a, b, x): ...@@ -95,8 +95,8 @@ def betacf(a, b, x):
h *= delta h *= delta
if (abs(delta-1.0) < EPS): break if (abs(delta-1.0) < EPS): break
if (m > MAXIT): print >> sys.stderr, ("a or b too big or MAXIT too small " if (m > MAXIT): print(("a or b too big or MAXIT too small "
"in betacf") "in betacf"), file=sys.stderr)
return h return h
...@@ -118,5 +118,5 @@ def gammaln(x): ...@@ -118,5 +118,5 @@ def gammaln(x):
def die(string): def die(string):
print >> sys.stderr, string print(string, file=sys.stderr)
...@@ -105,7 +105,7 @@ class GeneExpression: ...@@ -105,7 +105,7 @@ class GeneExpression:
{'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])} {'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
""" """
if names == None: if names == None:
return self.genes.keys() return list(self.genes.keys())
elif isinstance(names, str): elif isinstance(names, str):
return self.matrix[self.genes[names],:] return self.matrix[self.genes[names],:]
else: else:
...@@ -148,7 +148,7 @@ class GeneExpression: ...@@ -148,7 +148,7 @@ class GeneExpression:
except: except:
index = samples index = samples
mygenes = {} mygenes = {}
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
mygenes[name] = self.matrix[ndx, index] mygenes[name] = self.matrix[ndx, index]
return mygenes return mygenes
...@@ -165,7 +165,7 @@ class GeneExpression: ...@@ -165,7 +165,7 @@ class GeneExpression:
sort_ndx = np.nan_to_num(self.matrix[:,index]).argsort() sort_ndx = np.nan_to_num(self.matrix[:,index]).argsort()
except: except:
sort_ndx = np.nan_to_num(self.matrix[:,sample]).argsort() sort_ndx = np.nan_to_num(self.matrix[:,sample]).argsort()
name_tuples = sorted(self.genes.items(), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles name_tuples = sorted(list(self.genes.items()), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
names = [] names = []
if descending: if descending:
for (name, index) in [name_tuples[index] for index in sort_ndx[::-1]]: # reverse the order for (name, index) in [name_tuples[index] for index in sort_ndx[::-1]]: # reverse the order
...@@ -199,7 +199,7 @@ class GeneExpression: ...@@ -199,7 +199,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding ratios. """ Creates and returns a gene dictionary with the corresponding ratios. """
mygenes = {} mygenes = {}
mdiv = self.matrix[:, index1] / self.matrix[:, index2] mdiv = self.matrix[:, index1] / self.matrix[:, index2]
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
mygenes[name] = mdiv[ndx] mygenes[name] = mdiv[ndx]
return mygenes return mygenes
...@@ -208,7 +208,7 @@ class GeneExpression: ...@@ -208,7 +208,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding log-ratios. """ Creates and returns a gene dictionary with the corresponding log-ratios. """
mygenes = {} mygenes = {}
mlr = np.log2(self.matrix[:, index1] / self.matrix[:, index2]) mlr = np.log2(self.matrix[:, index1] / self.matrix[:, index2])
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
mygenes[name] = mlr[ndx] mygenes[name] = mlr[ndx]
return mygenes return mygenes
...@@ -218,7 +218,7 @@ class GeneExpression: ...@@ -218,7 +218,7 @@ class GeneExpression:
index = self.genes[probeID] index = self.genes[probeID]
profile = self.matrix[index, :] profile = self.matrix[index, :]
mygenes = {} mygenes = {}
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
other = self.matrix[ndx, :] other = self.matrix[ndx, :]
mygenes[name] = pearson(profile, other) mygenes[name] = pearson(profile, other)
return mygenes return mygenes
...@@ -252,7 +252,7 @@ class GeneExpression: ...@@ -252,7 +252,7 @@ class GeneExpression:
# Calculate Z-score for the given column for each gene # Calculate Z-score for the given column for each gene
zscore = (self.matrix[:, index] - mu) / sd zscore = (self.matrix[:, index] - mu) / sd
mygenes = {} mygenes = {}
for (name, ndx) in self.genes.items(): for (name, ndx) in list(self.genes.items()):
try: try:
mygenes[name] = zscore[ndx, :] mygenes[name] = zscore[ndx, :]
except IndexError: except IndexError:
...@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0): ...@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0):
genes[name] = values genes[name] = values
if len(genes) == 0: if len(genes) == 0:
raise RuntimeError('No data in file') raise RuntimeError('No data in file')
print 'Data set %s contains %d entries' % (dataset, len(genes)) print('Data set %s contains %d genes' % (dataset, len(genes)))
if cnt_null > 0: if cnt_null > 0:
print 'Data set has %d null-values' % (cnt_null) print('Data set has %d null-values' % (cnt_null))
return GeneExpression(dataset, headers[2:], genes) return GeneExpression(dataset, headers[2:], genes)
...@@ -357,40 +357,29 @@ def pearson(X, Y): ...@@ -357,40 +357,29 @@ def pearson(X, Y):
return 0 return 0
return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar)) return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar))
# ------------------- Example --------------------- # ------------------- Example (basically exercise 7 in prac 9)---------------------
ge3716 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS3716.soft') if __name__=='__main__':
ratio = GeneExpression('GDS3716_ratio') g = readGEOFile('GDS3198.soft', id_column = 1)
ratio.addSamples('S1_ER+/Healthy', ge3716.getRatio( 33, 0)) meanfold = {}
ratio.addSamples('S2_ER+/Healthy', ge3716.getRatio( 34, 1)) for gene in g.genes:
ratio.addSamples('S3_ER+/Healthy', ge3716.getRatio( 35, 2)) profile = g.getGenes(gene)
ratio.addSamples('S4_ER+/Healthy', ge3716.getRatio( 36, 3)) meanfold[gene] = (np.log2(profile[0] / profile[3]) + np.log2(profile[1] / profile[4]) + np.log2(profile[2] / profile[5])) / 3
ratio.addSamples('S5_ER+/Healthy', ge3716.getRatio( 37, 4))
ratio.addSamples('S6_ER+/Healthy', ge3716.getRatio( 38, 5)) import matplotlib.pyplot as plt
ratio.addSamples('S7_ER+/Healthy', ge3716.getRatio( 39, 6)) scores = [y for y in list(meanfold.values()) if not np.isnan(y)]
ratio.addSamples('S8_ER+/Healthy', ge3716.getRatio( 40, 7)) hist, bins = np.histogram(scores, bins=50)
ratio.addSamples('S9_ER+/Healthy', ge3716.getRatio( 41, 8)) width = 0.7 * (bins[1] - bins[0])
ratio.addSamples('S1_ER-/Healthy', ge3716.getRatio( 24, 9)) center = (bins[:-1] + bins[1:]) / 2
ratio.addSamples('S2_ER-/Healthy', ge3716.getRatio( 25, 10)) plt.bar(center, hist, align='center', width=width)
ratio.addSamples('S3_ER-/Healthy', ge3716.getRatio( 26, 11)) plt.show()
ratio.addSamples('S4_ER-/Healthy', ge3716.getRatio( 27, 12))
ratio.addSamples('S5_ER-/Healthy', ge3716.getRatio( 28, 13))
ratio.addSamples('S6_ER-/Healthy', ge3716.getRatio( 29, 14))
ratio.addSamples('S7_ER-/Healthy', ge3716.getRatio( 30, 15))
ratio.addSamples('S8_ER-/Healthy', ge3716.getRatio( 31, 16))
ratio.addSamples('S9_ER-/Healthy', ge3716.getRatio( 32, 17))
ratio.writeGEOFile('/Users/mikael/workspace/COSC2000/GDS3716_ratios.soft')
print ge3716.getHeaders()
z = ratio.getZScore(0) # NOT recommended! Ratios are NOT normally distributed! Use log-ratios instead.
ge38 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS38.soft', id_column = 1)
cln2_profile = ge38.getGenes('CLN2')
pcorr = ge38.getPearson('CLN2')
gp = GeneExpression('Ex3', 'PC_CLN2', pcorr)
sorted = gp.sort('PC_CLN2', True)
print sorted[0], ge38.getGenes(sorted[0])
print sorted[1], ge38.getGenes(sorted[1])
result = sorted(list(meanfold.items()), key=lambda v: v[1])
print('========== Wildtype may down-regulate ==========')
for r in result[0:100]:
print(r[0], r[1])
print('========== Wildtype may up-regulate ==========')
for r in result[-1:-100:-1]:
print(r[0], r[1])
...@@ -138,7 +138,7 @@ class GibbsMotif(): ...@@ -138,7 +138,7 @@ class GibbsMotif():
LL += math.log(Qk / Pk) LL += math.log(Qk / Pk)
except ZeroDivisionError: except ZeroDivisionError:
pass pass
print "LL @ %5d=\t%5.2f" % (round, LL) print("LL @ %5d=\t%5.2f" % (round, LL))
# end main for-loop # end main for-loop
self.q = q self.q = q
...@@ -312,7 +312,7 @@ class GibbsAlign(): ...@@ -312,7 +312,7 @@ class GibbsAlign():
LL += math.log(Qk / Pk) LL += math.log(Qk / Pk)
except ZeroDivisionError: except ZeroDivisionError:
pass pass
print "LL @ %5d=\t%5.2f" % (round, LL) print("LL @ %5d=\t%5.2f" % (round, LL))
# end main for-loop # end main for-loop
self.q = q self.q = q
......
...@@ -92,7 +92,7 @@ class GO(): ...@@ -92,7 +92,7 @@ class GO():
say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations: say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations:
go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8)) go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8))
""" """
print "Started at", time.asctime() print(("Started at", time.asctime()))
# Get GO definitions # Get GO definitions
terms = readOBOFile(obofile) terms = readOBOFile(obofile)
for term in terms: for term in terms:
...@@ -108,7 +108,7 @@ class GO(): ...@@ -108,7 +108,7 @@ class GO():
cset.add((term, prel)) cset.add((term, prel))
except KeyError: except KeyError:
pass pass
print "Read %d GO definitions" % len(terms) print(("Read %d GO definitions" % len(terms)))
# open annotation file to analyse and index data # open annotation file to analyse and index data
src = open(annotFile, 'r') src = open(annotFile, 'r')
gene_cnt = 0 gene_cnt = 0
...@@ -126,7 +126,7 @@ class GO(): ...@@ -126,7 +126,7 @@ class GO():
terms_map = {term: (evid, qual != 'NOT')} terms_map = {term: (evid, qual != 'NOT')}
self.annots[gene] = (taxa, terms_map) self.annots[gene] = (taxa, terms_map)
src.close() src.close()
print "Read annotations for %d genes" % gene_cnt print(("Read annotations for %d genes" % gene_cnt))
def _makeIntoList(self, id_or_ids): def _makeIntoList(self, id_or_ids):
if type(id_or_ids) != list and type(id_or_ids) != set and type(id_or_ids) != tuple: if type(id_or_ids) != list and type(id_or_ids) != set and type(id_or_ids) != tuple:
...@@ -283,7 +283,7 @@ class GO(): ...@@ -283,7 +283,7 @@ class GO():
def getAllAnnots(self): def getAllAnnots(self):
""" Retrieve all annotated gene products """ """ Retrieve all annotated gene products """
return self.annots.keys() return list(self.annots.keys())
def getAllBackground(self, positives = [], taxa = None, evid = None, include_more_general = False): def getAllBackground(self, positives = [], taxa = None, evid = None, include_more_general = False):
""" Retrieve all genes and terms that are annotated but not in a list of positives (gene products). """ Retrieve all genes and terms that are annotated but not in a list of positives (gene products).
...@@ -328,12 +328,12 @@ class GO(): ...@@ -328,12 +328,12 @@ class GO():
cnt = fg_list.count(t) cnt = fg_list.count(t)
if cnt >= threshold: if cnt >= threshold:
term_cnt[t] = cnt term_cnt[t] = cnt
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
ret = [] ret = []
for t in sorted_cnt: for t in sorted_cnt:
defin = self.getTermdef(t[0]) defin = self.getTermdef(t[0])
if defin == None: if defin == None:
print 'Could not find definition of %s' % t[0] print(('Could not find definition of %s' % t[0]))
else: else:
ret.append((t[0], t[1], defin[2], defin[0])) ret.append((t[0], t[1], defin[2], defin[0]))
return ret return ret
...@@ -360,7 +360,7 @@ class GO(): ...@@ -360,7 +360,7 @@ class GO():
# Process background: find terms of genes # Process background: find terms of genes
bg_list = [] bg_list = []
if background == None: # need to use the full set if background == None: # need to use the full set
background = self.annots.keys() background = list(self.annots.keys())
negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives
nNeg = len(negatives) nNeg = len(negatives)
bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general) bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general)
...@@ -383,12 +383,12 @@ class GO(): ...@@ -383,12 +383,12 @@ class GO():
evalue = pvalue * len(term_set) # Bonferroni correction evalue = pvalue * len(term_set) # Bonferroni correction
if evalue <= threshold: # check if significance req is fulfilled if evalue <= threshold: # check if significance req is fulfilled
term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue) term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
ret = [] ret = []
for t in sorted_cnt: for t in sorted_cnt:
defin = self.getTermdef(t[0]) defin = self.getTermdef(t[0])
if defin == None: if defin == None:
print 'Could not find definition of %s' % t[0] print(('Could not find definition of %s' % t[0]))
else: else:
ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0])) ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0]))
return ret return ret
...@@ -448,17 +448,17 @@ class BinGO(): ...@@ -448,17 +448,17 @@ class BinGO():
found = set() found = set()
try: try:
(_, closure, _) = self.termdefs[term] (_, closure, _) = self.termdefs[term]
for (t, r) in closure.items(): for (t, r) in list(closure.items()):
if (not rel) or r == rel: if (not rel) or r == rel:
found.add(t) found.add(t)
found.update(self._getSuperTerms(t, rel)) found.update(self._getSuperTerms(t, rel))
except KeyError: except KeyError:
print 'Could not find GO:%s' % (''.join(decode(term, self.term_code))) print(('Could not find GO:%s' % (''.join(decode(term, self.term_code)))))
return found return found
def _getChildTerms(self, term, rel = None): def _getChildTerms(self, term, rel = None):
found = set() found = set()
for (child, termdef) in self.termdefs.items(): for (child, termdef) in list(self.termdefs.items()):
(_, parents_dict, _) = termdef (_, parents_dict, _) = termdef
try: try:
myrel = parents_dict[term] myrel = parents_dict[term]
...@@ -491,7 +491,7 @@ class BinGO(): ...@@ -491,7 +491,7 @@ class BinGO():
mymap[gene_name] = set() mymap[gene_name] = set()
try: try:
(taxa, terms) = self._getGeneEntry(i) (taxa, terms) = self._getGeneEntry(i)
for (term, evid_and_qual) in terms.items(): for (term, evid_and_qual) in list(terms.items()):
if evid_and_qual[1] and not evid: # if True and no evidence is specified if evid_and_qual[1] and not evid: # if True and no evidence is specified
direct.add(term) direct.add(term)
mymap[gene_name].add(term) mymap[gene_name].add(term)
...@@ -505,7 +505,7 @@ class BinGO(): ...@@ -505,7 +505,7 @@ class BinGO():
# STEP 2: Find the transitive closure of each term identified, store as a dictionary # STEP 2: Find the transitive closure of each term identified, store as a dictionary
indirect = {} indirect = {}
for t in direct: for t in direct:
if not indirect.has_key(t): if not t in indirect:
indirect[t] = set(self._getSuperTerms(t)) indirect[t] = set(self._getSuperTerms(t))
# STEP 3: compile and return results # STEP 3: compile and return results
for gene in mymap: for gene in mymap:
...@@ -521,7 +521,7 @@ class BinGO(): ...@@ -521,7 +521,7 @@ class BinGO():
def getAllGenes(self): def getAllGenes(self):
names = [] names = []
for g in self._decodeGeneIDs(self.annot_index.keys()): for g in self._decodeGeneIDs(list(self.annot_index.keys())):
names.append(''.join(g)) names.append(''.join(g))
return names return names
...@@ -545,7 +545,7 @@ class BinGO(): ...@@ -545,7 +545,7 @@ class BinGO():
gene_name = decode(g, self.gene_code) gene_name = decode(g, self.gene_code)
(mytaxa, tdict) = self._getGeneEntry(g) (mytaxa, tdict) = self._getGeneEntry(g)
if not taxa or taxa == mytaxa: if not taxa or taxa == mytaxa:
for annot_term in tdict.keys(): for annot_term in list(tdict.keys()):
if tdict[annot_term] == evid: if tdict[annot_term] == evid:
if annot_term in terms: if annot_term in terms:
try: try:
...@@ -642,10 +642,10 @@ class BinGO(): ...@@ -642,10 +642,10 @@ class BinGO():
(annot_offset, obo_offset) = unpack('II', buf) (annot_offset, obo_offset) = unpack('II', buf)
break break
except error as inst: except error as inst:
print "Problem reading binary file: ", inst, "at gene ", current_gene_cnt, "at definition ", current_terms_cnt, "at", f.tell() print(("Problem reading binary file: ", inst, "at gene ", current_gene_cnt, "at definition ", current_terms_cnt, "at", f.tell()))
exit(3) exit(3)
print "Read %d genes and %d term definitions" % (current_gene_cnt, current_terms_cnt) print(("Read %d genes and %d term definitions" % (current_gene_cnt, current_terms_cnt)))
print "Annotations start at", annot_offset, "\nDefinitions start at", obo_offset print(("Annotations start at", annot_offset, "\nDefinitions start at", obo_offset))
return f return f
#FIXME: write code to perform test of taxa enrichment #FIXME: write code to perform test of taxa enrichment
...@@ -655,7 +655,7 @@ class BinGO(): ...@@ -655,7 +655,7 @@ class BinGO():
Uses the Wilcoxon Ranksum test for each GO term to assign a p-value, Uses the Wilcoxon Ranksum test for each GO term to assign a p-value,
indicating the enrichment of term to "top" genes in descending order by score (by default). indicating the enrichment of term to "top" genes in descending order by score (by default).
""" """
fg_map = self.getTerms(gene_score_map.keys(), include_more_general = include_more_general) fg_map = self.getTerms(list(gene_score_map.keys()), include_more_general = include_more_general)
fg_list = [] fg_list = []
for id in fg_map: for id in fg_map:
for t in fg_map[id]: for t in fg_map[id]:
...@@ -663,7 +663,7 @@ class BinGO(): ...@@ -663,7 +663,7 @@ class BinGO():
term_set = set(fg_list) term_set = set(fg_list)
term_pval = {} term_pval = {}
if len(negatives_score_map) > 0: if len(negatives_score_map) > 0:
bg_map = self.getTerms(negatives_score_map.keys(), include_more_general = include_more_general) bg_map = self.getTerms(list(negatives_score_map.keys()), include_more_general = include_more_general)
for t in term_set: for t in term_set:
pos = [] pos = []
neg = [] neg = []
...@@ -698,13 +698,13 @@ class BinGO(): ...@@ -698,13 +698,13 @@ class BinGO():
else: else:
term_pval[t] = (p, 1.0) term_pval[t] = (p, 1.0)
sorted_pval = sorted(term_pval.items(), key=lambda v: v[1][0], reverse=False) sorted_pval = sorted(list(term_pval.items()), key=lambda v: v[1][0], reverse=False)
ret = [] ret = []
for t in sorted_pval: for t in sorted_pval:
defin = self.getTermdef(t[0]) defin = self.getTermdef(t[0])
if defin == None: if defin == None:
print 'Could not find definition of %s' % t[0] print(('Could not find definition of %s' % t[0]))
else: else:
ret.append((t[0], t[1][0], t[1][1], defin[2].strip(), defin[0])) ret.append((t[0], t[1][0], t[1][1], defin[2].strip(), defin[0]))
return ret return ret
...@@ -739,7 +739,7 @@ class BinGO(): ...@@ -739,7 +739,7 @@ class BinGO():
if background == None: if background == None:
for t in term_set: for t in term_set:
term_cnt[t] = fg_list.count(t) term_cnt[t] = fg_list.count(t)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
else: # a background is provided else: # a background is provided
for t in term_set: for t in term_set:
fg_hit = fg_list.count(t) fg_hit = fg_list.count(t)
...@@ -747,13 +747,13 @@ class BinGO(): ...@@ -747,13 +747,13 @@ class BinGO():
fg_nohit = nPos - fg_hit fg_nohit = nPos - fg_hit
bg_nohit = nNeg - bg_hit bg_nohit = nNeg - bg_hit
term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False)) term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
ret = [] ret = []
for t in sorted_cnt: for t in sorted_cnt:
defin = self.getTermdef(t[0]) defin = self.getTermdef(t[0])
if defin == None: if defin == None:
print 'Could not find definition of %s' % t[0] print(('Could not find definition of %s' % t[0]))
else: else:
if background != None: if background != None:
ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][1], defin[2], defin[0])) ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][1], defin[2], defin[0]))
...@@ -773,7 +773,7 @@ def encode(code_me, encode_strings): ...@@ -773,7 +773,7 @@ def encode(code_me, encode_strings):
accum *= codelen accum *= codelen
break break
except IndexError as e: except IndexError as e:
print e, code_me print((e, code_me))
return code return code
def decode(code, encode_strings): def decode(code, encode_strings):
...@@ -787,7 +787,7 @@ def decode(code, encode_strings): ...@@ -787,7 +787,7 @@ def decode(code, encode_strings):
code -= accum[pos] * indices[pos] code -= accum[pos] * indices[pos]
string = [encode_strings[pos][indices[pos]] for pos in range(len(encode_strings))] string = [encode_strings[pos][indices[pos]] for pos in range(len(encode_strings))]
except IndexError as e: except IndexError as e:
print e, code print((e, code))
return string return string
def _extractAnnotFields(line, columns = (1,2,3,4,6,8)): def _extractAnnotFields(line, columns = (1,2,3,4,6,8)):
...@@ -837,10 +837,10 @@ def _extractAnnotFields(line, columns = (1,2,3,4,6,8)): ...@@ -837,10 +837,10 @@ def _extractAnnotFields(line, columns = (1,2,3,4,6,8)):
term = None term = None
raise "No GO term on line: " + line raise "No GO term on line: " + line
evid = fields[columns[4]] evid = fields[columns[4]]
if not evid_codes.has_key(evid): if not evid in evid_codes:
evid = None evid = None
onto = fields[columns[5]] onto = fields[columns[5]]
if not onto_codes.has_key(onto): if not onto in onto_codes:
onto = None onto = None
taxa_idx = line.find('taxon:') taxa_idx = line.find('taxon:')
if taxa_idx == -1: if taxa_idx == -1:
...@@ -902,7 +902,7 @@ def readOBOFile(obofile): ...@@ -902,7 +902,7 @@ def readOBOFile(obofile):
return terms return terms
def writeBitFile(annotFile, obofile, destFile, taxas = None): def writeBitFile(annotFile, obofile, destFile, taxas = None):
print "Started at", time.asctime() print(("Started at", time.asctime()))
# open annotation file to analyse and index data # open annotation file to analyse and index data
src = open(annotFile, 'r') src = open(annotFile, 'r')
gene_index = [{} for _ in range(6)] # count different characters in different positions gene_index = [{} for _ in range(6)] # count different characters in different positions
...@@ -942,27 +942,27 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None): ...@@ -942,27 +942,27 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
pos += 1 pos += 1
prev_gene = gene prev_gene = gene
src.close() src.close()
print "Read annotations for %d genes" % gene_cnt print(("Read annotations for %d genes" % gene_cnt))
gene_code = ['' for _ in range(6)] gene_code = ['' for _ in range(6)]
term_code = ['' for _ in range(7)] term_code = ['' for _ in range(7)]
for d in range(len(gene_index)): for d in range(len(gene_index)):
arr = ['?' for _ in gene_index[d]] arr = ['?' for _ in gene_index[d]]
for (ch, index) in gene_index[d].items(): for (ch, index) in list(gene_index[d].items()):
arr[index] = ch arr[index] = ch
gene_code[d] = ''.join(arr) gene_code[d] = ''.join(arr)
for d in range(len(term_index)): for d in range(len(term_index)):
arr = ['?' for _ in term_index[d]] arr = ['?' for _ in term_index[d]]
for (ch, index) in term_index[d].iteritems(): for (ch, index) in term_index[d].items():
arr[index] = ch arr[index] = ch
term_code[d] = ''.join(arr) term_code[d] = ''.join(arr)
evid_code = ['' for _ in range(len(evid_index))] evid_code = ['' for _ in range(len(evid_index))]
for (e, ndx) in evid_index.items(): for (e, ndx) in list(evid_index.items()):
evid_code[ndx] = e evid_code[ndx] = e
# Get GO definitions # Get GO definitions
terms = readOBOFile(obofile) terms = readOBOFile(obofile)
print "Read %d GO definitions" % len(terms) print(("Read %d GO definitions" % len(terms)))
# re-open, now with the aim of copying info # re-open, now with the aim of copying info
src = open(annotFile, 'r') src = open(annotFile, 'r')
...@@ -975,7 +975,7 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None): ...@@ -975,7 +975,7 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
dst.write(code_str+"\n") dst.write(code_str+"\n")
for e_str in evid_code: for e_str in evid_code:
dst.write(e_str+'\n') dst.write(e_str+'\n')
print "Wrote header %d\t%d\t%d\t%d\t%d, now at @%d" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index), dst.tell()) print(("Wrote header %d\t%d\t%d\t%d\t%d, now at @%d" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index), dst.tell())))
# STEP 2: write annotations # STEP 2: write annotations
annot_offset = dst.tell() annot_offset = dst.tell()
...@@ -1012,11 +1012,11 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None): ...@@ -1012,11 +1012,11 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
(o, q, e) = concat_terms[t] (o, q, e) = concat_terms[t]
s = pack('?BI', q, evid_index[e], encode(t, term_code)) s = pack('?BI', q, evid_index[e], encode(t, term_code))
dst.write(s) dst.write(s)
print "Wrote GO annotations, now at @%d" % dst.tell() print(("Wrote GO annotations, now at @%d" % dst.tell()))
# Next, the ontology definition... # Next, the ontology definition...
obo_offset = dst.tell() # remember the position where the OBO starts obo_offset = dst.tell() # remember the position where the OBO starts
sorted_terms = sorted(terms.iteritems(), key=operator.itemgetter(0)) sorted_terms = sorted(iter(terms.items()), key=operator.itemgetter(0))
for [t, _] in sorted_terms: for [t, _] in sorted_terms:
(term_name, term_onto, term_is) = terms[t] (term_name, term_onto, term_is) = terms[t]
s = pack('IcH', encode(t[3:], term_code), term_onto, len(term_is)) s = pack('IcH', encode(t[3:], term_code), term_onto, len(term_is))
...@@ -1029,12 +1029,10 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None): ...@@ -1029,12 +1029,10 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
s = pack('BI', index, encode(sup_term[3:], term_code)) s = pack('BI', index, encode(sup_term[3:], term_code))
dst.write(s) dst.write(s)
dst.write(term_name + '\n') dst.write(term_name + '\n')
print "Wrote %d GO definitions, now at @%d" % (len(sorted_terms), dst.tell()) print(("Wrote %d GO definitions, now at @%d" % (len(sorted_terms), dst.tell())))
# Finally, write the offsets to quickly access annotations and definitions, resp # Finally, write the offsets to quickly access annotations and definitions, resp
dst.write(pack('II', annot_offset, obo_offset)) dst.write(pack('II', annot_offset, obo_offset))
# done, close # done, close
dst.close() dst.close()
print "Completed at", time.asctime() print(("Completed at", time.asctime()))
################################################### ###################################################
# This module is a supplement to the Python guide # # This module is a supplement to the Python guide #
# Version 2.2016.1 (8/3/2016) # # Version 1.20160 (19/12/2016) #
################################################### ###################################################
''' '''
This module contains code for that can help solving bioinformatics problems. This module contains code that can help solve bioinformatics problems.
See the accompanying Python guide for more explanations and examples. See the accompanying Python guide document for more explanations and examples.
Alphabet is a class that defines valid symbols that we then use to make up valid Alphabet is a class that defines valid symbols that we then use to make up valid
biological sequences. Note that we also define variables corresponding to biological sequences. Note that we also define variables corresponding to
DNA, RNA and Protein sequences that can be used directly. DNA, RNA and Protein sequences that can be used directly.
Sequence defines basic parts and operations on biological sequences. Sequence is a class that defines basic parts and operations on biological sequences.
Alignment defines an alignment of sequences (how symbols in different sequences line Alignment is a class that defines an alignment of sequences (how symbols in different sequences line
up when placed on-top). Alignment methods should generate instances of this class. up when placed on-top of one another). Alignment methods should generate instances of this class.
SubstMatrix defines a substitution matrix, i.e. a scoring system for performing SubstMatrix is a class that defines a substitution matrix, i.e. a scoring system for performing
alignments. You can read these from files or construct them manually. alignments. You can read these from files or construct them manually.
GeneProfile defines parts and operations for gene expression profiles. Essentially, GeneProfile is a class that defines parts and operations for gene expression profiles. Essentially,
the class will help to index expression data by gene name (rows) and by sample name (columns). the class will help to index expression data by gene name (rows) and by sample name (columns).
There are several methods not tied to a particular class because they construct new instances, There are several methods not tied to a particular class because they construct new instances,
e.g. reading from file, retrieving from the internet, creating an alignment from sequences etc. e.g. reading from file, retrieving from the internet, creating an alignment from sequences etc.
You need to have numpy installed (see http://www.numpy.org/). You need to have numpy installed (see http://www.numpy.org/).
Should work with Python v2.6-2.7 (see http://www.python.org/). Should work with Python v2.6-3.5 (see http://www.python.org/).
Has not been written to work with Python v3 and later--but this should be easy to do.
The code may contain bugs--please report to m.boden@uq.edu.au The code may contain bugs--please report to m.boden@uq.edu.au
''' '''
import math, numpy, urllib, urllib2 import math, numpy, urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse
###############################################################################
# Alphabet #
###############################################################################
class Alphabet(): class Alphabet():
""" A minimal class for alphabets """ """ A minimal class for alphabets
Alphabets include DNA, RNA and Protein """
def __init__(self, symbolString): def __init__(self, symbolString):
self.symbols = symbolString self.symbols = symbolString
def __len__(self): # implements the "len" operator, e.g. "len(Alphabet('XYZ')" results in 3 def __len__(self): # implements the "len" operator, e.g. "len(Alphabet('XYZ'))" results in 3
return len(self.symbols) return len(self.symbols) # will tell you the length of the symbols in an Alphabet instance
def __contains__(self, sym): # implements the "in" operator, e.g. "'A' in Alphabet('ACGT')" results in True def __contains__(self, sym): # implements the "in" operator, e.g. "'A' in Alphabet('ACGT')" results in True
return sym in self.symbols return sym in self.symbols # will tell you if 'A' is in the symbols in an Alphabet instance
def __iter__(self): # method that allows us to iterate over all symbols, e.g. "for sym in Alphabet('ACGT'): print sym" prints A, C, G and T on separate lines def __iter__(self): # method that allows us to iterate over all symbols, e.g. "for sym in Alphabet('ACGT'): print sym" prints A, C, G and T on separate lines
tsyms = tuple(self.symbols) tsyms = tuple(self.symbols)
return tsyms.__iter__() return tsyms.__iter__()
def __getitem__(self, ndx): """ Below we declare alphabet variables that are going to be available when
""" Retrieve the symbol(s) at the specified index (or slice of indices) """ this module (this .py file) is imported """
return self.symbols[ndx]
def index(self, sym):
""" Retrieve the index of the given symbol in the alphabet. """
return self.symbols.index(sym)
def __str__(self):
return self.symbols
""" Below we declare alphabet variables that are going to be available when
this module is imported """
DNA_Alphabet = Alphabet('ACGT') DNA_Alphabet = Alphabet('ACGT')
RNA_Alphabet = Alphabet('ACGU') RNA_Alphabet = Alphabet('ACGU')
Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY') Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX') Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
###############################################################################
# Sequence #
###############################################################################
class Sequence(): class Sequence():
""" A biological sequence class. Stores the sequence itself, """ A biological sequence class. Stores the sequence itself,
the alphabet and a name. the alphabet and a name.
Usage: Usage:
Create an instance of Sequence - have to pass in sequence, alphabet and name - gappy is an optional argument
>>> seq1 = Sequence('ACGGGAGAGG', DNA_Alphabet, 'ABC') >>> seq1 = Sequence('ACGGGAGAGG', DNA_Alphabet, 'ABC')
>>> print seq1 >>> print(seq1)
ABC: ACGGGAGAGG ABC: ACGGGAGAGG
>>> 'C' in seq1 >>> 'C' in seq1
True True
>>> for sym in seq1: >>> for sym in seq1:
... print sym ... print(sym)
""" """
def __init__(self, sequence, alphabet, name = '', gappy = False, annot = ''): def __init__(self, sequence, alphabet, name = '', gappy = False):
""" Construct a sequence from a string, an alphabet (gappy or not) and a name. """ Construct a sequence from a string, an alphabet (gappy or not) and a name.
The parameter gappy is for sequences when used in alignments, which means that '-' is allowed. """ The parameter gappy is for sequences when used in alignments. """
for sym in sequence: for sym in sequence:
if not sym in alphabet and (sym != '-' or not gappy): # error check: bail out if not sym in alphabet and (sym != '-' or not gappy): # error check: bail out
raise RuntimeError('Invalid symbol: ' + sym) raise RuntimeError('Invalid symbol: ' + sym)
self.sequence = sequence self.sequence = sequence # Store sequence
self.alphabet = alphabet self.alphabet = alphabet # Store alphabet
self.name = name self.name = name # Store name
self.gappy = gappy self.gappy = gappy
self.annot = annot # some annotation, e.g. species
def __len__(self): # the "len" operator def __len__(self): # the "len" operator
return len(self.sequence) return len(self.sequence)
def __iter__(self): # method that allows us to iterate over a sequence def __iter__(self): # method that allows us to iterate over a sequence
...@@ -104,7 +87,7 @@ class Sequence(): ...@@ -104,7 +87,7 @@ class Sequence():
return self.sequence[ndx] return self.sequence[ndx]
def writeFasta(self): def writeFasta(self):
""" Write one sequence in FASTA format to a string and return it. """ """ Write one sequence in FASTA format to a string and return it. """
fasta = '>' + self.name + ' ' + self.annot + '\n' fasta = '>' + self.name + '\n'
data = self.sequence data = self.sequence
nlines = (len(self.sequence) - 1) / 60 + 1 nlines = (len(self.sequence) - 1) / 60 + 1
for i in range(nlines): for i in range(nlines):
...@@ -126,244 +109,96 @@ class Sequence(): ...@@ -126,244 +109,96 @@ class Sequence():
def find(self, findme): def find(self, findme):
""" Find the position of the specified symbol or sub-sequence """ """ Find the position of the specified symbol or sub-sequence """
return self.sequence.find(findme) return self.sequence.find(findme)
#Challege - who can tell me what find() does in under 30 seconds
#demo this and teach them how to read pydocs
#Can we use it if we want to find multiple occurrences of a sequence?
#What can we use instead?
def findAll(self, findme):
""" Find all occurrences of the specified symbol or sub-sequence.
Uses a sliding window approach - very common when searching for motifs
or contamination.
Then, calculate the percentage of sequence made up of 'findme'.
If the percentage is above 20%, replace all occurrences of 'findme' with Ns.
"""
word = len(findme)
store = []
count = 0
for s in range(0, len(self.sequence)):
cur = self.sequence[s:s+word]
if cur == findme:
store.append(s)
count += 1
print(cur, "Match at position ", s)
else:
print(cur)
print("---------------")
percentage = float(count*word) / len(self.sequence)
print("Percentage of findme in sequence: ", percentage)
if percentage > 20:
print("Removing contamination")
newSeq = self.sequence.replace(findme, "N")
return store, percentage, newSeq
###############################################################################
# Alignment #
###############################################################################
class Alignment(): class Alignment():
""" A sequence alignment class. Stores two or more sequences of equal length where """ A sequence alignment class. Stores two or more sequences of equal length where
one symbol is gap '-'. The number of columns in the alignment is given by alignlen. one symbol is gap '-'. The number of columns in the alignment is given by alignlen.
Example usage: Example usage:
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)] >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print Alignment(seqs) >>> print(Alignment(seqs))
THIS-LI-NE- THIS-LI-NE-
--ISALIGNED """ --ISALIGNED """
def __init__(self, seqs): def __init__(self, seqs):
self.alphabet = None
self.alignlen = -1 self.alignlen = -1
self.seqs = seqs self.seqs = seqs
self.namelen = 0
for s in seqs: for s in seqs:
if self.alphabet == None:
self.alphabet = s.alphabet
elif self.alphabet != s.alphabet:
raise RuntimeError("Alignment invalid: contains a mix of alphabets")
if self.alignlen == -1: if self.alignlen == -1:
self.alignlen = len(s) self.alignlen = len(s)
elif self.alignlen != len(s): elif self.alignlen != len(s):
raise RuntimeError("Alignment invalid: lengths vary") raise RuntimeError("Alignment invalid")
self.namelen = max(len(s.name), self.namelen) def __str__(self):
def __str__(self): string = u''
string = '' namelen = 0
for seq in self.seqs:
namelen = max(len(seq.name), namelen)
for seq in self.seqs: for seq in self.seqs:
string += seq.name.ljust(self.namelen+1) string += seq.name.ljust(namelen+1)
for sym in seq: for sym in seq:
string += sym string += sym
string += '\n' string += '\n'
return string return string
def __len__(self):
""" Defines what the "len" operator returns for an instance of Alignment: the number of sequences. """
return len(self.seqs)
def __getitem__(self, ndx):
return self.seqs[ndx]
def calcDistances(self, measure, a=1.0):
""" Calculate the evolutionary distance between all pairs of sequences
in this alignment, using the given measure. Measure can be one of
'fractional', 'poisson', 'gamma', 'jc' or 'k2p'. If 'gamma' or 'k2p' is
given, then the parameter a must also be specified (or else it will use
the default value of 1.0).
Definitions of each distance metric are found in Zvelebil and Baum p268-276.
These are mostly intended for DNA, but adapted for protein (as below).
Note however that there are alternative distance matrices for proteins (p276).
"""
measure = measure.lower()
if not measure in ['fractional', 'poisson', 'gamma', 'jc', 'k2p']:
raise RuntimeError('Unsupported evolutionary distance measure: %s' % measure)
a = float(a)
distmat = numpy.zeros((len(self.seqs), len(self.seqs)))
# Loop through each pair of sequences
for i in range(len(self.seqs)):
for j in range(i + 1, len(self.seqs)):
seqA = self.seqs[i]
seqB = self.seqs[j]
# Calculate the fractional distance (p) first
# The two sequences of interest are in seqA and seqB
L = 0
D = 0
for k in range(self.alignlen):
# For every non-gapped column, put to L
# For every non-gapped column where the sequences are
# different, put to D
if seqA[k] != '-' and seqB[k] != '-':
L += 1
if seqA[k] != seqB[k]:
D += 1
p = float(D)/L
# Now calculate the specified measure based on p
if measure == 'fractional':
dist = p
else:
raise RuntimeError('Not implemented: %s' % measure)
distmat[i, j] = distmat[j, i] = dist
return distmat
def writeClustal(self):
""" Write the alignment to a string using the Clustal file format. """
symbolsPerLine = 60
maxNameLength = self.namelen + 1
mystring = ''
wholeRows = self.alignlen / symbolsPerLine
for i in range(wholeRows):
for j in range(len(self.seqs)):
mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
mystring += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
mystring += '\n'
# Possible last row
lastRowLength = self.alignlen - wholeRows*symbolsPerLine
if lastRowLength > 0:
for j in range(len(self.seqs)):
if maxNameLength > 0:
mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
mystring += self.seqs[j][-lastRowLength:] + '\n'
return mystring
def writeHTML(self, filename):
""" Generate HTML that displays the alignment in colour.
"""
fh = open(filename, 'w')
fh.write('<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n')
html = ''.ljust(self.namelen) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += str(i/10+1)[-1]
else:
html += ' '
html += '%s\n' % (self.alignlen)
fh.write(html)
if self.alignlen > 10:
html = ''.ljust(self.namelen) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += '0'
else:
html += ' '
html += '\n'
fh.write(html)
if len(self.alphabet) <= 5: # DNA or RNA
colours = {'A':'green','C':'orange','G':'red','T':'#66bbff','U':'#66bbff'}
else: # amino acids
colours = {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'}
for seq in self.seqs:
html = seq.name.ljust(self.namelen) + ' '
for sym in seq:
try:
colour = colours[sym]
except KeyError:
colour = 'white'
html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (colour, sym)
html += '\n'
fh.write(html)
fh.write('</pre></body></html>\n')
fh.close()
def scoreAlignment(self, substmat = None, gap = -1):
"""Score the alignment using a substitution matrix (substmat).
If the alignment consists of more than two sequences, the minimum
score of each column is used.
If substmat is not specified (None), the count of matches is returned.
"""
nseqs = len(self.seqs)
total = 0
for pos in range(self.alignlen):
min = None
for i in range(nseqs):
for j in range(i+1, nseqs):
gap_here = self.seqs[i][pos] == '-' or self.seqs[j][pos] == '-'
score = 0
if substmat == None:
if self.seqs[i][pos] == self.seqs[j][pos]:
score = 1
else: # we have a substitution matrix
if gap_here:
score = gap
else:
score = substmat.get(self.seqs[i][pos], self.seqs[j][pos])
if min == None:
min = score
elif min > score:
min = score
total += min
return total
############################################################################### def scoreAlignment(aln, substmat = None, gap = -1):
# Methods to create instances of Alignment # """Score an alignment (aln) using a substitution matrix (substmat).
############################################################################### If the alignment consists of more than two sequences, the minimum
score of each column is used.
def align(seqA, seqB, substMatrix, gap = -1): If substmat is not specified (None), the count of matches is returned.
""" Align seqA with seqB using the Needleman-Wunsch """
(global) algorithm. substMatrix is the substitution matrix to use and nseqs = len(aln.seqs)
gap is the linear gap penalty to use. """ total = 0
stringA, stringB = seqA.sequence, seqB.sequence for pos in range(aln.alignlen):
lenA, lenB = len(seqA), len(seqB) min = None
# Create the scoring matrix (S) and a matrix for traceback for i in range(nseqs):
S = numpy.zeros((lenA + 1, lenB + 1)) for j in range(i+1, nseqs):
Traceback = numpy.zeros((lenA + 1, lenB + 1)) gap_here = aln.seqs[i][pos] == '-' or aln.seqs[j][pos] == '-'
# Fill the first row and column of S with multiples of the gap penalty score = 0
for i in range(lenA + 1): if substmat == None:
S[i, 0] = i * gap if aln.seqs[i][pos] == aln.seqs[j][pos]:
for j in range(lenB + 1): score = 1
S[0, j] = j * gap else: # we have a substitution matrix
# Calculate the optimum score at each location in the matrix, note which option that was chosen for traceback if gap_here:
for i in range(1, lenA + 1): score = gap
for j in range(1, lenB + 1): else:
match = S[i-1, j-1] + substMatrix.get(stringA[i-1], stringB[j-1]) score = substmat.get(aln.seqs[i][pos], aln.seqs[j][pos])
delete = S[i-1, j ] + gap if min == None:
insert = S[i , j-1] + gap min = score
Traceback[i, j] = numpy.argmax([match, delete, insert]) elif min > score:
S[i, j] = max([match, delete, insert]) min = score
# Trace back the optimal alignment total += min
alignA = '' return total
alignB = ''
# Start at the end
i = lenA
j = lenB
# Stop when we hit the end of a sequence
while i > 0 and j > 0:
if Traceback[i, j] == 1:
# Got here by a gap in sequence B (go up)
alignA = stringA[i-1] + alignA
alignB = '-' + alignB
i -= 1
elif Traceback[i, j] == 2:
# Got here by a gap in sequence A (go left)
alignA = "-" + alignA
alignB = stringB[j-1] + alignB
j -= 1
else:
# Got here by aligning the bases (go diagonally)
alignA = stringA[i-1] + alignA
alignB = stringB[j-1] + alignB
i -= 1
j -= 1
# Fill in the rest of the alignment if it begins with gaps
# (i.e., trace back all the way to S[0, 0])
while i > 0:
# Go up
alignA = stringA[i-1] + alignA
alignB = '-' + alignB
i -= 1
while j > 0:
# Go left
alignA = '-' + alignA
alignB = stringB[j-1] + alignB
j -= 1
return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])
###############################################################################
# SubstMatrix #
###############################################################################
class SubstMatrix(): class SubstMatrix():
""" Create a substitution matrix for an alphabet. """ Create a substitution matrix for an alphabet.
Example usage: Example usage:
...@@ -375,11 +210,11 @@ class SubstMatrix(): ...@@ -375,11 +210,11 @@ class SubstMatrix():
... elif a == b: ... elif a == b:
... sm.set(a, b, +1) ... sm.set(a, b, +1)
... ...
>>> print sm >>> print(sm)
A 1 A 1
C -1 1 C -1 1
G -1 -1 1 G -1 -1 1
T -1 -1 -1 1 T -1 -1 -1 1
A C G T A C G T
>>> sm.get('C', 'T') >>> sm.get('C', 'T')
-1 -1
...@@ -401,7 +236,7 @@ class SubstMatrix(): ...@@ -401,7 +236,7 @@ class SubstMatrix():
def __str__(self): def __str__(self):
symbols = self.alphabet.symbols # what symbols are in the alphabet symbols = self.alphabet.symbols # what symbols are in the alphabet
i = len(symbols) i = len(symbols)
string = '' string = u''
for a in symbols: for a in symbols:
string += a + ' ' string += a + ' '
for b in symbols[:len(symbols)-i+1]: for b in symbols[:len(symbols)-i+1]:
...@@ -417,17 +252,12 @@ class SubstMatrix(): ...@@ -417,17 +252,12 @@ class SubstMatrix():
def writeFile(self, filename): def writeFile(self, filename):
""" Write this substitution matrix to the given file. """ """ Write this substitution matrix to the given file. """
fh = open(filename, 'w') fh = open(filename, 'w')
file = '' file = u''
for key in self.scoremat: for key in self.scoremat:
file += ''.join(key) + ': ' + str(self.scoremat[key]) + '\n' file += ''.join(key) + ': ' + str(self.scoremat[key]) + '\n'
fh.write(file) fh.write(file)
fh.close() fh.close()
###############################################################################
# Below are some useful methods for loading data from strings and files. #
# They recognize the FASTA and Clustal formats (nothing fancy). #
###############################################################################
def readSubstMatrix(filename, alphabet): def readSubstMatrix(filename, alphabet):
""" Read in the substitution matrix stored in the given file. """ """ Read in the substitution matrix stored in the given file. """
mat = SubstMatrix(alphabet) mat = SubstMatrix(alphabet)
...@@ -443,26 +273,25 @@ def readSubstMatrix(filename, alphabet): ...@@ -443,26 +273,25 @@ def readSubstMatrix(filename, alphabet):
mat.set(symbols[0], symbols[1], score) mat.set(symbols[0], symbols[1], score)
return mat return mat
def readFastaString(string, alphabet, gappy = False): """
Below are some useful methods for loading data from strings and files.
Recognize the FASTA and Clustal formats (nothing fancy).
"""
def readFastaString(string, alphabet):
""" Read the given string as FASTA formatted data and return the list of """ Read the given string as FASTA formatted data and return the list of
sequences contained within it. """ sequences contained within it. """
seqlist = [] # list of sequences contained in the string seqlist = [] # list of sequences contained in the string
seqname = '' # name of *current* sequence seqname = None # name of *current* sequence
seqannot = '' # annotation of *current* sequence
seqdata = [] # sequence data for *current* sequence seqdata = [] # sequence data for *current* sequence
for line in string.splitlines(): # read every line for line in string.splitlines(): # read every line
if len(line) == 0: # ignore empty lines if len(line) == 0: # ignore empty lines
continue continue
if line[0] == '>': # start of new sequence if line[0] == '>': # start of new sequence
if seqname: # check if we've got one current if seqname: # check if we've got one current
current = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot) current = Sequence(''.join(seqdata), alphabet, seqname)
seqlist.append(current) seqlist.append(current)
# now collect data about the new sequence # now collect data about the new sequence
parts = line[1:].split() # skip first char seqname = line[1:].split()[0] # skip first char
seqname = '' # name of *current* sequence
seqannot = '' # annotation of *current* sequence
if len(parts) > 0: seqname = parts[0]
if len(parts) > 1: seqannot = line[len(seqname) + 2:] # the rest of the line
seqdata = [] seqdata = []
else: # we assume this is (more) data for current else: # we assume this is (more) data for current
cleanline = line.split() cleanline = line.split()
...@@ -470,26 +299,26 @@ def readFastaString(string, alphabet, gappy = False): ...@@ -470,26 +299,26 @@ def readFastaString(string, alphabet, gappy = False):
seqdata.extend(tuple(thisline.strip('*'))) seqdata.extend(tuple(thisline.strip('*')))
# we're done reading the file, but the last sequence remains # we're done reading the file, but the last sequence remains
if seqname: if seqname:
lastseq = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot) lastseq = Sequence(''.join(seqdata), alphabet, seqname)
seqlist.append(lastseq) seqlist.append(lastseq)
return seqlist return seqlist
def readFastaFile(filename, alphabet, gappy = False): def readFastaFile(filename, alphabet):
""" Read the given FASTA formatted file and return the list of sequences """ Read the given FASTA formatted file and return the list of sequences
contained within it. """ contained within it. """
fh = open(filename) fh = open(filename)
data = fh.read() data = fh.read()
fh.close() fh.close()
seqlist = readFastaString(data, alphabet, gappy) seqlist = readFastaString(data, alphabet)
return seqlist return seqlist
def writeFastaFile(filename, seqs): def writeFastaFile(filename, seqs):
""" Write the specified sequences to a FASTA file. """ """ Write the specified sequences to a FASTA file. """
fh = open(filename, 'w') fh = open(filename, 'w')
for seq in seqs: for seq in seqs:
fh.write(seq.writeFasta()) fh.write(str(seq))
fh.close() fh.close()
def readClustalString(string, alphabet): def readClustalString(string, alphabet):
""" Read a ClustalW2 alignment in the given string and return as an """ Read a ClustalW2 alignment in the given string and return as an
Alignment object. """ Alignment object. """
...@@ -504,12 +333,12 @@ def readClustalString(string, alphabet): ...@@ -504,12 +333,12 @@ def readClustalString(string, alphabet):
continue continue
sections = line.split() sections = line.split()
name, seq = sections[0:2] name, seq = sections[0:2]
if seqs.has_key(name): if name in seqs:
seqs[name] += seq seqs[name] += seq
else: else:
seqs[name] = seq seqs[name] = seq
sequences = [] sequences = []
for name, seq in seqs.items(): for name, seq in list(seqs.items()):
sequences.append(Sequence(seq, alphabet, name, gappy = True)) sequences.append(Sequence(seq, alphabet, name, gappy = True))
return Alignment(sequences) return Alignment(sequences)
...@@ -522,17 +351,6 @@ def readClustalFile(filename, alphabet): ...@@ -522,17 +351,6 @@ def readClustalFile(filename, alphabet):
aln = readClustalString(data, alphabet) aln = readClustalString(data, alphabet)
return aln return aln
def writeClustalFile(filename, aln):
""" Write the specified alignment to a Clustal file. """
fh = open(filename, 'w')
fh.write('CLUSTAL W (1.83) multiple sequence alignment\n\n\n') # fake header so that clustal believes it
fh.write(aln.writeClustal())
fh.close()
###############################################################################
# GeneProfile #
###############################################################################
class GeneProfile(): class GeneProfile():
""" A class for gene expression data. """ A class for gene expression data.
Example usage: Example usage:
...@@ -557,7 +375,7 @@ class GeneProfile(): ...@@ -557,7 +375,7 @@ class GeneProfile():
def getSorted(self, index, descending=True): def getSorted(self, index, descending=True):
"""Get a list of (gene, value) tuples in descending order by value""" """Get a list of (gene, value) tuples in descending order by value"""
key_fn = lambda v: v[1][index] key_fn = lambda v: v[1][index]
return sorted(self.genes.items(), key=key_fn, reverse=descending) return sorted(list(self.genes.items()), key=key_fn, reverse=descending)
def addSample(self, sample_name, sample_dict): def addSample(self, sample_name, sample_dict):
"""Add a sample to the current data set. """Add a sample to the current data set.
sample_dict is a dictionary with the same keys as the current gene set. sample_dict is a dictionary with the same keys as the current gene set.
...@@ -579,7 +397,7 @@ class GeneProfile(): ...@@ -579,7 +397,7 @@ class GeneProfile():
if isinstance(sample_name, str): # a single sample-name if isinstance(sample_name, str): # a single sample-name
mysamples = [sample_name] mysamples = [sample_name]
else: # a list of sample-names else: # a list of sample-names
mysamples = sample_name mysamples = sample_name
for gene in self.genes: for gene in self.genes:
mygenes[gene] = [] mygenes[gene] = []
for name in mysamples: for name in mysamples:
...@@ -629,7 +447,7 @@ def readGeoFile(filename, id_column = 0): ...@@ -629,7 +447,7 @@ def readGeoFile(filename, id_column = 0):
manylines = fh.read() manylines = fh.read()
fh.close() fh.close()
data_rows = False # Indicates whether we're reading the data section or metadata data_rows = False # Indicates whether we're reading the data section or metadata
name = 'Unknown' name = u'Unknown'
cnt_data = 0 cnt_data = 0
for line in manylines.splitlines(): for line in manylines.splitlines():
if line.startswith('^DATASET'): if line.startswith('^DATASET'):
...@@ -662,28 +480,28 @@ def readGeoFile(filename, id_column = 0): ...@@ -662,28 +480,28 @@ def readGeoFile(filename, id_column = 0):
continue continue
if not ignore: if not ignore:
dataset[id] = tuple(values) dataset[id] = tuple(values)
print 'Data set %s contains %d genes' % (name, len(dataset.genes)) print('Data set %s contains %d genes' % (name, len(dataset.genes)))
return dataset return dataset
############################################################################### """
# Web service methods that find data in online databases. Web service methods that find data in online databases.
# Our implementations are mainly serviced by EBI. Our implementations are mainly serviced by EBI.
############################################################################### """
def getSequence(entryId, dbName, alphabet): def getSequence(entryId, dbName, alphabet):
""" Retrieve a single entry from a database """ Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' (Uniprot Accession) or 'SUMO1_MOUSE' (Uniprot Identifier) entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE'
dbName: name of db e.g. 'uniprotkb', 'pdb' or 'refseqn'. dbName: name of db e.g. 'uniprotkb', 'pdb' or 'refseqn'.
See: http://www.uniprot.org/faq/28. """ See: http://www.uniprot.org/faq/28. """
url = 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' +\ if not isinstance(entryId, str):
dbName + '&format=fasta&id=' + entryId entryId = entryId.decode("utf-8")
url ='http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' + dbName + '&format=fasta&id=' + entryId
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
return readFastaString(data, alphabet)[0] return readFastaString(data.decode("utf-8"), alphabet)[0]
except urllib2.HTTPError, ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def searchSequences(query, dbName = 'uniprot'): def searchSequences(query, dbName):
""" """
Retrieve multiple entries matching query from a database currently only via UniProtKB Retrieve multiple entries matching query from a database currently only via UniProtKB
query: search term(s) e.g. 'organism:9606+AND+antigen' query: search term(s) e.g. 'organism:9606+AND+antigen'
...@@ -696,42 +514,40 @@ def searchSequences(query, dbName = 'uniprot'): ...@@ -696,42 +514,40 @@ def searchSequences(query, dbName = 'uniprot'):
url = 'http://www.uniprot.org/' + dbName + '/?format=list&query=' + query url = 'http://www.uniprot.org/' + dbName + '/?format=list&query=' + query
# Get the entries # Get the entries
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
return data.splitlines() return data.splitlines()
except urllib2.HTTPError, ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
elif dbName.startswith('refseq'): elif dbName.startswith('refseq'):
dbs = dbName.split(":") dbs = dbName.split(":")
if len(dbs) > 1: if len(dbs) > 1:
dbName = dbs[1] dbName = dbs[1]
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/' base = u'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query url = base + "esearch.fcgi?db=" + dbName + "&term=" + query
# Get the entries # Get the entries
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
words = data.split("</Id>") words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]] words = [w[w.find("<Id>")+4:] for w in words[:-1]]
return words return words
except urllib2.HTTPError, ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return return
def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'): def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
""" """
Map identifiers between databases (based on UniProtKB; Map identifiers between databases (based on UniProtKB;
see http://www.uniprot.org/faq/28) see http://www.uniprot.org/faq/28)
identifiers: a list of identifiers (list of strings) identifiers: a list of identifiers (list of strings)
frm: the abbreviation for the identifier FROM which to idmap frm: the abbreviation for the identifier FROM which to idmap
to: the abbreviation for the identifier TO which to idmap to: the abbreviation for the identifier TO which to idmap
Returns a dictionary with key (from) -> value (to). Returns a dictionary with key (from) -> value (to) """
ACC is Uniprot Accession (e.g. 'P42813'). url = u'http://www.uniprot.org/mapping/'
"""
url = 'http://www.uniprot.org/mapping/'
# construct query by concatenating the list of identifiers # construct query by concatenating the list of identifiers
if isinstance(identifiers, str): if isinstance(identifiers, str):
query = identifiers.strip() query = identifiers.strip()
else: # assume it is a list of strings else: # assume it is a list of strings
query = '' query = u''
for id in identifiers: for id in identifiers:
query = query + id.strip() + ' ' query = query + id.strip() + ' '
query = query.strip() # remove trailing spaces query = query.strip() # remove trailing spaces
...@@ -742,8 +558,8 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'): ...@@ -742,8 +558,8 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
'query' : query 'query' : query
} }
if len(query) > 0: if len(query) > 0:
request = urllib2.Request(url, urllib.urlencode(params)) request = urllib.request.Request(url, urllib.parse.urlencode(params))
response = urllib2.urlopen(request).read() response = urllib.request.urlopen(request).read()
d = dict() d = dict()
for row in response.splitlines()[1:]: for row in response.splitlines()[1:]:
pair = row.split('\t') pair = row.split('\t')
...@@ -752,43 +568,42 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'): ...@@ -752,43 +568,42 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
else: else:
return dict() return dict()
############################################################################### """
# Gene Ontology services. Gene Ontology services.
# See http://www.ebi.ac.uk/QuickGO/WebServices.html for more info See http://www.ebi.ac.uk/QuickGO/WebServices.html for more info
############################################################################### """
def getGODef(goterm): def getGODef(goterm):
""" """
Retrieve information about a GO term Retrieve information about a GO term
goterm: the identifier, e.g. 'GO:0002080' goterm: the identifier, e.g. 'GO:0002080'
""" """
# Construct URL # Construct URL
url = 'http://www.ebi.ac.uk/QuickGO/GTerm?format=obo&id=' + goterm url = u'http://www.ebi.ac.uk/QuickGO/GTerm?format=obo&id=' + goterm
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
entry={'id': None, 'name': None, 'def': None} entry={'id': None, 'name': None, 'def': None}
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
for row in data.splitlines(): for row in data.splitlines():
index = row.find(':') index = row.find(':')
if index > 0 and len(row[index:]) > 1: if index > 0 and len(row[index:]) > 1:
field = row[0:index].strip() field = row[0:index].strip()
value = row[index+1:].strip(' "') # remove spaces value = row[index+1:].strip(' "') # remove spaces
if field in entry.keys(): # check if we need field if field in list(entry.keys()): # check if we need field
if entry[field] == None: # check if assigned if entry[field] == None: # check if assigned
entry[field] = value entry[field] = value
return entry return entry
except urllib2.HTTPError, ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def getGOTerms(genes, db='UniProtKB'): def getGOTerms(genes, db='UniProtKB'):
""" """
Retrieve all GO terms for a given set of genes (or single gene). Retrieve all GO terms for a given set of genes (or single gene).
db: use specified database, e.g. 'UniProtKB', 'UniGene', db: use specified database, e.g. 'UniProtKB', 'UniGene',
or 'Ensembl'. or 'Ensembl'.
The result is given as a map (key=gene name, value=list of unique The result is given as a map (key=gene name, value=list of unique
terms) OR in the case of a single gene as a list of unique terms. terms) OR in the case of a single gene as a list of unique terms.
""" """
if type(genes) != list and type(genes) != set and type(genes) != tuple: if type(genes) != list and type(genes) != set and type(genes) != tuple:
genes = [genes] # if 'genes' is a single gene, we make a single item list genes = [genes] # if 'genes' is a single gene, we make a single item list
map = dict() map = dict()
uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&protein=' uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&protein='
...@@ -796,13 +611,13 @@ def getGOTerms(genes, db='UniProtKB'): ...@@ -796,13 +611,13 @@ def getGOTerms(genes, db='UniProtKB'):
terms = set() # empty result set terms = set() # empty result set
url = uri + gene.strip() # Construct URL url = uri + gene.strip() # Construct URL
try: # Get the entry: fill in the fields specified below try: # Get the entry: fill in the fields specified below
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore header row for row in data.splitlines()[1:]: # we ignore header row
values = row.split('\t') values = row.split('\t')
if len(values) >= 7: if len(values) >= 7:
terms.add(values[6]) # add term to result set terms.add(values[6]) # add term to result set
map[gene] = list(terms) # make a list of the set map[gene] = list(terms) # make a list of the set
except urllib2.HTTPError, ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
if len(genes) == 1: if len(genes) == 1:
return map[genes[0]] return map[genes[0]]
...@@ -811,12 +626,12 @@ def getGOTerms(genes, db='UniProtKB'): ...@@ -811,12 +626,12 @@ def getGOTerms(genes, db='UniProtKB'):
def getGenes(goterms, db='UniProtKB', taxo=None): def getGenes(goterms, db='UniProtKB', taxo=None):
""" """
Retrieve all genes/proteins for a given set of GO terms Retrieve all genes/proteins for a given set of GO terms
(or single GO term). (or single GO term).
db: use specified database, e.g. 'UniProtKB', 'UniGene', db: use specified database, e.g. 'UniProtKB', 'UniGene',
or 'Ensembl' or 'Ensembl'
taxo: use specific taxonomic identifier, e.g. 9606 (human) taxo: use specific taxonomic identifier, e.g. 9606 (human)
The result is given as a map (key=gene name, value=list of unique The result is given as a map (key=gene name, value=list of unique
terms) OR in the case of a single gene as a list of unique terms. terms) OR in the case of a single gene as a list of unique terms.
""" """
if type(goterms) != list and type(goterms) != set and type(goterms) != tuple: if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
...@@ -831,502 +646,15 @@ def getGenes(goterms, db='UniProtKB', taxo=None): ...@@ -831,502 +646,15 @@ def getGenes(goterms, db='UniProtKB', taxo=None):
genes = set() # start with empty result set genes = set() # start with empty result set
url = uri + goterm.strip() # Construct URL url = uri + goterm.strip() # Construct URL
try: # Get the entry: fill in the fields specified below try: # Get the entry: fill in the fields specified below
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore first (header) row for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t') values = row.split('\t')
if len(values) >= 7: if len(values) >= 7:
genes.add(values[1]) # add gene name to result set genes.add(values[1]) # add gene name to result set
map[goterm] = list(genes) map[goterm] = list(genes)
except urllib2.HTTPError, ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
if len(goterms) == 1: if len(goterms) == 1:
return map[goterms[0]] return map[goterms[0]]
else: else:
return map return map
###############################################################################
# PhyloTree #
###############################################################################
class PhyloTree:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
Functionality includes labelling and traversing nodes; reading and writing to Newick format;
association with sequence alignment; maximum parsimony inference of ancestral sequence;
generation of single, bifurcating rooted tree by UPGMA.
Known issues: Binary only; Parsimony does not handle gaps in alignment.
Programmers should note that almost all functionality is implemented through recursion. """
def __init__(self, root):
""" Create a tree from a node that is "root" in the tree."""
self.root = root
def putAlignment(self, aln):
""" Associate the tree with a set of sequences/alignment.
Involves assigning the sequence to the leaf nodes. """
self.aln = aln
self.root._assignAlignment(aln)
def __str__(self):
""" Produce a printable representation of the tree, specifically the root of the tree. """
return str(self.root)
def strSequences(self, start = None, end = None):
""" Produce a sequence representation of the tree, specifically the root of the tree.
Specify the start and end positions in the alignment for the sequence to be printed
(if None the min and max positions will be used). """
if self.aln != None:
my_start = start or 0
my_end = end or self.aln.alignlen
return self.root._printSequences(my_start, my_end)
def findLabel(self, label):
""" Retrieve/return the node with the specified label.
Returns None if not found."""
return self.root._findLabel(label)
def getDescendantsOf(self, node, transitive = False):
""" Retrieve and return the (list of) descendants (children) of a specified node.
Node can be the label or the instance.
transitive indicates if only the direct descendants (False) or if all descendants
should be returned.
If node does not exist, None is returned.
If node has no descendants, an empty list will be returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
if node:
return node.getDescendants(transitive)
return None
def getAncestorsOf(self, node, transitive = False):
""" Retrieve and return the ancestor (transitive=False) or
ancestors (transitive=True) of a specified node.
Node can be the label or the instance.
If node does not exist, None is returned.
If node is the root of the tree, None is returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
if node:
myroot = self.root
found = False
branching = []
while not found and myroot != None:
branching.append(myroot)
if myroot.left == node or myroot.right == node:
found = True
break
if myroot.left:
if myroot.left.isAncestorOf(node, transitive = True):
myroot = myroot.left
else: # must be right branch then...
myroot = myroot.right
else: # must be right branch then...
myroot = myroot.right
if found and transitive:
return branching
elif found and len(branching) > 0:
return branching[len(branching)-1]
return None
def parsimony(self):
""" Solve the "small parsimony problem",
i.e. find the sequences on each of the internal nodes.
See Jones and Pevzner, p. 368 and onwards, for details. """
self.root._forwardParsimony(self.aln) # setup and compute scores for all nodes
self.root._backwardParsimony(self.aln) # use scores to determine sequences
return self.root.getSequence() # return the sequence found at the root
###############################################################################
# PhyloNode #
###############################################################################
class PhyloNode:
""" A class for a node in a rooted, binary (bifurcating) tree.
Contains pointers to descendants/daughters (left and right),
optional fields include data, label, sequence and dist.
If parsimony is used scores and traceback pointers are available.
A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """
def __init__(self, label = ''):
""" Initialise an initially unlinked node.
Populate fields left and right to link it with other nodes.
Set label to name it.
Use field data for any type of information associated with node.
Use dist to indicate the distance to its parent (if any).
Other fields are used internally, including sequence for associated alignment,
seqscores, backleft and backright for maximum parsimony. """
self.left = None
self.right = None
self.data = None
self.label = label
self.dist = None
self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
self.seqscores = None # The scores propagated from leaves via children
self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols
self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
def __str__(self):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = str(self.left)
if self.right:
right = str(self.right)
if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist)
if self.label != None:
label = str(self.label)
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if not self.left and self.right:
return ','+right
elif self.left and not self.right:
return left+','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def _printSequences(self, start, end):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = self.left._printSequences(start, end)
if self.right:
right = self.right._printSequences(start, end)
if self.dist:
dist = ':' + str(self.dist)
if self.sequence != None:
label = "".join(self.sequence[start:end]) + ""
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if not self.left and self.right:
return ','+right
elif self.left and not self.right:
return left+','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def _findLabel(self, label):
""" Find a node by label at this node or in any descendants (recursively). """
if self.label == label:
return self
else:
if self.left:
foundLeft = self.left._findLabel(label)
if foundLeft:
return foundLeft
if self.right:
return self.right._findLabel(label)
return None
def _propagateDistance(self, parent_dist):
""" Convert absolute distances to relative.
The only parameter is the absolute distance to the parent of this node. """
travelled = self.dist # absolute distance to this node
self.dist = parent_dist - self.dist # relative distance to this node
if self.left != None: # if there is a child node...
self.left._propagateDistance(travelled) # pass absolute distance to this node
if self.right != None:
self.right._propagateDistance(travelled)
def _assignAlignment(self, aln):
""" Assign an alignment to the node, which implies assigning a sequence to it if one is
available in the alignment. """
self.sequence = None
if self.left != None:
self.left._assignAlignment(aln)
if self.right != None:
self.right._assignAlignment(aln)
for seq in aln.seqs:
if seq.name == self.label:
self.sequence = seq
break
def _forwardParsimony(self, aln):
""" Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node,
then to propagate scores from sequence assigned nodes to root (backward). """
if self.sequence == None: # no sequence has been assigned
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
scoresleft = scoresright = None
if self.left != None:
scoresleft = self.left._forwardParsimony(aln)
if self.right != None:
scoresright = self.right._forwardParsimony(aln)
# for each position in the alignment,
# introduce (initially zero) score for each symbol in alphabet
self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
# for each position in the alignment,
# allocate a position to put the left child symbol from which each current node symbol score was determined
self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
# allocate a position to put the right child symbol from which each current node symbol score was determined
self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
for col in range(aln.alignlen):
for a_parent in range(len(aln.alphabet)):
best_score_left = +9999999
best_score_right = +9999999
best_symb_left = 0
best_symb_right = 0
for a_left in range(len(aln.alphabet)):
score = (scoresleft[col][a_left] + (1 if a_left != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_left:
best_symb_left = a_left
best_score_left = score
for a_right in range(len(aln.alphabet)):
score = (scoresright[col][a_right] + (1 if a_right != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_right:
best_symb_right = a_right
best_score_right = score
self.seqscores[col][a_parent] = best_score_left + best_score_right
self.backleft[col][a_parent] = best_symb_left
self.backright[col][a_parent] = best_symb_right
else:
self.seqscores = [[0 if a==sym else 999999 for a in aln.alphabet] for sym in self.sequence] # if we want to weight scores, this would need to change
return self.seqscores
def _backwardParsimony(self, aln, seq = None):
""" Internal function that operates recursively to inspect scores to determine
most parsimonious sequence, from root to leaves. """
if self.sequence == None: # no sequence has been assigned
leftbuf = []
rightbuf = []
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
currbuf = []
for col in range(aln.alignlen):
min_score = 999999
min_symb = None
left_symb = None
right_symb = None
for a_parent in range(len(aln.alphabet)):
if self.seqscores[col][a_parent] < min_score:
min_score = self.seqscores[col][a_parent]
min_symb = a_parent
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
currbuf.append(aln.alphabet[min_symb])
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
self.sequence = Sequence(currbuf, aln.alphabet, self.label, gappy = True)
else: # Non-root, but not leaf
self.sequence = seq
col = 0
for sym_parent in self.sequence:
a_parent = aln.alphabet.index(sym_parent)
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
col += 1
self.left._backwardParsimony(aln, Sequence(leftbuf, aln.alphabet, self.label, gappy = True))
self.right._backwardParsimony(aln, Sequence(rightbuf, aln.alphabet, self.label, gappy = True))
return self.sequence
def getSequence(self):
""" Get the sequence for the node. Return None if no sequence is assigned.
Requires that an alignment is associated with the tree, and that sequence names match node labels.
If the explored node is not a leaf, the sequence can be determined by parsimony. """
if self.sequence != None: # a sequence has been assigned
return self.sequence
elif self.seqscores != None: # inferred by parsimony but not yet assigned
return None # determine most parsimonous sequence, not yet implemented
def isAncestorOf(self, node, transitive = True):
""" Decide if this node is the ancestor of specified node.
If transitive is True (default), all descendants are included.
If transitive is False, only direct descendants are included. """
if node == self.left or node == self.right:
return True
elif transitive:
if self.left:
statusLeft = self.left.isAncestorOf(node, transitive)
if statusLeft: return True
if self.right:
return self.right.isAncestorOf(node, transitive)
else:
return False
def getDescendants(self, transitive = False):
""" Retrieve and return (list of) nodes descendant of this.
If transitive is False (default), only direct descendants are included.
If transitive is True, all descendants are (recursively) included. """
children = []
if self.left:
children.append(self.left)
if self.right:
children.append(self.right)
if not transitive:
return children
else:
grandchildren = []
for c in children:
d = c.getDescendants(transitive)
if d:
grandchildren.extend(d)
children.extend(grandchildren)
return children
###############################################################################
# Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
# Methods for processing files of trees on the Newick format
###############################################################################
def runUPGMA(aln, measure, absoluteDistances = False):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances).
If absoluteDistances is True, the tree will be assigned the total distance from provided species.
Otherwise, the relative addition at each path will be assigned."""
D = {}
N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances
nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for i in range(len(nodes)):
nodes[i].sequence = aln.seqs[i]
nodes[i].dist = 0.0
N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i):
D[_getkey(nodes[i], nodes[j])] = M[i, j]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged
dist = D[pair]
if dist < closest_dist or closest_dist == None:
closest_dist = dist
closest_pair = pair
# So we know the closest, now we need to merge...
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
dz = {} # new distances to cluster z
for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[_getkey(z, w)] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree
z.right = y
nodes.append(z)
if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
y._propagateDistance(z.dist) # convert absolute distances to relative by recursing down right path
z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return
def _getkey(node1, node2):
""" Construct canonical (unordered) key for two symbols """
if node1 <= node2:
return tuple([node1, node2])
else:
return tuple([node2, node1])
def _findComma(string, level = 0):
""" Find first comma at specified level of embedding """
mylevel = 0
for i in range(len(string)):
if string[i] == '(':
mylevel += 1
elif string[i] == ')':
mylevel -= 1
elif string[i] == ',' and mylevel == level:
return i
return -1
def parseNewickNode(string):
""" Utility function that recursively parses embedded string using Newick format. """
first = string.find('(')
last = string[::-1].find(')') # look from the back
if first == -1 and last == -1: # we are at leaf
y = string.split(':')
node = PhyloNode(y[0])
if len(y) >= 2:
node.dist = float(y[1])
return node
elif first >= 0 and last >= 0:
# remove parentheses
last = len(string) - last - 1 # correct index to refer from start instead of end of string
embed = string[first + 1:last]
tail = string[last + 1:]
# find where corresp comma is
comma = _findComma(embed)
if comma == -1:
raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
left = embed[0:comma].strip()
right = embed[comma + 1:].strip()
y = tail.split(':')
node = PhyloNode(y[0])
if len(y) >= 2:
node.dist = float(y[1])
node.left = parseNewickNode(left)
node.right = parseNewickNode(right)
return node
else:
raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
def parseNewick(string):
""" Main method for parsing a Newick string into a (phylogenetic) tree.
Handles labels (on both leaves and internal nodes), and includes distances (if provided).
Returns an instance of a PhyloTree. """
if string.find(';') != -1:
string = string[:string.find(';')]
return PhyloTree(parseNewickNode(string))
def readNewickFile(filename):
""" Read file on Newick format.
Returns an instance of a PhyloTree."""
f = open(filename)
string = ''.join(f)
return parseNewick(string)
def writeNewickFile(filename, tree):
""" Write the specified tree to a Newick file. """
fh = open(filename, 'w')
fh.write(tree.__str__())
fh.close()
###############################################################################
# Below is code that will be run if the module is "run", and not just "imported".
###############################################################################
if __name__=='__main__':
x = Sequence('ACTGA', DNA_Alphabet, 'x')
print "Sequence", x, "is constructed from the symbols", x.alphabet.symbols
print "( There are", x.count('A'), "occurrences of the symbol 'A' in", x.sequence, ")"
y = Sequence('TACGA', DNA_Alphabet, 'y')
print "Sequence", y, "is constructed from the symbols", y.alphabet.symbols
print
print "( The sub-sequence 'CG' starts at index", y.find('CG'), "of", y.sequence, ")"
print
sm = SubstMatrix(DNA_Alphabet)
for a in DNA_Alphabet:
for b in DNA_Alphabet:
if a==b:
sm.set(a, b, +2) # match
else:
sm.set(a, b, -1) # mismatch
print "Below is a substitution matrix for the alphabet", DNA_Alphabet.symbols
print sm
print
aln = align(x, y, sm, -2)
print "Below is the alignment between x and y"
print aln
\ No newline at end of file
...@@ -21,7 +21,7 @@ class NN(): ...@@ -21,7 +21,7 @@ class NN():
self.b_hid = numpy.random.randn(nHidden) # biases hidden layer self.b_hid = numpy.random.randn(nHidden) # biases hidden layer
self.w_out = numpy.random.randn(nOutput, nHidden) # weights hid -> out self.w_out = numpy.random.randn(nOutput, nHidden) # weights hid -> out
self.b_out = numpy.random.randn(nOutput) # biases output layer self.b_out = numpy.random.randn(nOutput) # biases output layer
print "Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output)) print("Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output)))
def writeFile(self, filename): def writeFile(self, filename):
""" Save NN to a file. """ """ Save NN to a file. """
...@@ -110,7 +110,7 @@ class NN(): ...@@ -110,7 +110,7 @@ class NN():
multi_targ = [ target ] multi_targ = [ target ]
for i in range(niter): for i in range(niter):
mse = 0.0 mse = 0.0
entries = range(len(multi_input)) entries = list(range(len(multi_input)))
if shuffle: if shuffle:
random.shuffle(entries) random.shuffle(entries)
for p in entries: for p in entries:
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
Module with methods and classes for phylogeny. Module with methods and classes for phylogeny.
@author: mikael @author: mikael
''' '''
##import sequence import sequence
class PhyloTree: class PhyloTree:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships. """ Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
...@@ -140,7 +140,19 @@ class PhyloNode: ...@@ -140,7 +140,19 @@ class PhyloNode:
return left+',' return left+','
elif self.left and self.right: elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist return '(' + left + ',' + right + ')' + dist
def __le__(self, other):
""" Returns indication of less than other node. """
return other and self.__hash__() <= other.__hash__()
def __eq__(self, other):
""" Returns indication of equivalence to other node. """
return other and self.__hash__() == other.__hash__()
def __hash__(self):
""" Returns hash of object. """
return hash((self.label, self.dist, self.sequence))
def _printSequences(self, start, end): def _printSequences(self, start, end):
""" Returns string with node (incl descendants) in a Newick style. """ """ Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = '' left = right = label = dist = ''
...@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False): ...@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False):
find the *closest* pair of clusters, and find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged). merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """ In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged for pair in D: # check all pairs which should be merged
dist = D[pair] dist = D[pair]
if dist < closest_dist or closest_dist == None: if closest_dist == None or dist < closest_dist:
closest_dist = dist closest_dist = dist
closest_pair = pair closest_pair = pair
# So we know the closest, now we need to merge... # So we know the closest, now we need to merge...
...@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False): ...@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False):
y = closest_pair[1] y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N Nx = N.pop(x, None) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N Ny = N.pop(y, None) # find number of sequences in y, remove the cluster from list N
if Nx == None or Ny == None:
continue
dz = {} # new distances to cluster z dz = {} # new distances to cluster z
for w in N: # for each node w ... for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y) # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
......
...@@ -277,7 +277,7 @@ def _readDistrib(linelist): ...@@ -277,7 +277,7 @@ def _readDistrib(linelist):
if len(d) == 0: if len(d) == 0:
return None return None
alpha = Alphabet(symstr) alpha = Alphabet(symstr)
if '*' in d.keys(): # tot provided if '*' in list(d.keys()): # tot provided
for sym in d: for sym in d:
if sym != '*': if sym != '*':
d[sym] = d[sym] * d['*'] d[sym] = d[sym] * d['*']
...@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'): ...@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'):
ncol = len(counts) ncol = len(counts)
if len(name) == 1: # proper symbol if len(name) == 1: # proper symbol
symcount[name] = counts symcount[name] = counts
alpha = Alphabet(''.join(symcount.keys())) alpha = Alphabet(''.join(list(symcount.keys())))
distribs = [] distribs = []
for col in range(ncol): for col in range(ncol):
d = dict([(sym, symcount[sym][col]) for sym in symcount]) d = dict([(sym, symcount[sym][col]) for sym in symcount])
...@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'): ...@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'):
""" """
d = readMultiCounts(filename, format=format) d = readMultiCounts(filename, format=format)
if len(d) > 0: if len(d) > 0:
return d.values()[0] return list(d.values())[0]
################################################################################################# #################################################################################################
# Joint class # Joint class
...@@ -628,12 +628,12 @@ class IndepJoint(Joint): ...@@ -628,12 +628,12 @@ class IndepJoint(Joint):
def displayMatrix(self, count = False): def displayMatrix(self, count = False):
""" Pretty-print matrix """ """ Pretty-print matrix """
print " \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas)))) print((" \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))))
for a in self.alphas[0]: for a in self.alphas[0]:
if count: if count:
print "%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True))) print(("%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))))
else: else:
print "%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a))) print(("%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))))
def __str__(self): def __str__(self):
""" Text representation of the table. Note that size is an issue so big tables """ Text representation of the table. Note that size is an issue so big tables
...@@ -718,5 +718,3 @@ class NaiveBayes(): ...@@ -718,5 +718,3 @@ class NaiveBayes():
prob *= condprob[i][key[i]] or 0.0 prob *= condprob[i][key[i]] or 0.0
out.observe(outsym, prob) out.observe(outsym, prob)
return out return out
...@@ -37,7 +37,7 @@ def base_percentages(reads): ...@@ -37,7 +37,7 @@ def base_percentages(reads):
for nuc in seq: for nuc in seq:
all_seqs.append(nuc) all_seqs.append(nuc)
counts=dict(Counter(all_seqs)) counts=dict(Counter(all_seqs))
nucs=counts.keys() nucs=list(counts.keys())
freqs={} freqs={}
for nuc in nucs: for nuc in nucs:
freqs[nuc]=float(counts[nuc])/sum(counts.values()) freqs[nuc]=float(counts[nuc])/sum(counts.values())
...@@ -67,7 +67,7 @@ you will get an accurate "percentage of reads aligned" statistic. ...@@ -67,7 +67,7 @@ you will get an accurate "percentage of reads aligned" statistic.
mapped=len(mapped)+len(mapped) mapped=len(mapped)+len(mapped)
else: else:
mapped=len(mapped) mapped=len(mapped)
print "number of mapped reads",mapped print("number of mapped reads",mapped)
return store_reads return store_reads
...@@ -108,9 +108,9 @@ def subgroups(mapped_reads): ...@@ -108,9 +108,9 @@ def subgroups(mapped_reads):
group3.append(read) group3.append(read)
else: else:
pass pass
print len(group1),"in p<1e-3 group" print(len(group1),"in p<1e-3 group")
print len(group2),"in 1e-3<=p<1e-2 group" print(len(group2),"in 1e-3<=p<1e-2 group")
print len(group3),"in 1e-2<=p<1 group" print(len(group3),"in 1e-2<=p<1 group")
return group1,group2,group3 return group1,group2,group3
...@@ -124,7 +124,7 @@ def dinuc_freq(mapped_reads): ...@@ -124,7 +124,7 @@ def dinuc_freq(mapped_reads):
for nuc in seq: for nuc in seq:
all_seqs.append(nuc) all_seqs.append(nuc)
counts=dict(Counter(all_seqs)) counts=dict(Counter(all_seqs))
nucs=counts.keys() nucs=list(counts.keys())
freqs={} freqs={}
for nuc in nucs: for nuc in nucs:
freqs[nuc]=float(counts[nuc])/sum(counts.values()) freqs[nuc]=float(counts[nuc])/sum(counts.values())
...@@ -135,7 +135,7 @@ def dinuc_freq(mapped_reads): ...@@ -135,7 +135,7 @@ def dinuc_freq(mapped_reads):
for nuc in seq: for nuc in seq:
all_seqs.append(nuc) all_seqs.append(nuc)
counts=dict(Counter(all_seqs)) counts=dict(Counter(all_seqs))
dinucs=counts.keys() dinucs=list(counts.keys())
dinuc_counts={} dinuc_counts={}
for i in dinucs: for i in dinucs:
val=float(counts[i])/sum(counts.values()) val=float(counts[i])/sum(counts.values())
...@@ -178,11 +178,11 @@ Calculations are based on the the length of the (possibly hard-clipped) sequence ...@@ -178,11 +178,11 @@ Calculations are based on the the length of the (possibly hard-clipped) sequence
length=int(read[8]) length=int(read[8])
lengths.append(length) lengths.append(length)
mean_len=np.mean(lengths) mean_len=np.mean(lengths)
print "group"+str(i+1)+"mean",mean_len print("group"+str(i+1)+"mean",mean_len)
max_len=np.max(lengths) max_len=np.max(lengths)
print "group"+str(i+1)+"max length",max_len print("group"+str(i+1)+"max length",max_len)
min_len=np.min(lengths) min_len=np.min(lengths)
print "group"+str(i+1)+"min length",min_len print("group"+str(i+1)+"min length",min_len)
data.append(["group"+str(i+1),mean_len,max_len,min_len]) data.append(["group"+str(i+1),mean_len,max_len,min_len])
return data return data
...@@ -221,15 +221,15 @@ def plot_base_composition(reads,sym): ...@@ -221,15 +221,15 @@ def plot_base_composition(reads,sym):
all_nucs.append(nucs) all_nucs.append(nucs)
all_items=[] all_items=[]
counts=[] counts=[]
pos=range(1,len(seq)+1) pos=list(range(1,len(seq)+1))
for dicts in all_nucs: for dicts in all_nucs:
for item in dicts.items(): for item in list(dicts.items()):
all_items.append(item) all_items.append(item)
all_items.sort(key=operator.itemgetter(0)) all_items.sort(key=operator.itemgetter(0))
groups= [map(operator.itemgetter(1),list(group)) for key, group in itertools.groupby(all_items, operator.itemgetter(0))] groups= [list(map(operator.itemgetter(1),list(group))) for key, group in itertools.groupby(all_items, operator.itemgetter(0))]
for group in groups: for group in groups:
counts.append(group.count(sym)) counts.append(group.count(sym))
print counts print(counts)
plt.figure(1, figsize=(8,8)) plt.figure(1, figsize=(8,8))
ax = plt.axes([0.1, 0.1, 0.8, 0.8]) ax = plt.axes([0.1, 0.1, 0.8, 0.8])
plt.bar(pos,counts,facecolor='g') plt.bar(pos,counts,facecolor='g')
...@@ -261,7 +261,7 @@ def raw_count_reader(filename): ...@@ -261,7 +261,7 @@ def raw_count_reader(filename):
def get_RPKM(data,num_map1,num_map2,num_map3,num_map4): def get_RPKM(data,num_map1,num_map2,num_map3,num_map4):
"""provide number of mapped reads for the two groups of interest and raw count data .This method provides length normalisation to prevent length and total count bias""" """provide number of mapped reads for the two groups of interest and raw count data .This method provides length normalisation to prevent length and total count bias"""
all_rpkms=[];final={} all_rpkms=[];final={}
for i,s,ii,ss,v in data.values(): for i,s,ii,ss,v in list(data.values()):
rpkms=[] rpkms=[]
num_mapped_reads=[num_map1,num_map2,num_map3,num_map4] num_mapped_reads=[num_map1,num_map2,num_map3,num_map4]
vals=[i,s,ii,ss] vals=[i,s,ii,ss]
...@@ -276,15 +276,15 @@ def get_RPKM(data,num_map1,num_map2,num_map3,num_map4): ...@@ -276,15 +276,15 @@ def get_RPKM(data,num_map1,num_map2,num_map3,num_map4):
rpkms.append(rpkm) rpkms.append(rpkm)
all_rpkms.append(rpkms) all_rpkms.append(rpkms)
#return gene names and rpkms #return gene names and rpkms
for i in range(0,len(data.keys())): for i in range(0,len(list(data.keys()))):
final[data.keys()[i]]=[float(all_rpkms[i][0]),float(all_rpkms[i][1]),float(all_rpkms[i][2]),float(all_rpkms[i][3])] final[list(data.keys())[i]]=[float(all_rpkms[i][0]),float(all_rpkms[i][1]),float(all_rpkms[i][2]),float(all_rpkms[i][3])]
return final return final
def write_RPKM_data(RPKM_data,filename): def write_RPKM_data(RPKM_data,filename):
"""write RPKM data to a file""" """write RPKM data to a file"""
f=open(filename,'w') f=open(filename,'w')
for i in range(0,len(RPKM_data)): for i in range(0,len(RPKM_data)):
f.write("%s\t%d\t%d\t%d\t%d\n"%(RPKM_data.keys()[i],int(RPKM_data.values()[i][0]),int(RPKM_data.values()[i][1]),int(RPKM_data.values()[i][2]),int(RPKM_data.values()[i][3]))) f.write("%s\t%d\t%d\t%d\t%d\n"%(list(RPKM_data.keys())[i],int(list(RPKM_data.values())[i][0]),int(list(RPKM_data.values())[i][1]),int(list(RPKM_data.values())[i][2]),int(list(RPKM_data.values())[i][3])))
f.close() f.close()
...@@ -316,13 +316,13 @@ def plotreprpkm(rpkm_data,timepoint): ...@@ -316,13 +316,13 @@ def plotreprpkm(rpkm_data,timepoint):
one=[] one=[]
two=[] two=[]
if timepoint=="t1": if timepoint=="t1":
for i in range(0,len(rpkm_data.values())): for i in range(0,len(list(rpkm_data.values()))):
one.append(int(rpkm_data.values()[i][0])) one.append(int(list(rpkm_data.values())[i][0]))
two.append(int(rpkm_data.values()[i][1])) two.append(int(list(rpkm_data.values())[i][1]))
else: else:
for i in range(0,len(rpkm_data.values())): for i in range(0,len(list(rpkm_data.values()))):
one.append(int(rpkm_data.values()[i][2])) one.append(int(list(rpkm_data.values())[i][2]))
two.append(int(rpkm_data.values()[i][3])) two.append(int(list(rpkm_data.values())[i][3]))
plt.plot(one,two,'o') plt.plot(one,two,'o')
pcc=pearson_def(one,two) pcc=pearson_def(one,two)
R2=pcc**2 R2=pcc**2
...@@ -343,15 +343,15 @@ def plotMAreprpkm(rpkm_data,timepoint): ...@@ -343,15 +343,15 @@ def plotMAreprpkm(rpkm_data,timepoint):
m=[] m=[]
a=[] a=[]
if timepoint=="t1": if timepoint=="t1":
for i in range(0,len(rpkm_data.values())): for i in range(0,len(list(rpkm_data.values()))):
y=np.log2(rpkm_data.values()[i][0]+1)-np.log2(rpkm_data.values()[i][1]+1) y=np.log2(list(rpkm_data.values())[i][0]+1)-np.log2(list(rpkm_data.values())[i][1]+1)
x=(np.log2(rpkm_data.values()[i][0]+1)+np.log2(rpkm_data.values()[i][1]+1))/2 x=(np.log2(list(rpkm_data.values())[i][0]+1)+np.log2(list(rpkm_data.values())[i][1]+1))/2
m.append(y) m.append(y)
a.append(x) a.append(x)
else: else:
for i in range(0,len(rpkm_data.values())): for i in range(0,len(list(rpkm_data.values()))):
y=np.log2(rpkm_data.values()[i][2]+1)-np.log2(rpkm_data.values()[i][3]+1) y=np.log2(list(rpkm_data.values())[i][2]+1)-np.log2(list(rpkm_data.values())[i][3]+1)
x=(np.log2(rpkm_data.values()[i][2]+1)+np.log2(rpkm_data.values()[i][3]+1))/2 x=(np.log2(list(rpkm_data.values())[i][2]+1)+np.log2(list(rpkm_data.values())[i][3]+1))/2
m.append(y) m.append(y)
a.append(x) a.append(x)
plt.figure(1, figsize=(8,8)) plt.figure(1, figsize=(8,8))
...@@ -370,18 +370,18 @@ def plotMAreprpkm(rpkm_data,timepoint): ...@@ -370,18 +370,18 @@ def plotMAreprpkm(rpkm_data,timepoint):
def get_cv(data1,condition): def get_cv(data1,condition):
cvs=[] cvs=[]
if condition=="t1": if condition=="t1":
for i in range(0,len(data1.values())): for i in range(0,len(list(data1.values()))):
mean = np.mean([data1.values()[i][0],data1.values()[i][1]]) mean = np.mean([list(data1.values())[i][0],list(data1.values())[i][1]])
std=np.std([data1.values()[i][0],data1.values()[i][1]]) std=np.std([list(data1.values())[i][0],list(data1.values())[i][1]])
if mean==0.0 and std==0.0: if mean==0.0 and std==0.0:
pass pass
else: else:
cv=float(mean+1)/(std+1) cv=float(mean+1)/(std+1)
cvs.append(cv) cvs.append(cv)
else: else:
for i in range(0,len(data1.values())): for i in range(0,len(list(data1.values()))):
mean = np.mean([data1.values()[i][2],data1.values()[i][3]]) mean = np.mean([list(data1.values())[i][2],list(data1.values())[i][3]])
std=np.std([data1.values()[i][2],data1.values()[i][3]]) std=np.std([list(data1.values())[i][2],list(data1.values())[i][3]])
if mean==0.0 and std==0.0: if mean==0.0 and std==0.0:
pass pass
else: else:
...@@ -430,7 +430,7 @@ def plotMA(rpkm_data,cutoff=[-1.5,1.5]): ...@@ -430,7 +430,7 @@ def plotMA(rpkm_data,cutoff=[-1.5,1.5]):
avg_rpkm2=[] avg_rpkm2=[]
sig_logfc2=[] sig_logfc2=[]
sig_avg_rpkm2=[] sig_avg_rpkm2=[]
for i,ii,s,ss in rpkm_data.values(): for i,ii,s,ss in list(rpkm_data.values()):
fc=np.log2(float(s+1)/(i+1)) fc=np.log2(float(s+1)/(i+1))
if fc<cutoff[0] or fc>cutoff[1]: if fc<cutoff[0] or fc>cutoff[1]:
sig_logfc.append(fc) sig_logfc.append(fc)
...@@ -438,7 +438,7 @@ def plotMA(rpkm_data,cutoff=[-1.5,1.5]): ...@@ -438,7 +438,7 @@ def plotMA(rpkm_data,cutoff=[-1.5,1.5]):
else: else:
logfc.append(fc) logfc.append(fc)
avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2) avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
for i,ii,s,ss in rpkm_data.values(): for i,ii,s,ss in list(rpkm_data.values()):
fc2=np.log2(float(ss+1)/(ii+1)) fc2=np.log2(float(ss+1)/(ii+1))
if fc2<cutoff[0] or fc2>cutoff[1]: if fc2<cutoff[0] or fc2>cutoff[1]:
sig_logfc2.append(fc2) sig_logfc2.append(fc2)
...@@ -470,7 +470,7 @@ def plotMA_pval(rpkm_data,cutoff=0.05): ...@@ -470,7 +470,7 @@ def plotMA_pval(rpkm_data,cutoff=0.05):
avg_rpkm2=[] avg_rpkm2=[]
sig_logfc2=[] sig_logfc2=[]
sig_avg_rpkm2=[] sig_avg_rpkm2=[]
for i,ii,s,ss,pval in rpkm_data.values(): for i,ii,s,ss,pval in list(rpkm_data.values()):
fc=np.log2(float(s+1)/(i+1)) fc=np.log2(float(s+1)/(i+1))
if float(pval)<cutoff: if float(pval)<cutoff:
sig_logfc.append(fc) sig_logfc.append(fc)
...@@ -478,7 +478,7 @@ def plotMA_pval(rpkm_data,cutoff=0.05): ...@@ -478,7 +478,7 @@ def plotMA_pval(rpkm_data,cutoff=0.05):
else: else:
logfc.append(fc) logfc.append(fc)
avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2) avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
for i,ii,s,ss,pval in rpkm_data.values(): for i,ii,s,ss,pval in list(rpkm_data.values()):
fc2=np.log2(float(ss+1)/(ii+1)) fc2=np.log2(float(ss+1)/(ii+1))
if float(pval)<cutoff: if float(pval)<cutoff:
sig_logfc2.append(fc2) sig_logfc2.append(fc2)
...@@ -506,7 +506,7 @@ def Welcht(rpkm): ...@@ -506,7 +506,7 @@ def Welcht(rpkm):
"""Performs Welchs T-statistic (one-tailed)""" """Performs Welchs T-statistic (one-tailed)"""
ts=[] ts=[]
result={} result={}
for i,ii,s,ss in rpkm.values(): for i,ii,s,ss in list(rpkm.values()):
sd1=np.std([i,ii]) sd1=np.std([i,ii])
sd2=np.std([s,ss]) sd2=np.std([s,ss])
t=(np.mean([s,ss])-np.mean([i,ii]))/(math.sqrt(((float(sd2)/2)+(float(sd1)/2)))) t=(np.mean([s,ss])-np.mean([i,ii]))/(math.sqrt(((float(sd2)/2)+(float(sd1)/2))))
...@@ -521,8 +521,8 @@ def Welcht(rpkm): ...@@ -521,8 +521,8 @@ def Welcht(rpkm):
pval=pval pval=pval
pvals.append(pval) pvals.append(pval)
corr_pvals=correct_pvalues_for_multiple_testing(pvals, correction_type = "Benjamini-Hochberg") corr_pvals=correct_pvalues_for_multiple_testing(pvals, correction_type = "Benjamini-Hochberg")
for i in range(0,len(rpkm.values())): for i in range(0,len(list(rpkm.values()))):
result[rpkm.keys()[i]]=[rpkm.values()[i][0],rpkm.values()[i][1],rpkm.values()[i][2],rpkm.values()[i][3],corr_pvals[i]] result[list(rpkm.keys())[i]]=[list(rpkm.values())[i][0],list(rpkm.values())[i][1],list(rpkm.values())[i][2],list(rpkm.values())[i][3],corr_pvals[i]]
return result return result
...@@ -551,7 +551,7 @@ def correct_pvalues_for_multiple_testing(pvalues, correction_type = "Benjamini-H ...@@ -551,7 +551,7 @@ def correct_pvalues_for_multiple_testing(pvalues, correction_type = "Benjamini-H
rank = n - i rank = n - i
pvalue, index = vals pvalue, index = vals
new_values.append((n/rank) * pvalue) new_values.append((n/rank) * pvalue)
for i in xrange(0, int(n)-1): for i in range(0, int(n)-1):
if new_values[i] < new_values[i+1]: if new_values[i] < new_values[i+1]:
new_values[i+1] = new_values[i] new_values[i+1] = new_values[i]
for i, vals in enumerate(values): for i, vals in enumerate(values):
......
...@@ -257,7 +257,7 @@ class BedFile(): ...@@ -257,7 +257,7 @@ class BedFile():
self.rows = entries self.rows = entries
self.format = format self.format = format
self.indices = self._createIndices() self.indices = self._createIndices()
def _read(self, filename, format = 'Limited'): def _read(self, filename, format = 'Limited'):
""" Read a BED file. """ Read a BED file.
format: specifies the format of the file, format: specifies the format of the file,
...@@ -276,7 +276,7 @@ class BedFile(): ...@@ -276,7 +276,7 @@ class BedFile():
"Strand", e.g. "Strand", e.g.
chr4 185772359 185772424 - chr4 185772359 185772424 -
chr18 20513381 20513401 + chr18 20513381 20513401 +
also supports a 5th label field also supports a 5th label field
chr5 20611949 20611949 + ENSG00000251629_20611949 chr5 20611949 20611949 + ENSG00000251629_20611949
chr3 42187863 42187863 - ENSG00000234562_42187863 chr3 42187863 42187863 - ENSG00000234562_42187863
"Summit", e.g. "Summit", e.g.
...@@ -361,7 +361,7 @@ class BedFile(): ...@@ -361,7 +361,7 @@ class BedFile():
acceptHeaderRows -= 1 # count down the number of header rows that can occur acceptHeaderRows -= 1 # count down the number of header rows that can occur
f.close() f.close()
return rows return rows
def __iter__(self): def __iter__(self):
return self.rows.__iter__() return self.rows.__iter__()
...@@ -381,11 +381,11 @@ class BedFile(): ...@@ -381,11 +381,11 @@ class BedFile():
index_name = {} index_name = {}
for i in range(len(self.rows)): for i in range(len(self.rows)):
row = self.rows[i] row = self.rows[i]
if not index_start.has_key(row.chrom): # seeing chromosome entry first time if not row.chrom in index_start: # seeing chromosome entry first time
index_start[row.chrom] = [] index_start[row.chrom] = []
if not index_centre.has_key(row.chrom): # seeing chromosome entry first time if not row.chrom in index_centre: # seeing chromosome entry first time
index_centre[row.chrom] = [] index_centre[row.chrom] = []
if not index_end.has_key(row.chrom): # seeing chromosome entry first time if not row.chrom in index_end: # seeing chromosome entry first time
index_end[row.chrom] = [] index_end[row.chrom] = []
index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i)) index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i)) index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
...@@ -477,7 +477,7 @@ class BedFile(): ...@@ -477,7 +477,7 @@ class BedFile():
Note that if the name is not unique, the last entry with the name will be returned. Note that if the name is not unique, the last entry with the name will be returned.
""" """
return self.indices[3][myname] return self.indices[3][myname]
def closest(self, myloc, minimum = True): def closest(self, myloc, minimum = True):
""" Find the closest entry in the current BedFile to a given location. """ Find the closest entry in the current BedFile to a given location.
Return a tuple with the absolute distance and the entry that is closest. Return a tuple with the absolute distance and the entry that is closest.
...@@ -607,7 +607,7 @@ class BedFile(): ...@@ -607,7 +607,7 @@ class BedFile():
if not earliest_start: # not yet initialised if not earliest_start: # not yet initialised
earliest_start = start earliest_start = start
latest_end = end latest_end = end
else: else:
if start > latest_end: # new entry if start > latest_end: # new entry
entry = BedEntry(c, earliest_start, latest_end) entry = BedEntry(c, earliest_start, latest_end)
if self.format == 'Peaks': if self.format == 'Peaks':
...@@ -639,7 +639,7 @@ class BedFile(): ...@@ -639,7 +639,7 @@ class BedFile():
if not earliest_start: # not yet initialised if not earliest_start: # not yet initialised
earliest_start = start earliest_start = start
latest_end = end latest_end = end
else: else:
if start > latest_end: # new entry if start > latest_end: # new entry
entry = BedEntry(c, earliest_start, latest_end) entry = BedEntry(c, earliest_start, latest_end)
if self.format == 'Peaks': if self.format == 'Peaks':
...@@ -660,7 +660,7 @@ class BedFile(): ...@@ -660,7 +660,7 @@ class BedFile():
entry.addOption(name = rows[idx].name, strand = rows[idx].strand) entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
newrows.append(entry) newrows.append(entry)
return BedFile(newrows, format = self.format) return BedFile(newrows, format = self.format)
def write(self, filename, format = 'BED6', header = None): def write(self, filename, format = 'BED6', header = None):
""" Save the data """ Save the data
format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported. format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
...@@ -697,7 +697,7 @@ def readBedFile(filename, format = 'Limited'): ...@@ -697,7 +697,7 @@ def readBedFile(filename, format = 'Limited'):
"Strand", e.g. "Strand", e.g.
chr4 185772359 185772424 - chr4 185772359 185772424 -
chr18 20513381 20513401 + chr18 20513381 20513401 +
also supports a 5th label field also supports a 5th label field
chr5 20611949 20611949 + ENSG00000251629_20611949 chr5 20611949 20611949 + ENSG00000251629_20611949
chr3 42187863 42187863 - ENSG00000234562_42187863 chr3 42187863 42187863 - ENSG00000234562_42187863
"Summit", e.g. "Summit", e.g.
...@@ -714,7 +714,7 @@ def readBedFile(filename, format = 'Limited'): ...@@ -714,7 +714,7 @@ def readBedFile(filename, format = 'Limited'):
chr1 931838 9 chr1 931838 9
""" """
return BedFile(filename, format) return BedFile(filename, format)
def writeBedFile(entries, filename, format = 'BED6', header = None): def writeBedFile(entries, filename, format = 'BED6', header = None):
""" Save the BED entries to a BED file. """ Save the BED entries to a BED file.
format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported. format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
...@@ -725,11 +725,11 @@ def writeBedFile(entries, filename, format = 'BED6', header = None): ...@@ -725,11 +725,11 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
for row in entries: for row in entries:
if format == 'Peaks': if format == 'Peaks':
#f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser #f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
f.write("%s %d %d %s %d %s %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue)) f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
elif format == 'Limited': elif format == 'Limited':
f.write("%s %d %d" % (row.chrom, row.chromStart, row.chromEnd)) f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
else: else:
f.write("%s %d %d %s %d %s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand)) f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
f.write("\n") f.write("\n")
f.close() f.close()
...@@ -760,7 +760,7 @@ try: ...@@ -760,7 +760,7 @@ try:
except ImportError: except ImportError:
strerror = lambda x: 'strerror not supported' strerror = lambda x: 'strerror not supported'
from os.path import exists from os.path import exists
from itertools import izip from itertools import chain
def true_long_type(): def true_long_type():
""" """
...@@ -805,7 +805,7 @@ def base_to_bin(x): ...@@ -805,7 +805,7 @@ def base_to_bin(x):
def create_byte_table(): def create_byte_table():
"""create BYTE_TABLE""" """create BYTE_TABLE"""
d = {} d = {}
for x in xrange(2**8): for x in range(2**8):
d[x] = byte_to_bases(x) d[x] = byte_to_bases(x)
return d return d
...@@ -821,9 +821,9 @@ def split16(x): ...@@ -821,9 +821,9 @@ def split16(x):
def create_twobyte_table(): def create_twobyte_table():
"""create TWOBYTE_TABLE""" """create TWOBYTE_TABLE"""
d = {} d = {}
for x in xrange(2**16): for x in range(2**16):
c, f = split16(x) c, f = split16(x)
d[x] = byte_to_bases(c) + byte_to_bases(f) d[x] = chain(byte_to_bases(c), byte_to_bases(f))
return d return d
BYTE_TABLE = create_byte_table() BYTE_TABLE = create_byte_table()
...@@ -836,7 +836,7 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size): ...@@ -836,7 +836,7 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
""" """
longs_len = len(longs) longs_len = len(longs)
# dna = ctypes.create_string_buffer(array_size) # dna = ctypes.create_string_buffer(array_size)
dna = array('c', 'N' * longs_len) dna = array('b', 'N' * longs_len)
# translate from 32-bit blocks to bytes # translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed) # this method ensures correct endianess (byteswap as neeed)
bytes = array('B') bytes = array('B')
...@@ -845,14 +845,14 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size): ...@@ -845,14 +845,14 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)]) first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
i = 16 - first_base_offset i = 16 - first_base_offset
if array_size < i: i = array_size if array_size < i: i = array_size
dna[0:i] = array('c', first_block[first_base_offset:first_base_offset + i]) dna[0:i] = array('b', first_block[first_base_offset:first_base_offset + i])
if longs_len == 1: return dna if longs_len == 1: return dna
# middle blocks (implicitly skipped if they don't exist) # middle blocks (implicitly skipped if they don't exist)
for byte in bytes[4:-4]: for byte in bytes[4:-4]:
dna[i:i + 4] = array('c', BYTE_TABLE[byte]) dna[i:i + 4] = array('b', BYTE_TABLE[byte])
i += 4 i += 4
# last block # last block
last_block = array('c', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)])) last_block = array('b', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
dna[i:i + last_base_offset] = last_block[0:last_base_offset] dna[i:i + last_base_offset] = last_block[0:last_base_offset]
return dna return dna
...@@ -889,7 +889,7 @@ class TwoBitFile(dict): ...@@ -889,7 +889,7 @@ class TwoBitFile(dict):
self._file_handle = open(foo, 'rb') self._file_handle = open(foo, 'rb')
self._load_header() self._load_header()
self._load_index() self._load_index()
for name, offset in self._offset_dict.iteritems(): for name, offset in self._offset_dict.items():
self[name] = TwoBitSequence(self._file_handle, offset, self[name] = TwoBitSequence(self._file_handle, offset,
self._byteswapped) self._byteswapped)
return return
...@@ -926,13 +926,16 @@ class TwoBitFile(dict): ...@@ -926,13 +926,16 @@ class TwoBitFile(dict):
if remaining == 0: break if remaining == 0: break
name_size = array('B') name_size = array('B')
name_size.fromfile(file_handle, 1) name_size.fromfile(file_handle, 1)
if byteswapped: name_size.byteswap() if byteswapped:
name = array('c') name_size.byteswap()
if byteswapped: name.byteswap() name = array('b')
if byteswapped:
name.byteswap()
name.fromfile(file_handle, name_size[0]) name.fromfile(file_handle, name_size[0])
offset = array(LONG) offset = array(LONG)
offset.fromfile(file_handle, 1) offset.fromfile(file_handle, 1)
if byteswapped: offset.byteswap() if byteswapped:
offset.byteswap()
sequence_offsets.append((name.tostring(), offset[0])) sequence_offsets.append((name.tostring(), offset[0]))
remaining -= 1 remaining -= 1
self._sequence_offsets = sequence_offsets self._sequence_offsets = sequence_offsets
...@@ -943,7 +946,7 @@ class TwoBitFile(dict): ...@@ -943,7 +946,7 @@ class TwoBitFile(dict):
d = {} d = {}
file_handle = self._file_handle file_handle = self._file_handle
byteswapped = self._byteswapped byteswapped = self._byteswapped
for name, offset in self._offset_dict.iteritems(): for name, offset in self._offset_dict.items():
file_handle.seek(offset) file_handle.seek(offset)
dna_size = array(LONG) dna_size = array(LONG)
dna_size.fromfile(file_handle, 1) dna_size.fromfile(file_handle, 1)
...@@ -1078,7 +1081,7 @@ class TwoBitSequence(object): ...@@ -1078,7 +1081,7 @@ class TwoBitSequence(object):
if byteswapped: fourbyte_dna.byteswap() if byteswapped: fourbyte_dna.byteswap()
string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset, string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
last_base_offset, region_size) last_base_offset, region_size)
for start, size in izip(n_block_starts, n_block_sizes): for start, size in zip(n_block_starts, n_block_sizes):
end = start + size end = start + size
if end <= min_: continue if end <= min_: continue
if start > max_: break if start > max_: break
...@@ -1086,14 +1089,14 @@ class TwoBitSequence(object): ...@@ -1086,14 +1089,14 @@ class TwoBitSequence(object):
if end > max_: end = max_ if end > max_: end = max_
start -= min_ start -= min_
end -= min_ end -= min_
string_as_array[start:end] = array('c', 'N'*(end-start)) string_as_array[start:end] = array('b', 'N'*(end-start))
lower = str.lower lower = str.lower
first_masked_region = max(0, first_masked_region = max(0,
bisect_right(mask_block_starts, min_) - 1) bisect_right(mask_block_starts, min_) - 1)
last_masked_region = min(len(mask_block_starts), last_masked_region = min(len(mask_block_starts),
1 + bisect_right(mask_block_starts, max_, 1 + bisect_right(mask_block_starts, max_,
lo=first_masked_region)) lo=first_masked_region))
for start, size in izip(mask_block_starts[first_masked_region:last_masked_region], for start, size in zip(mask_block_starts[first_masked_region:last_masked_region],
mask_block_sizes[first_masked_region:last_masked_region]): mask_block_sizes[first_masked_region:last_masked_region]):
end = start + size end = start + size
if end <= min_: continue if end <= min_: continue
...@@ -1102,9 +1105,9 @@ class TwoBitSequence(object): ...@@ -1102,9 +1105,9 @@ class TwoBitSequence(object):
if end > max_: end = max_ if end > max_: end = max_
start -= min_ start -= min_
end -= min_ end -= min_
string_as_array[start:end] = array('c', lower(string_as_array[start:end].tostring())) string_as_array[start:end] = array('b', lower(string_as_array[start:end].tostring()))
if not len(string_as_array) == max_ - min_: if not len(string_as_array) == max_ - min_:
raise RuntimeError, "Sequence was longer than it should be" raise RuntimeError("Sequence was longer than it should be")
if reverse: if reverse:
return self.reverseComplement(string_as_array.tostring()) return self.reverseComplement(string_as_array.tostring())
return string_as_array.tostring() return string_as_array.tostring()
...@@ -1124,7 +1127,7 @@ class TwoBitSequence(object): ...@@ -1124,7 +1127,7 @@ class TwoBitSequence(object):
""" """
return self.__getslice__(0, None) return self.__getslice__(0, None)
class TwoBitFileError(StandardError): class TwoBitFileError(Exception):
""" """
Base exception for TwoBit module Base exception for TwoBit module
""" """
......
...@@ -55,10 +55,11 @@ class Sequence(object): ...@@ -55,10 +55,11 @@ class Sequence(object):
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'Y'] """ 'R', 'S', 'T', 'V', 'W', 'Y'] """
try: # convert sequence data into a compact array representation #try: # convert sequence data into a compact array representation
self.sequence = array.array('c', ''.join([s.upper() for s in sequence])) # self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
except TypeError: #except TypeError:
raise RuntimeError('Sequence data is not specified correctly: must be iterable') # raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
self.sequence = sequence
# Assign an alphabet # Assign an alphabet
self.alphabet = None self.alphabet = None
...@@ -133,15 +134,15 @@ class Sequence(object): ...@@ -133,15 +134,15 @@ class Sequence(object):
Calling self.__getitem__(3) is equivalent to self[3] Calling self.__getitem__(3) is equivalent to self[3]
""" """
if type(ndx) is slice: if type(ndx) is slice:
return self.sequence[ndx].tostring() return ''.join(self.sequence[ndx])
else: else:
return self.sequence[ndx] return self.sequence[ndx]
def writeFasta(self): def writeFasta(self):
""" Write one sequence in FASTA format to a string and return it. """ """ Write one sequence in FASTA format to a string and return it. """
fasta = '>' + self.name + ' ' + self.info + '\n' fasta = '>' + self.name + ' ' + self.info + '\n'
data = self.sequence.tostring() data = ''.join(self.sequence)
nlines = (len(self.sequence) - 1) / 60 + 1 nlines = int(math.ceil((len(self.sequence) - 1) / 60 + 1))
for i in range(nlines): for i in range(nlines):
lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n' lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
fasta += lineofseq fasta += lineofseq
...@@ -164,7 +165,7 @@ class Sequence(object): ...@@ -164,7 +165,7 @@ class Sequence(object):
def find(self, findme): def find(self, findme):
""" Find the position of the specified symbol or sub-sequence """ """ Find the position of the specified symbol or sub-sequence """
return self.sequence.tostring().find(findme) return ''.join(self.sequence).find(findme)
""" """
Below are some useful methods for loading data from strings and files. Below are some useful methods for loading data from strings and files.
...@@ -438,8 +439,8 @@ class Alignment(): ...@@ -438,8 +439,8 @@ class Alignment():
column index, entropy, number of gaps, and symbols in order of decreasing probability. column index, entropy, number of gaps, and symbols in order of decreasing probability.
theta1 is the threshold for displaying symbols in upper case, theta1 is the threshold for displaying symbols in upper case,
theta2 is the threshold for showing symbols at all, and in lower case. """ theta2 is the threshold for showing symbols at all, and in lower case. """
print "Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen) print(("Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)))
print "Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2) print(("Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)))
for col in range(self.alignlen): for col in range(self.alignlen):
d = Distrib(self.alphabet) d = Distrib(self.alphabet)
gaps = 0 gaps = 0
...@@ -448,21 +449,21 @@ class Alignment(): ...@@ -448,21 +449,21 @@ class Alignment():
d.observe(seq[col]) d.observe(seq[col])
else: else:
gaps += 1 gaps += 1
print (col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps, print(((col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,))
symprobs = d.getProbsort() symprobs = d.getProbsort()
(_, maxprob) = symprobs[0] (_, maxprob) = symprobs[0]
if maxprob >= theta1: if maxprob >= theta1:
print "%d\tTRUE\t" % int(maxprob * 100), print(("%d\tTRUE\t" % int(maxprob * 100),))
else: else:
print "%d\t\t" % int(maxprob * 100), print(("%d\t\t" % int(maxprob * 100),))
for (sym, prob) in symprobs: for (sym, prob) in symprobs:
if prob >= theta1: if prob >= theta1:
print sym, "%d%%" % int(prob * 100), print((sym, "%d%%" % int(prob * 100),))
elif prob >= theta2 and lowercase: elif prob >= theta2 and lowercase:
print sym.lower(), "%d%%" % int(prob * 100), print((sym.lower(), "%d%%" % int(prob * 100),))
elif prob >= theta2: elif prob >= theta2:
print sym, "%d%%" % int(prob * 100), print((sym, "%d%%" % int(prob * 100),))
print print()
def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False): def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False):
""" Display a table with rows for each alignment column, showing """ Display a table with rows for each alignment column, showing
...@@ -644,7 +645,7 @@ class Alignment(): ...@@ -644,7 +645,7 @@ class Alignment():
return distmat return distmat
def writeHTML(self, filename=None): def writeHTML(self, filename=None):
""" Generate HTML that displays the alignment in color. """ Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym) Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue' and that each symbol maps to a text string naming the color, e.g. 'blue'
""" """
...@@ -681,10 +682,9 @@ class Alignment(): ...@@ -681,10 +682,9 @@ class Alignment():
htmlstr += html htmlstr += html
htmlstr += '<pre>' htmlstr += '<pre>'
if filename: if filename:
fh = open(filename, 'w') with open(filename, 'w+') as fh:
fh.write(htmlstr) fh.write(htmlstr)
fh.write('</body></html>\n') fh.write('</body></html>\n')
fh.close()
else: else:
return htmlstr return htmlstr
...@@ -985,12 +985,12 @@ def readClustal(string, alphabet): ...@@ -985,12 +985,12 @@ def readClustal(string, alphabet):
index = name.find('/') index = name.find('/')
if index >= 0: if index >= 0:
name = name[0:index] name = name[0:index]
if seqs.has_key(name): if name in seqs:
seqs[name] += seqstr seqs[name] += seqstr
else: else:
seqs[name] = seqstr seqs[name] = seqstr
sequences = [] sequences = []
for name, seqstr in seqs.items(): for name, seqstr in list(seqs.items()):
sequences.append(Sequence(seqstr, alphabet, name, gappy = True)) sequences.append(Sequence(seqstr, alphabet, name, gappy = True))
return Alignment(sequences) return Alignment(sequences)
...@@ -1180,12 +1180,12 @@ class PWM(object): ...@@ -1180,12 +1180,12 @@ class PWM(object):
def display(self, format = 'COLUMN'): def display(self, format = 'COLUMN'):
if format == 'COLUMN': if format == 'COLUMN':
print " \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length))) print((" \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))))
for j in range(len(self.alphabet)): for j in range(len(self.alphabet)):
print "%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j])) print(("%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
elif format == 'JASPAR': elif format == 'JASPAR':
for j in range(len(self.alphabet)): for j in range(len(self.alphabet)):
print "%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j])) print(("%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
def search(self, sequence, lowerBound=0): def search(self, sequence, lowerBound=0):
""" Find matches to the motif in a specified sequence. Returns a list """ Find matches to the motif in a specified sequence. Returns a list
...@@ -1229,7 +1229,7 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None): ...@@ -1229,7 +1229,7 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
""" Get the sequence identified by the given ID from the given database """ Get the sequence identified by the given ID from the given database
(e.g. 'uniprotkb', 'refseqn' or 'refseqp'), and return it as a Sequence (e.g. 'uniprotkb', 'refseqn' or 'refseqp'), and return it as a Sequence
object. An error is caused if the sequence ID is not found. If start and object. An error is caused if the sequence ID is not found. If start and
end are given, then only that section of the sequence is returned. end are given, then only that section of the sequence is returned.
Note: more flexible search options are supported by using webservice.fetch Note: more flexible search options are supported by using webservice.fetch
directly.""" directly."""
...@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None): ...@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
for i in range(MAX_TRY): for i in range(MAX_TRY):
try: try:
fastaData = fetch(id, database) fastaData = fetch(id, database).decode("utf-8")
seq = readFasta(fastaData)[0] seq = readFasta(fastaData)[0]
break break
except: except:
from time import sleep from time import sleep
print 'Failed on {i}th try for id {id}'.format(i=i, id=id) print(('Failed on {i}th try for id {id}'.format(i=i, id=id)))
sleep(0.1) sleep(0.1)
try: try:
return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info) return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info)
...@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'): ...@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
if __name__ == '__main__': if __name__ == '__main__':
seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True) seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True)
print 'Read', len(seqs), 'sequences' print(('Read', len(seqs), 'sequences'))
...@@ -71,7 +71,7 @@ class SeqNN(): ...@@ -71,7 +71,7 @@ class SeqNN():
im[row, _onehotIndex(alpha, subseqs[k])] = 1 im[row, _onehotIndex(alpha, subseqs[k])] = 1
if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1 if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1
row += 1 row += 1
print "There are", row, "entries in data set" print("There are", row, "entries in data set")
if targets: if targets:
return im, om return im, om
else: else:
...@@ -85,7 +85,7 @@ class SeqNN(): ...@@ -85,7 +85,7 @@ class SeqNN():
im, om = self._encodeseq(seqs, targets) im, om = self._encodeseq(seqs, targets)
for i in range(niter): # train first NN for i in range(niter): # train first NN
rmse = self.nn1.train(im, om, eta = eta, niter = 1) rmse = self.nn1.train(im, om, eta = eta, niter = 1)
print i, ":", rmse print(i, ":", rmse)
if not self.cascade: # if there's no cascaded NN, finish here if not self.cascade: # if there's no cascaded NN, finish here
return rmse return rmse
nn1seqs = [] # a list of new SS sequences ... nn1seqs = [] # a list of new SS sequences ...
...@@ -95,7 +95,7 @@ class SeqNN(): ...@@ -95,7 +95,7 @@ class SeqNN():
im, om = self._encodeseq(nn1seqs, targets) # construct input/output patterns from SS sequences im, om = self._encodeseq(nn1seqs, targets) # construct input/output patterns from SS sequences
for i in range(niter): # train cascaded NN for i in range(niter): # train cascaded NN
rmse = self.nn2.train(im, om, eta = eta, niter = 1) rmse = self.nn2.train(im, om, eta = eta, niter = 1)
print i, ":", rmse print(i, ":", rmse)
return rmse return rmse
def testAll(self, seqs, targets): def testAll(self, seqs, targets):
......
...@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4): ...@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4):
specified width average of 100. specified width average of 100.
""" """
sum = 0.0 sum = 0.0
order = range(0, len(calls) - 1, +1) # we are extending calls downstream order = list(range(0, len(calls) - 1, +1)) # we are extending calls downstream
cnt = 0 cnt = 0
for i in order: # extend to the right for i in order: # extend to the right
if calls[i]: # to extend a call is required in the first place if calls[i]: # to extend a call is required in the first place
...@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4): ...@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4):
AND extend this list upstream containing a specified width average of 100. AND extend this list upstream containing a specified width average of 100.
""" """
sum = 0.0 sum = 0.0
order = range(len(calls) - 1, 0, -1) # we are extending calls upstream/to-the-left order = list(range(len(calls) - 1, 0, -1)) # we are extending calls upstream/to-the-left
cnt = 0 cnt = 0
for i in order: # extend to the right for i in order: # extend to the right
if calls[i]: # a requirement to extend is to have a call in the first place if calls[i]: # a requirement to extend is to have a call in the first place
......
...@@ -291,7 +291,7 @@ class TupleEntries(object): ...@@ -291,7 +291,7 @@ class TupleEntries(object):
def __iter__(self): def __iter__(self):
return self return self
def next(self): def __next__(self):
""" Step through sequence of entries, either """ Step through sequence of entries, either
(if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or (if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
(if sparse) with calls to tuple store based on all possible symbol combinations.""" (if sparse) with calls to tuple store based on all possible symbol combinations."""
......
import urllib, urllib2 import urllib.request
import os import os
from time import sleep from time import sleep
import stats import stats
from StringIO import StringIO from io import StringIO
import gzip import gzip
""" This module is collection of functions for accessing the EBI REST web services, """ This module is collection of functions for accessing the EBI REST web services,
including sequence retrieval, searching, gene ontology, BLAST and ClustalW. including sequence retrieval, searching, gene ontology, BLAST and ClustalW.
The class EBI takes precautions taken as to not send too many requests when The class EBI takes precautions taken as to not send too many requests when
performing BLAST and ClustalW queries. performing BLAST and ClustalW queries.
See See
http://www.ebi.ac.uk/Tools/webservices/tutorials/01_intro and http://www.ebi.ac.uk/Tools/webservices/tutorials/01_intro and
http://www.ebi.ac.uk/Tools/webservices/tutorials/02_rest http://www.ebi.ac.uk/Tools/webservices/tutorials/02_rest
http://www.ebi.ac.uk/Tools/webservices/tutorials/06_programming/python/rest/urllib http://www.ebi.ac.uk/Tools/webservices/tutorials/06_programming/python/rest/urllib
""" """
__ebiUrl__ = 'http://www.ebi.ac.uk/Tools/' # Use UQ mirror when available __ebiUrl__ = 'http://www.ebi.ac.uk/Tools/' # Use UQ mirror when available
__ebiGOUrl__ = 'http://www.ebi.ac.uk/QuickGO/' # Use UQ mirror when available __ebiGOUrl__ = 'http://www.ebi.ac.uk/QuickGO/' # Use UQ mirror when available
__uniprotUrl__ = 'http://www.uniprot.org/' # __uniprotUrl__ = 'http://www.uniprot.org/' #
def fetch(entryId, dbName='uniprotkb', format='fasta'): def fetch(entryId, dbName='uniprotkb', format='fasta'):
""" """
Retrieve a single entry from a database Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' (database dependent; examples for uniprotkb) entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' (database dependent; examples for uniprotkb)
dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases) format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases)
See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax
""" """
# Construct URL # Construct URL
url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
# Get the entry # Get the entry
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
if data.startswith('ERROR'): if data.startswith(b'ERROR'):
raise RuntimeError(data) raise RuntimeError(data)
return data return data
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def search(query, dbName='uniprot', format='list', limit=100): def search(query, dbName='uniprot', format='list', limit=100):
""" """
Retrieve multiple entries matching query from a database currently only via UniProtKB Retrieve multiple entries matching query from a database currently only via UniProtKB
query: search term(s) e.g. 'organism:9606+AND+antigen' query: search term(s) e.g. 'organism:9606+AND+antigen'
dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed" dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
format: file format e.g. 'list', 'fasta' or 'txt' format: file format e.g. 'list', 'fasta' or 'txt'
limit: max number of results (specify None for all results) limit: max number of results (specify None for all results)
See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
""" """
if dbName.startswith('uniprot'): if dbName.startswith('uniprot'):
# Construct URL # Construct URL
if limit == None: # no limit to number of results returned if limit == None: # no limit to number of results returned
url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query
else: else:
url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
# Get the entries # Get the entries
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
if format == 'list': if format == 'list':
return data.splitlines() return data.splitlines()
else: else:
return data return data
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
elif dbName.startswith('refseq'): elif dbName.startswith('refseq'):
dbs = dbName.split(":") dbs = dbName.split(":")
if len(dbs) > 1: if len(dbs) > 1:
dbName = dbs[1] dbName = dbs[1]
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/' base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit) url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
# Get the entries # Get the entries
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
words = data.split("</Id>") words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]] words = [w[w.find("<Id>")+4:] for w in words[:-1]]
if format == 'list': if format == 'list':
return words return words
elif format == 'fasta' and len(words) > 0: elif format == 'fasta' and len(words) > 0:
url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id=" url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
for w in words: for w in words:
url += w + "," url += w + ","
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
return data return data
else: else:
return '' return ''
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return return
authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'], authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
3702: ['Arabidopsis thaliana', 'TAIR_ID'], 3702: ['Arabidopsis thaliana', 'TAIR_ID'],
4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'], 4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
10090: ['Mus musculus', 'MGI_ID']} 10090: ['Mus musculus', 'MGI_ID']}
def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False): def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False):
""" """
Map identifiers between databases (based on UniProtKB; see http://www.uniprot.org/faq/28) Map identifiers between databases (based on UniProtKB; see http://www.uniprot.org/faq/28)
identifiers: a list of identifiers (list of strings) identifiers: a list of identifiers (list of strings)
frm: the tag/abbreviation for the identifier FROM which to idmap frm: the tag/abbreviation for the identifier FROM which to idmap
to: the tag/abbreviation for the identifier TO which to idmap to: the tag/abbreviation for the identifier TO which to idmap
format: the results format to use format: the results format to use
reverse: reverse the returned mapping key (to) -> value (from) reverse: reverse the returned mapping key (to) -> value (from)
Returns a dictionary with key (from) -> value (to) Returns a dictionary with key (from) -> value (to)
Set reverse to True if dictionary should contain the reverse mapping, useful if the mapping is non-unique Set reverse to True if dictionary should contain the reverse mapping, useful if the mapping is non-unique
""" """
url = __uniprotUrl__ + 'mapping/' url = __uniprotUrl__ + 'mapping/'
# construct query by concatenating the list of identifiers # construct query by concatenating the list of identifiers
if isinstance(identifiers, str): if isinstance(identifiers, str):
query = identifiers.strip() query = identifiers.strip()
else: # assume it is a list of strings else: # assume it is a list of strings
query = '' query = ''
for id in identifiers: for id in identifiers:
query = query + id.strip() + ' ' query = query + id.strip() + ' '
query = query.strip() # remove trailing spaces query = query.strip() # remove trailing spaces
params = { params = {
'from' : frm, 'from' : frm,
'to' : to, 'to' : to,
'format' : format, 'format' : format,
'query' : query 'query' : query
} }
if len(query) > 0: if len(query) > 0:
request = urllib2.Request(url, urllib.urlencode(params)) request = urllib.request.Request(url, urllib.parse.urlencode(params))
response = urllib2.urlopen(request).read() response = urllib.request.urlopen(request).read()
d = dict() d = dict()
for row in response.splitlines()[1:]: for row in response.splitlines()[1:]:
pair = row.split('\t') pair = row.split('\t')
if not reverse: if not reverse:
d[pair[0]] = pair[1] d[pair[0]] = pair[1]
else: else:
d[pair[1]] = pair[0] d[pair[1]] = pair[0]
return d return d
else: else:
return dict() return dict()
""" """
Gene Ontology service (QuickGO) Gene Ontology service (QuickGO)
http://www.ebi.ac.uk/QuickGO/WebServices.html http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries. Note that this service can be slow for queries involving a large number of entries.
""" """
def getGOReport(positives, background = None, database = 'UniProtKB'): def getGOReport(positives, background = None, database = 'UniProtKB'):
""" Generate a complete GO term report for a set of genes (positives). """ Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided). Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
(GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]). (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
E-value is a Bonferroni-corrected p-value. E-value is a Bonferroni-corrected p-value.
""" """
pos = set(positives) pos = set(positives)
fg_map = getGOTerms(pos, database) fg_map = getGOTerms(pos, database)
fg_list = [] fg_list = []
for id in fg_map: for id in fg_map:
for t in fg_map[id]: for t in fg_map[id]:
fg_list.append(t) fg_list.append(t)
bg_map = {} bg_map = {}
bg_list = [] bg_list = []
neg = set() neg = set()
if background != None: if background != None:
neg = set(background).difference(pos) neg = set(background).difference(pos)
bg_map = getGOTerms(neg, database) bg_map = getGOTerms(neg, database)
for id in bg_map: for id in bg_map:
for t in bg_map[id]: for t in bg_map[id]:
bg_list.append(t) bg_list.append(t)
term_set = set(fg_list) term_set = set(fg_list)
term_cnt = {} term_cnt = {}
nPos = len(pos) nPos = len(pos)
nNeg = len(neg) nNeg = len(neg)
if background == None: if background == None:
for t in term_set: for t in term_set:
term_cnt[t] = fg_list.count(t) term_cnt[t] = fg_list.count(t)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
else: # a background is provided else: # a background is provided
for t in term_set: for t in term_set:
fg_hit = fg_list.count(t) fg_hit = fg_list.count(t)
bg_hit = bg_list.count(t) bg_hit = bg_list.count(t)
fg_nohit = nPos - fg_hit fg_nohit = nPos - fg_hit
bg_nohit = nNeg - bg_hit bg_nohit = nNeg - bg_hit
term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False)) term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
ret = [] ret = []
for t in sorted_cnt: for t in sorted_cnt:
defin = getGODef(t[0]) defin = getGODef(t[0])
if background != None: if background != None:
ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name'])) ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name']))
else: else:
ret.append((t[0], t[1], defin['name'])) ret.append((t[0], t[1], defin['name']))
return ret return ret
def getGODef(goterm): def getGODef(goterm):
""" """
Retrieve information about a GO term Retrieve information about a GO term
goterm: the identifier, e.g. 'GO:0002080' goterm: the identifier, e.g. 'GO:0002080'
""" """
# Construct URL # Construct URL
url = __ebiGOUrl__ + 'GTerm?format=obo&id=' + goterm url = __ebiGOUrl__ + 'GTerm?format=obo&id=' + goterm
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
entry={'id': None, 'name': None, 'def': None} entry={'id': None, 'name': None, 'def': None}
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
for row in data.splitlines(): for row in data.splitlines():
index = row.find(':') index = row.find(':')
if index > 0 and len(row[index:]) > 1: if index > 0 and len(row[index:]) > 1:
field = row[0:index].strip() field = row[0:index].strip()
value = row[index+1:].strip(' "') # remove spaces and quotation marks value = row[index+1:].strip(' "') # remove spaces and quotation marks
if field in entry.keys(): # check if we need this field if field in list(entry.keys()): # check if we need this field
if entry[field] == None: # check if not yet assigned if entry[field] == None: # check if not yet assigned
entry[field] = value entry[field] = value
return entry return entry
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def getGOTerms(genes, database='UniProtKB', completeAnnot = False): def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
""" """
Retrieve all GO terms for a given set of genes (or single gene). Retrieve all GO terms for a given set of genes (or single gene).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl' database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
The result is given as a map (key=gene name, value=list of unique terms) OR The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms. in the case of a single gene as a list of unique terms.
If completeAnnot is True (default is False) then the above "terms" is the first element If completeAnnot is True (default is False) then the above "terms" is the first element
in a tuple with (gene-terms-map, gene-taxon-id). in a tuple with (gene-terms-map, gene-taxon-id).
""" """
if type(genes) != list and type(genes) != set and type(genes) != tuple: if type(genes) != list and type(genes) != set and type(genes) != tuple:
genes = [genes] genes = [genes]
termsmap = dict() termsmap = dict()
taxonmap = dict() taxonmap = dict()
uri_string = 'GAnnotation?format=tsv&gz&db=' + database + '&protein=' uri_string = 'GAnnotation?format=tsv&gz&db=' + database + '&protein='
# build queries (batches of genes) # build queries (batches of genes)
queryLength = 2000 queryLength = 2000
queries = [] queries = []
query = None query = None
for gene in genes: for gene in genes:
if query == None: if query == None:
query = gene query = gene
elif len(query) < queryLength: elif len(query) < queryLength:
query += ','+gene query += ','+gene
else: else:
queries.append(query) queries.append(query)
query = gene query = gene
if query != None: if query != None:
queries.append(query) queries.append(query)
# execute queries, each involving a number of genes # execute queries, each involving a number of genes
for query in queries: for query in queries:
# Construct URL # Construct URL
url = __ebiGOUrl__ + uri_string + query url = __ebiGOUrl__ + uri_string + query
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
urlreq = urllib2.Request(url) urlreq = urllib.request.Request(url)
urlreq.add_header('Accept-encoding', 'gzip') urlreq.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(urlreq) response = urllib.request.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip': if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read()) buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf) f = gzip.GzipFile(fileobj=buf)
data = f.read() data = f.read()
else: else:
data = response.read() data = response.read()
for row in data.splitlines()[1:]: # we ignore first (header) row for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t') values = row.split('\t')
if len(values) >= 7: if len(values) >= 7:
key = values[1] key = values[1]
if termsmap.has_key(key): if key in termsmap:
termsmap[key].add(values[6]) termsmap[key].add(values[6])
else: else:
termsmap[key] = set([values[6]]) termsmap[key] = set([values[6]])
taxonmap[key] = int(values[4]) taxonmap[key] = int(values[4])
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
if completeAnnot: if completeAnnot:
if len(genes) == 1: if len(genes) == 1:
if len(termsmap) == 1: if len(termsmap) == 1:
return (termsmap[genes[0]], taxonmap[genes[0]]) return (termsmap[genes[0]], taxonmap[genes[0]])
else: else:
return (set(), None) return (set(), None)
else: else:
return (termsmap, taxonmap) return (termsmap, taxonmap)
else: else:
if len(genes) == 1: if len(genes) == 1:
if len(termsmap) == 1: if len(termsmap) == 1:
return termsmap[genes[0]] return termsmap[genes[0]]
else: else:
return set() return set()
else: else:
return termsmap return termsmap
def getGenes(goterms, database='UniProtKB', taxo=None): def getGenes(goterms, database='UniProtKB', taxo=None):
""" """
Retrieve all genes/proteins for a given set of GO terms (or single GO term). Retrieve all genes/proteins for a given set of GO terms (or single GO term).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl' database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
taxo: use specific taxonomic identifier, e.g. 9606 (human) taxo: use specific taxonomic identifier, e.g. 9606 (human)
The result is given as a map (key=gene name, value=list of unique terms) OR The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms. in the case of a single gene as a list of unique terms.
""" """
if type(goterms) != list and type(goterms) != set and type(goterms) != tuple: if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
goterms = [goterms] goterms = [goterms]
map = dict() map = dict()
if taxo == None: if taxo == None:
uri_string = 'GAnnotation?format=tsv&db=' + database + '&term=' uri_string = 'GAnnotation?format=tsv&db=' + database + '&term='
else: else:
uri_string = 'GAnnotation?format=tsv&db=' + database + '&tax=' + str(taxo) + '&term=' uri_string = 'GAnnotation?format=tsv&db=' + database + '&tax=' + str(taxo) + '&term='
for goterm in goterms: for goterm in goterms:
genes = set() genes = set()
# Construct URL # Construct URL
url = __ebiGOUrl__ + uri_string + goterm.strip() url = __ebiGOUrl__ + uri_string + goterm.strip()
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
data = urllib2.urlopen(url).read() data = urllib.request.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore first (header) row for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t') values = row.split('\t')
if len(values) >= 7: if len(values) >= 7:
genes.add(values[1]) genes.add(values[1])
map[goterm] = list(genes) map[goterm] = list(genes)
except urllib2.HTTPError, ex: except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
if len(goterms) == 1: if len(goterms) == 1:
return map[goterms[0]] return map[goterms[0]]
else: else:
return map return map
class EBI(object): class EBI(object):
__email__ = 'anon@uq.edu.au' # to whom emails about jobs should go __email__ = 'anon@uq.edu.au' # to whom emails about jobs should go
__ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
__checkInterval__ = 2 # how long to wait between checking job status __checkInterval__ = 2 # how long to wait between checking job status
def __init__(self, service=None): def __init__(self, service=None):
""" Initialise service session. """ Initialise service session.
service: presently, ncbiblast and clustalw2 are supported. Use None (default) for fetch/idmap jobs. service: presently, ncbiblast and clustalw2 are supported. Use None (default) for fetch/idmap jobs.
""" """
self.service = service self.service = service
self.lockFile = '%s.lock' % service self.lockFile = '%s.lock' % service
def createLock(self): def createLock(self):
""" Create a lock file to prevent submission of more than 1 job """ Create a lock file to prevent submission of more than 1 job
at a time by a single user. """ at a time by a single user. """
fh = open(self.lockFile, 'w') fh = open(self.lockFile, 'w')
fh.write(self.jobId) fh.write(self.jobId)
fh.close() fh.close()
def removeLock(self): def removeLock(self):
""" Remove the lock file. """ """ Remove the lock file. """
os.remove(self.lockFile) os.remove(self.lockFile)
def isLocked(self): def isLocked(self):
""" Check if there is a lock on this service. If there is, check if """ Check if there is a lock on this service. If there is, check if
the job is complete, and if so remove the lock. Return True if still the job is complete, and if so remove the lock. Return True if still
locked and False if not. """ locked and False if not. """
if os.path.exists(self.lockFile): if os.path.exists(self.lockFile):
fh = open(self.lockFile, 'r') fh = open(self.lockFile, 'r')
jobId = fh.read() jobId = fh.read()
fh.close() fh.close()
status = self.status(jobId) status = self.status(jobId)
if status == 'RUNNING': if status == 'RUNNING':
self.jobId = jobId self.jobId = jobId
return True return True
else: else:
self.removeLock() self.removeLock()
return False return False
else: else:
return False return False
""" """
BLAST and CLUSTALW services BLAST and CLUSTALW services
""" """
def run(self, params): def run(self, params):
""" Submit a job to the given service with the given parameters, given """ Submit a job to the given service with the given parameters, given
as a dictionary. Return the jobId. """ as a dictionary. Return the jobId. """
if self.service == None: if self.service == None:
raise RuntimeError('No service specified') raise RuntimeError('No service specified')
if self.isLocked(): if self.isLocked():
raise RuntimeError("""You currently have a %s job running. You must raise RuntimeError("""You currently have a %s job running. You must
wait until it is complete before submitting another job. Go to wait until it is complete before submitting another job. Go to
%sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId)) %sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId))
url = self.__ebiServiceUrl__ + self.service + '/run/' url = self.__ebiServiceUrl__ + self.service + '/run/'
# ncbiblast database parameter needs special handling # ncbiblast database parameter needs special handling
if self.service == 'ncbiblast': if self.service == 'ncbiblast':
databaseList = params['database'] databaseList = params['database']
del params['database'] del params['database']
databaseData = '' databaseData = ''
for db in databaseList: for db in databaseList:
databaseData += '&database=' + db databaseData += '&database=' + db
encodedParams = urllib.urlencode(params) encodedParams = urllib.parse.urlencode(params)
encodedParams += databaseData encodedParams += databaseData
else: else:
encodedParams = urllib.urlencode(params) encodedParams = urllib.parse.urlencode(params)
print url print(url)
self.jobId = urllib2.urlopen(url, encodedParams).read() self.jobId = urllib.request.urlopen(url, encodedParams).read()
self.createLock() self.createLock()
return self.jobId return self.jobId
def status(self, jobId=None): def status(self, jobId=None):
""" Check the status of the given job (or the current job if none is """ Check the status of the given job (or the current job if none is
specified), and return the result. """ specified), and return the result. """
if jobId is None: if jobId is None:
jobId = self.jobId jobId = self.jobId
url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
status = urllib2.urlopen(url).read() status = urllib.request.urlopen(url).read()
return status return status
def resultTypes(self): def resultTypes(self):
""" Get the available result types. Will only work on a finished job. """ """ Get the available result types. Will only work on a finished job. """
url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
resultTypes = urllib2.urlopen(url).read() resultTypes = urllib.request.urlopen(url).read()
return resultTypes return resultTypes
def result(self, resultType): def result(self, resultType):
""" Get the result of the given job of the specified type. """ """ Get the result of the given job of the specified type. """
url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType) url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
try: try:
result = urllib2.urlopen(url).read() result = urllib.request.urlopen(url).read()
if resultType == 'error': if resultType == 'error':
raise RuntimeError('An error occurred: %s' % result) raise RuntimeError('An error occurred: %s' % result)
except urllib2.HTTPError: except(urllib.error.HTTPError):
if resultType == 'error': if resultType == 'error':
raise RuntimeError('An unknown error occurred while processing the job (check your input)') raise RuntimeError('An unknown error occurred while processing the job (check your input)')
else: else:
self.result('error') self.result('error')
return result return result
def submit(self, params, resultTypes): def submit(self, params, resultTypes):
""" Submit a new job to the service with the given parameters. """ Submit a new job to the service with the given parameters.
Return the output in the specified format. """ Return the output in the specified format. """
params['email'] = self.__email__ params['email'] = self.__email__
self.run(params) self.run(params)
print 'Submitted new', self.service, 'job, jobId:', self.jobId print(('Submitted new', self.service, 'job, jobId:', self.jobId))
print 'Please be patient while the job is completed' print('Please be patient while the job is completed')
status = 'RUNNING' status = 'RUNNING'
observe = 0 observe = 0
while status == 'RUNNING': while status == 'RUNNING':
observe = observe + 1 observe = observe + 1
status = self.status() status = self.status()
sleep(self.__checkInterval__) sleep(self.__checkInterval__)
if status != 'FINISHED': if status != 'FINISHED':
raise RuntimeError('An error occurred and the job could not be completed') raise RuntimeError('An error occurred and the job could not be completed')
print 'Job complete.' print('Job complete.')
self.removeLock() self.removeLock()
if type(resultTypes) != list: if type(resultTypes) != list:
resultTypes = [resultTypes] resultTypes = [resultTypes]
results = [] results = []
for resultType in resultTypes: for resultType in resultTypes:
results.append(self.result(resultType)) results.append(self.result(resultType))
if len(results) == 1: if len(results) == 1:
return results[0] return results[0]
else: else:
return results return results
...@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100): ...@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
neg[word] = 1 neg[word] = 1
logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg
for (word, cnt_pos) in pos.items(): for (word, cnt_pos) in list(pos.items()):
cnt_neg = 0.0001 cnt_neg = 0.0001
try: try:
cnt_neg = neg[word] cnt_neg = neg[word]
...@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100): ...@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pass pass
logratio[word] = math.log(float(cnt_pos) / float(cnt_neg)) logratio[word] = math.log(float(cnt_pos) / float(cnt_neg))
allpos = logratio.items() # extract all pairs of words:log-ratio allpos = list(logratio.items()) # extract all pairs of words:log-ratio
sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them
print "Enriched words (sorted by ln pos/neg)" print("Enriched words (sorted by ln pos/neg)")
print "Word \tln pos/neg\tE-value" print("Word \tln pos/neg\tE-value")
for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values
cnt_pos = int(pos[word]) cnt_pos = int(pos[word])
try: cnt_neg = int(neg[word]) try: cnt_neg = int(neg[word])
...@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100): ...@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False) pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False)
# Correct for multiple testing (very conservatively) # Correct for multiple testing (very conservatively)
eval = pval * len(allpos) eval = pval * len(allpos)
print "%s\t%6.3f \t%e" % (word, lgr, eval) print("%s\t%6.3f \t%e" % (word, lgr, eval))
def getReverse(distribs): def getReverse(distribs):
""" Construct a new list of probability distributions of DNA, by """ Construct a new list of probability distributions of DNA, by
...@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'): ...@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
except KeyError: except KeyError:
usage(sys.argv[0], "Unknown motif %s" % motif) usage(sys.argv[0], "Unknown motif %s" % motif)
return return
print "Motif %s:" % motif print("Motif %s:" % motif)
pwm1 = sequence.PWM(fg1, bg) pwm1 = sequence.PWM(fg1, bg)
pwm1.display(format='JASPAR') pwm1.display(format='JASPAR')
print "Motif %s (reverse complement):" % motif print("Motif %s (reverse complement):" % motif)
pwm2 = sequence.PWM(fg2, bg) pwm2 = sequence.PWM(fg2, bg)
pwm2.display(format='JASPAR') pwm2.display(format='JASPAR')
...@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'): ...@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
# plot the average score curve # plot the average score curve
# print >> sys.stderr, "" # print >> sys.stderr, ""
x = range(-(seq_len/2), (seq_len/2)) # call center of sequence X=0 x = list(range(-(seq_len/2), (seq_len/2))) # call center of sequence X=0
lbl = "%s" % (motif) lbl = "%s" % (motif)
plt.plot(x, avg_motif_score, label=lbl) plt.plot(x, avg_motif_score, label=lbl)
#plt.plot(x, smoothed_avg_motif_score, label=lbl) #plt.plot(x, smoothed_avg_motif_score, label=lbl)
...@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices ...@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
except KeyError: except KeyError:
usage(sys.argv[0], "Unknown motif %s" % motif) usage(sys.argv[0], "Unknown motif %s" % motif)
return return
print "Motif %s:" % motif print("Motif %s:" % motif)
pwm1 = sequence.PWM(fg1, bg) pwm1 = sequence.PWM(fg1, bg)
pwm1.display(format='JASPAR') pwm1.display(format='JASPAR')
print "Motif %s (reverse complement):" % motif print("Motif %s (reverse complement):" % motif)
pwm2 = sequence.PWM(fg2, bg) pwm2 = sequence.PWM(fg2, bg)
pwm2.display(format='JASPAR') pwm2.display(format='JASPAR')
...@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices ...@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
# divide number of sequences with hit by total number of hits # divide number of sequences with hit by total number of hits
site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ] site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ]
print >> sys.stderr, "Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits) print("Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits), file=sys.stderr)
# STATISTICS # STATISTICS
# Get the cumulative hit counts in concentric windows # Get the cumulative hit counts in concentric windows
...@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices ...@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
for i in range(hw, seq_len-motif_width+1-hw): for i in range(hw, seq_len-motif_width+1-hw):
smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1) smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1)
x = range(-(seq_len/2), (seq_len/2)) # call center of sequence X=0 x = list(range(-(seq_len/2), (seq_len/2))) # call center of sequence X=0
lbl = "%s, t=%.2f" % (motif, threshold) lbl = "%s, t=%.2f" % (motif, threshold)
#lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue)) #lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
plt.plot(x, smoothed_site_probability, label=lbl) plt.plot(x, smoothed_site_probability, label=lbl)
...@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices ...@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
def usage(name, errmsg = None): def usage(name, errmsg = None):
if errmsg != None: if errmsg != None:
print "Error: %s" % errmsg print("Error: %s" % errmsg)
print """Usage: %s [options] print("""Usage: %s [options]
-f <fasta-filename> (required) -f <fasta-filename> (required)
-d discover enriched words -d discover enriched words
-w <word width, default 8> -w <word width, default 8>
-p <peak width, default 100> -p <peak width, default 100>
-m <peak margin, default 100> -m <peak margin, default 100>
-s <JASPAR-ID> scan for JASPAR motif -s <JASPAR-ID> scan for JASPAR motif
-h print this help""" % name -h print this help""" % name)
if __name__ == '__main__': if __name__ == '__main__':
try: try:
optlst, args = getopt.getopt(sys.argv[1:], 'f:hds:j:w:p:m:') optlst, args = getopt.getopt(sys.argv[1:], 'f:hds:j:w:p:m:')
except getopt.GetoptError, err: except getopt.GetoptError as err:
usage(sys.argv[0], str(err)) usage(sys.argv[0], str(err))
sys.exit(2) sys.exit(2)
FILENAME = None FILENAME = None
...@@ -301,7 +301,7 @@ if __name__ == '__main__': ...@@ -301,7 +301,7 @@ if __name__ == '__main__':
sys.exit(3) sys.exit(3)
seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN) seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN)
if DISCOVER_MODE: if DISCOVER_MODE:
print "Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN) print("Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN))
countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN) countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
elif SCAN_MODE: elif SCAN_MODE:
scanMotifReport(seqs, MOTIF_ID) scanMotifReport(seqs, MOTIF_ID)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment