Commit ac6c5d6b authored by Mikael Boden's avatar Mikael Boden

python3_5

parent 934c2bff
......@@ -95,8 +95,8 @@ def betacf(a, b, x):
h *= delta
if (abs(delta-1.0) < EPS): break
if (m > MAXIT): print >> sys.stderr, ("a or b too big or MAXIT too small "
"in betacf")
if (m > MAXIT): print(("a or b too big or MAXIT too small "
"in betacf"), file=sys.stderr)
return h
......@@ -118,5 +118,5 @@ def gammaln(x):
def die(string):
print >> sys.stderr, string
print(string, file=sys.stderr)
......@@ -105,7 +105,7 @@ class GeneExpression:
{'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
"""
if names == None:
return self.genes.keys()
return list(self.genes.keys())
elif isinstance(names, str):
return self.matrix[self.genes[names],:]
else:
......@@ -148,7 +148,7 @@ class GeneExpression:
except:
index = samples
mygenes = {}
for (name, ndx) in self.genes.items():
for (name, ndx) in list(self.genes.items()):
mygenes[name] = self.matrix[ndx, index]
return mygenes
......@@ -165,7 +165,7 @@ class GeneExpression:
sort_ndx = np.nan_to_num(self.matrix[:,index]).argsort()
except:
sort_ndx = np.nan_to_num(self.matrix[:,sample]).argsort()
name_tuples = sorted(self.genes.items(), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
name_tuples = sorted(list(self.genes.items()), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
names = []
if descending:
for (name, index) in [name_tuples[index] for index in sort_ndx[::-1]]: # reverse the order
......@@ -199,7 +199,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding ratios. """
mygenes = {}
mdiv = self.matrix[:, index1] / self.matrix[:, index2]
for (name, ndx) in self.genes.items():
for (name, ndx) in list(self.genes.items()):
mygenes[name] = mdiv[ndx]
return mygenes
......@@ -208,7 +208,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding log-ratios. """
mygenes = {}
mlr = np.log2(self.matrix[:, index1] / self.matrix[:, index2])
for (name, ndx) in self.genes.items():
for (name, ndx) in list(self.genes.items()):
mygenes[name] = mlr[ndx]
return mygenes
......@@ -218,7 +218,7 @@ class GeneExpression:
index = self.genes[probeID]
profile = self.matrix[index, :]
mygenes = {}
for (name, ndx) in self.genes.items():
for (name, ndx) in list(self.genes.items()):
other = self.matrix[ndx, :]
mygenes[name] = pearson(profile, other)
return mygenes
......@@ -252,7 +252,7 @@ class GeneExpression:
# Calculate Z-score for the given column for each gene
zscore = (self.matrix[:, index] - mu) / sd
mygenes = {}
for (name, ndx) in self.genes.items():
for (name, ndx) in list(self.genes.items()):
try:
mygenes[name] = zscore[ndx, :]
except IndexError:
......@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0):
genes[name] = values
if len(genes) == 0:
raise RuntimeError('No data in file')
print 'Data set %s contains %d entries' % (dataset, len(genes))
print('Data set %s contains %d genes' % (dataset, len(genes)))
if cnt_null > 0:
print 'Data set has %d null-values' % (cnt_null)
print('Data set has %d null-values' % (cnt_null))
return GeneExpression(dataset, headers[2:], genes)
......@@ -357,40 +357,29 @@ def pearson(X, Y):
return 0
return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar))
# ------------------- Example ---------------------
# ------------------- Example (basically exercise 7 in prac 9)---------------------
ge3716 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS3716.soft')
if __name__=='__main__':
ratio = GeneExpression('GDS3716_ratio')
ratio.addSamples('S1_ER+/Healthy', ge3716.getRatio( 33, 0))
ratio.addSamples('S2_ER+/Healthy', ge3716.getRatio( 34, 1))
ratio.addSamples('S3_ER+/Healthy', ge3716.getRatio( 35, 2))
ratio.addSamples('S4_ER+/Healthy', ge3716.getRatio( 36, 3))
ratio.addSamples('S5_ER+/Healthy', ge3716.getRatio( 37, 4))
ratio.addSamples('S6_ER+/Healthy', ge3716.getRatio( 38, 5))
ratio.addSamples('S7_ER+/Healthy', ge3716.getRatio( 39, 6))
ratio.addSamples('S8_ER+/Healthy', ge3716.getRatio( 40, 7))
ratio.addSamples('S9_ER+/Healthy', ge3716.getRatio( 41, 8))
ratio.addSamples('S1_ER-/Healthy', ge3716.getRatio( 24, 9))
ratio.addSamples('S2_ER-/Healthy', ge3716.getRatio( 25, 10))
ratio.addSamples('S3_ER-/Healthy', ge3716.getRatio( 26, 11))
ratio.addSamples('S4_ER-/Healthy', ge3716.getRatio( 27, 12))
ratio.addSamples('S5_ER-/Healthy', ge3716.getRatio( 28, 13))
ratio.addSamples('S6_ER-/Healthy', ge3716.getRatio( 29, 14))
ratio.addSamples('S7_ER-/Healthy', ge3716.getRatio( 30, 15))
ratio.addSamples('S8_ER-/Healthy', ge3716.getRatio( 31, 16))
ratio.addSamples('S9_ER-/Healthy', ge3716.getRatio( 32, 17))
ratio.writeGEOFile('/Users/mikael/workspace/COSC2000/GDS3716_ratios.soft')
print ge3716.getHeaders()
z = ratio.getZScore(0) # NOT recommended! Ratios are NOT normally distributed! Use log-ratios instead.
ge38 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS38.soft', id_column = 1)
cln2_profile = ge38.getGenes('CLN2')
pcorr = ge38.getPearson('CLN2')
gp = GeneExpression('Ex3', 'PC_CLN2', pcorr)
sorted = gp.sort('PC_CLN2', True)
print sorted[0], ge38.getGenes(sorted[0])
print sorted[1], ge38.getGenes(sorted[1])
g = readGEOFile('GDS3198.soft', id_column = 1)
meanfold = {}
for gene in g.genes:
profile = g.getGenes(gene)
meanfold[gene] = (np.log2(profile[0] / profile[3]) + np.log2(profile[1] / profile[4]) + np.log2(profile[2] / profile[5])) / 3
import matplotlib.pyplot as plt
scores = [y for y in list(meanfold.values()) if not np.isnan(y)]
hist, bins = np.histogram(scores, bins=50)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.show()
result = sorted(list(meanfold.items()), key=lambda v: v[1])
print('========== Wildtype may down-regulate ==========')
for r in result[0:100]:
print(r[0], r[1])
print('========== Wildtype may up-regulate ==========')
for r in result[-1:-100:-1]:
print(r[0], r[1])
......@@ -138,7 +138,7 @@ class GibbsMotif():
LL += math.log(Qk / Pk)
except ZeroDivisionError:
pass
print "LL @ %5d=\t%5.2f" % (round, LL)
print("LL @ %5d=\t%5.2f" % (round, LL))
# end main for-loop
self.q = q
......@@ -312,7 +312,7 @@ class GibbsAlign():
LL += math.log(Qk / Pk)
except ZeroDivisionError:
pass
print "LL @ %5d=\t%5.2f" % (round, LL)
print("LL @ %5d=\t%5.2f" % (round, LL))
# end main for-loop
self.q = q
......
......@@ -92,7 +92,7 @@ class GO():
say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations:
go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8))
"""
print "Started at", time.asctime()
print(("Started at", time.asctime()))
# Get GO definitions
terms = readOBOFile(obofile)
for term in terms:
......@@ -108,7 +108,7 @@ class GO():
cset.add((term, prel))
except KeyError:
pass
print "Read %d GO definitions" % len(terms)
print(("Read %d GO definitions" % len(terms)))
# open annotation file to analyse and index data
src = open(annotFile, 'r')
gene_cnt = 0
......@@ -126,7 +126,7 @@ class GO():
terms_map = {term: (evid, qual != 'NOT')}
self.annots[gene] = (taxa, terms_map)
src.close()
print "Read annotations for %d genes" % gene_cnt
print(("Read annotations for %d genes" % gene_cnt))
def _makeIntoList(self, id_or_ids):
if type(id_or_ids) != list and type(id_or_ids) != set and type(id_or_ids) != tuple:
......@@ -283,7 +283,7 @@ class GO():
def getAllAnnots(self):
""" Retrieve all annotated gene products """
return self.annots.keys()
return list(self.annots.keys())
def getAllBackground(self, positives = [], taxa = None, evid = None, include_more_general = False):
""" Retrieve all genes and terms that are annotated but not in a list of positives (gene products).
......@@ -328,12 +328,12 @@ class GO():
cnt = fg_list.count(t)
if cnt >= threshold:
term_cnt[t] = cnt
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
ret = []
for t in sorted_cnt:
defin = self.getTermdef(t[0])
if defin == None:
print 'Could not find definition of %s' % t[0]
print(('Could not find definition of %s' % t[0]))
else:
ret.append((t[0], t[1], defin[2], defin[0]))
return ret
......@@ -360,7 +360,7 @@ class GO():
# Process background: find terms of genes
bg_list = []
if background == None: # need to use the full set
background = self.annots.keys()
background = list(self.annots.keys())
negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives
nNeg = len(negatives)
bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general)
......@@ -383,12 +383,12 @@ class GO():
evalue = pvalue * len(term_set) # Bonferroni correction
if evalue <= threshold: # check if significance req is fulfilled
term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
ret = []
for t in sorted_cnt:
defin = self.getTermdef(t[0])
if defin == None:
print 'Could not find definition of %s' % t[0]
print(('Could not find definition of %s' % t[0]))
else:
ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0]))
return ret
......@@ -448,17 +448,17 @@ class BinGO():
found = set()
try:
(_, closure, _) = self.termdefs[term]
for (t, r) in closure.items():
for (t, r) in list(closure.items()):
if (not rel) or r == rel:
found.add(t)
found.update(self._getSuperTerms(t, rel))
except KeyError:
print 'Could not find GO:%s' % (''.join(decode(term, self.term_code)))
print(('Could not find GO:%s' % (''.join(decode(term, self.term_code)))))
return found
def _getChildTerms(self, term, rel = None):
found = set()
for (child, termdef) in self.termdefs.items():
for (child, termdef) in list(self.termdefs.items()):
(_, parents_dict, _) = termdef
try:
myrel = parents_dict[term]
......@@ -491,7 +491,7 @@ class BinGO():
mymap[gene_name] = set()
try:
(taxa, terms) = self._getGeneEntry(i)
for (term, evid_and_qual) in terms.items():
for (term, evid_and_qual) in list(terms.items()):
if evid_and_qual[1] and not evid: # if True and no evidence is specified
direct.add(term)
mymap[gene_name].add(term)
......@@ -505,7 +505,7 @@ class BinGO():
# STEP 2: Find the transitive closure of each term identified, store as a dictionary
indirect = {}
for t in direct:
if not indirect.has_key(t):
if not t in indirect:
indirect[t] = set(self._getSuperTerms(t))
# STEP 3: compile and return results
for gene in mymap:
......@@ -521,7 +521,7 @@ class BinGO():
def getAllGenes(self):
names = []
for g in self._decodeGeneIDs(self.annot_index.keys()):
for g in self._decodeGeneIDs(list(self.annot_index.keys())):
names.append(''.join(g))
return names
......@@ -545,7 +545,7 @@ class BinGO():
gene_name = decode(g, self.gene_code)
(mytaxa, tdict) = self._getGeneEntry(g)
if not taxa or taxa == mytaxa:
for annot_term in tdict.keys():
for annot_term in list(tdict.keys()):
if tdict[annot_term] == evid:
if annot_term in terms:
try:
......@@ -642,10 +642,10 @@ class BinGO():
(annot_offset, obo_offset) = unpack('II', buf)
break
except error as inst:
print "Problem reading binary file: ", inst, "at gene ", current_gene_cnt, "at definition ", current_terms_cnt, "at", f.tell()
print(("Problem reading binary file: ", inst, "at gene ", current_gene_cnt, "at definition ", current_terms_cnt, "at", f.tell()))
exit(3)
print "Read %d genes and %d term definitions" % (current_gene_cnt, current_terms_cnt)
print "Annotations start at", annot_offset, "\nDefinitions start at", obo_offset
print(("Read %d genes and %d term definitions" % (current_gene_cnt, current_terms_cnt)))
print(("Annotations start at", annot_offset, "\nDefinitions start at", obo_offset))
return f
#FIXME: write code to perform test of taxa enrichment
......@@ -655,7 +655,7 @@ class BinGO():
Uses the Wilcoxon Ranksum test for each GO term to assign a p-value,
indicating the enrichment of term to "top" genes in descending order by score (by default).
"""
fg_map = self.getTerms(gene_score_map.keys(), include_more_general = include_more_general)
fg_map = self.getTerms(list(gene_score_map.keys()), include_more_general = include_more_general)
fg_list = []
for id in fg_map:
for t in fg_map[id]:
......@@ -663,7 +663,7 @@ class BinGO():
term_set = set(fg_list)
term_pval = {}
if len(negatives_score_map) > 0:
bg_map = self.getTerms(negatives_score_map.keys(), include_more_general = include_more_general)
bg_map = self.getTerms(list(negatives_score_map.keys()), include_more_general = include_more_general)
for t in term_set:
pos = []
neg = []
......@@ -698,13 +698,13 @@ class BinGO():
else:
term_pval[t] = (p, 1.0)
sorted_pval = sorted(term_pval.items(), key=lambda v: v[1][0], reverse=False)
sorted_pval = sorted(list(term_pval.items()), key=lambda v: v[1][0], reverse=False)
ret = []
for t in sorted_pval:
defin = self.getTermdef(t[0])
if defin == None:
print 'Could not find definition of %s' % t[0]
print(('Could not find definition of %s' % t[0]))
else:
ret.append((t[0], t[1][0], t[1][1], defin[2].strip(), defin[0]))
return ret
......@@ -739,7 +739,7 @@ class BinGO():
if background == None:
for t in term_set:
term_cnt[t] = fg_list.count(t)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
else: # a background is provided
for t in term_set:
fg_hit = fg_list.count(t)
......@@ -747,13 +747,13 @@ class BinGO():
fg_nohit = nPos - fg_hit
bg_nohit = nNeg - bg_hit
term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
ret = []
for t in sorted_cnt:
defin = self.getTermdef(t[0])
if defin == None:
print 'Could not find definition of %s' % t[0]
print(('Could not find definition of %s' % t[0]))
else:
if background != None:
ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][1], defin[2], defin[0]))
......@@ -773,7 +773,7 @@ def encode(code_me, encode_strings):
accum *= codelen
break
except IndexError as e:
print e, code_me
print((e, code_me))
return code
def decode(code, encode_strings):
......@@ -787,7 +787,7 @@ def decode(code, encode_strings):
code -= accum[pos] * indices[pos]
string = [encode_strings[pos][indices[pos]] for pos in range(len(encode_strings))]
except IndexError as e:
print e, code
print((e, code))
return string
def _extractAnnotFields(line, columns = (1,2,3,4,6,8)):
......@@ -837,10 +837,10 @@ def _extractAnnotFields(line, columns = (1,2,3,4,6,8)):
term = None
raise "No GO term on line: " + line
evid = fields[columns[4]]
if not evid_codes.has_key(evid):
if not evid in evid_codes:
evid = None
onto = fields[columns[5]]
if not onto_codes.has_key(onto):
if not onto in onto_codes:
onto = None
taxa_idx = line.find('taxon:')
if taxa_idx == -1:
......@@ -902,7 +902,7 @@ def readOBOFile(obofile):
return terms
def writeBitFile(annotFile, obofile, destFile, taxas = None):
print "Started at", time.asctime()
print(("Started at", time.asctime()))
# open annotation file to analyse and index data
src = open(annotFile, 'r')
gene_index = [{} for _ in range(6)] # count different characters in different positions
......@@ -942,27 +942,27 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
pos += 1
prev_gene = gene
src.close()
print "Read annotations for %d genes" % gene_cnt
print(("Read annotations for %d genes" % gene_cnt))
gene_code = ['' for _ in range(6)]
term_code = ['' for _ in range(7)]
for d in range(len(gene_index)):
arr = ['?' for _ in gene_index[d]]
for (ch, index) in gene_index[d].items():
for (ch, index) in list(gene_index[d].items()):
arr[index] = ch
gene_code[d] = ''.join(arr)
for d in range(len(term_index)):
arr = ['?' for _ in term_index[d]]
for (ch, index) in term_index[d].iteritems():
for (ch, index) in term_index[d].items():
arr[index] = ch
term_code[d] = ''.join(arr)
evid_code = ['' for _ in range(len(evid_index))]
for (e, ndx) in evid_index.items():
for (e, ndx) in list(evid_index.items()):
evid_code[ndx] = e
# Get GO definitions
terms = readOBOFile(obofile)
print "Read %d GO definitions" % len(terms)
print(("Read %d GO definitions" % len(terms)))
# re-open, now with the aim of copying info
src = open(annotFile, 'r')
......@@ -975,7 +975,7 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
dst.write(code_str+"\n")
for e_str in evid_code:
dst.write(e_str+'\n')
print "Wrote header %d\t%d\t%d\t%d\t%d, now at @%d" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index), dst.tell())
print(("Wrote header %d\t%d\t%d\t%d\t%d, now at @%d" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index), dst.tell())))
# STEP 2: write annotations
annot_offset = dst.tell()
......@@ -1012,11 +1012,11 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
(o, q, e) = concat_terms[t]
s = pack('?BI', q, evid_index[e], encode(t, term_code))
dst.write(s)
print "Wrote GO annotations, now at @%d" % dst.tell()
print(("Wrote GO annotations, now at @%d" % dst.tell()))
# Next, the ontology definition...
obo_offset = dst.tell() # remember the position where the OBO starts
sorted_terms = sorted(terms.iteritems(), key=operator.itemgetter(0))
sorted_terms = sorted(iter(terms.items()), key=operator.itemgetter(0))
for [t, _] in sorted_terms:
(term_name, term_onto, term_is) = terms[t]
s = pack('IcH', encode(t[3:], term_code), term_onto, len(term_is))
......@@ -1029,12 +1029,10 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
s = pack('BI', index, encode(sup_term[3:], term_code))
dst.write(s)
dst.write(term_name + '\n')
print "Wrote %d GO definitions, now at @%d" % (len(sorted_terms), dst.tell())
print(("Wrote %d GO definitions, now at @%d" % (len(sorted_terms), dst.tell())))
# Finally, write the offsets to quickly access annotations and definitions, resp
dst.write(pack('II', annot_offset, obo_offset))
# done, close
dst.close()
print "Completed at", time.asctime()
print(("Completed at", time.asctime()))
###################################################
# This module is a supplement to the Python guide #
# Version 2.2016.1 (8/3/2016) #
# Version 1.20160 (19/12/2016) #
###################################################
'''
This module contains code for that can help solving bioinformatics problems.
See the accompanying Python guide for more explanations and examples.
'''
This module contains code that can help solve bioinformatics problems.
See the accompanying Python guide document for more explanations and examples.
Alphabet is a class that defines valid symbols that we then use to make up valid
biological sequences. Note that we also define variables corresponding to
Alphabet is a class that defines valid symbols that we then use to make up valid
biological sequences. Note that we also define variables corresponding to
DNA, RNA and Protein sequences that can be used directly.
Sequence defines basic parts and operations on biological sequences.
Sequence is a class that defines basic parts and operations on biological sequences.
Alignment defines an alignment of sequences (how symbols in different sequences line
up when placed on-top). Alignment methods should generate instances of this class.
Alignment is a class that defines an alignment of sequences (how symbols in different sequences line
up when placed on-top of one another). Alignment methods should generate instances of this class.
SubstMatrix defines a substitution matrix, i.e. a scoring system for performing
SubstMatrix is a class that defines a substitution matrix, i.e. a scoring system for performing
alignments. You can read these from files or construct them manually.
GeneProfile defines parts and operations for gene expression profiles. Essentially,
GeneProfile is a class that defines parts and operations for gene expression profiles. Essentially,
the class will help to index expression data by gene name (rows) and by sample name (columns).
There are several methods not tied to a particular class because they construct new instances,
There are several methods not tied to a particular class because they construct new instances,
e.g. reading from file, retrieving from the internet, creating an alignment from sequences etc.
You need to have numpy installed (see http://www.numpy.org/).
Should work with Python v2.6-2.7 (see http://www.python.org/).
Has not been written to work with Python v3 and later--but this should be easy to do.
You need to have numpy installed (see http://www.numpy.org/).
Should work with Python v2.6-3.5 (see http://www.python.org/).
The code may contain bugs--please report to m.boden@uq.edu.au
'''
import math, numpy, urllib, urllib2
import math, numpy, urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse
###############################################################################
# Alphabet #
###############################################################################
class Alphabet():
""" A minimal class for alphabets """
""" A minimal class for alphabets
Alphabets include DNA, RNA and Protein """
def __init__(self, symbolString):
self.symbols = symbolString
def __len__(self): # implements the "len" operator, e.g. "len(Alphabet('XYZ')" results in 3
return len(self.symbols)
def __contains__(self, sym): # implements the "in" operator, e.g. "'A' in Alphabet('ACGT')" results in True
return sym in self.symbols
def __len__(self): # implements the "len" operator, e.g. "len(Alphabet('XYZ'))" results in 3
return len(self.symbols) # will tell you the length of the symbols in an Alphabet instance
def __contains__(self, sym): # implements the "in" operator, e.g. "'A' in Alphabet('ACGT')" results in True
return sym in self.symbols # will tell you if 'A' is in the symbols in an Alphabet instance
def __iter__(self): # method that allows us to iterate over all symbols, e.g. "for sym in Alphabet('ACGT'): print sym" prints A, C, G and T on separate lines
tsyms = tuple(self.symbols)
return tsyms.__iter__()
def __getitem__(self, ndx):
""" Retrieve the symbol(s) at the specified index (or slice of indices) """
return self.symbols[ndx]
def index(self, sym):
""" Retrieve the index of the given symbol in the alphabet. """
return self.symbols.index(sym)
def __str__(self):
return self.symbols
""" Below we declare alphabet variables that are going to be available when
this module is imported """
""" Below we declare alphabet variables that are going to be available when
this module (this .py file) is imported """
DNA_Alphabet = Alphabet('ACGT')
RNA_Alphabet = Alphabet('ACGU')
Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
###############################################################################
# Sequence #
###############################################################################
class Sequence():
""" A biological sequence class. Stores the sequence itself,
the alphabet and a name.
""" A biological sequence class. Stores the sequence itself,
the alphabet and a name.
Usage:
Create an instance of Sequence - have to pass in sequence, alphabet and name - gappy is an optional argument
>>> seq1 = Sequence('ACGGGAGAGG', DNA_Alphabet, 'ABC')
>>> print seq1
>>> print(seq1)
ABC: ACGGGAGAGG
>>> 'C' in seq1
True
>>> for sym in seq1:
... print sym
... print(sym)
"""
def __init__(self, sequence, alphabet, name = '', gappy = False, annot = ''):
def __init__(self, sequence, alphabet, name = '', gappy = False):
""" Construct a sequence from a string, an alphabet (gappy or not) and a name.
The parameter gappy is for sequences when used in alignments, which means that '-' is allowed. """
The parameter gappy is for sequences when used in alignments. """
for sym in sequence:
if not sym in alphabet and (sym != '-' or not gappy): # error check: bail out
raise RuntimeError('Invalid symbol: ' + sym)
self.sequence = sequence
self.alphabet = alphabet
self.name = name
self.sequence = sequence # Store sequence
self.alphabet = alphabet # Store alphabet
self.name = name # Store name
self.gappy = gappy
self.annot = annot # some annotation, e.g. species
def __len__(self): # the "len" operator
return len(self.sequence)
def __iter__(self): # method that allows us to iterate over a sequence
......@@ -104,7 +87,7 @@ class Sequence():
return self.sequence[ndx]
def writeFasta(self):
""" Write one sequence in FASTA format to a string and return it. """
fasta = '>' + self.name + ' ' + self.annot + '\n'
fasta = '>' + self.name + '\n'
data = self.sequence
nlines = (len(self.sequence) - 1) / 60 + 1
for i in range(nlines):
......@@ -126,244 +109,96 @@ class Sequence():
def find(self, findme):
""" Find the position of the specified symbol or sub-sequence """
return self.sequence.find(findme)
#Challege - who can tell me what find() does in under 30 seconds
#demo this and teach them how to read pydocs
#Can we use it if we want to find multiple occurrences of a sequence?
#What can we use instead?
def findAll(self, findme):
""" Find all occurrences of the specified symbol or sub-sequence.
Uses a sliding window approach - very common when searching for motifs
or contamination.
Then, calculate the percentage of sequence made up of 'findme'.
If the percentage is above 20%, replace all occurrences of 'findme' with Ns.
"""
word = len(findme)
store = []
count = 0
for s in range(0, len(self.sequence)):
cur = self.sequence[s:s+word]
if cur == findme:
store.append(s)
count += 1
print(cur, "Match at position ", s)
else:
print(cur)
print("---------------")
percentage = float(count*word) / len(self.sequence)
print("Percentage of findme in sequence: ", percentage)
if percentage > 20:
print("Removing contamination")
newSeq = self.sequence.replace(findme, "N")
return store, percentage, newSeq
###############################################################################
# Alignment #
###############################################################################
class Alignment():
""" A sequence alignment class. Stores two or more sequences of equal length where
one symbol is gap '-'. The number of columns in the alignment is given by alignlen.
one symbol is gap '-'. The number of columns in the alignment is given by alignlen.
Example usage:
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print Alignment(seqs)
>>> print(Alignment(seqs))
THIS-LI-NE-
--ISALIGNED """
def __init__(self, seqs):
self.alphabet = None
self.alignlen = -1
self.seqs = seqs
self.namelen = 0
for s in seqs:
if self.alphabet == None:
self.alphabet = s.alphabet
elif self.alphabet != s.alphabet:
raise RuntimeError("Alignment invalid: contains a mix of alphabets")
if self.alignlen == -1:
self.alignlen = len(s)
elif self.alignlen != len(s):
raise RuntimeError("Alignment invalid: lengths vary")
self.namelen = max(len(s.name), self.namelen)
def __str__(self):
string = ''
raise RuntimeError("Alignment invalid")
def __str__(self):
string = u''
namelen = 0
for seq in self.seqs:
namelen = max(len(seq.name), namelen)
for seq in self.seqs:
string += seq.name.ljust(self.namelen+1)
string += seq.name.ljust(namelen+1)
for sym in seq:
string += sym
string += '\n'
return string
def __len__(self):
""" Defines what the "len" operator returns for an instance of Alignment: the number of sequences. """
return len(self.seqs)
def __getitem__(self, ndx):
return self.seqs[ndx]
def calcDistances(self, measure, a=1.0):
""" Calculate the evolutionary distance between all pairs of sequences
in this alignment, using the given measure. Measure can be one of
'fractional', 'poisson', 'gamma', 'jc' or 'k2p'. If 'gamma' or 'k2p' is
given, then the parameter a must also be specified (or else it will use
the default value of 1.0).
Definitions of each distance metric are found in Zvelebil and Baum p268-276.
These are mostly intended for DNA, but adapted for protein (as below).
Note however that there are alternative distance matrices for proteins (p276).
"""
measure = measure.lower()
if not measure in ['fractional', 'poisson', 'gamma', 'jc', 'k2p']:
raise RuntimeError('Unsupported evolutionary distance measure: %s' % measure)
a = float(a)
distmat = numpy.zeros((len(self.seqs), len(self.seqs)))
# Loop through each pair of sequences
for i in range(len(self.seqs)):
for j in range(i + 1, len(self.seqs)):
seqA = self.seqs[i]
seqB = self.seqs[j]
# Calculate the fractional distance (p) first
# The two sequences of interest are in seqA and seqB
L = 0
D = 0
for k in range(self.alignlen):
# For every non-gapped column, put to L
# For every non-gapped column where the sequences are
# different, put to D
if seqA[k] != '-' and seqB[k] != '-':
L += 1
if seqA[k] != seqB[k]:
D += 1
p = float(D)/L
# Now calculate the specified measure based on p
if measure == 'fractional':
dist = p
else:
raise RuntimeError('Not implemented: %s' % measure)
distmat[i, j] = distmat[j, i] = dist
return distmat
def writeClustal(self):
""" Write the alignment to a string using the Clustal file format. """
symbolsPerLine = 60
maxNameLength = self.namelen + 1
mystring = ''
wholeRows = self.alignlen / symbolsPerLine
for i in range(wholeRows):
for j in range(len(self.seqs)):
mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
mystring += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
mystring += '\n'
# Possible last row
lastRowLength = self.alignlen - wholeRows*symbolsPerLine
if lastRowLength > 0:
for j in range(len(self.seqs)):
if maxNameLength > 0:
mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
mystring += self.seqs[j][-lastRowLength:] + '\n'
return mystring
def writeHTML(self, filename):
""" Generate HTML that displays the alignment in colour.
"""
fh = open(filename, 'w')
fh.write('<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n')
html = ''.ljust(self.namelen) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += str(i/10+1)[-1]
else:
html += ' '
html += '%s\n' % (self.alignlen)
fh.write(html)
if self.alignlen > 10:
html = ''.ljust(self.namelen) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += '0'
else:
html += ' '
html += '\n'
fh.write(html)
if len(self.alphabet) <= 5: # DNA or RNA
colours = {'A':'green','C':'orange','G':'red','T':'#66bbff','U':'#66bbff'}
else: # amino acids
colours = {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'}
for seq in self.seqs:
html = seq.name.ljust(self.namelen) + ' '
for sym in seq:
try:
colour = colours[sym]
except KeyError:
colour = 'white'
html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (colour, sym)
html += '\n'
fh.write(html)
fh.write('</pre></body></html>\n')
fh.close()
def scoreAlignment(self, substmat = None, gap = -1):
"""Score the alignment using a substitution matrix (substmat).
If the alignment consists of more than two sequences, the minimum
score of each column is used.
If substmat is not specified (None), the count of matches is returned.
"""
nseqs = len(self.seqs)
total = 0
for pos in range(self.alignlen):
min = None
for i in range(nseqs):
for j in range(i+1, nseqs):
gap_here = self.seqs[i][pos] == '-' or self.seqs[j][pos] == '-'
score = 0
if substmat == None:
if self.seqs[i][pos] == self.seqs[j][pos]:
score = 1
else: # we have a substitution matrix
if gap_here:
score = gap
else:
score = substmat.get(self.seqs[i][pos], self.seqs[j][pos])
if min == None:
min = score
elif min > score:
min = score
total += min
return total
###############################################################################
# Methods to create instances of Alignment #
###############################################################################
def align(seqA, seqB, substMatrix, gap = -1):
""" Align seqA with seqB using the Needleman-Wunsch
(global) algorithm. substMatrix is the substitution matrix to use and
gap is the linear gap penalty to use. """
stringA, stringB = seqA.sequence, seqB.sequence
lenA, lenB = len(seqA), len(seqB)
# Create the scoring matrix (S) and a matrix for traceback
S = numpy.zeros((lenA + 1, lenB + 1))
Traceback = numpy.zeros((lenA + 1, lenB + 1))
# Fill the first row and column of S with multiples of the gap penalty
for i in range(lenA + 1):
S[i, 0] = i * gap
for j in range(lenB + 1):
S[0, j] = j * gap
# Calculate the optimum score at each location in the matrix, note which option that was chosen for traceback
for i in range(1, lenA + 1):
for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(stringA[i-1], stringB[j-1])
delete = S[i-1, j ] + gap
insert = S[i , j-1] + gap
Traceback[i, j] = numpy.argmax([match, delete, insert])
S[i, j] = max([match, delete, insert])
# Trace back the optimal alignment
alignA = ''
alignB = ''
# Start at the end
i = lenA
j = lenB
# Stop when we hit the end of a sequence
while i > 0 and j > 0:
if Traceback[i, j] == 1:
# Got here by a gap in sequence B (go up)
alignA = stringA[i-1] + alignA
alignB = '-' + alignB
i -= 1
elif Traceback[i, j] == 2:
# Got here by a gap in sequence A (go left)
alignA = "-" + alignA
alignB = stringB[j-1] + alignB
j -= 1
else:
# Got here by aligning the bases (go diagonally)
alignA = stringA[i-1] + alignA
alignB = stringB[j-1] + alignB
i -= 1
j -= 1
# Fill in the rest of the alignment if it begins with gaps
# (i.e., trace back all the way to S[0, 0])
while i > 0:
# Go up
alignA = stringA[i-1] + alignA
alignB = '-' + alignB
i -= 1
while j > 0:
# Go left
alignA = '-' + alignA
alignB = stringB[j-1] + alignB
j -= 1
return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])
def scoreAlignment(aln, substmat = None, gap = -1):
"""Score an alignment (aln) using a substitution matrix (substmat).
If the alignment consists of more than two sequences, the minimum
score of each column is used.
If substmat is not specified (None), the count of matches is returned.
"""
nseqs = len(aln.seqs)
total = 0
for pos in range(aln.alignlen):
min = None
for i in range(nseqs):
for j in range(i+1, nseqs):
gap_here = aln.seqs[i][pos] == '-' or aln.seqs[j][pos] == '-'
score = 0
if substmat == None:
if aln.seqs[i][pos] == aln.seqs[j][pos]:
score = 1
else: # we have a substitution matrix
if gap_here:
score = gap
else:
score = substmat.get(aln.seqs[i][pos], aln.seqs[j][pos])
if min == None:
min = score
elif min > score:
min = score
total += min
return total
###############################################################################
# SubstMatrix #
###############################################################################
class SubstMatrix():
""" Create a substitution matrix for an alphabet.
Example usage:
......@@ -375,11 +210,11 @@ class SubstMatrix():
... elif a == b:
... sm.set(a, b, +1)
...
>>> print sm
A 1
C -1 1
G -1 -1 1
T -1 -1 -1 1
>>> print(sm)
A 1
C -1 1
G -1 -1 1
T -1 -1 -1 1
A C G T
>>> sm.get('C', 'T')
-1
......@@ -401,7 +236,7 @@ class SubstMatrix():
def __str__(self):
symbols = self.alphabet.symbols # what symbols are in the alphabet
i = len(symbols)
string = ''
string = u''
for a in symbols:
string += a + ' '
for b in symbols[:len(symbols)-i+1]:
......@@ -417,17 +252,12 @@ class SubstMatrix():
def writeFile(self, filename):
""" Write this substitution matrix to the given file. """
fh = open(filename, 'w')
file = ''
file = u''
for key in self.scoremat:
file += ''.join(key) + ': ' + str(self.scoremat[key]) + '\n'
fh.write(file)
fh.close()
###############################################################################
# Below are some useful methods for loading data from strings and files. #
# They recognize the FASTA and Clustal formats (nothing fancy). #
###############################################################################
def readSubstMatrix(filename, alphabet):
""" Read in the substitution matrix stored in the given file. """
mat = SubstMatrix(alphabet)
......@@ -443,26 +273,25 @@ def readSubstMatrix(filename, alphabet):
mat.set(symbols[0], symbols[1], score)
return mat
def readFastaString(string, alphabet, gappy = False):
"""
Below are some useful methods for loading data from strings and files.
Recognize the FASTA and Clustal formats (nothing fancy).
"""
def readFastaString(string, alphabet):
""" Read the given string as FASTA formatted data and return the list of
sequences contained within it. """
seqlist = [] # list of sequences contained in the string
seqname = '' # name of *current* sequence
seqannot = '' # annotation of *current* sequence
seqlist = [] # list of sequences contained in the string
seqname = None # name of *current* sequence
seqdata = [] # sequence data for *current* sequence
for line in string.splitlines(): # read every line
if len(line) == 0: # ignore empty lines
continue
if line[0] == '>': # start of new sequence
if line[0] == '>': # start of new sequence
if seqname: # check if we've got one current
current = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot)
current = Sequence(''.join(seqdata), alphabet, seqname)
seqlist.append(current)
# now collect data about the new sequence
parts = line[1:].split() # skip first char
seqname = '' # name of *current* sequence
seqannot = '' # annotation of *current* sequence
if len(parts) > 0: seqname = parts[0]
if len(parts) > 1: seqannot = line[len(seqname) + 2:] # the rest of the line
seqname = line[1:].split()[0] # skip first char
seqdata = []
else: # we assume this is (more) data for current
cleanline = line.split()
......@@ -470,26 +299,26 @@ def readFastaString(string, alphabet, gappy = False):
seqdata.extend(tuple(thisline.strip('*')))
# we're done reading the file, but the last sequence remains
if seqname:
lastseq = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot)
lastseq = Sequence(''.join(seqdata), alphabet, seqname)
seqlist.append(lastseq)
return seqlist
def readFastaFile(filename, alphabet, gappy = False):
""" Read the given FASTA formatted file and return the list of sequences
def readFastaFile(filename, alphabet):
""" Read the given FASTA formatted file and return the list of sequences
contained within it. """
fh = open(filename)
data = fh.read()
fh.close()
seqlist = readFastaString(data, alphabet, gappy)
seqlist = readFastaString(data, alphabet)
return seqlist
def writeFastaFile(filename, seqs):
""" Write the specified sequences to a FASTA file. """
fh = open(filename, 'w')
for seq in seqs:
fh.write(seq.writeFasta())
fh.write(str(seq))
fh.close()
def readClustalString(string, alphabet):
""" Read a ClustalW2 alignment in the given string and return as an
Alignment object. """
......@@ -504,12 +333,12 @@ def readClustalString(string, alphabet):
continue
sections = line.split()
name, seq = sections[0:2]
if seqs.has_key(name):
if name in seqs:
seqs[name] += seq
else:
seqs[name] = seq
sequences = []
for name, seq in seqs.items():
for name, seq in list(seqs.items()):
sequences.append(Sequence(seq, alphabet, name, gappy = True))
return Alignment(sequences)
......@@ -522,17 +351,6 @@ def readClustalFile(filename, alphabet):
aln = readClustalString(data, alphabet)
return aln
def writeClustalFile(filename, aln):
""" Write the specified alignment to a Clustal file. """
fh = open(filename, 'w')
fh.write('CLUSTAL W (1.83) multiple sequence alignment\n\n\n') # fake header so that clustal believes it
fh.write(aln.writeClustal())
fh.close()
###############################################################################
# GeneProfile #
###############################################################################
class GeneProfile():
""" A class for gene expression data.
Example usage:
......@@ -557,7 +375,7 @@ class GeneProfile():
def getSorted(self, index, descending=True):
"""Get a list of (gene, value) tuples in descending order by value"""
key_fn = lambda v: v[1][index]
return sorted(self.genes.items(), key=key_fn, reverse=descending)
return sorted(list(self.genes.items()), key=key_fn, reverse=descending)
def addSample(self, sample_name, sample_dict):
"""Add a sample to the current data set.
sample_dict is a dictionary with the same keys as the current gene set.
......@@ -579,7 +397,7 @@ class GeneProfile():
if isinstance(sample_name, str): # a single sample-name
mysamples = [sample_name]
else: # a list of sample-names
mysamples = sample_name
mysamples = sample_name
for gene in self.genes:
mygenes[gene] = []
for name in mysamples:
......@@ -629,7 +447,7 @@ def readGeoFile(filename, id_column = 0):
manylines = fh.read()
fh.close()
data_rows = False # Indicates whether we're reading the data section or metadata
name = 'Unknown'
name = u'Unknown'
cnt_data = 0
for line in manylines.splitlines():
if line.startswith('^DATASET'):
......@@ -662,28 +480,28 @@ def readGeoFile(filename, id_column = 0):
continue
if not ignore:
dataset[id] = tuple(values)
print 'Data set %s contains %d genes' % (name, len(dataset.genes))
print('Data set %s contains %d genes' % (name, len(dataset.genes)))
return dataset
###############################################################################
# Web service methods that find data in online databases.
# Our implementations are mainly serviced by EBI.
###############################################################################
"""
Web service methods that find data in online databases.
Our implementations are mainly serviced by EBI.
"""
def getSequence(entryId, dbName, alphabet):
""" Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' (Uniprot Accession) or 'SUMO1_MOUSE' (Uniprot Identifier)
entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE'
dbName: name of db e.g. 'uniprotkb', 'pdb' or 'refseqn'.
See: http://www.uniprot.org/faq/28. """
url = 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' +\
dbName + '&format=fasta&id=' + entryId
if not isinstance(entryId, str):
entryId = entryId.decode("utf-8")
url ='http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' + dbName + '&format=fasta&id=' + entryId
try:
data = urllib2.urlopen(url).read()
return readFastaString(data, alphabet)[0]
except urllib2.HTTPError, ex:
data = urllib.request.urlopen(url).read()
return readFastaString(data.decode("utf-8"), alphabet)[0]
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
def searchSequences(query, dbName = 'uniprot'):
def searchSequences(query, dbName):
"""
Retrieve multiple entries matching query from a database currently only via UniProtKB
query: search term(s) e.g. 'organism:9606+AND+antigen'
......@@ -696,42 +514,40 @@ def searchSequences(query, dbName = 'uniprot'):
url = 'http://www.uniprot.org/' + dbName + '/?format=list&query=' + query
# Get the entries
try:
data = urllib2.urlopen(url).read()
data = urllib.request.urlopen(url).read()
return data.splitlines()
except urllib2.HTTPError, ex:
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
elif dbName.startswith('refseq'):
dbs = dbName.split(":")
if len(dbs) > 1:
dbName = dbs[1]
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
base = u'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query
# Get the entries
try:
data = urllib2.urlopen(url).read()
data = urllib.request.urlopen(url).read()
words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]]
return words
except urllib2.HTTPError, ex:
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
return
def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
"""
Map identifiers between databases (based on UniProtKB;
Map identifiers between databases (based on UniProtKB;
see http://www.uniprot.org/faq/28)
identifiers: a list of identifiers (list of strings)
frm: the abbreviation for the identifier FROM which to idmap
to: the abbreviation for the identifier TO which to idmap
Returns a dictionary with key (from) -> value (to).
ACC is Uniprot Accession (e.g. 'P42813').
"""
url = 'http://www.uniprot.org/mapping/'
Returns a dictionary with key (from) -> value (to) """
url = u'http://www.uniprot.org/mapping/'
# construct query by concatenating the list of identifiers
if isinstance(identifiers, str):
query = identifiers.strip()
else: # assume it is a list of strings
query = ''
query = u''
for id in identifiers:
query = query + id.strip() + ' '
query = query.strip() # remove trailing spaces
......@@ -742,8 +558,8 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
'query' : query
}
if len(query) > 0:
request = urllib2.Request(url, urllib.urlencode(params))
response = urllib2.urlopen(request).read()
request = urllib.request.Request(url, urllib.parse.urlencode(params))
response = urllib.request.urlopen(request).read()
d = dict()
for row in response.splitlines()[1:]:
pair = row.split('\t')
......@@ -752,43 +568,42 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
else:
return dict()
###############################################################################
# Gene Ontology services.
# See http://www.ebi.ac.uk/QuickGO/WebServices.html for more info
###############################################################################
"""
Gene Ontology services.
See http://www.ebi.ac.uk/QuickGO/WebServices.html for more info
"""
def getGODef(goterm):
"""
Retrieve information about a GO term
goterm: the identifier, e.g. 'GO:0002080'
"""
# Construct URL
url = 'http://www.ebi.ac.uk/QuickGO/GTerm?format=obo&id=' + goterm
url = u'http://www.ebi.ac.uk/QuickGO/GTerm?format=obo&id=' + goterm
# Get the entry: fill in the fields specified below
try:
entry={'id': None, 'name': None, 'def': None}
data = urllib2.urlopen(url).read()
data = urllib.request.urlopen(url).read()
for row in data.splitlines():
index = row.find(':')
if index > 0 and len(row[index:]) > 1:
field = row[0:index].strip()
value = row[index+1:].strip(' "') # remove spaces
if field in entry.keys(): # check if we need field
if field in list(entry.keys()): # check if we need field
if entry[field] == None: # check if assigned
entry[field] = value
return entry
except urllib2.HTTPError, ex:
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
def getGOTerms(genes, db='UniProtKB'):
"""
Retrieve all GO terms for a given set of genes (or single gene).
db: use specified database, e.g. 'UniProtKB', 'UniGene',
db: use specified database, e.g. 'UniProtKB', 'UniGene',
or 'Ensembl'.
The result is given as a map (key=gene name, value=list of unique
The result is given as a map (key=gene name, value=list of unique
terms) OR in the case of a single gene as a list of unique terms.
"""
if type(genes) != list and type(genes) != set and type(genes) != tuple:
if type(genes) != list and type(genes) != set and type(genes) != tuple:
genes = [genes] # if 'genes' is a single gene, we make a single item list
map = dict()
uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&protein='
......@@ -796,13 +611,13 @@ def getGOTerms(genes, db='UniProtKB'):
terms = set() # empty result set
url = uri + gene.strip() # Construct URL
try: # Get the entry: fill in the fields specified below
data = urllib2.urlopen(url).read()
data = urllib.request.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore header row
values = row.split('\t')
if len(values) >= 7:
terms.add(values[6]) # add term to result set
map[gene] = list(terms) # make a list of the set
except urllib2.HTTPError, ex:
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
if len(genes) == 1:
return map[genes[0]]
......@@ -811,12 +626,12 @@ def getGOTerms(genes, db='UniProtKB'):
def getGenes(goterms, db='UniProtKB', taxo=None):
"""
Retrieve all genes/proteins for a given set of GO terms
Retrieve all genes/proteins for a given set of GO terms
(or single GO term).
db: use specified database, e.g. 'UniProtKB', 'UniGene',
db: use specified database, e.g. 'UniProtKB', 'UniGene',
or 'Ensembl'
taxo: use specific taxonomic identifier, e.g. 9606 (human)
The result is given as a map (key=gene name, value=list of unique
The result is given as a map (key=gene name, value=list of unique
terms) OR in the case of a single gene as a list of unique terms.
"""
if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
......@@ -831,502 +646,15 @@ def getGenes(goterms, db='UniProtKB', taxo=None):
genes = set() # start with empty result set
url = uri + goterm.strip() # Construct URL
try: # Get the entry: fill in the fields specified below
data = urllib2.urlopen(url).read()
data = urllib.request.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
genes.add(values[1]) # add gene name to result set
map[goterm] = list(genes)
except urllib2.HTTPError, ex:
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
if len(goterms) == 1:
return map[goterms[0]]
else:
return map
###############################################################################
# PhyloTree #
###############################################################################
class PhyloTree:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
Functionality includes labelling and traversing nodes; reading and writing to Newick format;
association with sequence alignment; maximum parsimony inference of ancestral sequence;
generation of single, bifurcating rooted tree by UPGMA.
Known issues: Binary only; Parsimony does not handle gaps in alignment.
Programmers should note that almost all functionality is implemented through recursion. """
def __init__(self, root):
""" Create a tree from a node that is "root" in the tree."""
self.root = root
def putAlignment(self, aln):
""" Associate the tree with a set of sequences/alignment.
Involves assigning the sequence to the leaf nodes. """
self.aln = aln
self.root._assignAlignment(aln)
def __str__(self):
""" Produce a printable representation of the tree, specifically the root of the tree. """
return str(self.root)
def strSequences(self, start = None, end = None):
""" Produce a sequence representation of the tree, specifically the root of the tree.
Specify the start and end positions in the alignment for the sequence to be printed
(if None the min and max positions will be used). """
if self.aln != None:
my_start = start or 0
my_end = end or self.aln.alignlen
return self.root._printSequences(my_start, my_end)
def findLabel(self, label):
""" Retrieve/return the node with the specified label.
Returns None if not found."""
return self.root._findLabel(label)
def getDescendantsOf(self, node, transitive = False):
""" Retrieve and return the (list of) descendants (children) of a specified node.
Node can be the label or the instance.
transitive indicates if only the direct descendants (False) or if all descendants
should be returned.
If node does not exist, None is returned.
If node has no descendants, an empty list will be returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
if node:
return node.getDescendants(transitive)
return None
def getAncestorsOf(self, node, transitive = False):
""" Retrieve and return the ancestor (transitive=False) or
ancestors (transitive=True) of a specified node.
Node can be the label or the instance.
If node does not exist, None is returned.
If node is the root of the tree, None is returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
if node:
myroot = self.root
found = False
branching = []
while not found and myroot != None:
branching.append(myroot)
if myroot.left == node or myroot.right == node:
found = True
break
if myroot.left:
if myroot.left.isAncestorOf(node, transitive = True):
myroot = myroot.left
else: # must be right branch then...
myroot = myroot.right
else: # must be right branch then...
myroot = myroot.right
if found and transitive:
return branching
elif found and len(branching) > 0:
return branching[len(branching)-1]
return None
def parsimony(self):
""" Solve the "small parsimony problem",
i.e. find the sequences on each of the internal nodes.
See Jones and Pevzner, p. 368 and onwards, for details. """
self.root._forwardParsimony(self.aln) # setup and compute scores for all nodes
self.root._backwardParsimony(self.aln) # use scores to determine sequences
return self.root.getSequence() # return the sequence found at the root
###############################################################################
# PhyloNode #
###############################################################################
class PhyloNode:
""" A class for a node in a rooted, binary (bifurcating) tree.
Contains pointers to descendants/daughters (left and right),
optional fields include data, label, sequence and dist.
If parsimony is used scores and traceback pointers are available.
A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """
def __init__(self, label = ''):
""" Initialise an initially unlinked node.
Populate fields left and right to link it with other nodes.
Set label to name it.
Use field data for any type of information associated with node.
Use dist to indicate the distance to its parent (if any).
Other fields are used internally, including sequence for associated alignment,
seqscores, backleft and backright for maximum parsimony. """
self.left = None
self.right = None
self.data = None
self.label = label
self.dist = None
self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
self.seqscores = None # The scores propagated from leaves via children
self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols
self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
def __str__(self):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = str(self.left)
if self.right:
right = str(self.right)
if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist)
if self.label != None:
label = str(self.label)
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if not self.left and self.right:
return ','+right
elif self.left and not self.right:
return left+','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def _printSequences(self, start, end):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = self.left._printSequences(start, end)
if self.right:
right = self.right._printSequences(start, end)
if self.dist:
dist = ':' + str(self.dist)
if self.sequence != None:
label = "".join(self.sequence[start:end]) + ""
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if not self.left and self.right:
return ','+right
elif self.left and not self.right:
return left+','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def _findLabel(self, label):
""" Find a node by label at this node or in any descendants (recursively). """
if self.label == label:
return self
else:
if self.left:
foundLeft = self.left._findLabel(label)
if foundLeft:
return foundLeft
if self.right:
return self.right._findLabel(label)
return None
def _propagateDistance(self, parent_dist):
""" Convert absolute distances to relative.
The only parameter is the absolute distance to the parent of this node. """
travelled = self.dist # absolute distance to this node
self.dist = parent_dist - self.dist # relative distance to this node
if self.left != None: # if there is a child node...
self.left._propagateDistance(travelled) # pass absolute distance to this node
if self.right != None:
self.right._propagateDistance(travelled)
def _assignAlignment(self, aln):
""" Assign an alignment to the node, which implies assigning a sequence to it if one is
available in the alignment. """
self.sequence = None
if self.left != None:
self.left._assignAlignment(aln)
if self.right != None:
self.right._assignAlignment(aln)
for seq in aln.seqs:
if seq.name == self.label:
self.sequence = seq
break
def _forwardParsimony(self, aln):
""" Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node,
then to propagate scores from sequence assigned nodes to root (backward). """
if self.sequence == None: # no sequence has been assigned
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
scoresleft = scoresright = None
if self.left != None:
scoresleft = self.left._forwardParsimony(aln)
if self.right != None:
scoresright = self.right._forwardParsimony(aln)
# for each position in the alignment,
# introduce (initially zero) score for each symbol in alphabet
self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
# for each position in the alignment,
# allocate a position to put the left child symbol from which each current node symbol score was determined
self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
# allocate a position to put the right child symbol from which each current node symbol score was determined
self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
for col in range(aln.alignlen):
for a_parent in range(len(aln.alphabet)):
best_score_left = +9999999
best_score_right = +9999999
best_symb_left = 0
best_symb_right = 0
for a_left in range(len(aln.alphabet)):
score = (scoresleft[col][a_left] + (1 if a_left != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_left:
best_symb_left = a_left
best_score_left = score
for a_right in range(len(aln.alphabet)):
score = (scoresright[col][a_right] + (1 if a_right != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_right:
best_symb_right = a_right
best_score_right = score
self.seqscores[col][a_parent] = best_score_left + best_score_right
self.backleft[col][a_parent] = best_symb_left
self.backright[col][a_parent] = best_symb_right
else:
self.seqscores = [[0 if a==sym else 999999 for a in aln.alphabet] for sym in self.sequence] # if we want to weight scores, this would need to change
return self.seqscores
def _backwardParsimony(self, aln, seq = None):
""" Internal function that operates recursively to inspect scores to determine
most parsimonious sequence, from root to leaves. """
if self.sequence == None: # no sequence has been assigned
leftbuf = []
rightbuf = []
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
currbuf = []
for col in range(aln.alignlen):
min_score = 999999
min_symb = None
left_symb = None
right_symb = None
for a_parent in range(len(aln.alphabet)):
if self.seqscores[col][a_parent] < min_score:
min_score = self.seqscores[col][a_parent]
min_symb = a_parent
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
currbuf.append(aln.alphabet[min_symb])
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
self.sequence = Sequence(currbuf, aln.alphabet, self.label, gappy = True)
else: # Non-root, but not leaf
self.sequence = seq
col = 0
for sym_parent in self.sequence:
a_parent = aln.alphabet.index(sym_parent)
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
col += 1
self.left._backwardParsimony(aln, Sequence(leftbuf, aln.alphabet, self.label, gappy = True))
self.right._backwardParsimony(aln, Sequence(rightbuf, aln.alphabet, self.label, gappy = True))
return self.sequence
def getSequence(self):
""" Get the sequence for the node. Return None if no sequence is assigned.
Requires that an alignment is associated with the tree, and that sequence names match node labels.
If the explored node is not a leaf, the sequence can be determined by parsimony. """
if self.sequence != None: # a sequence has been assigned
return self.sequence
elif self.seqscores != None: # inferred by parsimony but not yet assigned
return None # determine most parsimonous sequence, not yet implemented
def isAncestorOf(self, node, transitive = True):
""" Decide if this node is the ancestor of specified node.
If transitive is True (default), all descendants are included.
If transitive is False, only direct descendants are included. """
if node == self.left or node == self.right:
return True
elif transitive:
if self.left:
statusLeft = self.left.isAncestorOf(node, transitive)
if statusLeft: return True
if self.right:
return self.right.isAncestorOf(node, transitive)
else:
return False
def getDescendants(self, transitive = False):
""" Retrieve and return (list of) nodes descendant of this.
If transitive is False (default), only direct descendants are included.
If transitive is True, all descendants are (recursively) included. """
children = []
if self.left:
children.append(self.left)
if self.right:
children.append(self.right)
if not transitive:
return children
else:
grandchildren = []
for c in children:
d = c.getDescendants(transitive)
if d:
grandchildren.extend(d)
children.extend(grandchildren)
return children
###############################################################################
# Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
# Methods for processing files of trees on the Newick format
###############################################################################
def runUPGMA(aln, measure, absoluteDistances = False):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances).
If absoluteDistances is True, the tree will be assigned the total distance from provided species.
Otherwise, the relative addition at each path will be assigned."""
D = {}
N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances
nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for i in range(len(nodes)):
nodes[i].sequence = aln.seqs[i]
nodes[i].dist = 0.0
N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i):
D[_getkey(nodes[i], nodes[j])] = M[i, j]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged
dist = D[pair]
if dist < closest_dist or closest_dist == None:
closest_dist = dist
closest_pair = pair
# So we know the closest, now we need to merge...
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
dz = {} # new distances to cluster z
for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[_getkey(z, w)] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree
z.right = y
nodes.append(z)
if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
y._propagateDistance(z.dist) # convert absolute distances to relative by recursing down right path
z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return
def _getkey(node1, node2):
""" Construct canonical (unordered) key for two symbols """
if node1 <= node2:
return tuple([node1, node2])
else:
return tuple([node2, node1])
def _findComma(string, level = 0):
""" Find first comma at specified level of embedding """
mylevel = 0
for i in range(len(string)):
if string[i] == '(':
mylevel += 1
elif string[i] == ')':
mylevel -= 1
elif string[i] == ',' and mylevel == level:
return i
return -1
def parseNewickNode(string):
""" Utility function that recursively parses embedded string using Newick format. """
first = string.find('(')
last = string[::-1].find(')') # look from the back
if first == -1 and last == -1: # we are at leaf
y = string.split(':')
node = PhyloNode(y[0])
if len(y) >= 2:
node.dist = float(y[1])
return node
elif first >= 0 and last >= 0:
# remove parentheses
last = len(string) - last - 1 # correct index to refer from start instead of end of string
embed = string[first + 1:last]
tail = string[last + 1:]
# find where corresp comma is
comma = _findComma(embed)
if comma == -1:
raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
left = embed[0:comma].strip()
right = embed[comma + 1:].strip()
y = tail.split(':')
node = PhyloNode(y[0])
if len(y) >= 2:
node.dist = float(y[1])
node.left = parseNewickNode(left)
node.right = parseNewickNode(right)
return node
else:
raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
def parseNewick(string):
""" Main method for parsing a Newick string into a (phylogenetic) tree.
Handles labels (on both leaves and internal nodes), and includes distances (if provided).
Returns an instance of a PhyloTree. """
if string.find(';') != -1:
string = string[:string.find(';')]
return PhyloTree(parseNewickNode(string))
def readNewickFile(filename):
""" Read file on Newick format.
Returns an instance of a PhyloTree."""
f = open(filename)
string = ''.join(f)
return parseNewick(string)
def writeNewickFile(filename, tree):
""" Write the specified tree to a Newick file. """
fh = open(filename, 'w')
fh.write(tree.__str__())
fh.close()
###############################################################################
# Below is code that will be run if the module is "run", and not just "imported".
###############################################################################
if __name__=='__main__':
x = Sequence('ACTGA', DNA_Alphabet, 'x')
print "Sequence", x, "is constructed from the symbols", x.alphabet.symbols
print "( There are", x.count('A'), "occurrences of the symbol 'A' in", x.sequence, ")"
y = Sequence('TACGA', DNA_Alphabet, 'y')
print "Sequence", y, "is constructed from the symbols", y.alphabet.symbols
print
print "( The sub-sequence 'CG' starts at index", y.find('CG'), "of", y.sequence, ")"
print
sm = SubstMatrix(DNA_Alphabet)
for a in DNA_Alphabet:
for b in DNA_Alphabet:
if a==b:
sm.set(a, b, +2) # match
else:
sm.set(a, b, -1) # mismatch
print "Below is a substitution matrix for the alphabet", DNA_Alphabet.symbols
print sm
print
aln = align(x, y, sm, -2)
print "Below is the alignment between x and y"
print aln
\ No newline at end of file
......@@ -21,7 +21,7 @@ class NN():
self.b_hid = numpy.random.randn(nHidden) # biases hidden layer
self.w_out = numpy.random.randn(nOutput, nHidden) # weights hid -> out
self.b_out = numpy.random.randn(nOutput) # biases output layer
print "Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output))
print("Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output)))
def writeFile(self, filename):
""" Save NN to a file. """
......@@ -110,7 +110,7 @@ class NN():
multi_targ = [ target ]
for i in range(niter):
mse = 0.0
entries = range(len(multi_input))
entries = list(range(len(multi_input)))
if shuffle:
random.shuffle(entries)
for p in entries:
......
......@@ -2,7 +2,7 @@
Module with methods and classes for phylogeny.
@author: mikael
'''
##import sequence
import sequence
class PhyloTree:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
......@@ -140,7 +140,19 @@ class PhyloNode:
return left+','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def __le__(self, other):
""" Returns indication of less than other node. """
return other and self.__hash__() <= other.__hash__()
def __eq__(self, other):
""" Returns indication of equivalence to other node. """
return other and self.__hash__() == other.__hash__()
def __hash__(self):
""" Returns hash of object. """
return hash((self.label, self.dist, self.sequence))
def _printSequences(self, start, end):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
......@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False):
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged
dist = D[pair]
if dist < closest_dist or closest_dist == None:
if closest_dist == None or dist < closest_dist:
closest_dist = dist
closest_pair = pair
# So we know the closest, now we need to merge...
......@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False):
y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
Nx = N.pop(x, None) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y, None) # find number of sequences in y, remove the cluster from list N
if Nx == None or Ny == None:
continue
dz = {} # new distances to cluster z
for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
......
......@@ -277,7 +277,7 @@ def _readDistrib(linelist):
if len(d) == 0:
return None
alpha = Alphabet(symstr)
if '*' in d.keys(): # tot provided
if '*' in list(d.keys()): # tot provided
for sym in d:
if sym != '*':
d[sym] = d[sym] * d['*']
......@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'):
ncol = len(counts)
if len(name) == 1: # proper symbol
symcount[name] = counts
alpha = Alphabet(''.join(symcount.keys()))
alpha = Alphabet(''.join(list(symcount.keys())))
distribs = []
for col in range(ncol):
d = dict([(sym, symcount[sym][col]) for sym in symcount])
......@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'):
"""
d = readMultiCounts(filename, format=format)
if len(d) > 0:
return d.values()[0]
return list(d.values())[0]
#################################################################################################
# Joint class
......@@ -628,12 +628,12 @@ class IndepJoint(Joint):
def displayMatrix(self, count = False):
""" Pretty-print matrix """
print " \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))
print((" \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))))
for a in self.alphas[0]:
if count:
print "%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))
print(("%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))))
else:
print "%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))
print(("%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))))
def __str__(self):
""" Text representation of the table. Note that size is an issue so big tables
......@@ -718,5 +718,3 @@ class NaiveBayes():
prob *= condprob[i][key[i]] or 0.0
out.observe(outsym, prob)
return out
......@@ -37,7 +37,7 @@ def base_percentages(reads):
for nuc in seq:
all_seqs.append(nuc)
counts=dict(Counter(all_seqs))
nucs=counts.keys()
nucs=list(counts.keys())
freqs={}
for nuc in nucs:
freqs[nuc]=float(counts[nuc])/sum(counts.values())
......@@ -67,7 +67,7 @@ you will get an accurate "percentage of reads aligned" statistic.
mapped=len(mapped)+len(mapped)
else:
mapped=len(mapped)
print "number of mapped reads",mapped
print("number of mapped reads",mapped)
return store_reads
......@@ -108,9 +108,9 @@ def subgroups(mapped_reads):
group3.append(read)
else:
pass
print len(group1),"in p<1e-3 group"
print len(group2),"in 1e-3<=p<1e-2 group"
print len(group3),"in 1e-2<=p<1 group"
print(len(group1),"in p<1e-3 group")
print(len(group2),"in 1e-3<=p<1e-2 group")
print(len(group3),"in 1e-2<=p<1 group")
return group1,group2,group3
......@@ -124,7 +124,7 @@ def dinuc_freq(mapped_reads):
for nuc in seq:
all_seqs.append(nuc)
counts=dict(Counter(all_seqs))
nucs=counts.keys()
nucs=list(counts.keys())
freqs={}
for nuc in nucs:
freqs[nuc]=float(counts[nuc])/sum(counts.values())
......@@ -135,7 +135,7 @@ def dinuc_freq(mapped_reads):
for nuc in seq:
all_seqs.append(nuc)
counts=dict(Counter(all_seqs))
dinucs=counts.keys()
dinucs=list(counts.keys())
dinuc_counts={}
for i in dinucs:
val=float(counts[i])/sum(counts.values())
......@@ -178,11 +178,11 @@ Calculations are based on the the length of the (possibly hard-clipped) sequence
length=int(read[8])
lengths.append(length)
mean_len=np.mean(lengths)
print "group"+str(i+1)+"mean",mean_len
print("group"+str(i+1)+"mean",mean_len)
max_len=np.max(lengths)
print "group"+str(i+1)+"max length",max_len
print("group"+str(i+1)+"max length",max_len)
min_len=np.min(lengths)
print "group"+str(i+1)+"min length",min_len
print("group"+str(i+1)+"min length",min_len)
data.append(["group"+str(i+1),mean_len,max_len,min_len])
return data
......@@ -221,15 +221,15 @@ def plot_base_composition(reads,sym):
all_nucs.append(nucs)
all_items=[]
counts=[]
pos=range(1,len(seq)+1)
pos=list(range(1,len(seq)+1))
for dicts in all_nucs:
for item in dicts.items():
for item in list(dicts.items()):
all_items.append(item)
all_items.sort(key=operator.itemgetter(0))
groups= [map(operator.itemgetter(1),list(group)) for key, group in itertools.groupby(all_items, operator.itemgetter(0))]
groups= [list(map(operator.itemgetter(1),list(group))) for key, group in itertools.groupby(all_items, operator.itemgetter(0))]
for group in groups:
counts.append(group.count(sym))
print counts
print(counts)
plt.figure(1, figsize=(8,8))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
plt.bar(pos,counts,facecolor='g')
......@@ -261,7 +261,7 @@ def raw_count_reader(filename):
def get_RPKM(data,num_map1,num_map2,num_map3,num_map4):
"""provide number of mapped reads for the two groups of interest and raw count data .This method provides length normalisation to prevent length and total count bias"""
all_rpkms=[];final={}
for i,s,ii,ss,v in data.values():
for i,s,ii,ss,v in list(data.values()):
rpkms=[]
num_mapped_reads=[num_map1,num_map2,num_map3,num_map4]
vals=[i,s,ii,ss]
......@@ -276,15 +276,15 @@ def get_RPKM(data,num_map1,num_map2,num_map3,num_map4):
rpkms.append(rpkm)
all_rpkms.append(rpkms)
#return gene names and rpkms
for i in range(0,len(data.keys())):
final[data.keys()[i]]=[float(all_rpkms[i][0]),float(all_rpkms[i][1]),float(all_rpkms[i][2]),float(all_rpkms[i][3])]
for i in range(0,len(list(data.keys()))):
final[list(data.keys())[i]]=[float(all_rpkms[i][0]),float(all_rpkms[i][1]),float(all_rpkms[i][2]),float(all_rpkms[i][3])]
return final
def write_RPKM_data(RPKM_data,filename):
"""write RPKM data to a file"""
f=open(filename,'w')
for i in range(0,len(RPKM_data)):
f.write("%s\t%d\t%d\t%d\t%d\n"%(RPKM_data.keys()[i],int(RPKM_data.values()[i][0]),int(RPKM_data.values()[i][1]),int(RPKM_data.values()[i][2]),int(RPKM_data.values()[i][3])))
f.write("%s\t%d\t%d\t%d\t%d\n"%(list(RPKM_data.keys())[i],int(list(RPKM_data.values())[i][0]),int(list(RPKM_data.values())[i][1]),int(list(RPKM_data.values())[i][2]),int(list(RPKM_data.values())[i][3])))
f.close()
......@@ -316,13 +316,13 @@ def plotreprpkm(rpkm_data,timepoint):
one=[]
two=[]
if timepoint=="t1":
for i in range(0,len(rpkm_data.values())):
one.append(int(rpkm_data.values()[i][0]))
two.append(int(rpkm_data.values()[i][1]))
for i in range(0,len(list(rpkm_data.values()))):
one.append(int(list(rpkm_data.values())[i][0]))
two.append(int(list(rpkm_data.values())[i][1]))
else:
for i in range(0,len(rpkm_data.values())):
one.append(int(rpkm_data.values()[i][2]))
two.append(int(rpkm_data.values()[i][3]))
for i in range(0,len(list(rpkm_data.values()))):
one.append(int(list(rpkm_data.values())[i][2]))
two.append(int(list(rpkm_data.values())[i][3]))
plt.plot(one,two,'o')
pcc=pearson_def(one,two)
R2=pcc**2
......@@ -343,15 +343,15 @@ def plotMAreprpkm(rpkm_data,timepoint):
m=[]
a=[]
if timepoint=="t1":
for i in range(0,len(rpkm_data.values())):
y=np.log2(rpkm_data.values()[i][0]+1)-np.log2(rpkm_data.values()[i][1]+1)
x=(np.log2(rpkm_data.values()[i][0]+1)+np.log2(rpkm_data.values()[i][1]+1))/2
for i in range(0,len(list(rpkm_data.values()))):
y=np.log2(list(rpkm_data.values())[i][0]+1)-np.log2(list(rpkm_data.values())[i][1]+1)
x=(np.log2(list(rpkm_data.values())[i][0]+1)+np.log2(list(rpkm_data.values())[i][1]+1))/2
m.append(y)
a.append(x)
else:
for i in range(0,len(rpkm_data.values())):
y=np.log2(rpkm_data.values()[i][2]+1)-np.log2(rpkm_data.values()[i][3]+1)
x=(np.log2(rpkm_data.values()[i][2]+1)+np.log2(rpkm_data.values()[i][3]+1))/2
for i in range(0,len(list(rpkm_data.values()))):
y=np.log2(list(rpkm_data.values())[i][2]+1)-np.log2(list(rpkm_data.values())[i][3]+1)
x=(np.log2(list(rpkm_data.values())[i][2]+1)+np.log2(list(rpkm_data.values())[i][3]+1))/2
m.append(y)
a.append(x)
plt.figure(1, figsize=(8,8))
......@@ -370,18 +370,18 @@ def plotMAreprpkm(rpkm_data,timepoint):
def get_cv(data1,condition):
cvs=[]
if condition=="t1":
for i in range(0,len(data1.values())):
mean = np.mean([data1.values()[i][0],data1.values()[i][1]])
std=np.std([data1.values()[i][0],data1.values()[i][1]])
for i in range(0,len(list(data1.values()))):
mean = np.mean([list(data1.values())[i][0],list(data1.values())[i][1]])
std=np.std([list(data1.values())[i][0],list(data1.values())[i][1]])
if mean==0.0 and std==0.0:
pass
else:
cv=float(mean+1)/(std+1)
cvs.append(cv)
else:
for i in range(0,len(data1.values())):
mean = np.mean([data1.values()[i][2],data1.values()[i][3]])
std=np.std([data1.values()[i][2],data1.values()[i][3]])
for i in range(0,len(list(data1.values()))):
mean = np.mean([list(data1.values())[i][2],list(data1.values())[i][3]])
std=np.std([list(data1.values())[i][2],list(data1.values())[i][3]])
if mean==0.0 and std==0.0:
pass
else:
......@@ -430,7 +430,7 @@ def plotMA(rpkm_data,cutoff=[-1.5,1.5]):
avg_rpkm2=[]
sig_logfc2=[]
sig_avg_rpkm2=[]
for i,ii,s,ss in rpkm_data.values():
for i,ii,s,ss in list(rpkm_data.values()):
fc=np.log2(float(s+1)/(i+1))
if fc<cutoff[0] or fc>cutoff[1]:
sig_logfc.append(fc)
......@@ -438,7 +438,7 @@ def plotMA(rpkm_data,cutoff=[-1.5,1.5]):
else:
logfc.append(fc)
avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
for i,ii,s,ss in rpkm_data.values():
for i,ii,s,ss in list(rpkm_data.values()):
fc2=np.log2(float(ss+1)/(ii+1))
if fc2<cutoff[0] or fc2>cutoff[1]:
sig_logfc2.append(fc2)
......@@ -470,7 +470,7 @@ def plotMA_pval(rpkm_data,cutoff=0.05):
avg_rpkm2=[]
sig_logfc2=[]
sig_avg_rpkm2=[]
for i,ii,s,ss,pval in rpkm_data.values():
for i,ii,s,ss,pval in list(rpkm_data.values()):
fc=np.log2(float(s+1)/(i+1))
if float(pval)<cutoff:
sig_logfc.append(fc)
......@@ -478,7 +478,7 @@ def plotMA_pval(rpkm_data,cutoff=0.05):
else:
logfc.append(fc)
avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
for i,ii,s,ss,pval in rpkm_data.values():
for i,ii,s,ss,pval in list(rpkm_data.values()):
fc2=np.log2(float(ss+1)/(ii+1))
if float(pval)<cutoff:
sig_logfc2.append(fc2)
......@@ -506,7 +506,7 @@ def Welcht(rpkm):
"""Performs Welchs T-statistic (one-tailed)"""
ts=[]
result={}
for i,ii,s,ss in rpkm.values():
for i,ii,s,ss in list(rpkm.values()):
sd1=np.std([i,ii])
sd2=np.std([s,ss])
t=(np.mean([s,ss])-np.mean([i,ii]))/(math.sqrt(((float(sd2)/2)+(float(sd1)/2))))
......@@ -521,8 +521,8 @@ def Welcht(rpkm):
pval=pval
pvals.append(pval)
corr_pvals=correct_pvalues_for_multiple_testing(pvals, correction_type = "Benjamini-Hochberg")
for i in range(0,len(rpkm.values())):
result[rpkm.keys()[i]]=[rpkm.values()[i][0],rpkm.values()[i][1],rpkm.values()[i][2],rpkm.values()[i][3],corr_pvals[i]]
for i in range(0,len(list(rpkm.values()))):
result[list(rpkm.keys())[i]]=[list(rpkm.values())[i][0],list(rpkm.values())[i][1],list(rpkm.values())[i][2],list(rpkm.values())[i][3],corr_pvals[i]]
return result
......@@ -551,7 +551,7 @@ def correct_pvalues_for_multiple_testing(pvalues, correction_type = "Benjamini-H
rank = n - i
pvalue, index = vals
new_values.append((n/rank) * pvalue)
for i in xrange(0, int(n)-1):
for i in range(0, int(n)-1):
if new_values[i] < new_values[i+1]:
new_values[i+1] = new_values[i]
for i, vals in enumerate(values):
......
......@@ -257,7 +257,7 @@ class BedFile():
self.rows = entries
self.format = format
self.indices = self._createIndices()
def _read(self, filename, format = 'Limited'):
""" Read a BED file.
format: specifies the format of the file,
......@@ -276,7 +276,7 @@ class BedFile():
"Strand", e.g.
chr4 185772359 185772424 -
chr18 20513381 20513401 +
also supports a 5th label field
also supports a 5th label field
chr5 20611949 20611949 + ENSG00000251629_20611949
chr3 42187863 42187863 - ENSG00000234562_42187863
"Summit", e.g.
......@@ -361,7 +361,7 @@ class BedFile():
acceptHeaderRows -= 1 # count down the number of header rows that can occur
f.close()
return rows
def __iter__(self):
return self.rows.__iter__()
......@@ -381,11 +381,11 @@ class BedFile():
index_name = {}
for i in range(len(self.rows)):
row = self.rows[i]
if not index_start.has_key(row.chrom): # seeing chromosome entry first time
if not row.chrom in index_start: # seeing chromosome entry first time
index_start[row.chrom] = []
if not index_centre.has_key(row.chrom): # seeing chromosome entry first time
if not row.chrom in index_centre: # seeing chromosome entry first time
index_centre[row.chrom] = []
if not index_end.has_key(row.chrom): # seeing chromosome entry first time
if not row.chrom in index_end: # seeing chromosome entry first time
index_end[row.chrom] = []
index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
......@@ -477,7 +477,7 @@ class BedFile():
Note that if the name is not unique, the last entry with the name will be returned.
"""
return self.indices[3][myname]
def closest(self, myloc, minimum = True):
""" Find the closest entry in the current BedFile to a given location.
Return a tuple with the absolute distance and the entry that is closest.
......@@ -607,7 +607,7 @@ class BedFile():
if not earliest_start: # not yet initialised
earliest_start = start
latest_end = end
else:
else:
if start > latest_end: # new entry
entry = BedEntry(c, earliest_start, latest_end)
if self.format == 'Peaks':
......@@ -639,7 +639,7 @@ class BedFile():
if not earliest_start: # not yet initialised
earliest_start = start
latest_end = end
else:
else:
if start > latest_end: # new entry
entry = BedEntry(c, earliest_start, latest_end)
if self.format == 'Peaks':
......@@ -660,7 +660,7 @@ class BedFile():
entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
newrows.append(entry)
return BedFile(newrows, format = self.format)
def write(self, filename, format = 'BED6', header = None):
""" Save the data
format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
......@@ -697,7 +697,7 @@ def readBedFile(filename, format = 'Limited'):
"Strand", e.g.
chr4 185772359 185772424 -
chr18 20513381 20513401 +
also supports a 5th label field
also supports a 5th label field
chr5 20611949 20611949 + ENSG00000251629_20611949
chr3 42187863 42187863 - ENSG00000234562_42187863
"Summit", e.g.
......@@ -714,7 +714,7 @@ def readBedFile(filename, format = 'Limited'):
chr1 931838 9
"""
return BedFile(filename, format)
def writeBedFile(entries, filename, format = 'BED6', header = None):
""" Save the BED entries to a BED file.
format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
......@@ -725,11 +725,11 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
for row in entries:
if format == 'Peaks':
#f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
f.write("%s %d %d %s %d %s %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
elif format == 'Limited':
f.write("%s %d %d" % (row.chrom, row.chromStart, row.chromEnd))
f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
else:
f.write("%s %d %d %s %d %s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
f.write("\n")
f.close()
......@@ -760,7 +760,7 @@ try:
except ImportError:
strerror = lambda x: 'strerror not supported'
from os.path import exists
from itertools import izip
from itertools import chain
def true_long_type():
"""
......@@ -805,7 +805,7 @@ def base_to_bin(x):
def create_byte_table():
"""create BYTE_TABLE"""
d = {}
for x in xrange(2**8):
for x in range(2**8):
d[x] = byte_to_bases(x)
return d
......@@ -821,9 +821,9 @@ def split16(x):
def create_twobyte_table():
"""create TWOBYTE_TABLE"""
d = {}
for x in xrange(2**16):
for x in range(2**16):
c, f = split16(x)
d[x] = byte_to_bases(c) + byte_to_bases(f)
d[x] = chain(byte_to_bases(c), byte_to_bases(f))
return d
BYTE_TABLE = create_byte_table()
......@@ -836,7 +836,7 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
"""
longs_len = len(longs)
# dna = ctypes.create_string_buffer(array_size)
dna = array('c', 'N' * longs_len)
dna = array('b', 'N' * longs_len)
# translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed)
bytes = array('B')
......@@ -845,14 +845,14 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
i = 16 - first_base_offset
if array_size < i: i = array_size
dna[0:i] = array('c', first_block[first_base_offset:first_base_offset + i])
dna[0:i] = array('b', first_block[first_base_offset:first_base_offset + i])
if longs_len == 1: return dna
# middle blocks (implicitly skipped if they don't exist)
for byte in bytes[4:-4]:
dna[i:i + 4] = array('c', BYTE_TABLE[byte])
dna[i:i + 4] = array('b', BYTE_TABLE[byte])
i += 4
# last block
last_block = array('c', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
last_block = array('b', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
dna[i:i + last_base_offset] = last_block[0:last_base_offset]
return dna
......@@ -889,7 +889,7 @@ class TwoBitFile(dict):
self._file_handle = open(foo, 'rb')
self._load_header()
self._load_index()
for name, offset in self._offset_dict.iteritems():
for name, offset in self._offset_dict.items():
self[name] = TwoBitSequence(self._file_handle, offset,
self._byteswapped)
return
......@@ -926,13 +926,16 @@ class TwoBitFile(dict):
if remaining == 0: break
name_size = array('B')
name_size.fromfile(file_handle, 1)
if byteswapped: name_size.byteswap()
name = array('c')
if byteswapped: name.byteswap()
if byteswapped:
name_size.byteswap()
name = array('b')
if byteswapped:
name.byteswap()
name.fromfile(file_handle, name_size[0])
offset = array(LONG)
offset.fromfile(file_handle, 1)
if byteswapped: offset.byteswap()
if byteswapped:
offset.byteswap()
sequence_offsets.append((name.tostring(), offset[0]))
remaining -= 1
self._sequence_offsets = sequence_offsets
......@@ -943,7 +946,7 @@ class TwoBitFile(dict):
d = {}
file_handle = self._file_handle
byteswapped = self._byteswapped
for name, offset in self._offset_dict.iteritems():
for name, offset in self._offset_dict.items():
file_handle.seek(offset)
dna_size = array(LONG)
dna_size.fromfile(file_handle, 1)
......@@ -1078,7 +1081,7 @@ class TwoBitSequence(object):
if byteswapped: fourbyte_dna.byteswap()
string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
last_base_offset, region_size)
for start, size in izip(n_block_starts, n_block_sizes):
for start, size in zip(n_block_starts, n_block_sizes):
end = start + size
if end <= min_: continue
if start > max_: break
......@@ -1086,14 +1089,14 @@ class TwoBitSequence(object):
if end > max_: end = max_
start -= min_
end -= min_
string_as_array[start:end] = array('c', 'N'*(end-start))
string_as_array[start:end] = array('b', 'N'*(end-start))
lower = str.lower
first_masked_region = max(0,
bisect_right(mask_block_starts, min_) - 1)
last_masked_region = min(len(mask_block_starts),
1 + bisect_right(mask_block_starts, max_,
lo=first_masked_region))
for start, size in izip(mask_block_starts[first_masked_region:last_masked_region],
for start, size in zip(mask_block_starts[first_masked_region:last_masked_region],
mask_block_sizes[first_masked_region:last_masked_region]):
end = start + size
if end <= min_: continue
......@@ -1102,9 +1105,9 @@ class TwoBitSequence(object):
if end > max_: end = max_
start -= min_
end -= min_
string_as_array[start:end] = array('c', lower(string_as_array[start:end].tostring()))
string_as_array[start:end] = array('b', lower(string_as_array[start:end].tostring()))
if not len(string_as_array) == max_ - min_:
raise RuntimeError, "Sequence was longer than it should be"
raise RuntimeError("Sequence was longer than it should be")
if reverse:
return self.reverseComplement(string_as_array.tostring())
return string_as_array.tostring()
......@@ -1124,7 +1127,7 @@ class TwoBitSequence(object):
"""
return self.__getslice__(0, None)
class TwoBitFileError(StandardError):
class TwoBitFileError(Exception):
"""
Base exception for TwoBit module
"""
......
......@@ -55,10 +55,11 @@ class Sequence(object):
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'Y'] """
try: # convert sequence data into a compact array representation
self.sequence = array.array('c', ''.join([s.upper() for s in sequence]))
except TypeError:
raise RuntimeError('Sequence data is not specified correctly: must be iterable')
#try: # convert sequence data into a compact array representation
# self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
#except TypeError:
# raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
self.sequence = sequence
# Assign an alphabet
self.alphabet = None
......@@ -133,15 +134,15 @@ class Sequence(object):
Calling self.__getitem__(3) is equivalent to self[3]
"""
if type(ndx) is slice:
return self.sequence[ndx].tostring()
return ''.join(self.sequence[ndx])
else:
return self.sequence[ndx]
def writeFasta(self):
""" Write one sequence in FASTA format to a string and return it. """
fasta = '>' + self.name + ' ' + self.info + '\n'
data = self.sequence.tostring()
nlines = (len(self.sequence) - 1) / 60 + 1
data = ''.join(self.sequence)
nlines = int(math.ceil((len(self.sequence) - 1) / 60 + 1))
for i in range(nlines):
lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
fasta += lineofseq
......@@ -164,7 +165,7 @@ class Sequence(object):
def find(self, findme):
""" Find the position of the specified symbol or sub-sequence """
return self.sequence.tostring().find(findme)
return ''.join(self.sequence).find(findme)
"""
Below are some useful methods for loading data from strings and files.
......@@ -438,8 +439,8 @@ class Alignment():
column index, entropy, number of gaps, and symbols in order of decreasing probability.
theta1 is the threshold for displaying symbols in upper case,
theta2 is the threshold for showing symbols at all, and in lower case. """
print "Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)
print "Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)
print(("Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)))
print(("Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)))
for col in range(self.alignlen):
d = Distrib(self.alphabet)
gaps = 0
......@@ -448,21 +449,21 @@ class Alignment():
d.observe(seq[col])
else:
gaps += 1
print (col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,
print(((col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,))
symprobs = d.getProbsort()
(_, maxprob) = symprobs[0]
if maxprob >= theta1:
print "%d\tTRUE\t" % int(maxprob * 100),
print(("%d\tTRUE\t" % int(maxprob * 100),))
else:
print "%d\t\t" % int(maxprob * 100),
print(("%d\t\t" % int(maxprob * 100),))
for (sym, prob) in symprobs:
if prob >= theta1:
print sym, "%d%%" % int(prob * 100),
print((sym, "%d%%" % int(prob * 100),))
elif prob >= theta2 and lowercase:
print sym.lower(), "%d%%" % int(prob * 100),
print((sym.lower(), "%d%%" % int(prob * 100),))
elif prob >= theta2:
print sym, "%d%%" % int(prob * 100),
print
print((sym, "%d%%" % int(prob * 100),))
print()
def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False):
""" Display a table with rows for each alignment column, showing
......@@ -644,7 +645,7 @@ class Alignment():
return distmat
def writeHTML(self, filename=None):
""" Generate HTML that displays the alignment in color.
""" Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue'
"""
......@@ -681,10 +682,9 @@ class Alignment():
htmlstr += html
htmlstr += '<pre>'
if filename:
fh = open(filename, 'w')
fh.write(htmlstr)
fh.write('</body></html>\n')
fh.close()
with open(filename, 'w+') as fh:
fh.write(htmlstr)
fh.write('</body></html>\n')
else:
return htmlstr
......@@ -985,12 +985,12 @@ def readClustal(string, alphabet):
index = name.find('/')
if index >= 0:
name = name[0:index]
if seqs.has_key(name):
if name in seqs:
seqs[name] += seqstr
else:
seqs[name] = seqstr
sequences = []
for name, seqstr in seqs.items():
for name, seqstr in list(seqs.items()):
sequences.append(Sequence(seqstr, alphabet, name, gappy = True))
return Alignment(sequences)
......@@ -1180,12 +1180,12 @@ class PWM(object):
def display(self, format = 'COLUMN'):
if format == 'COLUMN':
print " \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))
print((" \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))))
for j in range(len(self.alphabet)):
print "%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
print(("%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
elif format == 'JASPAR':
for j in range(len(self.alphabet)):
print "%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
print(("%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
def search(self, sequence, lowerBound=0):
""" Find matches to the motif in a specified sequence. Returns a list
......@@ -1229,7 +1229,7 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
""" Get the sequence identified by the given ID from the given database
(e.g. 'uniprotkb', 'refseqn' or 'refseqp'), and return it as a Sequence
object. An error is caused if the sequence ID is not found. If start and
end are given, then only that section of the sequence is returned.
end are given, then only that section of the sequence is returned.
Note: more flexible search options are supported by using webservice.fetch
directly."""
......@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
for i in range(MAX_TRY):
try:
fastaData = fetch(id, database)
fastaData = fetch(id, database).decode("utf-8")
seq = readFasta(fastaData)[0]
break
except:
from time import sleep
print 'Failed on {i}th try for id {id}'.format(i=i, id=id)
print(('Failed on {i}th try for id {id}'.format(i=i, id=id)))
sleep(0.1)
try:
return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info)
......@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
if __name__ == '__main__':
seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True)
print 'Read', len(seqs), 'sequences'
print(('Read', len(seqs), 'sequences'))
......@@ -71,7 +71,7 @@ class SeqNN():
im[row, _onehotIndex(alpha, subseqs[k])] = 1
if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1
row += 1
print "There are", row, "entries in data set"
print("There are", row, "entries in data set")
if targets:
return im, om
else:
......@@ -85,7 +85,7 @@ class SeqNN():
im, om = self._encodeseq(seqs, targets)
for i in range(niter): # train first NN
rmse = self.nn1.train(im, om, eta = eta, niter = 1)
print i, ":", rmse
print(i, ":", rmse)
if not self.cascade: # if there's no cascaded NN, finish here
return rmse
nn1seqs = [] # a list of new SS sequences ...
......@@ -95,7 +95,7 @@ class SeqNN():
im, om = self._encodeseq(nn1seqs, targets) # construct input/output patterns from SS sequences
for i in range(niter): # train cascaded NN
rmse = self.nn2.train(im, om, eta = eta, niter = 1)
print i, ":", rmse
print(i, ":", rmse)
return rmse
def testAll(self, seqs, targets):
......
......@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4):
specified width average of 100.
"""
sum = 0.0
order = range(0, len(calls) - 1, +1) # we are extending calls downstream
order = list(range(0, len(calls) - 1, +1)) # we are extending calls downstream
cnt = 0
for i in order: # extend to the right
if calls[i]: # to extend a call is required in the first place
......@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4):
AND extend this list upstream containing a specified width average of 100.
"""
sum = 0.0
order = range(len(calls) - 1, 0, -1) # we are extending calls upstream/to-the-left
order = list(range(len(calls) - 1, 0, -1)) # we are extending calls upstream/to-the-left
cnt = 0
for i in order: # extend to the right
if calls[i]: # a requirement to extend is to have a call in the first place
......
......@@ -291,7 +291,7 @@ class TupleEntries(object):
def __iter__(self):
return self
def next(self):
def __next__(self):
""" Step through sequence of entries, either
(if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
(if sparse) with calls to tuple store based on all possible symbol combinations."""
......
import urllib, urllib2
import os
from time import sleep
import stats
from StringIO import StringIO
import gzip
""" This module is collection of functions for accessing the EBI REST web services,
including sequence retrieval, searching, gene ontology, BLAST and ClustalW.
The class EBI takes precautions taken as to not send too many requests when
performing BLAST and ClustalW queries.
See
http://www.ebi.ac.uk/Tools/webservices/tutorials/01_intro and
http://www.ebi.ac.uk/Tools/webservices/tutorials/02_rest
http://www.ebi.ac.uk/Tools/webservices/tutorials/06_programming/python/rest/urllib
"""
__ebiUrl__ = 'http://www.ebi.ac.uk/Tools/' # Use UQ mirror when available
__ebiGOUrl__ = 'http://www.ebi.ac.uk/QuickGO/' # Use UQ mirror when available
__uniprotUrl__ = 'http://www.uniprot.org/' #
def fetch(entryId, dbName='uniprotkb', format='fasta'):
"""
Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' (database dependent; examples for uniprotkb)
dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases)
See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax
"""
# Construct URL
url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
# Get the entry
try:
data = urllib2.urlopen(url).read()
if data.startswith('ERROR'):
raise RuntimeError(data)
return data
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
def search(query, dbName='uniprot', format='list', limit=100):
"""
Retrieve multiple entries matching query from a database currently only via UniProtKB
query: search term(s) e.g. 'organism:9606+AND+antigen'
dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
format: file format e.g. 'list', 'fasta' or 'txt'
limit: max number of results (specify None for all results)
See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
"""
if dbName.startswith('uniprot'):
# Construct URL
if limit == None: # no limit to number of results returned
url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query
else:
url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
# Get the entries
try:
data = urllib2.urlopen(url).read()
if format == 'list':
return data.splitlines()
else:
return data
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
elif dbName.startswith('refseq'):
dbs = dbName.split(":")
if len(dbs) > 1:
dbName = dbs[1]
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
# Get the entries
try:
data = urllib2.urlopen(url).read()
words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]]
if format == 'list':
return words
elif format == 'fasta' and len(words) > 0:
url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
for w in words:
url += w + ","
data = urllib2.urlopen(url).read()
return data
else:
return ''
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
return
authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
3702: ['Arabidopsis thaliana', 'TAIR_ID'],
4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
10090: ['Mus musculus', 'MGI_ID']}
def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False):
"""
Map identifiers between databases (based on UniProtKB; see http://www.uniprot.org/faq/28)
identifiers: a list of identifiers (list of strings)
frm: the tag/abbreviation for the identifier FROM which to idmap
to: the tag/abbreviation for the identifier TO which to idmap
format: the results format to use
reverse: reverse the returned mapping key (to) -> value (from)
Returns a dictionary with key (from) -> value (to)
Set reverse to True if dictionary should contain the reverse mapping, useful if the mapping is non-unique
"""
url = __uniprotUrl__ + 'mapping/'
# construct query by concatenating the list of identifiers
if isinstance(identifiers, str):
query = identifiers.strip()
else: # assume it is a list of strings
query = ''
for id in identifiers:
query = query + id.strip() + ' '
query = query.strip() # remove trailing spaces
params = {
'from' : frm,
'to' : to,
'format' : format,
'query' : query
}
if len(query) > 0:
request = urllib2.Request(url, urllib.urlencode(params))
response = urllib2.urlopen(request).read()
d = dict()
for row in response.splitlines()[1:]:
pair = row.split('\t')
if not reverse:
d[pair[0]] = pair[1]
else:
d[pair[1]] = pair[0]
return d
else:
return dict()
"""
Gene Ontology service (QuickGO)
http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries.
"""
def getGOReport(positives, background = None, database = 'UniProtKB'):
""" Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
(GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
E-value is a Bonferroni-corrected p-value.
"""
pos = set(positives)
fg_map = getGOTerms(pos, database)
fg_list = []
for id in fg_map:
for t in fg_map[id]:
fg_list.append(t)
bg_map = {}
bg_list = []
neg = set()
if background != None:
neg = set(background).difference(pos)
bg_map = getGOTerms(neg, database)
for id in bg_map:
for t in bg_map[id]:
bg_list.append(t)
term_set = set(fg_list)
term_cnt = {}
nPos = len(pos)
nNeg = len(neg)
if background == None:
for t in term_set:
term_cnt[t] = fg_list.count(t)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
else: # a background is provided
for t in term_set:
fg_hit = fg_list.count(t)
bg_hit = bg_list.count(t)
fg_nohit = nPos - fg_hit
bg_nohit = nNeg - bg_hit
term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
ret = []
for t in sorted_cnt:
defin = getGODef(t[0])
if background != None:
ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name']))
else:
ret.append((t[0], t[1], defin['name']))
return ret
def getGODef(goterm):
"""
Retrieve information about a GO term
goterm: the identifier, e.g. 'GO:0002080'
"""
# Construct URL
url = __ebiGOUrl__ + 'GTerm?format=obo&id=' + goterm
# Get the entry: fill in the fields specified below
try:
entry={'id': None, 'name': None, 'def': None}
data = urllib2.urlopen(url).read()
for row in data.splitlines():
index = row.find(':')
if index > 0 and len(row[index:]) > 1:
field = row[0:index].strip()
value = row[index+1:].strip(' "') # remove spaces and quotation marks
if field in entry.keys(): # check if we need this field
if entry[field] == None: # check if not yet assigned
entry[field] = value
return entry
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
"""
Retrieve all GO terms for a given set of genes (or single gene).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms.
If completeAnnot is True (default is False) then the above "terms" is the first element
in a tuple with (gene-terms-map, gene-taxon-id).
"""
if type(genes) != list and type(genes) != set and type(genes) != tuple:
genes = [genes]
termsmap = dict()
taxonmap = dict()
uri_string = 'GAnnotation?format=tsv&gz&db=' + database + '&protein='
# build queries (batches of genes)
queryLength = 2000
queries = []
query = None
for gene in genes:
if query == None:
query = gene
elif len(query) < queryLength:
query += ','+gene
else:
queries.append(query)
query = gene
if query != None:
queries.append(query)
# execute queries, each involving a number of genes
for query in queries:
# Construct URL
url = __ebiGOUrl__ + uri_string + query
# Get the entry: fill in the fields specified below
try:
urlreq = urllib2.Request(url)
urlreq.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
else:
data = response.read()
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
key = values[1]
if termsmap.has_key(key):
termsmap[key].add(values[6])
else:
termsmap[key] = set([values[6]])
taxonmap[key] = int(values[4])
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
if completeAnnot:
if len(genes) == 1:
if len(termsmap) == 1:
return (termsmap[genes[0]], taxonmap[genes[0]])
else:
return (set(), None)
else:
return (termsmap, taxonmap)
else:
if len(genes) == 1:
if len(termsmap) == 1:
return termsmap[genes[0]]
else:
return set()
else:
return termsmap
def getGenes(goterms, database='UniProtKB', taxo=None):
"""
Retrieve all genes/proteins for a given set of GO terms (or single GO term).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
taxo: use specific taxonomic identifier, e.g. 9606 (human)
The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms.
"""
if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
goterms = [goterms]
map = dict()
if taxo == None:
uri_string = 'GAnnotation?format=tsv&db=' + database + '&term='
else:
uri_string = 'GAnnotation?format=tsv&db=' + database + '&tax=' + str(taxo) + '&term='
for goterm in goterms:
genes = set()
# Construct URL
url = __ebiGOUrl__ + uri_string + goterm.strip()
# Get the entry: fill in the fields specified below
try:
data = urllib2.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
genes.add(values[1])
map[goterm] = list(genes)
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
if len(goterms) == 1:
return map[goterms[0]]
else:
return map
class EBI(object):
__email__ = 'anon@uq.edu.au' # to whom emails about jobs should go
__ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
__checkInterval__ = 2 # how long to wait between checking job status
def __init__(self, service=None):
""" Initialise service session.
service: presently, ncbiblast and clustalw2 are supported. Use None (default) for fetch/idmap jobs.
"""
self.service = service
self.lockFile = '%s.lock' % service
def createLock(self):
""" Create a lock file to prevent submission of more than 1 job
at a time by a single user. """
fh = open(self.lockFile, 'w')
fh.write(self.jobId)
fh.close()
def removeLock(self):
""" Remove the lock file. """
os.remove(self.lockFile)
def isLocked(self):
""" Check if there is a lock on this service. If there is, check if
the job is complete, and if so remove the lock. Return True if still
locked and False if not. """
if os.path.exists(self.lockFile):
fh = open(self.lockFile, 'r')
jobId = fh.read()
fh.close()
status = self.status(jobId)
if status == 'RUNNING':
self.jobId = jobId
return True
else:
self.removeLock()
return False
else:
return False
"""
BLAST and CLUSTALW services
"""
def run(self, params):
""" Submit a job to the given service with the given parameters, given
as a dictionary. Return the jobId. """
if self.service == None:
raise RuntimeError('No service specified')
if self.isLocked():
raise RuntimeError("""You currently have a %s job running. You must
wait until it is complete before submitting another job. Go to
%sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId))
url = self.__ebiServiceUrl__ + self.service + '/run/'
# ncbiblast database parameter needs special handling
if self.service == 'ncbiblast':
databaseList = params['database']
del params['database']
databaseData = ''
for db in databaseList:
databaseData += '&database=' + db
encodedParams = urllib.urlencode(params)
encodedParams += databaseData
else:
encodedParams = urllib.urlencode(params)
print url
self.jobId = urllib2.urlopen(url, encodedParams).read()
self.createLock()
return self.jobId
def status(self, jobId=None):
""" Check the status of the given job (or the current job if none is
specified), and return the result. """
if jobId is None:
jobId = self.jobId
url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
status = urllib2.urlopen(url).read()
return status
def resultTypes(self):
""" Get the available result types. Will only work on a finished job. """
url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
resultTypes = urllib2.urlopen(url).read()
return resultTypes
def result(self, resultType):
""" Get the result of the given job of the specified type. """
url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
try:
result = urllib2.urlopen(url).read()
if resultType == 'error':
raise RuntimeError('An error occurred: %s' % result)
except urllib2.HTTPError:
if resultType == 'error':
raise RuntimeError('An unknown error occurred while processing the job (check your input)')
else:
self.result('error')
return result
def submit(self, params, resultTypes):
""" Submit a new job to the service with the given parameters.
Return the output in the specified format. """
params['email'] = self.__email__
self.run(params)
print 'Submitted new', self.service, 'job, jobId:', self.jobId
print 'Please be patient while the job is completed'
status = 'RUNNING'
observe = 0
while status == 'RUNNING':
observe = observe + 1
status = self.status()
sleep(self.__checkInterval__)
if status != 'FINISHED':
raise RuntimeError('An error occurred and the job could not be completed')
print 'Job complete.'
self.removeLock()
if type(resultTypes) != list:
resultTypes = [resultTypes]
results = []
for resultType in resultTypes:
results.append(self.result(resultType))
if len(results) == 1:
return results[0]
else:
return results
import urllib.request
import os
from time import sleep
import stats
from io import StringIO
import gzip
""" This module is collection of functions for accessing the EBI REST web services,
including sequence retrieval, searching, gene ontology, BLAST and ClustalW.
The class EBI takes precautions taken as to not send too many requests when
performing BLAST and ClustalW queries.
See
http://www.ebi.ac.uk/Tools/webservices/tutorials/01_intro and
http://www.ebi.ac.uk/Tools/webservices/tutorials/02_rest
http://www.ebi.ac.uk/Tools/webservices/tutorials/06_programming/python/rest/urllib
"""
__ebiUrl__ = 'http://www.ebi.ac.uk/Tools/' # Use UQ mirror when available
__ebiGOUrl__ = 'http://www.ebi.ac.uk/QuickGO/' # Use UQ mirror when available
__uniprotUrl__ = 'http://www.uniprot.org/' #
def fetch(entryId, dbName='uniprotkb', format='fasta'):
"""
Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' (database dependent; examples for uniprotkb)
dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases)
See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax
"""
# Construct URL
url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
# Get the entry
try:
data = urllib.request.urlopen(url).read()
if data.startswith(b'ERROR'):
raise RuntimeError(data)
return data
except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read())
def search(query, dbName='uniprot', format='list', limit=100):
"""
Retrieve multiple entries matching query from a database currently only via UniProtKB
query: search term(s) e.g. 'organism:9606+AND+antigen'
dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
format: file format e.g. 'list', 'fasta' or 'txt'
limit: max number of results (specify None for all results)
See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
"""
if dbName.startswith('uniprot'):
# Construct URL
if limit == None: # no limit to number of results returned
url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query
else:
url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
# Get the entries
try:
data = urllib.request.urlopen(url).read()
if format == 'list':
return data.splitlines()
else:
return data
except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read())
elif dbName.startswith('refseq'):
dbs = dbName.split(":")
if len(dbs) > 1:
dbName = dbs[1]
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
# Get the entries
try:
data = urllib.request.urlopen(url).read()
words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]]
if format == 'list':
return words
elif format == 'fasta' and len(words) > 0:
url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
for w in words:
url += w + ","
data = urllib.request.urlopen(url).read()
return data
else:
return ''
except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read())
return
authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
3702: ['Arabidopsis thaliana', 'TAIR_ID'],
4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
10090: ['Mus musculus', 'MGI_ID']}
def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False):
"""
Map identifiers between databases (based on UniProtKB; see http://www.uniprot.org/faq/28)
identifiers: a list of identifiers (list of strings)
frm: the tag/abbreviation for the identifier FROM which to idmap
to: the tag/abbreviation for the identifier TO which to idmap
format: the results format to use
reverse: reverse the returned mapping key (to) -> value (from)
Returns a dictionary with key (from) -> value (to)
Set reverse to True if dictionary should contain the reverse mapping, useful if the mapping is non-unique
"""
url = __uniprotUrl__ + 'mapping/'
# construct query by concatenating the list of identifiers
if isinstance(identifiers, str):
query = identifiers.strip()
else: # assume it is a list of strings
query = ''
for id in identifiers:
query = query + id.strip() + ' '
query = query.strip() # remove trailing spaces
params = {
'from' : frm,
'to' : to,
'format' : format,
'query' : query
}
if len(query) > 0:
request = urllib.request.Request(url, urllib.parse.urlencode(params))
response = urllib.request.urlopen(request).read()
d = dict()
for row in response.splitlines()[1:]:
pair = row.split('\t')
if not reverse:
d[pair[0]] = pair[1]
else:
d[pair[1]] = pair[0]
return d
else:
return dict()
"""
Gene Ontology service (QuickGO)
http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries.
"""
def getGOReport(positives, background = None, database = 'UniProtKB'):
""" Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
(GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
E-value is a Bonferroni-corrected p-value.
"""
pos = set(positives)
fg_map = getGOTerms(pos, database)
fg_list = []
for id in fg_map:
for t in fg_map[id]:
fg_list.append(t)
bg_map = {}
bg_list = []
neg = set()
if background != None:
neg = set(background).difference(pos)
bg_map = getGOTerms(neg, database)
for id in bg_map:
for t in bg_map[id]:
bg_list.append(t)
term_set = set(fg_list)
term_cnt = {}
nPos = len(pos)
nNeg = len(neg)
if background == None:
for t in term_set:
term_cnt[t] = fg_list.count(t)
sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
else: # a background is provided
for t in term_set:
fg_hit = fg_list.count(t)
bg_hit = bg_list.count(t)
fg_nohit = nPos - fg_hit
bg_nohit = nNeg - bg_hit
term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
ret = []
for t in sorted_cnt:
defin = getGODef(t[0])
if background != None:
ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name']))
else:
ret.append((t[0], t[1], defin['name']))
return ret
def getGODef(goterm):
"""
Retrieve information about a GO term
goterm: the identifier, e.g. 'GO:0002080'
"""
# Construct URL
url = __ebiGOUrl__ + 'GTerm?format=obo&id=' + goterm
# Get the entry: fill in the fields specified below
try:
entry={'id': None, 'name': None, 'def': None}
data = urllib.request.urlopen(url).read()
for row in data.splitlines():
index = row.find(':')
if index > 0 and len(row[index:]) > 1:
field = row[0:index].strip()
value = row[index+1:].strip(' "') # remove spaces and quotation marks
if field in list(entry.keys()): # check if we need this field
if entry[field] == None: # check if not yet assigned
entry[field] = value
return entry
except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read())
def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
"""
Retrieve all GO terms for a given set of genes (or single gene).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms.
If completeAnnot is True (default is False) then the above "terms" is the first element
in a tuple with (gene-terms-map, gene-taxon-id).
"""
if type(genes) != list and type(genes) != set and type(genes) != tuple:
genes = [genes]
termsmap = dict()
taxonmap = dict()
uri_string = 'GAnnotation?format=tsv&gz&db=' + database + '&protein='
# build queries (batches of genes)
queryLength = 2000
queries = []
query = None
for gene in genes:
if query == None:
query = gene
elif len(query) < queryLength:
query += ','+gene
else:
queries.append(query)
query = gene
if query != None:
queries.append(query)
# execute queries, each involving a number of genes
for query in queries:
# Construct URL
url = __ebiGOUrl__ + uri_string + query
# Get the entry: fill in the fields specified below
try:
urlreq = urllib.request.Request(url)
urlreq.add_header('Accept-encoding', 'gzip')
response = urllib.request.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
else:
data = response.read()
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
key = values[1]
if key in termsmap:
termsmap[key].add(values[6])
else:
termsmap[key] = set([values[6]])
taxonmap[key] = int(values[4])
except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read())
if completeAnnot:
if len(genes) == 1:
if len(termsmap) == 1:
return (termsmap[genes[0]], taxonmap[genes[0]])
else:
return (set(), None)
else:
return (termsmap, taxonmap)
else:
if len(genes) == 1:
if len(termsmap) == 1:
return termsmap[genes[0]]
else:
return set()
else:
return termsmap
def getGenes(goterms, database='UniProtKB', taxo=None):
"""
Retrieve all genes/proteins for a given set of GO terms (or single GO term).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
taxo: use specific taxonomic identifier, e.g. 9606 (human)
The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms.
"""
if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
goterms = [goterms]
map = dict()
if taxo == None:
uri_string = 'GAnnotation?format=tsv&db=' + database + '&term='
else:
uri_string = 'GAnnotation?format=tsv&db=' + database + '&tax=' + str(taxo) + '&term='
for goterm in goterms:
genes = set()
# Construct URL
url = __ebiGOUrl__ + uri_string + goterm.strip()
# Get the entry: fill in the fields specified below
try:
data = urllib.request.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
genes.add(values[1])
map[goterm] = list(genes)
except(urllib.error.HTTPError, ex):
raise RuntimeError(ex.read())
if len(goterms) == 1:
return map[goterms[0]]
else:
return map
class EBI(object):
__email__ = 'anon@uq.edu.au' # to whom emails about jobs should go
__ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
__checkInterval__ = 2 # how long to wait between checking job status
def __init__(self, service=None):
""" Initialise service session.
service: presently, ncbiblast and clustalw2 are supported. Use None (default) for fetch/idmap jobs.
"""
self.service = service
self.lockFile = '%s.lock' % service
def createLock(self):
""" Create a lock file to prevent submission of more than 1 job
at a time by a single user. """
fh = open(self.lockFile, 'w')
fh.write(self.jobId)
fh.close()
def removeLock(self):
""" Remove the lock file. """
os.remove(self.lockFile)
def isLocked(self):
""" Check if there is a lock on this service. If there is, check if
the job is complete, and if so remove the lock. Return True if still
locked and False if not. """
if os.path.exists(self.lockFile):
fh = open(self.lockFile, 'r')
jobId = fh.read()
fh.close()
status = self.status(jobId)
if status == 'RUNNING':
self.jobId = jobId
return True
else:
self.removeLock()
return False
else:
return False
"""
BLAST and CLUSTALW services
"""
def run(self, params):
""" Submit a job to the given service with the given parameters, given
as a dictionary. Return the jobId. """
if self.service == None:
raise RuntimeError('No service specified')
if self.isLocked():
raise RuntimeError("""You currently have a %s job running. You must
wait until it is complete before submitting another job. Go to
%sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId))
url = self.__ebiServiceUrl__ + self.service + '/run/'
# ncbiblast database parameter needs special handling
if self.service == 'ncbiblast':
databaseList = params['database']
del params['database']
databaseData = ''
for db in databaseList:
databaseData += '&database=' + db
encodedParams = urllib.parse.urlencode(params)
encodedParams += databaseData
else:
encodedParams = urllib.parse.urlencode(params)
print(url)
self.jobId = urllib.request.urlopen(url, encodedParams).read()
self.createLock()
return self.jobId
def status(self, jobId=None):
""" Check the status of the given job (or the current job if none is
specified), and return the result. """
if jobId is None:
jobId = self.jobId
url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
status = urllib.request.urlopen(url).read()
return status
def resultTypes(self):
""" Get the available result types. Will only work on a finished job. """
url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
resultTypes = urllib.request.urlopen(url).read()
return resultTypes
def result(self, resultType):
""" Get the result of the given job of the specified type. """
url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
try:
result = urllib.request.urlopen(url).read()
if resultType == 'error':
raise RuntimeError('An error occurred: %s' % result)
except(urllib.error.HTTPError):
if resultType == 'error':
raise RuntimeError('An unknown error occurred while processing the job (check your input)')
else:
self.result('error')
return result
def submit(self, params, resultTypes):
""" Submit a new job to the service with the given parameters.
Return the output in the specified format. """
params['email'] = self.__email__
self.run(params)
print(('Submitted new', self.service, 'job, jobId:', self.jobId))
print('Please be patient while the job is completed')
status = 'RUNNING'
observe = 0
while status == 'RUNNING':
observe = observe + 1
status = self.status()
sleep(self.__checkInterval__)
if status != 'FINISHED':
raise RuntimeError('An error occurred and the job could not be completed')
print('Job complete.')
self.removeLock()
if type(resultTypes) != list:
resultTypes = [resultTypes]
results = []
for resultType in resultTypes:
results.append(self.result(resultType))
if len(results) == 1:
return results[0]
else:
return results
......@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
neg[word] = 1
logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg
for (word, cnt_pos) in pos.items():
for (word, cnt_pos) in list(pos.items()):
cnt_neg = 0.0001
try:
cnt_neg = neg[word]
......@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pass
logratio[word] = math.log(float(cnt_pos) / float(cnt_neg))
allpos = logratio.items() # extract all pairs of words:log-ratio
allpos = list(logratio.items()) # extract all pairs of words:log-ratio
sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them
print "Enriched words (sorted by ln pos/neg)"
print "Word \tln pos/neg\tE-value"
print("Enriched words (sorted by ln pos/neg)")
print("Word \tln pos/neg\tE-value")
for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values
cnt_pos = int(pos[word])
try: cnt_neg = int(neg[word])
......@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False)
# Correct for multiple testing (very conservatively)
eval = pval * len(allpos)
print "%s\t%6.3f \t%e" % (word, lgr, eval)
print("%s\t%6.3f \t%e" % (word, lgr, eval))
def getReverse(distribs):
""" Construct a new list of probability distributions of DNA, by
......@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
except KeyError:
usage(sys.argv[0], "Unknown motif %s" % motif)
return
print "Motif %s:" % motif
print("Motif %s:" % motif)
pwm1 = sequence.PWM(fg1, bg)
pwm1.display(format='JASPAR')
print "Motif %s (reverse complement):" % motif
print("Motif %s (reverse complement):" % motif)
pwm2 = sequence.PWM(fg2, bg)
pwm2.display(format='JASPAR')
......@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
# plot the average score curve
# print >> sys.stderr, ""
x = range(-(seq_len/2), (seq_len/2)) # call center of sequence X=0
x = list(range(-(seq_len/2), (seq_len/2))) # call center of sequence X=0
lbl = "%s" % (motif)
plt.plot(x, avg_motif_score, label=lbl)
#plt.plot(x, smoothed_avg_motif_score, label=lbl)
......@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
except KeyError:
usage(sys.argv[0], "Unknown motif %s" % motif)
return
print "Motif %s:" % motif
print("Motif %s:" % motif)
pwm1 = sequence.PWM(fg1, bg)
pwm1.display(format='JASPAR')
print "Motif %s (reverse complement):" % motif
print("Motif %s (reverse complement):" % motif)
pwm2 = sequence.PWM(fg2, bg)
pwm2.display(format='JASPAR')
......@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
# divide number of sequences with hit by total number of hits
site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ]
print >> sys.stderr, "Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits)
print("Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits), file=sys.stderr)
# STATISTICS
# Get the cumulative hit counts in concentric windows
......@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
for i in range(hw, seq_len-motif_width+1-hw):
smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1)
x = range(-(seq_len/2), (seq_len/2)) # call center of sequence X=0
x = list(range(-(seq_len/2), (seq_len/2))) # call center of sequence X=0
lbl = "%s, t=%.2f" % (motif, threshold)
#lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
plt.plot(x, smoothed_site_probability, label=lbl)
......@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
def usage(name, errmsg = None):
if errmsg != None:
print "Error: %s" % errmsg
print """Usage: %s [options]
print("Error: %s" % errmsg)
print("""Usage: %s [options]
-f <fasta-filename> (required)
-d discover enriched words
-w <word width, default 8>
-p <peak width, default 100>
-m <peak margin, default 100>
-s <JASPAR-ID> scan for JASPAR motif
-h print this help""" % name
-h print this help""" % name)
if __name__ == '__main__':
try:
optlst, args = getopt.getopt(sys.argv[1:], 'f:hds:j:w:p:m:')
except getopt.GetoptError, err:
except getopt.GetoptError as err:
usage(sys.argv[0], str(err))
sys.exit(2)
FILENAME = None
......@@ -301,7 +301,7 @@ if __name__ == '__main__':
sys.exit(3)
seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN)
if DISCOVER_MODE:
print "Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
print("Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN))
countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
elif SCAN_MODE:
scanMotifReport(seqs, MOTIF_ID)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment