Commit 881688af authored by Mikael Boden's avatar Mikael Boden

webservice_in_Python_3

parent de69764a
......@@ -118,6 +118,7 @@ class GO():
if line.startswith('!'):
continue
(gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns)
print(gene, symb, qual, term, evid, onto, taxa)
try:
(taxa_q, terms_map) = self.annots[gene]
terms_map[term] = (evid, qual != 'NOT')
......
......@@ -48,7 +48,7 @@ class PhyloTree:
If node does not exist, None is returned.
If node has no descendants, an empty list will be returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
node = self.findLabel(node)
if node:
return node.getDescendants(transitive)
return None
......@@ -60,22 +60,24 @@ class PhyloTree:
If node does not exist, None is returned.
If node is the root of the tree, None is returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
node = self.findLabel(node)
if node:
myroot = self.root
found = False
branching = []
while not found and myroot != None:
branching.append(myroot)
# check if "myroot" is a leaf node, i.e. does not have children
if myroot.left == node or myroot.right == node:
found = True
break
if myroot.left:
if myroot.left.isAncestorOf(node, transitive = True):
myroot = myroot.left
else: # must be right branch then...
myroot = myroot.right
else: # must be right branch then...
if myroot.left != None: # myroot has a "left" child
# check if the "left" child of "myroot" is the ancestor of "node"
if myroot.left.isAncestorOf(node, transitive = True): # if yes,
myroot = myroot.left # move to the "left" child
else: # if not,
myroot = myroot.right # move to the "right" child
else: # myroot does NOT have a "left" child, so let's move "right"
myroot = myroot.right
if found and transitive:
return branching
......@@ -91,6 +93,8 @@ class PhyloTree:
self.root._backwardParsimony(self.aln) # use scores to determine sequences
return self.root.getSequence() # return the sequence found at the root
def canonise(self):
self.root._canonise()
class PhyloNode:
""" A class for a node in a rooted, binary (bifurcating) tree.
......@@ -212,6 +216,18 @@ class PhyloNode:
self.sequence = seq
break
def _canonise(self):
if self.left == None and self.right == None: # at leaf
return self.label
myleft = self.left._canonise()
myright = self.right._canonise();
if myleft > myright:
tmpnode = self.left
self.left = self.right
self.right = tmpnode
return myright
return myleft
def _forwardParsimony(self, aln):
""" Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node,
......@@ -459,3 +475,6 @@ def readNewick(filename):
string = ''.join(f)
return parseNewick(string)
def writeNewickFile(filename, my_tree):
with open(filename, 'w') as fh:
print(my_tree, end="", file=fh)
......@@ -33,17 +33,17 @@ from prob import *
# Sequence ------------------****
class Sequence(object):
""" A biological sequence. Stores the sequence itself (as a compact array),
the alphabet (i.e., type of sequence it is), and optionally a name and further
""" A biological sequence. Stores the sequence itself (as a compact array),
the alphabet (i.e., type of sequence it is), and optionally a name and further
information. """
sequence = None # The array of symbols that make up the sequence
sequence = None # The array of symbols that make up the sequence
alphabet = None # The alphabet from which symbols come
name = None # The name (identifier) of a sequence
info = None # Other information (free text; e.g. annotations)
length = None # The number of symbols that the sequence is composed of
gappy = None # True if the sequence has "gaps", i.e. positions that represent deletions relative another sequence
def __init__(self, sequence, alphabet = None, name = '', info = '', gappy = False):
""" Create a sequence with the sequence data. Specifying the alphabet,
name and other information about the sequence are all optional.
......@@ -56,14 +56,11 @@ class Sequence(object):
will output the standard protein alphabet:
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'Y'] """
#try: # convert sequence data into a compact array representation
# self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
#except TypeError:
# raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
self.sequence = sequence
# Assign an alphabet
# If no alphabet is provided, attempts to identify the alphabet from sequence
self.alphabet = None
if not alphabet is None:
for sym in self.sequence:
......@@ -75,7 +72,7 @@ class Sequence(object):
alpha = predefAlphabets[alphaName]
valid = True
for sym in self.sequence:
if not sym in alpha and (sym != '-' or not gappy):
if not sym in alpha and (sym != '-' or not gappy):
valid = False
break
if valid:
......@@ -83,17 +80,17 @@ class Sequence(object):
break
if self.alphabet is None:
raise RuntimeError('Could not identify alphabet from sequence: %s' % name)
# Store other information
self.name = name
self.info = info
self.length = len(self.sequence)
self.gappy = gappy
def __len__(self):
""" Defines what the "len" operator returns for an instance of Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print(len(seq))
>>> print (len(seq))
9
"""
return len(self.sequence)
......@@ -104,42 +101,42 @@ class Sequence(object):
for sym in self:
str += sym
return str
def __iter__(self):
""" Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
>>> seq = Sequence('AGGAT', DNA_Alphabet)
>>> for sym in seq:
print(sym)
print (sym)
will print A, G, G, A, T (each on a separate row)
"""
"""
tsyms = tuple(self.sequence)
return tsyms.__iter__()
def __contains__(self, item):
""" Defines what is returned when the "in" operator is used on a Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print('T' in seq)
>>> print ('T' in seq)
True
which is equivalent to
>>> print(seq.__contains__('T'))
which is equivalent to
>>> print (seq.__contains__('T'))
True
>>> print('X' in seq)
>>> print ('X' in seq)
False
"""
"""
for sym in self.sequence:
if sym == item:
return True
return False
def __getitem__(self, ndx):
""" Retrieve a specified index (or a "slice" of indices) of the sequence data.
Calling self.__getitem__(3) is equivalent to self[3]
Calling self.__getitem__(3) is equivalent to self[3]
"""
if type(ndx) is slice:
return ''.join(self.sequence[ndx])
else:
return self.sequence[ndx]
def writeFasta(self):
""" Write one sequence in FASTA format to a string and return it. """
fasta = '>' + self.name + ' ' + self.info + '\n'
......@@ -149,7 +146,7 @@ class Sequence(object):
lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
fasta += lineofseq
return fasta
def count(self, findme = None):
""" Get the number of occurrences of specified symbol findme OR
if findme = None, return a dictionary of counts of all symbols in alphabet """
......@@ -318,13 +315,12 @@ def getCount(seqs, findme = None):
class Alignment():
""" A sequence alignment class. Stores two or more sequences of equal length where
one symbol is gap '-'
one symbol is gap '-'
Example usage:
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print(Alignment(seqs))
>>> seqs = [Sequence('THIS-LI-NE-', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print (Alignment(seqs))
THIS-LI-NE-
--ISALIGNED
"""
--ISALIGNED """
alignlen = None
seqs = None
......@@ -381,7 +377,7 @@ class Alignment():
maxNameLength = self.getnamelen() + 1
string = ''
wholeRows = self.alignlen / symbolsPerLine
for i in range(wholeRows):
for i in range(int(wholeRows)):
for j in range(len(self.seqs)):
string += self.seqs[j].name.ljust(maxNameLength) + ' '
string += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
......@@ -646,49 +642,65 @@ class Alignment():
distmat[i, j] = distmat[j, i] = dist
return distmat
def writeHTML(self, filename=None):
def writeHTML(self, filename = None):
""" Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue'
"""
if filename == None:
htmlstr = '<pre>\n'
else:
htmlstr = '<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'
html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
maxNameLength = self.getnamelen()
html = ''.ljust(maxNameLength) + ' '
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += str(i/10+1)[-1]
html += str(i/10+1)[0]
else:
html += ' '
html += '%s\n' % (self.alignlen)
htmlstr += html
if self.alignlen > 10:
html = ''.ljust(maxNameLength) + ' '
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += '0'
index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
else:
html += ' '
html += '\n'
if self.alignlen > 100:
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0 and i >= 99:
index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
else:
html += ' '
html += '\n'
if self.alignlen > 1000:
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
else:
html += ' '
html += '\n'
htmlstr += html
for seq in self.seqs:
html = seq.name.ljust(maxNameLength) + ' '
html += seq.name.ljust(maxNameLength) + ' '
for sym in seq:
color = self.alphabet.getAnnotation('html-color', sym)
if not color:
color = 'white'
html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
html += '\n'
htmlstr += html
htmlstr += '<pre>'
html += '</pre></body></html>'
if filename:
with open(filename, 'w+') as fh:
fh.write(htmlstr)
fh.write('</body></html>\n')
else:
return htmlstr
fh = open(filename, 'w')
fh.write(html)
fh.close()
return html
def saveConsensus(aln, theta1 = 0.99, theta2 = 0.01, countgaps = False, consensus = True, filename = None):
""" Display a table with rows for each alignment column, showing
......@@ -1239,7 +1251,7 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
for i in range(MAX_TRY):
try:
fastaData = fetch(id, database).decode("utf-8")
fastaData = fetch(id, database)
seq = readFasta(fastaData)[0]
break
except:
......
......@@ -121,22 +121,27 @@ this module is imported """
Bool_Alphabet = Alphabet('TF')
DNA_Alphabet = Alphabet('ACGT')
DNA_Alphabet_wN = Alphabet('ACGTN')
RNA_Alphabet_wN = Alphabet('ACGUN')
RNA_Alphabet = Alphabet('ACGU')
Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
Protein_Alphabet_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
Protein_Alphabet_wSTOP = Protein_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
DSSP_Alphabet = Alphabet('GHITEBSC')
DSSP3_Alphabet = Alphabet('HEC')
predefAlphabets = {'DNA': DNA_Alphabet,
predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
'DNA': DNA_Alphabet,
'RNA': RNA_Alphabet,
'DNAwN': Alphabet('ACGTN'),
'RNAwN': Alphabet('ACGUN'),
'DNAwN': RNA_Alphabet_wN,
'RNAwN': DNA_Alphabet_wN,
'Protein': Protein_Alphabet,
'ProteinwX': Protein_wX}
'ProteinwX': Protein_wX,
'ProteinwSTOP' : Protein_wSTOP,
'DSSP_Alphabet' : DSSP_Alphabet,
'DSSP3_Alphabet' : DSSP3_Alphabet}
# The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder = ['DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX']
preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
# Useful annotations
DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
......
......@@ -32,11 +32,13 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
# Get the entry
try:
data = urllib.request.urlopen(url).read()
if data.startswith(b'ERROR'):
data = urllib.request.urlopen(url).read().decode("utf-8")
print (type(data))
if data.startswith("ERROR"):
raise RuntimeError(data)
return data
except(urllib.error.HTTPError, ex):
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
def search(query, dbName='uniprot', format='list', limit=100):
......@@ -57,12 +59,12 @@ def search(query, dbName='uniprot', format='list', limit=100):
url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
# Get the entries
try:
data = urllib.request.urlopen(url).read()
data = urllib.request.urlopen(url).read().decode("utf-8")
if format == 'list':
return data.splitlines()
else:
return data
except(urllib.error.HTTPError, ex):
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
elif dbName.startswith('refseq'):
dbs = dbName.split(":")
......@@ -72,7 +74,7 @@ def search(query, dbName='uniprot', format='list', limit=100):
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
# Get the entries
try:
data = urllib.request.urlopen(url).read()
data = urllib.request.urlopen(url).read().decode("utf-8")
words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]]
if format == 'list':
......@@ -81,11 +83,11 @@ def search(query, dbName='uniprot', format='list', limit=100):
url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
for w in words:
url += w + ","
data = urllib.request.urlopen(url).read()
data = urllib.request.urlopen(url).read().decode("utf-8")
return data
else:
return ''
except(urllib.error.HTTPError, ex):
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
return
......@@ -199,7 +201,7 @@ def getGODef(goterm):
# Get the entry: fill in the fields specified below
try:
entry={'id': None, 'name': None, 'def': None}
data = urllib.request.urlopen(url).read()
data = urllib.request.urlopen(url).read().decode("utf-8")
for row in data.splitlines():
index = row.find(':')
if index > 0 and len(row[index:]) > 1:
......@@ -209,7 +211,7 @@ def getGODef(goterm):
if entry[field] == None: # check if not yet assigned
entry[field] = value
return entry
except(urllib.error.HTTPError, ex):
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
......@@ -252,9 +254,9 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
data = f.read().decode("utf-8")
else:
data = response.read()
data = response.read().decode("utf-8")
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
......@@ -264,7 +266,7 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
else:
termsmap[key] = set([values[6]])
taxonmap[key] = int(values[4])
except(urllib.error.HTTPError, ex):
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
if completeAnnot:
if len(genes) == 1:
......@@ -304,13 +306,13 @@ def getGenes(goterms, database='UniProtKB', taxo=None):
url = __ebiGOUrl__ + uri_string + goterm.strip()
# Get the entry: fill in the fields specified below
try:
data = urllib.request.urlopen(url).read()
data = urllib.request.urlopen(url).read().decode("utf-8")
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
genes.add(values[1])
map[goterm] = list(genes)
except(urllib.error.HTTPError, ex):
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read())
if len(goterms) == 1:
return map[goterms[0]]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment