Commit 881688af authored by Mikael Boden's avatar Mikael Boden

webservice_in_Python_3

parent de69764a
...@@ -118,6 +118,7 @@ class GO(): ...@@ -118,6 +118,7 @@ class GO():
if line.startswith('!'): if line.startswith('!'):
continue continue
(gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns) (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns)
print(gene, symb, qual, term, evid, onto, taxa)
try: try:
(taxa_q, terms_map) = self.annots[gene] (taxa_q, terms_map) = self.annots[gene]
terms_map[term] = (evid, qual != 'NOT') terms_map[term] = (evid, qual != 'NOT')
......
...@@ -48,7 +48,7 @@ class PhyloTree: ...@@ -48,7 +48,7 @@ class PhyloTree:
If node does not exist, None is returned. If node does not exist, None is returned.
If node has no descendants, an empty list will be returned.""" If node has no descendants, an empty list will be returned."""
if not isinstance(node, PhyloNode): if not isinstance(node, PhyloNode):
node = self.root.findLabel(node) node = self.findLabel(node)
if node: if node:
return node.getDescendants(transitive) return node.getDescendants(transitive)
return None return None
...@@ -60,22 +60,24 @@ class PhyloTree: ...@@ -60,22 +60,24 @@ class PhyloTree:
If node does not exist, None is returned. If node does not exist, None is returned.
If node is the root of the tree, None is returned.""" If node is the root of the tree, None is returned."""
if not isinstance(node, PhyloNode): if not isinstance(node, PhyloNode):
node = self.root.findLabel(node) node = self.findLabel(node)
if node: if node:
myroot = self.root myroot = self.root
found = False found = False
branching = [] branching = []
while not found and myroot != None: while not found and myroot != None:
branching.append(myroot) branching.append(myroot)
# check if "myroot" is a leaf node, i.e. does not have children
if myroot.left == node or myroot.right == node: if myroot.left == node or myroot.right == node:
found = True found = True
break break
if myroot.left: if myroot.left != None: # myroot has a "left" child
if myroot.left.isAncestorOf(node, transitive = True): # check if the "left" child of "myroot" is the ancestor of "node"
myroot = myroot.left if myroot.left.isAncestorOf(node, transitive = True): # if yes,
else: # must be right branch then... myroot = myroot.left # move to the "left" child
myroot = myroot.right else: # if not,
else: # must be right branch then... myroot = myroot.right # move to the "right" child
else: # myroot does NOT have a "left" child, so let's move "right"
myroot = myroot.right myroot = myroot.right
if found and transitive: if found and transitive:
return branching return branching
...@@ -91,6 +93,8 @@ class PhyloTree: ...@@ -91,6 +93,8 @@ class PhyloTree:
self.root._backwardParsimony(self.aln) # use scores to determine sequences self.root._backwardParsimony(self.aln) # use scores to determine sequences
return self.root.getSequence() # return the sequence found at the root return self.root.getSequence() # return the sequence found at the root
def canonise(self):
self.root._canonise()
class PhyloNode: class PhyloNode:
""" A class for a node in a rooted, binary (bifurcating) tree. """ A class for a node in a rooted, binary (bifurcating) tree.
...@@ -212,6 +216,18 @@ class PhyloNode: ...@@ -212,6 +216,18 @@ class PhyloNode:
self.sequence = seq self.sequence = seq
break break
def _canonise(self):
if self.left == None and self.right == None: # at leaf
return self.label
myleft = self.left._canonise()
myright = self.right._canonise();
if myleft > myright:
tmpnode = self.left
self.left = self.right
self.right = tmpnode
return myright
return myleft
def _forwardParsimony(self, aln): def _forwardParsimony(self, aln):
""" Internal function that operates recursively to first initialise each node (forward), """ Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node, stopping only once a sequence has been assigned to the node,
...@@ -459,3 +475,6 @@ def readNewick(filename): ...@@ -459,3 +475,6 @@ def readNewick(filename):
string = ''.join(f) string = ''.join(f)
return parseNewick(string) return parseNewick(string)
def writeNewickFile(filename, my_tree):
with open(filename, 'w') as fh:
print(my_tree, end="", file=fh)
...@@ -57,13 +57,10 @@ class Sequence(object): ...@@ -57,13 +57,10 @@ class Sequence(object):
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'Y'] """ 'R', 'S', 'T', 'V', 'W', 'Y'] """
#try: # convert sequence data into a compact array representation
# self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
#except TypeError:
# raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
self.sequence = sequence self.sequence = sequence
# Assign an alphabet # Assign an alphabet
# If no alphabet is provided, attempts to identify the alphabet from sequence
self.alphabet = None self.alphabet = None
if not alphabet is None: if not alphabet is None:
for sym in self.sequence: for sym in self.sequence:
...@@ -93,7 +90,7 @@ class Sequence(object): ...@@ -93,7 +90,7 @@ class Sequence(object):
def __len__(self): def __len__(self):
""" Defines what the "len" operator returns for an instance of Sequence, e.g. """ Defines what the "len" operator returns for an instance of Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet) >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print(len(seq)) >>> print (len(seq))
9 9
""" """
return len(self.sequence) return len(self.sequence)
...@@ -109,7 +106,7 @@ class Sequence(object): ...@@ -109,7 +106,7 @@ class Sequence(object):
""" Defines how a Sequence should be "iterated", i.e. what its elements are, e.g. """ Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
>>> seq = Sequence('AGGAT', DNA_Alphabet) >>> seq = Sequence('AGGAT', DNA_Alphabet)
>>> for sym in seq: >>> for sym in seq:
print(sym) print (sym)
will print A, G, G, A, T (each on a separate row) will print A, G, G, A, T (each on a separate row)
""" """
tsyms = tuple(self.sequence) tsyms = tuple(self.sequence)
...@@ -118,12 +115,12 @@ class Sequence(object): ...@@ -118,12 +115,12 @@ class Sequence(object):
def __contains__(self, item): def __contains__(self, item):
""" Defines what is returned when the "in" operator is used on a Sequence, e.g. """ Defines what is returned when the "in" operator is used on a Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet) >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print('T' in seq) >>> print ('T' in seq)
True True
which is equivalent to which is equivalent to
>>> print(seq.__contains__('T')) >>> print (seq.__contains__('T'))
True True
>>> print('X' in seq) >>> print ('X' in seq)
False False
""" """
for sym in self.sequence: for sym in self.sequence:
...@@ -320,11 +317,10 @@ class Alignment(): ...@@ -320,11 +317,10 @@ class Alignment():
""" A sequence alignment class. Stores two or more sequences of equal length where """ A sequence alignment class. Stores two or more sequences of equal length where
one symbol is gap '-' one symbol is gap '-'
Example usage: Example usage:
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)] >>> seqs = [Sequence('THIS-LI-NE-', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print(Alignment(seqs)) >>> print (Alignment(seqs))
THIS-LI-NE- THIS-LI-NE-
--ISALIGNED --ISALIGNED """
"""
alignlen = None alignlen = None
seqs = None seqs = None
...@@ -381,7 +377,7 @@ class Alignment(): ...@@ -381,7 +377,7 @@ class Alignment():
maxNameLength = self.getnamelen() + 1 maxNameLength = self.getnamelen() + 1
string = '' string = ''
wholeRows = self.alignlen / symbolsPerLine wholeRows = self.alignlen / symbolsPerLine
for i in range(wholeRows): for i in range(int(wholeRows)):
for j in range(len(self.seqs)): for j in range(len(self.seqs)):
string += self.seqs[j].name.ljust(maxNameLength) + ' ' string += self.seqs[j].name.ljust(maxNameLength) + ' '
string += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n' string += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
...@@ -646,49 +642,65 @@ class Alignment(): ...@@ -646,49 +642,65 @@ class Alignment():
distmat[i, j] = distmat[j, i] = dist distmat[i, j] = distmat[j, i] = dist
return distmat return distmat
def writeHTML(self, filename=None): def writeHTML(self, filename = None):
""" Generate HTML that displays the alignment in color. """ Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym) Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue' and that each symbol maps to a text string naming the color, e.g. 'blue'
""" """
if filename == None: html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
htmlstr = '<pre>\n'
else:
htmlstr = '<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'
maxNameLength = self.getnamelen() maxNameLength = self.getnamelen()
html = ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0: if (i+1) % 10 == 0:
html += str(i/10+1)[-1] html += str(i/10+1)[0]
else: else:
html += ' ' html += ' '
html += '%s\n' % (self.alignlen) html += '%s\n' % (self.alignlen)
htmlstr += html
if self.alignlen > 10: if self.alignlen > 10:
html = ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0: if (i+1) % 10 == 0:
html += '0' index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
else:
html += ' '
html += '\n'
if self.alignlen > 100:
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0 and i >= 99:
index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
else:
html += ' '
html += '\n'
if self.alignlen > 1000:
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
else: else:
html += ' ' html += ' '
html += '\n' html += '\n'
htmlstr += html
for seq in self.seqs: for seq in self.seqs:
html = seq.name.ljust(maxNameLength) + ' ' html += seq.name.ljust(maxNameLength) + ' '
for sym in seq: for sym in seq:
color = self.alphabet.getAnnotation('html-color', sym) color = self.alphabet.getAnnotation('html-color', sym)
if not color: if not color:
color = 'white' color = 'white'
html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym) html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
html += '\n' html += '\n'
htmlstr += html html += '</pre></body></html>'
htmlstr += '<pre>'
if filename: if filename:
with open(filename, 'w+') as fh: fh = open(filename, 'w')
fh.write(htmlstr) fh.write(html)
fh.write('</body></html>\n') fh.close()
else: return html
return htmlstr
def saveConsensus(aln, theta1 = 0.99, theta2 = 0.01, countgaps = False, consensus = True, filename = None): def saveConsensus(aln, theta1 = 0.99, theta2 = 0.01, countgaps = False, consensus = True, filename = None):
""" Display a table with rows for each alignment column, showing """ Display a table with rows for each alignment column, showing
...@@ -1239,7 +1251,7 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None): ...@@ -1239,7 +1251,7 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
for i in range(MAX_TRY): for i in range(MAX_TRY):
try: try:
fastaData = fetch(id, database).decode("utf-8") fastaData = fetch(id, database)
seq = readFasta(fastaData)[0] seq = readFasta(fastaData)[0]
break break
except: except:
......
...@@ -121,22 +121,27 @@ this module is imported """ ...@@ -121,22 +121,27 @@ this module is imported """
Bool_Alphabet = Alphabet('TF') Bool_Alphabet = Alphabet('TF')
DNA_Alphabet = Alphabet('ACGT') DNA_Alphabet = Alphabet('ACGT')
DNA_Alphabet_wN = Alphabet('ACGTN') DNA_Alphabet_wN = Alphabet('ACGTN')
RNA_Alphabet_wN = Alphabet('ACGUN')
RNA_Alphabet = Alphabet('ACGU') RNA_Alphabet = Alphabet('ACGU')
Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY') Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX') Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
Protein_Alphabet_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*') Protein_Alphabet_wSTOP = Protein_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
DSSP_Alphabet = Alphabet('GHITEBSC') DSSP_Alphabet = Alphabet('GHITEBSC')
DSSP3_Alphabet = Alphabet('HEC') DSSP3_Alphabet = Alphabet('HEC')
predefAlphabets = {'DNA': DNA_Alphabet, predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
'DNA': DNA_Alphabet,
'RNA': RNA_Alphabet, 'RNA': RNA_Alphabet,
'DNAwN': Alphabet('ACGTN'), 'DNAwN': RNA_Alphabet_wN,
'RNAwN': Alphabet('ACGUN'), 'RNAwN': DNA_Alphabet_wN,
'Protein': Protein_Alphabet, 'Protein': Protein_Alphabet,
'ProteinwX': Protein_wX} 'ProteinwX': Protein_wX,
'ProteinwSTOP' : Protein_wSTOP,
'DSSP_Alphabet' : DSSP_Alphabet,
'DSSP3_Alphabet' : DSSP3_Alphabet}
# The preferred order in which a predefined alphabet is assigned to a sequence # The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid) # (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder = ['DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX'] preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
# Useful annotations # Useful annotations
DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'}) DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'}) RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
......
...@@ -32,11 +32,13 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'): ...@@ -32,11 +32,13 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
# Get the entry # Get the entry
try: try:
data = urllib.request.urlopen(url).read() data = urllib.request.urlopen(url).read().decode("utf-8")
if data.startswith(b'ERROR'): print (type(data))
if data.startswith("ERROR"):
raise RuntimeError(data) raise RuntimeError(data)
return data return data
except(urllib.error.HTTPError, ex):
except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def search(query, dbName='uniprot', format='list', limit=100): def search(query, dbName='uniprot', format='list', limit=100):
...@@ -57,12 +59,12 @@ def search(query, dbName='uniprot', format='list', limit=100): ...@@ -57,12 +59,12 @@ def search(query, dbName='uniprot', format='list', limit=100):
url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
# Get the entries # Get the entries
try: try:
data = urllib.request.urlopen(url).read() data = urllib.request.urlopen(url).read().decode("utf-8")
if format == 'list': if format == 'list':
return data.splitlines() return data.splitlines()
else: else:
return data return data
except(urllib.error.HTTPError, ex): except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
elif dbName.startswith('refseq'): elif dbName.startswith('refseq'):
dbs = dbName.split(":") dbs = dbName.split(":")
...@@ -72,7 +74,7 @@ def search(query, dbName='uniprot', format='list', limit=100): ...@@ -72,7 +74,7 @@ def search(query, dbName='uniprot', format='list', limit=100):
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit) url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
# Get the entries # Get the entries
try: try:
data = urllib.request.urlopen(url).read() data = urllib.request.urlopen(url).read().decode("utf-8")
words = data.split("</Id>") words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]] words = [w[w.find("<Id>")+4:] for w in words[:-1]]
if format == 'list': if format == 'list':
...@@ -81,11 +83,11 @@ def search(query, dbName='uniprot', format='list', limit=100): ...@@ -81,11 +83,11 @@ def search(query, dbName='uniprot', format='list', limit=100):
url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id=" url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
for w in words: for w in words:
url += w + "," url += w + ","
data = urllib.request.urlopen(url).read() data = urllib.request.urlopen(url).read().decode("utf-8")
return data return data
else: else:
return '' return ''
except(urllib.error.HTTPError, ex): except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return return
...@@ -199,7 +201,7 @@ def getGODef(goterm): ...@@ -199,7 +201,7 @@ def getGODef(goterm):
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
entry={'id': None, 'name': None, 'def': None} entry={'id': None, 'name': None, 'def': None}
data = urllib.request.urlopen(url).read() data = urllib.request.urlopen(url).read().decode("utf-8")
for row in data.splitlines(): for row in data.splitlines():
index = row.find(':') index = row.find(':')
if index > 0 and len(row[index:]) > 1: if index > 0 and len(row[index:]) > 1:
...@@ -209,7 +211,7 @@ def getGODef(goterm): ...@@ -209,7 +211,7 @@ def getGODef(goterm):
if entry[field] == None: # check if not yet assigned if entry[field] == None: # check if not yet assigned
entry[field] = value entry[field] = value
return entry return entry
except(urllib.error.HTTPError, ex): except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def getGOTerms(genes, database='UniProtKB', completeAnnot = False): def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
...@@ -252,9 +254,9 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False): ...@@ -252,9 +254,9 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
if response.info().get('Content-Encoding') == 'gzip': if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read()) buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf) f = gzip.GzipFile(fileobj=buf)
data = f.read() data = f.read().decode("utf-8")
else: else:
data = response.read() data = response.read().decode("utf-8")
for row in data.splitlines()[1:]: # we ignore first (header) row for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t') values = row.split('\t')
if len(values) >= 7: if len(values) >= 7:
...@@ -264,7 +266,7 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False): ...@@ -264,7 +266,7 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
else: else:
termsmap[key] = set([values[6]]) termsmap[key] = set([values[6]])
taxonmap[key] = int(values[4]) taxonmap[key] = int(values[4])
except(urllib.error.HTTPError, ex): except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
if completeAnnot: if completeAnnot:
if len(genes) == 1: if len(genes) == 1:
...@@ -304,13 +306,13 @@ def getGenes(goterms, database='UniProtKB', taxo=None): ...@@ -304,13 +306,13 @@ def getGenes(goterms, database='UniProtKB', taxo=None):
url = __ebiGOUrl__ + uri_string + goterm.strip() url = __ebiGOUrl__ + uri_string + goterm.strip()
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
data = urllib.request.urlopen(url).read() data = urllib.request.urlopen(url).read().decode("utf-8")
for row in data.splitlines()[1:]: # we ignore first (header) row for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t') values = row.split('\t')
if len(values) >= 7: if len(values) >= 7:
genes.add(values[1]) genes.add(values[1])
map[goterm] = list(genes) map[goterm] = list(genes)
except(urllib.error.HTTPError, ex): except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
if len(goterms) == 1: if len(goterms) == 1:
return map[goterms[0]] return map[goterms[0]]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment