webservice_in_Python_3

881688af · Mikael Boden · de69764a · 881688af · 881688af · 881688af
Commit 881688af authored Jul 21, 2017 by Mikael Boden
Hide whitespace changes
Inline Side-by-side

Showing with 122 additions and 83 deletions

godata.py godata.py +1 -0

phylo.py phylo.py +27 -8

sequence.py sequence.py +66 -54

sym.py sym.py +11 -6

webservice.py webservice.py +17 -15

No files found.
--- a/godata.py
+++ b/godata.py
@@ -118,6 +118,7 @@ class GO():
            if line.startswith('!'):
                continue
            (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns)
+            print(gene, symb, qual, term, evid, onto, taxa)
            try:
                (taxa_q, terms_map) = self.annots[gene]
                terms_map[term] = (evid, qual != 'NOT')

--- a/phylo.py
+++ b/phylo.py
@@ -48,7 +48,7 @@ class PhyloTree:
            If node does not exist, None is returned.
            If node has no descendants, an empty list will be returned."""
        if not isinstance(node, PhyloNode):
-            node = self.root.findLabel(node)
+            node = self.findLabel(node)
        if node:
            return node.getDescendants(transitive)
        return None
@@ -60,22 +60,24 @@ class PhyloTree:
            If node does not exist, None is returned.
            If node is the root of the tree, None is returned."""
        if not isinstance(node, PhyloNode):
-            node = self.root.findLabel(node)
+            node = self.findLabel(node)
        if node:
            myroot = self.root
            found = False
            branching = []
            while not found and myroot != None:
                branching.append(myroot)
+                # check if "myroot" is a leaf node, i.e. does not have children
                if myroot.left == node or myroot.right == node:
                    found = True
                    break
-                if myroot.left:
-                    if myroot.left.isAncestorOf(node, transitive = True):
-                        myroot = myroot.left
-                    else: # must be right branch then...
-                        myroot = myroot.right
-                else: # must be right branch then...
+                if myroot.left != None: # myroot has a "left" child
+                    # check if the "left" child of "myroot" is the ancestor of "node"
+                    if myroot.left.isAncestorOf(node, transitive = True): # if yes,
+                        myroot = myroot.left    # move to the "left" child
+                    else:                       # if not,
+                        myroot = myroot.right   # move to the "right" child
+                else: # myroot does NOT have a "left" child, so let's move "right"
                    myroot = myroot.right
            if found and transitive:
                return branching
@@ -91,6 +93,8 @@ class PhyloTree:
        self.root._backwardParsimony(self.aln) # use scores to determine sequences
        return self.root.getSequence() # return the sequence found at the root

+    def canonise(self):
+        self.root._canonise()

 class PhyloNode:
    """ A class for a node in a rooted, binary (bifurcating) tree.
@@ -212,6 +216,18 @@ class PhyloNode:
                self.sequence = seq
                break

+    def _canonise(self):
+        if self.left == None and self.right == None: # at leaf
+            return self.label
+        myleft = self.left._canonise()
+        myright = self.right._canonise();
+        if myleft > myright:
+            tmpnode = self.left
+            self.left = self.right
+            self.right = tmpnode
+            return myright
+        return myleft
+
    def _forwardParsimony(self, aln):
        """ Internal function that operates recursively to first initialise each node (forward),
            stopping only once a sequence has been assigned to the node,
@@ -459,3 +475,6 @@ def readNewick(filename):
    string = ''.join(f)
    return parseNewick(string)

+def writeNewickFile(filename, my_tree):
+    with open(filename, 'w') as fh:
+        print(my_tree, end="", file=fh)
--- a/sequence.py
+++ b/sequence.py
@@ -33,17 +33,17 @@ from prob import *
 # Sequence ------------------****

 class Sequence(object):
-    """ A biological sequence. Stores the sequence itself (as a compact array),
-    the alphabet (i.e., type of sequence it is), and optionally a name and further
+    """ A biological sequence. Stores the sequence itself (as a compact array), 
+    the alphabet (i.e., type of sequence it is), and optionally a name and further 
    information. """
-
-    sequence = None # The array of symbols that make up the sequence
+    
+    sequence = None # The array of symbols that make up the sequence 
    alphabet = None # The alphabet from which symbols come
    name =     None # The name (identifier) of a sequence
    info =     None # Other information (free text; e.g. annotations)
    length =   None # The number of symbols that the sequence is composed of
    gappy =    None # True if the sequence has "gaps", i.e. positions that represent deletions relative another sequence
-
+    
    def __init__(self, sequence, alphabet = None, name = '', info = '', gappy = False):
        """ Create a sequence with the sequence data. Specifying the alphabet,
        name and other information about the sequence are all optional.
@@ -56,14 +56,11 @@ class Sequence(object):
        will output the standard protein alphabet:
        ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'] """
-
-        #try: # convert sequence data into a compact array representation
-        #    self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
-        #except TypeError:
-        #    raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
+        
        self.sequence = sequence
-
+        
        # Assign an alphabet
+        # If no alphabet is provided, attempts to identify the alphabet from sequence
        self.alphabet = None
        if not alphabet is None:
            for sym in self.sequence:
@@ -75,7 +72,7 @@ class Sequence(object):
                alpha = predefAlphabets[alphaName]
                valid = True
                for sym in self.sequence:
-                    if not sym in alpha and (sym != '-' or not gappy):
+                    if not sym in alpha and (sym != '-' or not gappy):  
                        valid = False
                        break
                if valid:
@@ -83,17 +80,17 @@ class Sequence(object):
                    break
            if self.alphabet is None:
                raise RuntimeError('Could not identify alphabet from sequence: %s' % name)
-
+        
        # Store other information
        self.name = name
        self.info = info
        self.length = len(self.sequence)
        self.gappy = gappy
-
+        
    def __len__(self):
        """ Defines what the "len" operator returns for an instance of Sequence, e.g.
        >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
-        >>> print(len(seq))
+        >>> print (len(seq))
        9
        """
        return len(self.sequence)
@@ -104,42 +101,42 @@ class Sequence(object):
        for sym in self:
            str += sym
        return str
-
+    
    def __iter__(self):
        """ Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
        >>> seq = Sequence('AGGAT', DNA_Alphabet)
        >>> for sym in seq:
-                print(sym)
+                print (sym)
        will print A, G, G, A, T (each on a separate row)
-        """
+        """ 
        tsyms = tuple(self.sequence)
        return tsyms.__iter__()
-
+    
    def __contains__(self, item):
        """ Defines what is returned when the "in" operator is used on a Sequence, e.g.
        >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
-        >>> print('T' in seq)
+        >>> print ('T' in seq)
        True
-            which is equivalent to
-        >>> print(seq.__contains__('T'))
+            which is equivalent to 
+        >>> print (seq.__contains__('T'))
        True
-        >>> print('X' in seq)
+        >>> print ('X' in seq)
        False
-        """
+        """ 
        for sym in self.sequence:
            if sym == item:
                return True
        return False
-
+        
    def __getitem__(self, ndx):
        """ Retrieve a specified index (or a "slice" of indices) of the sequence data.
-            Calling self.__getitem__(3) is equivalent to self[3]
+            Calling self.__getitem__(3) is equivalent to self[3] 
        """
        if type(ndx) is slice:
            return ''.join(self.sequence[ndx])
        else:
            return self.sequence[ndx]
-
+        
    def writeFasta(self):
        """ Write one sequence in FASTA format to a string and return it. """
        fasta = '>' + self.name + ' ' + self.info + '\n'
@@ -149,7 +146,7 @@ class Sequence(object):
            lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
            fasta += lineofseq
        return fasta
-
+    
    def count(self, findme = None):
        """ Get the number of occurrences of specified symbol findme OR
            if findme = None, return a dictionary of counts of all symbols in alphabet """
@@ -318,13 +315,12 @@ def getCount(seqs, findme = None):

 class Alignment():
    """ A sequence alignment class. Stores two or more sequences of equal length where
-    one symbol is gap '-'
+    one symbol is gap '-' 
    Example usage:
-    >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
-    >>> print(Alignment(seqs))
+    >>> seqs = [Sequence('THIS-LI-NE-', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
+    >>> print (Alignment(seqs))
     THIS-LI-NE-
-     --ISALIGNED
-    """
+     --ISALIGNED """

    alignlen = None
    seqs = None
@@ -381,7 +377,7 @@ class Alignment():
        maxNameLength =  self.getnamelen() + 1
        string = ''
        wholeRows = self.alignlen / symbolsPerLine
-        for i in range(wholeRows):
+        for i in range(int(wholeRows)):
            for j in range(len(self.seqs)):
                string += self.seqs[j].name.ljust(maxNameLength) + ' '
                string += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
@@ -646,49 +642,65 @@ class Alignment():
                distmat[i, j] = distmat[j, i] = dist
        return distmat

-    def writeHTML(self, filename=None):
+    def writeHTML(self, filename = None):
        """ Generate HTML that displays the alignment in color.
            Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
            and that each symbol maps to a text string naming the color, e.g. 'blue'
        """
-        if filename == None:
-            htmlstr = '<pre>\n'
-        else:
-            htmlstr = '<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'
+        html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
        maxNameLength =  self.getnamelen()
-        html = ''.ljust(maxNameLength) + ' '
+        html += ''.ljust(maxNameLength) + ' '
        for i in range(self.alignlen - 1):
            if (i+1) % 10 == 0:
-                html += str(i/10+1)[-1]
+                html += str(i/10+1)[0]
            else:
                html += ' '
        html += '%s\n' % (self.alignlen)
-        htmlstr += html
+
        if self.alignlen > 10:
-            html = ''.ljust(maxNameLength) + ' '
+            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
                if (i+1) % 10 == 0:
-                    html += '0'
+                    index = len(str(i/10 + 1).split('.')[0])
+                    html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
+                else:
+                    html += ' '
+            html += '\n'
+
+        if self.alignlen > 100:
+            html += ''.ljust(maxNameLength) + ' '
+            for i in range(self.alignlen - 1):
+                if (i+1) % 10 == 0 and i >= 99:
+                    index = len(str(i/10 + 1).split('.')[0])
+                    html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
+
+                else:
+                    html += ' '
+            html += '\n'
+
+        if self.alignlen > 1000:
+            html += ''.ljust(maxNameLength) + ' '
+            for i in range(self.alignlen - 1):
+                if (i+1) % 10 == 0:
+                    html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
+
                else:
                    html += ' '
            html += '\n'
-            htmlstr += html
        for seq in self.seqs:
-            html = seq.name.ljust(maxNameLength) + ' '
+            html += seq.name.ljust(maxNameLength) + ' '
            for sym in seq:
                color = self.alphabet.getAnnotation('html-color', sym)
                if not color:
                    color = 'white'
                html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
            html += '\n'
-            htmlstr += html
-        htmlstr += '<pre>'
+        html += '</pre></body></html>'
        if filename:
-            with open(filename, 'w+') as fh:
-                fh.write(htmlstr)
-                fh.write('</body></html>\n')
-        else:
-            return htmlstr
+            fh = open(filename, 'w')
+            fh.write(html)
+            fh.close()
+        return html

 def saveConsensus(aln, theta1 = 0.99, theta2 = 0.01, countgaps = False, consensus = True, filename = None):
    """ Display a table with rows for each alignment column, showing
@@ -1239,7 +1251,7 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):

    for i in range(MAX_TRY):
        try:
-            fastaData = fetch(id, database).decode("utf-8")
+            fastaData = fetch(id, database)
            seq = readFasta(fastaData)[0]
            break
        except:

--- a/sym.py
+++ b/sym.py
@@ -121,22 +121,27 @@ this module is imported """
 Bool_Alphabet = Alphabet('TF')
 DNA_Alphabet = Alphabet('ACGT')
 DNA_Alphabet_wN = Alphabet('ACGTN')
+RNA_Alphabet_wN = Alphabet('ACGUN')
 RNA_Alphabet = Alphabet('ACGU')
 Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
 Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
-Protein_Alphabet_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
+Protein_Alphabet_wSTOP = Protein_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
 DSSP_Alphabet = Alphabet('GHITEBSC')
 DSSP3_Alphabet = Alphabet('HEC')

-predefAlphabets = {'DNA': DNA_Alphabet,
+predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
+                   'DNA': DNA_Alphabet,
                   'RNA': RNA_Alphabet,
-                   'DNAwN': Alphabet('ACGTN'),
-                   'RNAwN': Alphabet('ACGUN'),
+                   'DNAwN': RNA_Alphabet_wN,
+                   'RNAwN': DNA_Alphabet_wN,
                   'Protein': Protein_Alphabet,
-                   'ProteinwX': Protein_wX}
+                   'ProteinwX': Protein_wX,
+                   'ProteinwSTOP' : Protein_wSTOP,
+                   'DSSP_Alphabet' : DSSP_Alphabet,
+                   'DSSP3_Alphabet' : DSSP3_Alphabet}
 # The preferred order in which a predefined alphabet is assigned to a sequence
 # (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
-preferredOrder = ['DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX']
+preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
 # Useful annotations
 DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
 RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})

--- a/webservice.py
+++ b/webservice.py
@@ -32,11 +32,13 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
    url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
    # Get the entry
    try:
-        data = urllib.request.urlopen(url).read()
-        if data.startswith(b'ERROR'):
+        data = urllib.request.urlopen(url).read().decode("utf-8")
+        print (type(data))
+        if data.startswith("ERROR"):
            raise RuntimeError(data)
        return data
-    except(urllib.error.HTTPError, ex):
+
+    except urllib.error.HTTPError as ex:
        raise RuntimeError(ex.read())

 def search(query, dbName='uniprot', format='list', limit=100):
@@ -57,12 +59,12 @@ def search(query, dbName='uniprot', format='list', limit=100):
            url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
        # Get the entries
        try:
-            data = urllib.request.urlopen(url).read()
+            data = urllib.request.urlopen(url).read().decode("utf-8")
            if format == 'list':
                return data.splitlines()
            else:
                return data
-        except(urllib.error.HTTPError, ex):
+        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
    elif dbName.startswith('refseq'):
        dbs = dbName.split(":")
@@ -72,7 +74,7 @@ def search(query, dbName='uniprot', format='list', limit=100):
        url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
        # Get the entries
        try:
-            data = urllib.request.urlopen(url).read()
+            data = urllib.request.urlopen(url).read().decode("utf-8")
            words = data.split("</Id>")
            words = [w[w.find("<Id>")+4:] for w in words[:-1]]
            if format == 'list':
@@ -81,11 +83,11 @@ def search(query, dbName='uniprot', format='list', limit=100):
                url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
                for w in words:
                    url += w + ","
-                data = urllib.request.urlopen(url).read()
+                data = urllib.request.urlopen(url).read().decode("utf-8")
                return data
            else:
                return ''
-        except(urllib.error.HTTPError, ex):
+        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
    return

@@ -199,7 +201,7 @@ def getGODef(goterm):
    # Get the entry: fill in the fields specified below
    try:
        entry={'id': None, 'name': None, 'def': None}
-        data = urllib.request.urlopen(url).read()
+        data = urllib.request.urlopen(url).read().decode("utf-8")
        for row in data.splitlines():
            index = row.find(':')
            if index > 0 and len(row[index:]) > 1:
@@ -209,7 +211,7 @@ def getGODef(goterm):
                    if entry[field] == None:      # check if not yet assigned
                        entry[field] = value
        return entry
-    except(urllib.error.HTTPError, ex):
+    except urllib.error.HTTPError as ex:
        raise RuntimeError(ex.read())

 def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
@@ -252,9 +254,9 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
            if response.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(response.read())
                f = gzip.GzipFile(fileobj=buf)
-                data = f.read()
+                data = f.read().decode("utf-8")
            else:
-                data = response.read()
+                data = response.read().decode("utf-8")
            for row in data.splitlines()[1:]:  # we ignore first (header) row
                values = row.split('\t')
                if len(values) >= 7:
@@ -264,7 +266,7 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
                    else:
                        termsmap[key] = set([values[6]])
                        taxonmap[key] = int(values[4])
-        except(urllib.error.HTTPError, ex):
+        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
    if completeAnnot:
        if len(genes) == 1:
@@ -304,13 +306,13 @@ def getGenes(goterms, database='UniProtKB', taxo=None):
        url = __ebiGOUrl__ + uri_string + goterm.strip()
        # Get the entry: fill in the fields specified below
        try:
-            data = urllib.request.urlopen(url).read()
+            data = urllib.request.urlopen(url).read().decode("utf-8")
            for row in data.splitlines()[1:]:  # we ignore first (header) row
                values = row.split('\t')
                if len(values) >= 7:
                    genes.add(values[1])
            map[goterm] = list(genes)
-        except(urllib.error.HTTPError, ex):
+        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
    if len(goterms) == 1:
        return map[goterms[0]]