added_regex_search_in_gappy_sequences

8fa94535 · Mikael Boden · 9889428a · 8fa94535 · 8fa94535 · 8fa94535
Commit 8fa94535 authored Jun 03, 2019 by Mikael Boden
7 changed files
--- a/annotation_test.py
+++ b/annotation_test.py
 import annotations
 import phylo
+tree = phylo.parseNewick("(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);")
+# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")
 # tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
@@ -10,17 +12,17 @@ import phylo
-working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
+# working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
+#
-tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
+# tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
+#
-print (tree)
+# print (tree)
-print (tree.nexus_annotations.annotations)
+# print (tree.nexus_annotations.annotations)
+#
-tree.swap_annotations("PDB")
+# tree.swap_annotations("PDB")
+#
-print (tree)
+# print (tree)
-print (tree.nexus_annotations.annotations)
+# print (tree.nexus_annotations.annotations)
 #
 # tree.write_to_nexus(working_dir + "output.nexus")

--- a/annotations.py
+++ b/annotations.py
 from collections import defaultdict
 from phylo import *
+import phylo
 import matplotlib
 import random
@@ -146,3 +147,5 @@ class NexusAnnotations():
    def generate_colour_list(self, num):
        return num
+# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
--- a/gtf.py
+++ b/gtf.py
@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
    f.close()
 if __name__ == '__main__':
-    bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf')
+    bf = GtfFile('/Users/mikael/simhome/NFIX/WT1689.gtf')
    print(bf.chroms.keys())
    g = bf.generate('chr12')
    print(next(g))

--- a/phylo.py
+++ b/phylo.py
--- a/sequence.py
+++ b/sequence.py
@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
                    if parse_defline:
                        parsed = parseDefline(seqinfo[0])
                        seqname = parsed[0]
-                    else:
-                        seqname = seqinfo[0]
                        seqinfo = line[1:]
+                    else: # we are not parsing the sequence name so no need to duplicate it in the info
+                        seqname = seqinfo[0]
+                        if len(seqinfo) > 0: # more than a name
+                            edited_info = ''
+                            for infopart in seqinfo[1:]:
+                                edited_info += infopart + ' '
+                            seqinfo = edited_info
+                        else:
+                            seqinfo = ''
                except IndexError as errmsg:
                    if not ignore:
                        raise RuntimeError(errmsg)
@@ -717,60 +724,62 @@ class Alignment():
                distmat[i, j] = distmat[j, i] = dist
        return distmat
-    def writeHTML(self, filename = None):
+    def writeHTML(self, filename = None, col_start = None, col_end = None):
        """ Generate HTML that displays the alignment in color.
            Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
            and that each symbol maps to a text string naming the color, e.g. 'blue'
        """
+        col_start = col_start or 0
+        col_end = col_end or self.alignlen
        html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
+        html += '''<p style="font-size:12px">\n'''
        maxNameLength =  self.getnamelen()
        html += ''.ljust(maxNameLength) + ' '
        for i in range(self.alignlen - 1):
-            if (i+1) % 10 == 0:
+            if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
                html += str(i/10+1)[0]
-            else:
+            elif (i >= col_start and i < col_end):
                html += ' '
-        html += '%s\n' % (self.alignlen)
+#        html += '%s\n' % (col_end)
+        html += '\n'
        if self.alignlen > 10:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0:
+                if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
                    index = len(str(i/10 + 1).split('.')[0])
                    html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
-                else:
+                elif (i >= col_start and i < col_end):
                    html += ' '
            html += '\n'
        if self.alignlen > 100:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0 and i >= 99:
+                if (i+1) % 10 == 0 and i >= 99  and (i >= col_start and i < col_end):
                    index = len(str(i/10 + 1).split('.')[0])
                    html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
+                elif (i >= col_start and i < col_end):
-                else:
                    html += ' '
            html += '\n'
        if self.alignlen > 1000:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0:
+                if (i+1) % 10 == 0  and (i >= col_start and i < col_end):
                    html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
+                elif (i >= col_start and i < col_end):
-                else:
                    html += ' '
            html += '\n'
        for seq in self.seqs:
            html += seq.name.ljust(maxNameLength) + ' '
-            for sym in seq:
+            for sym in seq[col_start:col_end]:
                color = self.alphabet.getAnnotation('html-color', sym)
                if not color:
                    color = 'white'
                html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
            html += '\n'
-        html += '</pre></body></html>'
+        html += '</p></pre></body></html>'
        if filename:
            fh = open(filename, 'w')
            fh.write(html)
@@ -1187,19 +1196,25 @@ class Regexp(object):
    def __str__(self):
        return self.pattern
-    def search(self, sequence):
+    def search(self, sequence, gappy = False):
        """ Find matches to the motif in the specified sequence. Returns a list
        of triples, of the form (position, matched string, score). Note that
        the score is always 1.0 because a regexp either matches
        or doesn't. """
        if not type(sequence) is Sequence:
            sequence = Sequence(sequence)
+        if gappy == False or sequence.gappy == False:
            sequenceString = sequence[:]
            results = []
            for match in self.regex.finditer(sequenceString):
                results.append((match.start(), match.group(), 1.0))
            return results
+        else:  # if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
+            degapped, idxs = sequence.getDegapped()
+            results = []
+            for match in self.regex.finditer(''.join(degapped)):
+                results.append((idxs[match.start()], match.group(), 1.0))
+            return results
 class PWM(object):

--- a/sym.py
+++ b/sym.py
@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
                   'Protein': Protein_Alphabet,
                   'ProteinwX': Protein_wX,
                   'ProteinwSTOP' : Protein_wSTOP,
+                   'ProteinwGAP': Protein_wGAP,
                   'DSSP_Alphabet' : DSSP_Alphabet,
                   'DSSP3_Alphabet' : DSSP3_Alphabet}
 # The preferred order in which a predefined alphabet is assigned to a sequence
 # (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
-preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
+preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP',
+                  'ProteinwGAP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
 # Useful annotations
 DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
 RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
-Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
+#Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
+Protein_Alphabet.annotateAll('html-color', {
+#orange*/
+'G': "#F5A259",
+#green*/
+'N':"#00f900",
+'Q':"#00f900",
+'S': "#00f900",
+'T': "#00f900",
+#red*/
+'K': "#f62f00",
+'R': "#f62f00",
+#blue/purple*/
+'A':"#92b2f3",
+'I': "#92b2f3",
+'L': "#92b2f3",
+'M': "#92b2f3",
+'V': "#92b2f3",
+'W': "#92b2f3",
+'F': "#92b2f3",
+#yellow*/
+'P': "#FFFB00",
+#pink*/
+'C':"#F59692",
+#aqua*/
+'H': "#04B2B3",
+'Y': "#04B2B3",
+#purple*/
+'D':"#CE64CB",
+'E':"#CE64CB"})
 # ------------------ Substitution Matrix ------------------

--- a/webservice.py
+++ b/webservice.py
 import urllib.request
+import urllib.parse
 import os
 from time import sleep
 import stats
@@ -21,6 +22,7 @@ __ebiGOUrl__ =      'https://www.ebi.ac.uk/QuickGO/services/'
 __uniprotUrl__ = 'http://www.uniprot.org/'
 __ebiSearchUrl__ = 'http://www.ebi.ac.uk/ebisearch/'
 def fetch(entryId, dbName='uniprotkb', format='fasta'):
    """
    Retrieve a single entry from a database
@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
    except urllib.error.HTTPError as ex:
        raise RuntimeError(ex.read())
 def search(query, dbName='uniprot', format='list', limit=100, columns=""):
    """
    Retrieve multiple entries matching query from a database currently only via UniProtKB
@@ -55,9 +58,13 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
    if dbName.startswith('uniprot'):
        # Construct URL
        if limit == None:  # no limit to number of results returned
-            url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query + '&columns='+ columns
+            url = "{}{}/?format={}&query={}&columns={}".format(__uniprotUrl__, dbName, format,
+                                                               urllib.parse.quote(query),
+                                                               columns)
        else:
-            url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query + '&columns='+ columns
+            url = "{}{}/?format={}&limit={}&query={}&columns={}".format(__uniprotUrl__, dbName, format, str(limit),
+                                                                        urllib.parse.quote(query), columns)
        # Get the entries
        try:
@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
        dbs = dbName.split(":")
        if len(dbs) > 1:
            dbName = dbs[1]
        base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
-        url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
+        url = base + "esearch.fcgi?db={}&term={}+AND+srcdb_refseq[" \
+              "prop]&retmax={}".format(dbName, urllib.parse.quote(query), str(limit))
+        print (url)
        # Get the entries
        try:
            data = urllib.request.urlopen(url).read().decode("utf-8")
            words = data.split("</Id>")
-            words = [w[w.find("<Id>")+4:] for w in words[:-1]]
+            words = [w[w.find("<Id>") + 4:] for w in words[:-1]]
            if format == 'list':
                return words
            elif format == 'fasta' and len(words) > 0:
@@ -93,6 +107,7 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
            raise RuntimeError(ex.read())
    return
 authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
                           3702: ['Arabidopsis thaliana', 'TAIR_ID'],
                           4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
 Note that this service can be slow for queries involving a large number of entries.
 """
-def getGOReport(positives, background = None):
+def getGOReport(positives, background=None):
    """ Generate a complete GO term report for a set of genes (positives).
        Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
        Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
    for t in sorted_cnt:
        defin = getGODef(t[0])
        if background != None:
-            ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name']))
+            ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0] + t[1][1], defin['name']))
        else:
            ret.append((t[0], t[1], defin['name']))
    return ret
 def getGODef(goterm):
    """
    Retrieve information about a GO term
@@ -165,7 +182,7 @@ def getGODef(goterm):
    url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm
    # Get the entry: fill in the fields specified below
    try:
-        entry={'id': None, 'name': None, 'aspect': None}
+        entry = {'id': None, 'name': None, 'aspect': None}
        data = urllib.request.urlopen(url).read().decode("utf-8")
        ret = json.loads(data)
        for row in ret['results']:
@@ -179,6 +196,7 @@ def getGODef(goterm):
    except urllib.error.HTTPError as ex:
        raise RuntimeError(ex.read())
 def getGOTerms(genes):
    """
    Retrieve all GO terms for a given set of genes (or single gene).
@@ -237,6 +255,7 @@ def getGOTerms(genes):
            raise RuntimeError(ex.read())
    return map
 def getGenes(goterms, taxo=None):
    """
    Retrieve all genes/proteins for a given set of GO terms (or single GO term).
@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
            else:
                break
            termcnt += 1
-        uri_string = 'annotation/search?limit=' + str(limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId='
+        uri_string = 'annotation/search?limit=' + str(
+            limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId='
        for i in range(len(termbatch)):
            term = termbatch[i]
            uri_string += term + "," if i < len(termbatch) - 1 else term
@@ -295,8 +315,8 @@ def getGenes(goterms, taxo=None):
            raise RuntimeError(ex.read())
    return map
-class EBI(object):
+class EBI(object):
    __email__ = 'anon@uq.edu.au'  # to whom emails about jobs should go
    __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/'  # Use UQ mirror when available
    __checkInterval__ = 2  # how long to wait between checking job status
@@ -349,7 +369,8 @@ class EBI(object):
        if self.isLocked():
            raise RuntimeError("""You currently have a %s job running. You must
                                  wait until it is complete before submitting another job. Go to
-                                  %sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId))
+                                  %sstatus/%s to check the status of the job.""" % (
+            self.service, self.__ebiServiceUrl__, self.jobId))
        url = self.__ebiServiceUrl__ + self.service + '/run/'
        # ncbiblast database parameter needs special handling
        if self.service == 'ncbiblast':
@@ -423,8 +444,8 @@ class EBI(object):
        else:
            return results
-def getUniProtDict(ids, cols="", db='uniprot', identities=None):
+def getUniProtDict(ids, cols="", db='uniprot', identities=None):
    """
    :param ids: The list of UniProt IDs
@@ -467,7 +488,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
    the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
    """
    # Format the lists of IDs and columns correctly
    cols = ",".join(cols)
@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
        if type(identities) != list:
            identities = [identities] * len(ids)
        elif len(identities) != len(ids):
-            raise RuntimeError('Either supply a single identity threshold or supply one for each identifier in the list')
+            raise RuntimeError(
+                'Either supply a single identity threshold or supply one for each identifier in the list')
        # Check that the identity thresholds are valid values
        for x in identities:
            if x not in [1.0, 0.9, 0.5]:
-                raise RuntimeError("UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - " + str(x))
+                raise RuntimeError(
+                    "UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - " + str(x))
        # Add the query syntax around the identifiers
        updated_ids = ""
@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
    url = 'https://www.uniprot.org/' + db + '/'
    params = {
        'format': 'tab',
        'query': updated_ids,
@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
    # For each record we retrieve, split the line by tabs and build up the UniProt dict
    for line in page.split("\n")[1:]:
        if line:
-            splitlines= line.split("\t")
+            splitlines = line.split("\t")
            id_dict = {}
            pos = 1
            for col in cols.split(","):
                id_dict[col] = None if splitlines[pos] == "" else splitlines[pos]
-                pos +=1
+                pos += 1
            up_dict[splitlines[0]] = id_dict
    return up_dict