added_regex_search_in_gappy_sequences

8fa94535 · Mikael Boden · 9889428a · 8fa94535 · 8fa94535 · 8fa94535
Commit 8fa94535 authored Jun 03, 2019 by Mikael Boden
7 changed files
--- a/annotation_test.py
+++ b/annotation_test.py
 import annotations
 import phylo
+tree = phylo.parseNewick("(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);")
+# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")

 # tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")

@@ -10,17 +12,17 @@ import phylo



-working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
-
-tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
-
-print (tree)
-print (tree.nexus_annotations.annotations)
-
-tree.swap_annotations("PDB")
-
-print (tree)
-print (tree.nexus_annotations.annotations)
+# working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
+#
+# tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
+#
+# print (tree)
+# print (tree.nexus_annotations.annotations)
+#
+# tree.swap_annotations("PDB")
+#
+# print (tree)
+# print (tree.nexus_annotations.annotations)
 #
 # tree.write_to_nexus(working_dir + "output.nexus")


--- a/annotations.py
+++ b/annotations.py
 from collections import defaultdict
 from phylo import *
+import phylo
 import matplotlib
 import random

@@ -146,3 +147,5 @@ class NexusAnnotations():
    def generate_colour_list(self, num):
        return num

+# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
+
--- a/gtf.py
+++ b/gtf.py
@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
    f.close()

 if __name__ == '__main__':
-    bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf')
+    bf = GtfFile('/Users/mikael/simhome/NFIX/WT1689.gtf')
    print(bf.chroms.keys())
    g = bf.generate('chr12')
    print(next(g))

--- a/phylo.py
+++ b/phylo.py
--- a/sequence.py
+++ b/sequence.py
@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
                    if parse_defline:
                        parsed = parseDefline(seqinfo[0])
                        seqname = parsed[0]
-                    else:
+                        seqinfo = line[1:]
+                    else: # we are not parsing the sequence name so no need to duplicate it in the info
                        seqname = seqinfo[0]
-                    seqinfo = line[1:]
+                        if len(seqinfo) > 0: # more than a name
+                            edited_info = ''
+                            for infopart in seqinfo[1:]:
+                                edited_info += infopart + ' '
+                            seqinfo = edited_info
+                        else:
+                            seqinfo = ''
                except IndexError as errmsg:
                    if not ignore:
                        raise RuntimeError(errmsg)
@@ -717,60 +724,62 @@ class Alignment():
                distmat[i, j] = distmat[j, i] = dist
        return distmat

-    def writeHTML(self, filename = None):
+    def writeHTML(self, filename = None, col_start = None, col_end = None):
        """ Generate HTML that displays the alignment in color.
            Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
            and that each symbol maps to a text string naming the color, e.g. 'blue'
        """
+        col_start = col_start or 0
+        col_end = col_end or self.alignlen
        html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
+        html += '''<p style="font-size:12px">\n'''
        maxNameLength =  self.getnamelen()
        html += ''.ljust(maxNameLength) + ' '
        for i in range(self.alignlen - 1):
-            if (i+1) % 10 == 0:
+            if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
                html += str(i/10+1)[0]
-            else:
+            elif (i >= col_start and i < col_end):
                html += ' '
-        html += '%s\n' % (self.alignlen)
+#        html += '%s\n' % (col_end)
+        html += '\n'

        if self.alignlen > 10:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0:
+                if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
                    index = len(str(i/10 + 1).split('.')[0])
                    html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
-                else:
+                elif (i >= col_start and i < col_end):
                    html += ' '
            html += '\n'

        if self.alignlen > 100:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0 and i >= 99:
+                if (i+1) % 10 == 0 and i >= 99  and (i >= col_start and i < col_end):
                    index = len(str(i/10 + 1).split('.')[0])
                    html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
-
-                else:
+                elif (i >= col_start and i < col_end):
                    html += ' '
            html += '\n'

        if self.alignlen > 1000:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
-                if (i+1) % 10 == 0:
+                if (i+1) % 10 == 0  and (i >= col_start and i < col_end):
                    html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
-
-                else:
+                elif (i >= col_start and i < col_end):
                    html += ' '
            html += '\n'
        for seq in self.seqs:
            html += seq.name.ljust(maxNameLength) + ' '
-            for sym in seq:
+            for sym in seq[col_start:col_end]:
                color = self.alphabet.getAnnotation('html-color', sym)
                if not color:
                    color = 'white'
                html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
            html += '\n'
-        html += '</pre></body></html>'
+        html += '</p></pre></body></html>'
        if filename:
            fh = open(filename, 'w')
            fh.write(html)
@@ -1187,19 +1196,25 @@ class Regexp(object):
    def __str__(self):
        return self.pattern

-    def search(self, sequence):
+    def search(self, sequence, gappy = False):
        """ Find matches to the motif in the specified sequence. Returns a list
        of triples, of the form (position, matched string, score). Note that
        the score is always 1.0 because a regexp either matches
        or doesn't. """
        if not type(sequence) is Sequence:
            sequence = Sequence(sequence)
-        sequenceString = sequence[:]
-        results = []
-        for match in self.regex.finditer(sequenceString):
-            results.append((match.start(), match.group(), 1.0))
-        return results
-
+        if gappy == False or sequence.gappy == False:
+            sequenceString = sequence[:]
+            results = []
+            for match in self.regex.finditer(sequenceString):
+                results.append((match.start(), match.group(), 1.0))
+            return results
+        else:  # if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
+            degapped, idxs = sequence.getDegapped()
+            results = []
+            for match in self.regex.finditer(''.join(degapped)):
+                results.append((idxs[match.start()], match.group(), 1.0))
+            return results

 class PWM(object):


--- a/sym.py
+++ b/sym.py
@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
                   'Protein': Protein_Alphabet,
                   'ProteinwX': Protein_wX,
                   'ProteinwSTOP' : Protein_wSTOP,
+                   'ProteinwGAP': Protein_wGAP,
                   'DSSP_Alphabet' : DSSP_Alphabet,
                   'DSSP3_Alphabet' : DSSP3_Alphabet}
 # The preferred order in which a predefined alphabet is assigned to a sequence
 # (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
-preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
+preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP',
+                  'ProteinwGAP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
 # Useful annotations
 DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
 RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
-Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
+#Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
+Protein_Alphabet.annotateAll('html-color', {
+#orange*/
+'G': "#F5A259",
+#green*/
+'N':"#00f900",
+'Q':"#00f900",
+'S': "#00f900",
+'T': "#00f900",
+#red*/
+'K': "#f62f00",
+'R': "#f62f00",
+#blue/purple*/
+'A':"#92b2f3",
+'I': "#92b2f3",
+'L': "#92b2f3",
+'M': "#92b2f3",
+'V': "#92b2f3",
+'W': "#92b2f3",
+'F': "#92b2f3",
+#yellow*/
+'P': "#FFFB00",
+#pink*/
+'C':"#F59692",
+#aqua*/
+'H': "#04B2B3",
+'Y': "#04B2B3",
+#purple*/
+'D':"#CE64CB",
+'E':"#CE64CB"})

 # ------------------ Substitution Matrix ------------------


--- a/webservice.py
+++ b/webservice.py