Commit 8fa94535 authored by Mikael Boden's avatar Mikael Boden

added_regex_search_in_gappy_sequences

parent 9889428a
Pipeline #46 failed with stages
import annotations
import phylo
tree = phylo.parseNewick("(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
......@@ -10,17 +12,17 @@ import phylo
working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
print (tree)
print (tree.nexus_annotations.annotations)
tree.swap_annotations("PDB")
print (tree)
print (tree.nexus_annotations.annotations)
# working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
# tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
#
# print (tree)
# print (tree.nexus_annotations.annotations)
#
# tree.swap_annotations("PDB")
#
# print (tree)
# print (tree.nexus_annotations.annotations)
#
# tree.write_to_nexus(working_dir + "output.nexus")
......
from collections import defaultdict
from phylo import *
import phylo
import matplotlib
import random
......@@ -146,3 +147,5 @@ class NexusAnnotations():
def generate_colour_list(self, num):
return num
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
......@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
f.close()
if __name__ == '__main__':
bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf')
bf = GtfFile('/Users/mikael/simhome/NFIX/WT1689.gtf')
print(bf.chroms.keys())
g = bf.generate('chr12')
print(next(g))
......
This diff is collapsed.
......@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
if parse_defline:
parsed = parseDefline(seqinfo[0])
seqname = parsed[0]
else:
seqinfo = line[1:]
else: # we are not parsing the sequence name so no need to duplicate it in the info
seqname = seqinfo[0]
seqinfo = line[1:]
if len(seqinfo) > 0: # more than a name
edited_info = ''
for infopart in seqinfo[1:]:
edited_info += infopart + ' '
seqinfo = edited_info
else:
seqinfo = ''
except IndexError as errmsg:
if not ignore:
raise RuntimeError(errmsg)
......@@ -717,60 +724,62 @@ class Alignment():
distmat[i, j] = distmat[j, i] = dist
return distmat
def writeHTML(self, filename = None):
def writeHTML(self, filename = None, col_start = None, col_end = None):
""" Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue'
"""
col_start = col_start or 0
col_end = col_end or self.alignlen
html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
html += '''<p style="font-size:12px">\n'''
maxNameLength = self.getnamelen()
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
html += str(i/10+1)[0]
else:
elif (i >= col_start and i < col_end):
html += ' '
html += '%s\n' % (self.alignlen)
# html += '%s\n' % (col_end)
html += '\n'
if self.alignlen > 10:
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
else:
elif (i >= col_start and i < col_end):
html += ' '
html += '\n'
if self.alignlen > 100:
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0 and i >= 99:
if (i+1) % 10 == 0 and i >= 99 and (i >= col_start and i < col_end):
index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
else:
elif (i >= col_start and i < col_end):
html += ' '
html += '\n'
if self.alignlen > 1000:
html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
else:
elif (i >= col_start and i < col_end):
html += ' '
html += '\n'
for seq in self.seqs:
html += seq.name.ljust(maxNameLength) + ' '
for sym in seq:
for sym in seq[col_start:col_end]:
color = self.alphabet.getAnnotation('html-color', sym)
if not color:
color = 'white'
html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
html += '\n'
html += '</pre></body></html>'
html += '</p></pre></body></html>'
if filename:
fh = open(filename, 'w')
fh.write(html)
......@@ -1187,19 +1196,25 @@ class Regexp(object):
def __str__(self):
return self.pattern
def search(self, sequence):
def search(self, sequence, gappy = False):
""" Find matches to the motif in the specified sequence. Returns a list
of triples, of the form (position, matched string, score). Note that
the score is always 1.0 because a regexp either matches
or doesn't. """
if not type(sequence) is Sequence:
sequence = Sequence(sequence)
sequenceString = sequence[:]
results = []
for match in self.regex.finditer(sequenceString):
results.append((match.start(), match.group(), 1.0))
return results
if gappy == False or sequence.gappy == False:
sequenceString = sequence[:]
results = []
for match in self.regex.finditer(sequenceString):
results.append((match.start(), match.group(), 1.0))
return results
else: # if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
degapped, idxs = sequence.getDegapped()
results = []
for match in self.regex.finditer(''.join(degapped)):
results.append((idxs[match.start()], match.group(), 1.0))
return results
class PWM(object):
......
......@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
'Protein': Protein_Alphabet,
'ProteinwX': Protein_wX,
'ProteinwSTOP' : Protein_wSTOP,
'ProteinwGAP': Protein_wGAP,
'DSSP_Alphabet' : DSSP_Alphabet,
'DSSP3_Alphabet' : DSSP3_Alphabet}
# The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP',
'ProteinwGAP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
# Useful annotations
DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
#Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
Protein_Alphabet.annotateAll('html-color', {
#orange*/
'G': "#F5A259",
#green*/
'N':"#00f900",
'Q':"#00f900",
'S': "#00f900",
'T': "#00f900",
#red*/
'K': "#f62f00",
'R': "#f62f00",
#blue/purple*/
'A':"#92b2f3",
'I': "#92b2f3",
'L': "#92b2f3",
'M': "#92b2f3",
'V': "#92b2f3",
'W': "#92b2f3",
'F': "#92b2f3",
#yellow*/
'P': "#FFFB00",
#pink*/
'C':"#F59692",
#aqua*/
'H': "#04B2B3",
'Y': "#04B2B3",
#purple*/
'D':"#CE64CB",
'E':"#CE64CB"})
# ------------------ Substitution Matrix ------------------
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment