Commit 8fa94535 authored by Mikael Boden's avatar Mikael Boden

added_regex_search_in_gappy_sequences

parent 9889428a
Pipeline #46 failed with stages
import annotations import annotations
import phylo import phylo
tree = phylo.parseNewick("(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk") # tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
...@@ -10,17 +12,17 @@ import phylo ...@@ -10,17 +12,17 @@ import phylo
working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/" # working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
tree = phylo.read_nexus(working_dir + "annotation_simple.nexus") # tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
#
print (tree) # print (tree)
print (tree.nexus_annotations.annotations) # print (tree.nexus_annotations.annotations)
#
tree.swap_annotations("PDB") # tree.swap_annotations("PDB")
#
print (tree) # print (tree)
print (tree.nexus_annotations.annotations) # print (tree.nexus_annotations.annotations)
# #
# tree.write_to_nexus(working_dir + "output.nexus") # tree.write_to_nexus(working_dir + "output.nexus")
......
from collections import defaultdict from collections import defaultdict
from phylo import * from phylo import *
import phylo
import matplotlib import matplotlib
import random import random
...@@ -146,3 +147,5 @@ class NexusAnnotations(): ...@@ -146,3 +147,5 @@ class NexusAnnotations():
def generate_colour_list(self, num): def generate_colour_list(self, num):
return num return num
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
...@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None): ...@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
f.close() f.close()
if __name__ == '__main__': if __name__ == '__main__':
bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf') bf = GtfFile('/Users/mikael/simhome/NFIX/WT1689.gtf')
print(bf.chroms.keys()) print(bf.chroms.keys())
g = bf.generate('chr12') g = bf.generate('chr12')
print(next(g)) print(next(g))
......
This diff is collapsed.
...@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl ...@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
if parse_defline: if parse_defline:
parsed = parseDefline(seqinfo[0]) parsed = parseDefline(seqinfo[0])
seqname = parsed[0] seqname = parsed[0]
else:
seqname = seqinfo[0]
seqinfo = line[1:] seqinfo = line[1:]
else: # we are not parsing the sequence name so no need to duplicate it in the info
seqname = seqinfo[0]
if len(seqinfo) > 0: # more than a name
edited_info = ''
for infopart in seqinfo[1:]:
edited_info += infopart + ' '
seqinfo = edited_info
else:
seqinfo = ''
except IndexError as errmsg: except IndexError as errmsg:
if not ignore: if not ignore:
raise RuntimeError(errmsg) raise RuntimeError(errmsg)
...@@ -717,60 +724,62 @@ class Alignment(): ...@@ -717,60 +724,62 @@ class Alignment():
distmat[i, j] = distmat[j, i] = dist distmat[i, j] = distmat[j, i] = dist
return distmat return distmat
def writeHTML(self, filename = None): def writeHTML(self, filename = None, col_start = None, col_end = None):
""" Generate HTML that displays the alignment in color. """ Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym) Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue' and that each symbol maps to a text string naming the color, e.g. 'blue'
""" """
col_start = col_start or 0
col_end = col_end or self.alignlen
html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n''' html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
html += '''<p style="font-size:12px">\n'''
maxNameLength = self.getnamelen() maxNameLength = self.getnamelen()
html += ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0: if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
html += str(i/10+1)[0] html += str(i/10+1)[0]
else: elif (i >= col_start and i < col_end):
html += ' ' html += ' '
html += '%s\n' % (self.alignlen) # html += '%s\n' % (col_end)
html += '\n'
if self.alignlen > 10: if self.alignlen > 10:
html += ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0: if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
index = len(str(i/10 + 1).split('.')[0]) index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0' html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
else: elif (i >= col_start and i < col_end):
html += ' ' html += ' '
html += '\n' html += '\n'
if self.alignlen > 100: if self.alignlen > 100:
html += ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0 and i >= 99: if (i+1) % 10 == 0 and i >= 99 and (i >= col_start and i < col_end):
index = len(str(i/10 + 1).split('.')[0]) index = len(str(i/10 + 1).split('.')[0])
html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0' html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
elif (i >= col_start and i < col_end):
else:
html += ' ' html += ' '
html += '\n' html += '\n'
if self.alignlen > 1000: if self.alignlen > 1000:
html += ''.ljust(maxNameLength) + ' ' html += ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1): for i in range(self.alignlen - 1):
if (i+1) % 10 == 0: if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' ' html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
elif (i >= col_start and i < col_end):
else:
html += ' ' html += ' '
html += '\n' html += '\n'
for seq in self.seqs: for seq in self.seqs:
html += seq.name.ljust(maxNameLength) + ' ' html += seq.name.ljust(maxNameLength) + ' '
for sym in seq: for sym in seq[col_start:col_end]:
color = self.alphabet.getAnnotation('html-color', sym) color = self.alphabet.getAnnotation('html-color', sym)
if not color: if not color:
color = 'white' color = 'white'
html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym) html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
html += '\n' html += '\n'
html += '</pre></body></html>' html += '</p></pre></body></html>'
if filename: if filename:
fh = open(filename, 'w') fh = open(filename, 'w')
fh.write(html) fh.write(html)
...@@ -1187,19 +1196,25 @@ class Regexp(object): ...@@ -1187,19 +1196,25 @@ class Regexp(object):
def __str__(self): def __str__(self):
return self.pattern return self.pattern
def search(self, sequence): def search(self, sequence, gappy = False):
""" Find matches to the motif in the specified sequence. Returns a list """ Find matches to the motif in the specified sequence. Returns a list
of triples, of the form (position, matched string, score). Note that of triples, of the form (position, matched string, score). Note that
the score is always 1.0 because a regexp either matches the score is always 1.0 because a regexp either matches
or doesn't. """ or doesn't. """
if not type(sequence) is Sequence: if not type(sequence) is Sequence:
sequence = Sequence(sequence) sequence = Sequence(sequence)
if gappy == False or sequence.gappy == False:
sequenceString = sequence[:] sequenceString = sequence[:]
results = [] results = []
for match in self.regex.finditer(sequenceString): for match in self.regex.finditer(sequenceString):
results.append((match.start(), match.group(), 1.0)) results.append((match.start(), match.group(), 1.0))
return results return results
else: # if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
degapped, idxs = sequence.getDegapped()
results = []
for match in self.regex.finditer(''.join(degapped)):
results.append((idxs[match.start()], match.group(), 1.0))
return results
class PWM(object): class PWM(object):
......
...@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet, ...@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
'Protein': Protein_Alphabet, 'Protein': Protein_Alphabet,
'ProteinwX': Protein_wX, 'ProteinwX': Protein_wX,
'ProteinwSTOP' : Protein_wSTOP, 'ProteinwSTOP' : Protein_wSTOP,
'ProteinwGAP': Protein_wGAP,
'DSSP_Alphabet' : DSSP_Alphabet, 'DSSP_Alphabet' : DSSP_Alphabet,
'DSSP3_Alphabet' : DSSP3_Alphabet} 'DSSP3_Alphabet' : DSSP3_Alphabet}
# The preferred order in which a predefined alphabet is assigned to a sequence # The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid) # (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP', 'DSSP_Alphabet', 'DSSP3_Alphabet'] preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP',
'ProteinwGAP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
# Useful annotations # Useful annotations
DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'}) DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'}) RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'}) #Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
Protein_Alphabet.annotateAll('html-color', {
#orange*/
'G': "#F5A259",
#green*/
'N':"#00f900",
'Q':"#00f900",
'S': "#00f900",
'T': "#00f900",
#red*/
'K': "#f62f00",
'R': "#f62f00",
#blue/purple*/
'A':"#92b2f3",
'I': "#92b2f3",
'L': "#92b2f3",
'M': "#92b2f3",
'V': "#92b2f3",
'W': "#92b2f3",
'F': "#92b2f3",
#yellow*/
'P': "#FFFB00",
#pink*/
'C':"#F59692",
#aqua*/
'H': "#04B2B3",
'Y': "#04B2B3",
#purple*/
'D':"#CE64CB",
'E':"#CE64CB"})
# ------------------ Substitution Matrix ------------------ # ------------------ Substitution Matrix ------------------
......
import urllib.request import urllib.request
import urllib.parse
import os import os
from time import sleep from time import sleep
import stats import stats
...@@ -21,6 +22,7 @@ __ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/' ...@@ -21,6 +22,7 @@ __ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__ = 'http://www.uniprot.org/' __uniprotUrl__ = 'http://www.uniprot.org/'
__ebiSearchUrl__ = 'http://www.ebi.ac.uk/ebisearch/' __ebiSearchUrl__ = 'http://www.ebi.ac.uk/ebisearch/'
def fetch(entryId, dbName='uniprotkb', format='fasta'): def fetch(entryId, dbName='uniprotkb', format='fasta'):
""" """
Retrieve a single entry from a database Retrieve a single entry from a database
...@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'): ...@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
except urllib.error.HTTPError as ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def search(query, dbName='uniprot', format='list', limit=100, columns=""): def search(query, dbName='uniprot', format='list', limit=100, columns=""):
""" """
Retrieve multiple entries matching query from a database currently only via UniProtKB Retrieve multiple entries matching query from a database currently only via UniProtKB
...@@ -55,9 +58,13 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""): ...@@ -55,9 +58,13 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
if dbName.startswith('uniprot'): if dbName.startswith('uniprot'):
# Construct URL # Construct URL
if limit == None: # no limit to number of results returned if limit == None: # no limit to number of results returned
url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query + '&columns='+ columns url = "{}{}/?format={}&query={}&columns={}".format(__uniprotUrl__, dbName, format,
urllib.parse.quote(query),
columns)
else: else:
url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query + '&columns='+ columns url = "{}{}/?format={}&limit={}&query={}&columns={}".format(__uniprotUrl__, dbName, format, str(limit),
urllib.parse.quote(query), columns)
# Get the entries # Get the entries
try: try:
...@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""): ...@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
dbs = dbName.split(":") dbs = dbName.split(":")
if len(dbs) > 1: if len(dbs) > 1:
dbName = dbs[1] dbName = dbs[1]
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/' base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
url = base + "esearch.fcgi?db={}&term={}+AND+srcdb_refseq[" \
"prop]&retmax={}".format(dbName, urllib.parse.quote(query), str(limit))
print (url)
# Get the entries # Get the entries
try: try:
data = urllib.request.urlopen(url).read().decode("utf-8") data = urllib.request.urlopen(url).read().decode("utf-8")
words = data.split("</Id>") words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]] words = [w[w.find("<Id>") + 4:] for w in words[:-1]]
if format == 'list': if format == 'list':
return words return words
elif format == 'fasta' and len(words) > 0: elif format == 'fasta' and len(words) > 0:
...@@ -93,6 +107,7 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""): ...@@ -93,6 +107,7 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return return
authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'], authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
3702: ['Arabidopsis thaliana', 'TAIR_ID'], 3702: ['Arabidopsis thaliana', 'TAIR_ID'],
4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'], 4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
...@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html ...@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries. Note that this service can be slow for queries involving a large number of entries.
""" """
def getGOReport(positives, background = None):
def getGOReport(positives, background=None):
""" Generate a complete GO term report for a set of genes (positives). """ Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided). Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
...@@ -148,11 +164,12 @@ def getGOReport(positives, background = None): ...@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
for t in sorted_cnt: for t in sorted_cnt:
defin = getGODef(t[0]) defin = getGODef(t[0])
if background != None: if background != None:
ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name'])) ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0] + t[1][1], defin['name']))
else: else:
ret.append((t[0], t[1], defin['name'])) ret.append((t[0], t[1], defin['name']))
return ret return ret
def getGODef(goterm): def getGODef(goterm):
""" """
Retrieve information about a GO term Retrieve information about a GO term
...@@ -165,7 +182,7 @@ def getGODef(goterm): ...@@ -165,7 +182,7 @@ def getGODef(goterm):
url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
entry={'id': None, 'name': None, 'aspect': None} entry = {'id': None, 'name': None, 'aspect': None}
data = urllib.request.urlopen(url).read().decode("utf-8") data = urllib.request.urlopen(url).read().decode("utf-8")
ret = json.loads(data) ret = json.loads(data)
for row in ret['results']: for row in ret['results']:
...@@ -179,6 +196,7 @@ def getGODef(goterm): ...@@ -179,6 +196,7 @@ def getGODef(goterm):
except urllib.error.HTTPError as ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def getGOTerms(genes): def getGOTerms(genes):
""" """
Retrieve all GO terms for a given set of genes (or single gene). Retrieve all GO terms for a given set of genes (or single gene).
...@@ -237,6 +255,7 @@ def getGOTerms(genes): ...@@ -237,6 +255,7 @@ def getGOTerms(genes):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return map return map
def getGenes(goterms, taxo=None): def getGenes(goterms, taxo=None):
""" """
Retrieve all genes/proteins for a given set of GO terms (or single GO term). Retrieve all genes/proteins for a given set of GO terms (or single GO term).
...@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None): ...@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
else: else:
break break
termcnt += 1 termcnt += 1
uri_string = 'annotation/search?limit=' + str(limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId=' uri_string = 'annotation/search?limit=' + str(
limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId='
for i in range(len(termbatch)): for i in range(len(termbatch)):
term = termbatch[i] term = termbatch[i]
uri_string += term + "," if i < len(termbatch) - 1 else term uri_string += term + "," if i < len(termbatch) - 1 else term
...@@ -295,8 +315,8 @@ def getGenes(goterms, taxo=None): ...@@ -295,8 +315,8 @@ def getGenes(goterms, taxo=None):
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
return map return map
class EBI(object):
class EBI(object):
__email__ = 'anon@uq.edu.au' # to whom emails about jobs should go __email__ = 'anon@uq.edu.au' # to whom emails about jobs should go
__ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
__checkInterval__ = 2 # how long to wait between checking job status __checkInterval__ = 2 # how long to wait between checking job status
...@@ -349,7 +369,8 @@ class EBI(object): ...@@ -349,7 +369,8 @@ class EBI(object):
if self.isLocked(): if self.isLocked():
raise RuntimeError("""You currently have a %s job running. You must raise RuntimeError("""You currently have a %s job running. You must
wait until it is complete before submitting another job. Go to wait until it is complete before submitting another job. Go to
%sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId)) %sstatus/%s to check the status of the job.""" % (
self.service, self.__ebiServiceUrl__, self.jobId))
url = self.__ebiServiceUrl__ + self.service + '/run/' url = self.__ebiServiceUrl__ + self.service + '/run/'
# ncbiblast database parameter needs special handling # ncbiblast database parameter needs special handling
if self.service == 'ncbiblast': if self.service == 'ncbiblast':
...@@ -423,8 +444,8 @@ class EBI(object): ...@@ -423,8 +444,8 @@ class EBI(object):
else: else:
return results return results
def getUniProtDict(ids, cols="", db='uniprot', identities=None):
def getUniProtDict(ids, cols="", db='uniprot', identities=None):
""" """
:param ids: The list of UniProt IDs :param ids: The list of UniProt IDs
...@@ -467,7 +488,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -467,7 +488,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at. the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
""" """
# Format the lists of IDs and columns correctly # Format the lists of IDs and columns correctly
cols = ",".join(cols) cols = ",".join(cols)
...@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
if type(identities) != list: if type(identities) != list:
identities = [identities] * len(ids) identities = [identities] * len(ids)
elif len(identities) != len(ids): elif len(identities) != len(ids):
raise RuntimeError('Either supply a single identity threshold or supply one for each identifier in the list') raise RuntimeError(
'Either supply a single identity threshold or supply one for each identifier in the list')
# Check that the identity thresholds are valid values # Check that the identity thresholds are valid values
for x in identities: for x in identities:
if x not in [1.0, 0.9, 0.5]: if x not in [1.0, 0.9, 0.5]:
raise RuntimeError("UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - " + str(x)) raise RuntimeError(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - " + str(x))
# Add the query syntax around the identifiers # Add the query syntax around the identifiers
updated_ids = "" updated_ids = ""
...@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
url = 'https://www.uniprot.org/' + db + '/' url = 'https://www.uniprot.org/' + db + '/'
params = { params = {
'format': 'tab', 'format': 'tab',
'query': updated_ids, 'query': updated_ids,
...@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
# For each record we retrieve, split the line by tabs and build up the UniProt dict # For each record we retrieve, split the line by tabs and build up the UniProt dict
for line in page.split("\n")[1:]: for line in page.split("\n")[1:]:
if line: if line:
splitlines= line.split("\t") splitlines = line.split("\t")
id_dict = {} id_dict = {}
pos = 1 pos = 1
for col in cols.split(","): for col in cols.split(","):
id_dict[col] = None if splitlines[pos] == "" else splitlines[pos] id_dict[col] = None if splitlines[pos] == "" else splitlines[pos]
pos +=1 pos += 1
up_dict[splitlines[0]] = id_dict up_dict[splitlines[0]] = id_dict
return up_dict return up_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment