Commit 6b05a37e authored by Mikael Boden's avatar Mikael Boden

updated_webservice.py_with_new_protocols_for_accessing_uniprot_and_GO

parent eff5f374
...@@ -7,7 +7,7 @@ matplotlib_colours={'aliceblue':'#F0F8FF','aqua':'#00FFFF','aquamarine':'#7FFFD4 ...@@ -7,7 +7,7 @@ matplotlib_colours={'aliceblue':'#F0F8FF','aqua':'#00FFFF','aquamarine':'#7FFFD4
twenty_colours ={'dodgerblue':'#1E90FF', 'orangered':'#FF4500', 'greenyellow':'#ADFF2F', 'orchid':'#DA70D6'} twenty_colours ={'dodgerblue':'#1E90FF', 'orangered':'#FF4500', 'greenyellow':'#ADFF2F', 'orchid':'#DA70D6'}
symbols = ["*", "^", "!", "#", "~", "+", ":", "<", ">", "@", "%", "=", "-"] symbols = ["*", "^", "!", "#", "~", "+", ":", "<", ">", "@", "%", "=", "-", "a", "b", "c", "d", "e" "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "aa", "bb", "cc", "dd", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo", "pp", "qq", "rr", "ss", "tt", "uu", "vv", "ww", "xx", "yy", "zz"]
class NexusAnnotations(): class NexusAnnotations():
""" """
...@@ -59,6 +59,10 @@ class NexusAnnotations(): ...@@ -59,6 +59,10 @@ class NexusAnnotations():
if annot not in self.annotation_symbols: if annot not in self.annotation_symbols:
symbol = self.generate_annotation_symbol(annot) symbol = self.generate_annotation_symbol(annot)
self.add_annotation_symbol(annot, symbol) self.add_annotation_symbol(annot, symbol)
else:
symbol = self.generate_annotation_symbol(annot)
self.add_annotation_symbol(annot, symbol)
# Add in a symbol to represent this annotation if one doesn't already exist # Add in a symbol to represent this annotation if one doesn't already exist
...@@ -113,6 +117,8 @@ class NexusAnnotations(): ...@@ -113,6 +117,8 @@ class NexusAnnotations():
symbol = random.choice(self.symbol_list) symbol = random.choice(self.symbol_list)
if symbol not in self.annotation_symbols.values(): if symbol not in self.annotation_symbols.values():
return symbol return symbol
else:
return symbol
i+=1 i+=1
def add_annotation_symbol(self, symbol, annotation): def add_annotation_symbol(self, symbol, annotation):
...@@ -127,13 +133,16 @@ class NexusAnnotations(): ...@@ -127,13 +133,16 @@ class NexusAnnotations():
:return: A unique colour :return: A unique colour
""" """
i = 0 # i = 0
while i < len(self.colour_dict.values()): # while i < len(self.colour_dict.values()):
colour = random.choice(list(self.colour_dict.values())) # colour = random.choice(list(self.colour_dict.values()))
if colour not in self.used_colours: # if colour not in self.used_colours:
return colour # return colour
i+=1 # else:
# return colour
# i+=1
colour = random.choice(list(self.colour_dict.values()))
return colour
def generate_colour_list(self, num): def generate_colour_list(self, num):
return num return num
...@@ -4,6 +4,8 @@ from time import sleep ...@@ -4,6 +4,8 @@ from time import sleep
import stats import stats
from io import StringIO from io import StringIO
import gzip import gzip
import ssl
import json
""" This module is collection of functions for accessing the EBI REST web services, """ This module is collection of functions for accessing the EBI REST web services,
including sequence retrieval, searching, gene ontology, BLAST and ClustalW. including sequence retrieval, searching, gene ontology, BLAST and ClustalW.
...@@ -11,14 +13,13 @@ import gzip ...@@ -11,14 +13,13 @@ import gzip
performing BLAST and ClustalW queries. performing BLAST and ClustalW queries.
See See
http://www.ebi.ac.uk/Tools/webservices/tutorials/01_intro and http://www.ebi.ac.uk/Tools/webservices/tutorials
http://www.ebi.ac.uk/Tools/webservices/tutorials/02_rest
http://www.ebi.ac.uk/Tools/webservices/tutorials/06_programming/python/rest/urllib
""" """
__ebiUrl__ = 'http://www.ebi.ac.uk/Tools/' # Use UQ mirror when available __ebiUrl__ = 'http://www.ebi.ac.uk/Tools/'
__ebiGOUrl__ = 'http://www.ebi.ac.uk/QuickGO-Old/' # Use UQ mirror when available __ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__ = 'http://www.uniprot.org/' # __uniprotUrl__ = 'http://www.uniprot.org/'
__ebiSearchUrl__ = 'http://www.ebi.ac.uk/ebisearch/'
def fetch(entryId, dbName='uniprotkb', format='fasta'): def fetch(entryId, dbName='uniprotkb', format='fasta'):
""" """
...@@ -27,16 +28,17 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'): ...@@ -27,16 +28,17 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases) format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases)
See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax
http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb&id=P63166&format=fasta&style=raw&Retrieve=Retrieve
""" """
# Construct URL # Construct URL
url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&Retrieve=Retrieve&db=' + dbName + '&format=' + format + '&id=' + entryId
# Get the entry # Get the entry
try: try:
data = urllib.request.urlopen(url).read().decode("utf-8") data = urllib.request.urlopen(url).read().decode("utf-8")
if data.startswith("ERROR"): if data.startswith("ERROR"):
raise RuntimeError(data) raise RuntimeError(data)
return data return data
except urllib.error.HTTPError as ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
...@@ -95,53 +97,13 @@ authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'], ...@@ -95,53 +97,13 @@ authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'], 4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
10090: ['Mus musculus', 'MGI_ID']} 10090: ['Mus musculus', 'MGI_ID']}
def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False):
"""
Map identifiers between databases (based on UniProtKB; see http://www.uniprot.org/faq/28)
identifiers: a list of identifiers (list of strings)
frm: the tag/abbreviation for the identifier FROM which to idmap
to: the tag/abbreviation for the identifier TO which to idmap
format: the results format to use
reverse: reverse the returned mapping key (to) -> value (from)
Returns a dictionary with key (from) -> value (to)
Set reverse to True if dictionary should contain the reverse mapping, useful if the mapping is non-unique
"""
url = __uniprotUrl__ + 'mapping/'
# construct query by concatenating the list of identifiers
if isinstance(identifiers, str):
query = identifiers.strip()
else: # assume it is a list of strings
query = ''
for id in identifiers:
query = query + id.strip() + ' '
query = query.strip() # remove trailing spaces
params = {
'from' : frm,
'to' : to,
'format' : format,
'query' : query
}
if len(query) > 0:
request = urllib.request.Request(url, urllib.parse.urlencode(params))
response = urllib.request.urlopen(request).read()
d = dict()
for row in response.splitlines()[1:]:
pair = row.split('\t')
if not reverse:
d[pair[0]] = pair[1]
else:
d[pair[1]] = pair[0]
return d
else:
return dict()
""" """
Gene Ontology service (QuickGO) Gene Ontology service (QuickGO)
http://www.ebi.ac.uk/QuickGO/WebServices.html http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries. Note that this service can be slow for queries involving a large number of entries.
""" """
def getGOReport(positives, background = None, database = 'UniProtKB'): def getGOReport(positives, background = None):
""" Generate a complete GO term report for a set of genes (positives). """ Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided). Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
...@@ -149,7 +111,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'): ...@@ -149,7 +111,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
E-value is a Bonferroni-corrected p-value. E-value is a Bonferroni-corrected p-value.
""" """
pos = set(positives) pos = set(positives)
fg_map = getGOTerms(pos, database) fg_map = getGOTerms(pos)
fg_list = [] fg_list = []
for id in fg_map: for id in fg_map:
for t in fg_map[id]: for t in fg_map[id]:
...@@ -159,7 +121,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'): ...@@ -159,7 +121,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
neg = set() neg = set()
if background != None: if background != None:
neg = set(background).difference(pos) neg = set(background).difference(pos)
bg_map = getGOTerms(neg, database) bg_map = getGOTerms(neg)
for id in bg_map: for id in bg_map:
for t in bg_map[id]: for t in bg_map[id]:
bg_list.append(t) bg_list.append(t)
...@@ -195,129 +157,108 @@ def getGODef(goterm): ...@@ -195,129 +157,108 @@ def getGODef(goterm):
Retrieve information about a GO term Retrieve information about a GO term
goterm: the identifier, e.g. 'GO:0002080' goterm: the identifier, e.g. 'GO:0002080'
""" """
# Construct URL # first turn off server certificate verification
url = __ebiGOUrl__ + 'GTerm?format=obo&id=' + goterm if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
# Construct URL with query term
url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
try: try:
entry={'id': None, 'name': None, 'def': None} entry={'id': None, 'name': None, 'aspect': None}
data = urllib.request.urlopen(url).read().decode("utf-8") data = urllib.request.urlopen(url).read().decode("utf-8")
for row in data.splitlines(): ret = json.loads(data)
index = row.find(':') for row in ret['results']:
if index > 0 and len(row[index:]) > 1: for key in entry:
field = row[0:index].strip() try:
value = row[index+1:].strip(' "') # remove spaces and quotation marks entry[key] = row[key]
if field in list(entry.keys()): # check if we need this field except:
if entry[field] == None: # check if not yet assigned entry[key] = None
entry[field] = value entry['def'] = row['definition']['text']
return entry return entry
except urllib.error.HTTPError as ex: except urllib.error.HTTPError as ex:
raise RuntimeError(ex.read()) raise RuntimeError(ex.read())
def getGOTerms(genes, database='UniProtKB', completeAnnot = False): def getGOTerms(genes):
""" """
Retrieve all GO terms for a given set of genes (or single gene). Retrieve all GO terms for a given set of genes (or single gene).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl' The result is given as a map (key=gene name, value=list of unique terms).
The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms.
If completeAnnot is True (default is False) then the above "terms" is the first element
in a tuple with (gene-terms-map, gene-taxon-id).
""" """
if type(genes) != list and type(genes) != set and type(genes) != tuple: if type(genes) != list and type(genes) != set and type(genes) != tuple:
genes = [genes] genes = [genes]
termsmap = dict() map = dict()
taxonmap = dict() uri_string = 'annotation/search?geneProductId='
uri_string = 'GAnnotation?format=tsv&gz&db=' + database + '&protein=' batchsize = 100 # size of query batch
# build queries (batches of genes) genecnt = 0
queryLength = 2000 while genecnt < len(genes):
queries = [] genebatch = []
query = None for index in range(batchsize):
for gene in genes: if genecnt < len(genes):
if query == None: genebatch.append(genes[genecnt])
query = gene
elif len(query) < queryLength:
query += ','+gene
else:
queries.append(query)
query = gene
if query != None:
queries.append(query)
# execute queries, each involving a number of genes
for query in queries:
# Construct URL
url = __ebiGOUrl__ + uri_string + query
# Get the entry: fill in the fields specified below
try:
urlreq = urllib.request.Request(url)
urlreq.add_header('Accept-encoding', 'gzip')
response = urllib.request.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read().decode("utf-8")
else: else:
data = response.read().decode("utf-8") break
for row in data.splitlines()[1:]: # we ignore first (header) row genecnt += 1
values = row.split('\t') for i in range(len(genebatch)):
if len(values) >= 7: gene = genebatch[i]
key = values[1] uri_string += gene + "," if i < len(genes) - 1 else gene
if key in termsmap: # Construct URL
termsmap[key].add(values[6]) url = __ebiGOUrl__ + uri_string
# Get the entry: fill in the fields specified below
try:
# first turn off server certificate verification
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
urlreq = urllib.request.Request(url)
urlreq.add_header('Accept-encoding', 'gzip')
response = urllib.request.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read().decode("utf-8")
else:
data = response.read().decode("utf-8")
ret = json.loads(data)
for row in ret['results']:
genename = row['geneProductId'] # would look like "UniProtKB:A0A140VJQ9"
gotermid = row['goId'] # would look like "GO:0002080"
if not genename in map:
map[genename] = set([gotermid])
else: else:
termsmap[key] = set([values[6]]) map[genename].add(gotermid)
taxonmap[key] = int(values[4]) except urllib.error.HTTPError as ex:
except urllib.error.HTTPError as ex: raise RuntimeError(ex.read())
raise RuntimeError(ex.read()) return map
if completeAnnot:
if len(genes) == 1:
if len(termsmap) == 1:
return (termsmap[genes[0]], taxonmap[genes[0]])
else:
return (set(), None)
else:
return (termsmap, taxonmap)
else:
if len(genes) == 1:
if len(termsmap) == 1:
return termsmap[genes[0]]
else:
return set()
else:
return termsmap
def getGenes(goterms, database='UniProtKB', taxo=None): def getGenes(goterms, taxo=None):
""" """
Retrieve all genes/proteins for a given set of GO terms (or single GO term). Retrieve all genes/proteins for a given set of GO terms (or single GO term).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl' Genes that are annotated with a more specific GO term than those given are included.
taxo: use specific taxonomic identifier, e.g. 9606 (human) taxo: use specific taxonomic identifier, e.g. 9606 (human); default is all
The result is given as a map (key=gene name, value=list of unique terms) OR The result is given as a map (key=gene name, value=list of unique terms).
in the case of a single gene as a list of unique terms.
""" """
if type(goterms) != list and type(goterms) != set and type(goterms) != tuple: if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
goterms = [goterms] goterms = [goterms]
map = dict() map = dict()
if taxo == None: uri_string = 'annotation/search?taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId='
uri_string = 'GAnnotation?format=tsv&db=' + database + '&term=' for i in range(len(goterms)):
else: goterm = goterms[i]
uri_string = 'GAnnotation?format=tsv&db=' + database + '&tax=' + str(taxo) + '&term=' uri_string += goterm + "," if i < len(goterms) - 1 else goterm
for goterm in goterms: # Get the entry: fill in the fields specified below
genes = set() try:
# Construct URL # first turn off server certificate verification
url = __ebiGOUrl__ + uri_string + goterm.strip() if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
# Get the entry: fill in the fields specified below ssl._create_default_https_context = ssl._create_unverified_context
try: data = urllib.request.urlopen(__ebiGOUrl__ + uri_string).read().decode("utf-8")
data = urllib.request.urlopen(url).read().decode("utf-8") ret = json.loads(data)
for row in data.splitlines()[1:]: # we ignore first (header) row for row in ret['results']:
values = row.split('\t') genename = row['geneProductId'] # would look like "UniProtKB:A0A140VJQ9"
if len(values) >= 7: gotermid = row['goId'] # would look like "GO:0002080"
genes.add(values[1]) if not gotermid in map:
map[goterm] = list(genes) map[gotermid] = set([genename])
except urllib.error.HTTPError as ex: else:
raise RuntimeError(ex.read()) map[gotermid].add(genename)
if len(goterms) == 1: except urllib.error.HTTPError as ex:
return map[goterms[0]] raise RuntimeError(ex.read())
else: return map
return map
class EBI(object): class EBI(object):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment