webservice.py 17.7 KB
Newer Older
Mikael Boden's avatar
Mikael Boden committed
1 2 3 4 5 6
import urllib.request
import os
from time import sleep
import stats
from io import StringIO
import gzip
7 8
import ssl
import json
Mikael Boden's avatar
Mikael Boden committed
9 10 11 12 13 14 15

""" This module is collection of functions for accessing the EBI REST web services,
    including sequence retrieval, searching, gene ontology, BLAST and ClustalW.
    The class EBI takes precautions taken as to not send too many requests when
    performing BLAST and ClustalW queries.

    See
16
    http://www.ebi.ac.uk/Tools/webservices/tutorials
Mikael Boden's avatar
Mikael Boden committed
17 18
    """

19 20 21 22
__ebiUrl__ =        'http://www.ebi.ac.uk/Tools/'
__ebiGOUrl__ =      'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__ =    'http://www.uniprot.org/'
__ebiSearchUrl__ =  'http://www.ebi.ac.uk/ebisearch/'
Mikael Boden's avatar
Mikael Boden committed
23 24 25 26 27 28 29 30

def fetch(entryId, dbName='uniprotkb', format='fasta'):
    """
    Retrieve a single entry from a database
    entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' (database dependent; examples for uniprotkb)
    dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
    format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases)
    See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax
31 32

    http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb&id=P63166&format=fasta&style=raw&Retrieve=Retrieve
Mikael Boden's avatar
Mikael Boden committed
33 34
    """
     # Construct URL
35
    url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&Retrieve=Retrieve&db=' + dbName + '&format=' + format + '&id=' + entryId
Mikael Boden's avatar
Mikael Boden committed
36 37
    # Get the entry
    try:
Mikael Boden's avatar
Mikael Boden committed
38 39
        data = urllib.request.urlopen(url).read().decode("utf-8")
        if data.startswith("ERROR"):
Mikael Boden's avatar
Mikael Boden committed
40 41
            raise RuntimeError(data)
        return data
Mikael Boden's avatar
Mikael Boden committed
42
    except urllib.error.HTTPError as ex:
Mikael Boden's avatar
Mikael Boden committed
43 44
        raise RuntimeError(ex.read())

Mikael Boden's avatar
Mikael Boden committed
45
def search(query, dbName='uniprot', format='list', limit=100, columns=""):
Mikael Boden's avatar
Mikael Boden committed
46 47 48 49 50 51 52 53 54 55 56 57
    """
    Retrieve multiple entries matching query from a database currently only via UniProtKB
    query: search term(s) e.g. 'organism:9606+AND+antigen'
    dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
    format: file format e.g. 'list', 'fasta' or 'txt'
    limit: max number of results (specify None for all results)
    See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
    See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
    """
    if dbName.startswith('uniprot'):
        # Construct URL
        if limit == None: # no limit to number of results returned
Mikael Boden's avatar
Mikael Boden committed
58
            url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query + '&columns='+ columns
Mikael Boden's avatar
Mikael Boden committed
59
        else:
Mikael Boden's avatar
Mikael Boden committed
60
            url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query + '&columns='+ columns
Mikael Boden's avatar
Mikael Boden committed
61
        # Get the entries
Mikael Boden's avatar
Mikael Boden committed
62

Mikael Boden's avatar
Mikael Boden committed
63
        try:
Mikael Boden's avatar
Mikael Boden committed
64
            data = urllib.request.urlopen(url).read().decode("utf-8")
Mikael Boden's avatar
Mikael Boden committed
65 66 67 68
            if format == 'list':
                return data.splitlines()
            else:
                return data
Mikael Boden's avatar
Mikael Boden committed
69
        except urllib.error.HTTPError as ex:
Mikael Boden's avatar
Mikael Boden committed
70 71 72 73 74 75 76 77 78
            raise RuntimeError(ex.read())
    elif dbName.startswith('refseq'):
        dbs = dbName.split(":")
        if len(dbs) > 1:
            dbName = dbs[1]
        base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
        url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
        # Get the entries
        try:
Mikael Boden's avatar
Mikael Boden committed
79
            data = urllib.request.urlopen(url).read().decode("utf-8")
Mikael Boden's avatar
Mikael Boden committed
80 81 82 83 84 85 86 87
            words = data.split("</Id>")
            words = [w[w.find("<Id>")+4:] for w in words[:-1]]
            if format == 'list':
                return words
            elif format == 'fasta' and len(words) > 0:
                url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
                for w in words:
                    url += w + ","
Mikael Boden's avatar
Mikael Boden committed
88
                data = urllib.request.urlopen(url).read().decode("utf-8")
Mikael Boden's avatar
Mikael Boden committed
89 90 91
                return data
            else:
                return ''
Mikael Boden's avatar
Mikael Boden committed
92
        except urllib.error.HTTPError as ex:
Mikael Boden's avatar
Mikael Boden committed
93 94 95 96 97 98 99 100 101 102 103 104 105 106
            raise RuntimeError(ex.read())
    return

authorised_database_tag = {9606:  ['Homo sapiens', 'ACC', 'ID'],
                           3702:  ['Arabidopsis thaliana', 'TAIR_ID'],
                           4932:  ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
                           10090: ['Mus musculus', 'MGI_ID']}

"""
Gene Ontology service (QuickGO)
http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries.
"""

107
def getGOReport(positives, background = None):
Mikael Boden's avatar
Mikael Boden committed
108 109 110 111 112 113 114
    """ Generate a complete GO term report for a set of genes (positives).
        Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
        Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
        (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
        E-value is a Bonferroni-corrected p-value.
        """
    pos = set(positives)
115
    fg_map = getGOTerms(pos)
Mikael Boden's avatar
Mikael Boden committed
116 117 118 119 120 121 122 123 124
    fg_list = []
    for id in fg_map:
        for t in fg_map[id]:
            fg_list.append(t)
    bg_map = {}
    bg_list = []
    neg = set()
    if background != None:
        neg = set(background).difference(pos)
125
        bg_map = getGOTerms(neg)
Mikael Boden's avatar
Mikael Boden committed
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
        for id in bg_map:
            for t in bg_map[id]:
                bg_list.append(t)
    term_set = set(fg_list)
    term_cnt = {}

    nPos = len(pos)
    nNeg = len(neg)
    if background == None:
        for t in term_set:
            term_cnt[t] = fg_list.count(t)
        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
    else: # a background is provided
        for t in term_set:
            fg_hit = fg_list.count(t)
            bg_hit = bg_list.count(t)
            fg_nohit = nPos - fg_hit
            bg_nohit = nNeg - bg_hit
            term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)

    ret = []
    for t in sorted_cnt:
        defin = getGODef(t[0])
        if background != None:
            ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name']))
        else:
            ret.append((t[0], t[1], defin['name']))
    return ret

def getGODef(goterm):
    """
    Retrieve information about a GO term
    goterm: the identifier, e.g. 'GO:0002080'
    """
161 162 163 164 165
    # first turn off server certificate verification
    if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
        ssl._create_default_https_context = ssl._create_unverified_context
    # Construct URL with query term
    url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm
Mikael Boden's avatar
Mikael Boden committed
166 167
    # Get the entry: fill in the fields specified below
    try:
168
        entry={'id': None, 'name': None, 'aspect': None}
Mikael Boden's avatar
Mikael Boden committed
169
        data = urllib.request.urlopen(url).read().decode("utf-8")
170 171 172 173 174 175 176 177
        ret = json.loads(data)
        for row in ret['results']:
            for key in entry:
                try:
                    entry[key] = row[key]
                except:
                    entry[key] = None
            entry['def'] = row['definition']['text']
Mikael Boden's avatar
Mikael Boden committed
178
        return entry
Mikael Boden's avatar
Mikael Boden committed
179
    except urllib.error.HTTPError as ex:
Mikael Boden's avatar
Mikael Boden committed
180 181
        raise RuntimeError(ex.read())

182
def getGOTerms(genes):
Mikael Boden's avatar
Mikael Boden committed
183 184
    """
    Retrieve all GO terms for a given set of genes (or single gene).
185
    The result is given as a map (key=gene name, value=list of unique terms).
Mikael Boden's avatar
Mikael Boden committed
186 187 188
    """
    if type(genes) != list and type(genes) != set and type(genes) != tuple:
        genes = [genes]
189 190 191
    map = dict()
    batchsize = 100 # size of query batch
    genecnt = 0
Mikael Boden's avatar
Mikael Boden committed
192
    limitpage = 100 # number of record on each returned page
193 194 195 196 197
    while genecnt < len(genes):
        genebatch = []
        for index in range(batchsize):
            if genecnt < len(genes):
                genebatch.append(genes[genecnt])
Mikael Boden's avatar
Mikael Boden committed
198
            else:
199 200
                break
            genecnt += 1
Mikael Boden's avatar
Mikael Boden committed
201
        uri_string = 'annotation/search?limit=' + str(limitpage) + '&geneProductId='
202 203
        for i in range(len(genebatch)):
            gene = genebatch[i]
Mikael Boden's avatar
Mikael Boden committed
204 205 206 207 208 209 210 211 212 213
            uri_string += gene + "," if i < len(genebatch) - 1 else gene
        # Construct URL
        # Get the entry: fill in the fields specified below
        # first turn off server certificate verification
        if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
            ssl._create_default_https_context = ssl._create_unverified_context
        page = 1
        try:
            while (True):
                url = __ebiGOUrl__ + uri_string + '&page=' + str(page)
214 215 216 217 218 219 220 221 222 223
                urlreq = urllib.request.Request(url)
                urlreq.add_header('Accept-encoding', 'gzip')
                response = urllib.request.urlopen(urlreq)
                if response.info().get('Content-Encoding') == 'gzip':
                    buf = StringIO(response.read())
                    f = gzip.GzipFile(fileobj=buf)
                    data = f.read().decode("utf-8")
                else:
                    data = response.read().decode("utf-8")
                ret = json.loads(data)
Mikael Boden's avatar
Mikael Boden committed
224 225
                if page == 1 and int(ret['numberOfHits']) > limitpage * 100:
                    print('Warning:', ret['numberOfHits'], 'matches in a query. Be patient.')
226 227 228 229 230
                for row in ret['results']:
                    genename = row['geneProductId']  # would look like "UniProtKB:A0A140VJQ9"
                    gotermid = row['goId']  # would look like "GO:0002080"
                    if not genename in map:
                        map[genename] = set([gotermid])
Mikael Boden's avatar
Mikael Boden committed
231
                    else:
232
                        map[genename].add(gotermid)
Mikael Boden's avatar
Mikael Boden committed
233 234 235 236 237
                if len(ret['results']) < limitpage:
                    break
                page += 1
        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
238
    return map
Mikael Boden's avatar
Mikael Boden committed
239

240
def getGenes(goterms, taxo=None):
Mikael Boden's avatar
Mikael Boden committed
241 242
    """
    Retrieve all genes/proteins for a given set of GO terms (or single GO term).
243 244 245
    Genes that are annotated with a more specific GO term than those given are included.
    taxo: use specific taxonomic identifier, e.g. 9606 (human); default is all
    The result is given as a map (key=gene name, value=list of unique terms).
Mikael Boden's avatar
Mikael Boden committed
246 247 248 249
    """
    if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
        goterms = [goterms]
    map = dict()
Mikael Boden's avatar
Mikael Boden committed
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
    batchsize = 10 # size of query batch
    termcnt = 0
    limitpage = 100 # number of record on each returned page
    while termcnt < len(goterms):
        termbatch = []
        for index in range(batchsize):
            if termcnt < len(goterms):
                termbatch.append(goterms[termcnt])
            else:
                break
            termcnt += 1
        uri_string = 'annotation/search?limit=' + str(limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId='
        for i in range(len(termbatch)):
            term = termbatch[i]
            uri_string += term + "," if i < len(termbatch) - 1 else term
265 266 267
        # first turn off server certificate verification
        if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
            ssl._create_default_https_context = ssl._create_unverified_context
Mikael Boden's avatar
Mikael Boden committed
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
        page = 1
        try:
            while (True):
                url = __ebiGOUrl__ + uri_string + '&page=' + str(page)
                urlreq = urllib.request.Request(url)
                urlreq.add_header('Accept-encoding', 'gzip')
                response = urllib.request.urlopen(urlreq)
                if response.info().get('Content-Encoding') == 'gzip':
                    buf = StringIO(response.read())
                    f = gzip.GzipFile(fileobj=buf)
                    data = f.read().decode("utf-8")
                else:
                    data = response.read().decode("utf-8")
                ret = json.loads(data)
                if page == 1 and int(ret['numberOfHits']) > limitpage * 100:
                    print('Warning:', ret['numberOfHits'], 'matches in a query. Be patient.')
                for row in ret['results']:
                    genename = row['geneProductId']  # would look like "UniProtKB:A0A140VJQ9"
                    gotermid = row['goId']  # would look like "GO:0002080"
                    if not gotermid in map:
                        map[gotermid] = set([genename])
                    else:
                        map[gotermid].add(genename)
                if len(ret['results']) < limitpage:
                    break
                page += 1
        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
296
    return map
Mikael Boden's avatar
Mikael Boden committed
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424

class EBI(object):

    __email__ =         'anon@uq.edu.au'                            # to whom emails about jobs should go
    __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
    __checkInterval__ = 2                                           # how long to wait between checking job status

    def __init__(self, service=None):
        """ Initialise service session.
        service: presently, ncbiblast and clustalw2 are supported. Use None (default) for fetch/idmap jobs.
        """
        self.service = service
        self.lockFile = '%s.lock' % service

    def createLock(self):
        """ Create a lock file to prevent submission of more than 1 job
        at a time by a single user. """
        fh = open(self.lockFile, 'w')
        fh.write(self.jobId)
        fh.close()

    def removeLock(self):
        """ Remove the lock file. """
        os.remove(self.lockFile)

    def isLocked(self):
        """ Check if there is a lock on this service. If there is, check if
        the job is complete, and if so remove the lock. Return True if still
        locked and False if not. """
        if os.path.exists(self.lockFile):
            fh = open(self.lockFile, 'r')
            jobId = fh.read()
            fh.close()
            status = self.status(jobId)
            if status == 'RUNNING':
                self.jobId = jobId
                return True
            else:
                self.removeLock()
                return False
        else:
            return False

    """
    BLAST and CLUSTALW services
    """

    def run(self, params):
        """ Submit a job to the given service with the given parameters, given
        as a dictionary. Return the jobId. """
        if self.service == None:
            raise RuntimeError('No service specified')
        if self.isLocked():
            raise RuntimeError("""You currently have a %s job running. You must
                                  wait until it is complete before submitting another job. Go to
                                  %sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId))
        url = self.__ebiServiceUrl__ + self.service + '/run/'
        # ncbiblast database parameter needs special handling
        if self.service == 'ncbiblast':
            databaseList = params['database']
            del params['database']
            databaseData = ''
            for db in databaseList:
                databaseData += '&database=' + db
            encodedParams = urllib.parse.urlencode(params)
            encodedParams += databaseData
        else:
            encodedParams = urllib.parse.urlencode(params)
        print(url)
        self.jobId = urllib.request.urlopen(url, encodedParams).read()
        self.createLock()
        return self.jobId

    def status(self, jobId=None):
        """ Check the status of the given job (or the current job if none is
        specified), and return the result. """
        if jobId is None:
            jobId = self.jobId
        url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
        status = urllib.request.urlopen(url).read()
        return status

    def resultTypes(self):
        """ Get the available result types. Will only work on a finished job. """
        url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
        resultTypes = urllib.request.urlopen(url).read()
        return resultTypes

    def result(self, resultType):
        """ Get the result of the given job of the specified type. """
        url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
        try:
            result = urllib.request.urlopen(url).read()
            if resultType == 'error':
                raise RuntimeError('An error occurred: %s' % result)
        except(urllib.error.HTTPError):
            if resultType == 'error':
                raise RuntimeError('An unknown error occurred while processing the job (check your input)')
            else:
                self.result('error')
        return result

    def submit(self, params, resultTypes):
        """ Submit a new job to the service with the given parameters.
        Return the output in the specified format. """
        params['email'] = self.__email__
        self.run(params)
        print(('Submitted new', self.service, 'job, jobId:', self.jobId))
        print('Please be patient while the job is completed')
        status = 'RUNNING'
        observe = 0
        while status == 'RUNNING':
            observe = observe + 1
            status = self.status()
            sleep(self.__checkInterval__)
        if status != 'FINISHED':
            raise RuntimeError('An error occurred and the job could not be completed')
        print('Job complete.')
        self.removeLock()
        if type(resultTypes) != list:
            resultTypes = [resultTypes]
        results = []
        for resultType in resultTypes:
            results.append(self.result(resultType))
        if len(results) == 1:
            return results[0]
        else:
            return results