webservice.py 21.8 KB
Newer Older
Mikael Boden's avatar
Mikael Boden committed
1
import urllib.request
2
import urllib.parse
Mikael Boden's avatar
Mikael Boden committed
3 4 5 6 7
import os
from time import sleep
import stats
from io import StringIO
import gzip
8 9
import ssl
import json
Mikael Boden's avatar
Mikael Boden committed
10 11 12 13 14 15 16

""" This module is collection of functions for accessing the EBI REST web services,
    including sequence retrieval, searching, gene ontology, BLAST and ClustalW.
    The class EBI takes precautions taken as to not send too many requests when
    performing BLAST and ClustalW queries.

    See
17
    http://www.ebi.ac.uk/Tools/webservices/tutorials
Mikael Boden's avatar
Mikael Boden committed
18 19
    """

20 21 22 23 24
__ebiUrl__ = 'http://www.ebi.ac.uk/Tools/'
__ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__ = 'http://www.uniprot.org/'
__ebiSearchUrl__ = 'http://www.ebi.ac.uk/ebisearch/'

Mikael Boden's avatar
Mikael Boden committed
25 26 27 28 29 30 31 32

def fetch(entryId, dbName='uniprotkb', format='fasta'):
    """
    Retrieve a single entry from a database
    entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' (database dependent; examples for uniprotkb)
    dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
    format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases)
    See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax
33 34

    http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb&id=P63166&format=fasta&style=raw&Retrieve=Retrieve
Mikael Boden's avatar
Mikael Boden committed
35
    """
36
    # Construct URL
37
    url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&Retrieve=Retrieve&db=' + dbName + '&format=' + format + '&id=' + entryId
Mikael Boden's avatar
Mikael Boden committed
38 39
    # Get the entry
    try:
Mikael Boden's avatar
Mikael Boden committed
40 41
        data = urllib.request.urlopen(url).read().decode("utf-8")
        if data.startswith("ERROR"):
Mikael Boden's avatar
Mikael Boden committed
42 43
            raise RuntimeError(data)
        return data
Mikael Boden's avatar
Mikael Boden committed
44
    except urllib.error.HTTPError as ex:
Mikael Boden's avatar
Mikael Boden committed
45 46
        raise RuntimeError(ex.read())

47

Mikael Boden's avatar
Mikael Boden committed
48
def search(query, dbName='uniprot', format='list', limit=100, columns=""):
Mikael Boden's avatar
Mikael Boden committed
49 50 51 52 53 54 55 56 57 58 59
    """
    Retrieve multiple entries matching query from a database currently only via UniProtKB
    query: search term(s) e.g. 'organism:9606+AND+antigen'
    dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
    format: file format e.g. 'list', 'fasta' or 'txt'
    limit: max number of results (specify None for all results)
    See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
    See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
    """
    if dbName.startswith('uniprot'):
        # Construct URL
60 61 62 63
        if limit == None:  # no limit to number of results returned
            url = "{}{}/?format={}&query={}&columns={}".format(__uniprotUrl__, dbName, format,
                                                               urllib.parse.quote(query),
                                                               columns)
Mikael Boden's avatar
Mikael Boden committed
64
        else:
65 66 67
            url = "{}{}/?format={}&limit={}&query={}&columns={}".format(__uniprotUrl__, dbName, format, str(limit),
                                                                        urllib.parse.quote(query), columns)

Mikael Boden's avatar
Mikael Boden committed
68
        # Get the entries
Mikael Boden's avatar
Mikael Boden committed
69

Mikael Boden's avatar
Mikael Boden committed
70
        try:
Mikael Boden's avatar
Mikael Boden committed
71
            data = urllib.request.urlopen(url).read().decode("utf-8")
Mikael Boden's avatar
Mikael Boden committed
72 73 74 75
            if format == 'list':
                return data.splitlines()
            else:
                return data
Mikael Boden's avatar
Mikael Boden committed
76
        except urllib.error.HTTPError as ex:
Mikael Boden's avatar
Mikael Boden committed
77
            raise RuntimeError(ex.read().decode("utf-8"))
Mikael Boden's avatar
Mikael Boden committed
78 79 80 81
    elif dbName.startswith('refseq'):
        dbs = dbName.split(":")
        if len(dbs) > 1:
            dbName = dbs[1]
82 83


Mikael Boden's avatar
Mikael Boden committed
84
        base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
85 86 87 88 89 90

        url = base + "esearch.fcgi?db={}&term={}+AND+srcdb_refseq[" \
              "prop]&retmax={}".format(dbName, urllib.parse.quote(query), str(limit))

        print (url)

Mikael Boden's avatar
Mikael Boden committed
91 92
        # Get the entries
        try:
Mikael Boden's avatar
Mikael Boden committed
93
            data = urllib.request.urlopen(url).read().decode("utf-8")
Mikael Boden's avatar
Mikael Boden committed
94
            words = data.split("</Id>")
95
            words = [w[w.find("<Id>") + 4:] for w in words[:-1]]
Mikael Boden's avatar
Mikael Boden committed
96 97 98 99 100 101
            if format == 'list':
                return words
            elif format == 'fasta' and len(words) > 0:
                url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
                for w in words:
                    url += w + ","
Mikael Boden's avatar
Mikael Boden committed
102
                data = urllib.request.urlopen(url).read().decode("utf-8")
Mikael Boden's avatar
Mikael Boden committed
103 104 105
                return data
            else:
                return ''
Mikael Boden's avatar
Mikael Boden committed
106
        except urllib.error.HTTPError as ex:
Mikael Boden's avatar
Mikael Boden committed
107 108 109
            raise RuntimeError(ex.read())
    return

110 111 112 113

authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
                           3702: ['Arabidopsis thaliana', 'TAIR_ID'],
                           4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
Mikael Boden's avatar
Mikael Boden committed
114 115 116 117 118 119 120 121
                           10090: ['Mus musculus', 'MGI_ID']}

"""
Gene Ontology service (QuickGO)
http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries.
"""

122 123

def getGOReport(positives, background=None):
Mikael Boden's avatar
Mikael Boden committed
124 125 126 127 128 129 130
    """ Generate a complete GO term report for a set of genes (positives).
        Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
        Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
        (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
        E-value is a Bonferroni-corrected p-value.
        """
    pos = set(positives)
131
    fg_map = getGOTerms(pos)
Mikael Boden's avatar
Mikael Boden committed
132 133 134 135 136 137 138 139 140
    fg_list = []
    for id in fg_map:
        for t in fg_map[id]:
            fg_list.append(t)
    bg_map = {}
    bg_list = []
    neg = set()
    if background != None:
        neg = set(background).difference(pos)
141
        bg_map = getGOTerms(neg)
Mikael Boden's avatar
Mikael Boden committed
142 143 144 145 146 147 148 149 150 151 152 153
        for id in bg_map:
            for t in bg_map[id]:
                bg_list.append(t)
    term_set = set(fg_list)
    term_cnt = {}

    nPos = len(pos)
    nNeg = len(neg)
    if background == None:
        for t in term_set:
            term_cnt[t] = fg_list.count(t)
        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
154
    else:  # a background is provided
Mikael Boden's avatar
Mikael Boden committed
155 156 157 158 159 160 161 162 163 164 165 166
        for t in term_set:
            fg_hit = fg_list.count(t)
            bg_hit = bg_list.count(t)
            fg_nohit = nPos - fg_hit
            bg_nohit = nNeg - bg_hit
            term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)

    ret = []
    for t in sorted_cnt:
        defin = getGODef(t[0])
        if background != None:
167
            ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0] + t[1][1], defin['name']))
Mikael Boden's avatar
Mikael Boden committed
168 169 170 171
        else:
            ret.append((t[0], t[1], defin['name']))
    return ret

172

Mikael Boden's avatar
Mikael Boden committed
173 174 175 176 177
def getGODef(goterm):
    """
    Retrieve information about a GO term
    goterm: the identifier, e.g. 'GO:0002080'
    """
178
    # first turn off server certificate verification
Mikael Boden's avatar
Mikael Boden committed
179
    if (not os.environ.__getitem__('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
180 181 182
        ssl._create_default_https_context = ssl._create_unverified_context
    # Construct URL with query term
    url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm
Mikael Boden's avatar
Mikael Boden committed
183 184
    # Get the entry: fill in the fields specified below
    try:
185
        entry = {'id': None, 'name': None, 'aspect': None}
Mikael Boden's avatar
Mikael Boden committed
186
        data = urllib.request.urlopen(url).read().decode("utf-8")
187
        ret = json.loads(data)
188

189
        for row in ret['results']:
190 191 192 193 194 195 196
            if row['id'] == goterm:
                for key in entry:
                    try:
                        entry[key] = row[key]
                    except:
                        entry[key] = None
                entry['def'] = row['definition']['text']
Mikael Boden's avatar
Mikael Boden committed
197
        return entry
Mikael Boden's avatar
Mikael Boden committed
198
    except urllib.error.HTTPError as ex:
Mikael Boden's avatar
Mikael Boden committed
199 200
        raise RuntimeError(ex.read())

201

202
def getGOTerms(genes):
Mikael Boden's avatar
Mikael Boden committed
203 204
    """
    Retrieve all GO terms for a given set of genes (or single gene).
205
    The result is given as a map (key=gene name, value=list of unique terms).
Mikael Boden's avatar
Mikael Boden committed
206 207 208
    """
    if type(genes) != list and type(genes) != set and type(genes) != tuple:
        genes = [genes]
209
    map = dict()
210
    batchsize = 100  # size of query batch
211
    genecnt = 0
212
    limitpage = 100  # number of record on each returned page
213 214 215 216 217
    while genecnt < len(genes):
        genebatch = []
        for index in range(batchsize):
            if genecnt < len(genes):
                genebatch.append(genes[genecnt])
Mikael Boden's avatar
Mikael Boden committed
218
            else:
219 220
                break
            genecnt += 1
Mikael Boden's avatar
Mikael Boden committed
221
        uri_string = 'annotation/search?limit=' + str(limitpage) + '&geneProductId='
222 223
        for i in range(len(genebatch)):
            gene = genebatch[i]
Mikael Boden's avatar
Mikael Boden committed
224 225 226 227
            uri_string += gene + "," if i < len(genebatch) - 1 else gene
        # Construct URL
        # Get the entry: fill in the fields specified below
        # first turn off server certificate verification
Mikael Boden's avatar
Mikael Boden committed
228
        if (not os.environ.__getitem__('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
Mikael Boden's avatar
Mikael Boden committed
229 230 231 232 233
            ssl._create_default_https_context = ssl._create_unverified_context
        page = 1
        try:
            while (True):
                url = __ebiGOUrl__ + uri_string + '&page=' + str(page)
234 235 236
                urlreq = urllib.request.Request(url)
                urlreq.add_header('Accept-encoding', 'gzip')
                response = urllib.request.urlopen(urlreq)
Mikael Boden's avatar
Mikael Boden committed
237
                if response.info().__getitem__('Content-Encoding') == 'gzip':
238 239 240 241 242 243
                    buf = StringIO(response.read())
                    f = gzip.GzipFile(fileobj=buf)
                    data = f.read().decode("utf-8")
                else:
                    data = response.read().decode("utf-8")
                ret = json.loads(data)
Mikael Boden's avatar
Mikael Boden committed
244 245
                if page == 1 and int(ret['numberOfHits']) > limitpage * 100:
                    print('Warning:', ret['numberOfHits'], 'matches in a query. Be patient.')
246 247 248 249 250
                for row in ret['results']:
                    genename = row['geneProductId']  # would look like "UniProtKB:A0A140VJQ9"
                    gotermid = row['goId']  # would look like "GO:0002080"
                    if not genename in map:
                        map[genename] = set([gotermid])
Mikael Boden's avatar
Mikael Boden committed
251
                    else:
252
                        map[genename].add(gotermid)
Mikael Boden's avatar
Mikael Boden committed
253 254 255 256 257
                if len(ret['results']) < limitpage:
                    break
                page += 1
        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
258
    return map
Mikael Boden's avatar
Mikael Boden committed
259

260

261
def getGenes(goterms, taxo=None):
Mikael Boden's avatar
Mikael Boden committed
262 263
    """
    Retrieve all genes/proteins for a given set of GO terms (or single GO term).
264 265 266
    Genes that are annotated with a more specific GO term than those given are included.
    taxo: use specific taxonomic identifier, e.g. 9606 (human); default is all
    The result is given as a map (key=gene name, value=list of unique terms).
Mikael Boden's avatar
Mikael Boden committed
267 268 269 270
    """
    if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
        goterms = [goterms]
    map = dict()
271
    batchsize = 10  # size of query batch
Mikael Boden's avatar
Mikael Boden committed
272
    termcnt = 0
273
    limitpage = 100  # number of record on each returned page
Mikael Boden's avatar
Mikael Boden committed
274 275 276 277 278 279 280 281
    while termcnt < len(goterms):
        termbatch = []
        for index in range(batchsize):
            if termcnt < len(goterms):
                termbatch.append(goterms[termcnt])
            else:
                break
            termcnt += 1
282 283
        uri_string = 'annotation/search?limit=' + str(
            limitpage) + '&taxonId=' + taxo + "&goId=" if taxo else 'annotation/search?goId='
Mikael Boden's avatar
Mikael Boden committed
284 285 286
        for i in range(len(termbatch)):
            term = termbatch[i]
            uri_string += term + "," if i < len(termbatch) - 1 else term
287
        # first turn off server certificate verification
Mikael Boden's avatar
Mikael Boden committed
288
        if (not os.environ.__getitem__('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
289
            ssl._create_default_https_context = ssl._create_unverified_context
Mikael Boden's avatar
Mikael Boden committed
290 291 292 293 294 295 296
        page = 1
        try:
            while (True):
                url = __ebiGOUrl__ + uri_string + '&page=' + str(page)
                urlreq = urllib.request.Request(url)
                urlreq.add_header('Accept-encoding', 'gzip')
                response = urllib.request.urlopen(urlreq)
Mikael Boden's avatar
Mikael Boden committed
297
                if response.info().__getitem__('Content-Encoding') == 'gzip':
Mikael Boden's avatar
Mikael Boden committed
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
                    buf = StringIO(response.read())
                    f = gzip.GzipFile(fileobj=buf)
                    data = f.read().decode("utf-8")
                else:
                    data = response.read().decode("utf-8")
                ret = json.loads(data)
                if page == 1 and int(ret['numberOfHits']) > limitpage * 100:
                    print('Warning:', ret['numberOfHits'], 'matches in a query. Be patient.')
                for row in ret['results']:
                    genename = row['geneProductId']  # would look like "UniProtKB:A0A140VJQ9"
                    gotermid = row['goId']  # would look like "GO:0002080"
                    if not gotermid in map:
                        map[gotermid] = set([genename])
                    else:
                        map[gotermid].add(genename)
                if len(ret['results']) < limitpage:
                    break
                page += 1
        except urllib.error.HTTPError as ex:
            raise RuntimeError(ex.read())
318
    return map
Mikael Boden's avatar
Mikael Boden committed
319 320


321 322 323 324
class EBI(object):
    __email__ = 'anon@uq.edu.au'  # to whom emails about jobs should go
    __ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/'  # Use UQ mirror when available
    __checkInterval__ = 2  # how long to wait between checking job status
Mikael Boden's avatar
Mikael Boden committed
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373

    def __init__(self, service=None):
        """ Initialise service session.
        service: presently, ncbiblast and clustalw2 are supported. Use None (default) for fetch/idmap jobs.
        """
        self.service = service
        self.lockFile = '%s.lock' % service

    def createLock(self):
        """ Create a lock file to prevent submission of more than 1 job
        at a time by a single user. """
        fh = open(self.lockFile, 'w')
        fh.write(self.jobId)
        fh.close()

    def removeLock(self):
        """ Remove the lock file. """
        os.remove(self.lockFile)

    def isLocked(self):
        """ Check if there is a lock on this service. If there is, check if
        the job is complete, and if so remove the lock. Return True if still
        locked and False if not. """
        if os.path.exists(self.lockFile):
            fh = open(self.lockFile, 'r')
            jobId = fh.read()
            fh.close()
            status = self.status(jobId)
            if status == 'RUNNING':
                self.jobId = jobId
                return True
            else:
                self.removeLock()
                return False
        else:
            return False

    """
    BLAST and CLUSTALW services
    """

    def run(self, params):
        """ Submit a job to the given service with the given parameters, given
        as a dictionary. Return the jobId. """
        if self.service == None:
            raise RuntimeError('No service specified')
        if self.isLocked():
            raise RuntimeError("""You currently have a %s job running. You must
                                  wait until it is complete before submitting another job. Go to
374 375
                                  %sstatus/%s to check the status of the job.""" % (
            self.service, self.__ebiServiceUrl__, self.jobId))
Mikael Boden's avatar
Mikael Boden committed
376 377 378 379 380 381 382 383
        url = self.__ebiServiceUrl__ + self.service + '/run/'
        # ncbiblast database parameter needs special handling
        if self.service == 'ncbiblast':
            databaseList = params['database']
            del params['database']
            databaseData = ''
            for db in databaseList:
                databaseData += '&database=' + db
Mikael Boden's avatar
Mikael Boden committed
384 385
            encodedParams = urllib.parse.urlencode(params).encode('utf-8')
            encodedParams += databaseData.encode('utf-8')
Mikael Boden's avatar
Mikael Boden committed
386
        else:
Mikael Boden's avatar
Mikael Boden committed
387
            encodedParams = urllib.parse.urlencode(params).encode('utf-8')
Mikael Boden's avatar
Mikael Boden committed
388
        print(url)
Mikael Boden's avatar
Mikael Boden committed
389
        self.jobId = urllib.request.urlopen(url, encodedParams).read().decode('utf-8')
Mikael Boden's avatar
Mikael Boden committed
390 391 392 393 394 395 396 397 398
        self.createLock()
        return self.jobId

    def status(self, jobId=None):
        """ Check the status of the given job (or the current job if none is
        specified), and return the result. """
        if jobId is None:
            jobId = self.jobId
        url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
Mikael Boden's avatar
Mikael Boden committed
399
        status = urllib.request.urlopen(url).read().decode('utf-8')
Mikael Boden's avatar
Mikael Boden committed
400 401 402 403 404
        return status

    def resultTypes(self):
        """ Get the available result types. Will only work on a finished job. """
        url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
Mikael Boden's avatar
Mikael Boden committed
405
        resultTypes = urllib.request.urlopen(url).read().decode('utf-8')
Mikael Boden's avatar
Mikael Boden committed
406 407 408 409 410 411
        return resultTypes

    def result(self, resultType):
        """ Get the result of the given job of the specified type. """
        url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
        try:
Mikael Boden's avatar
Mikael Boden committed
412
            result = urllib.request.urlopen(url).read().decode('utf-8')
Mikael Boden's avatar
Mikael Boden committed
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
            if resultType == 'error':
                raise RuntimeError('An error occurred: %s' % result)
        except(urllib.error.HTTPError):
            if resultType == 'error':
                raise RuntimeError('An unknown error occurred while processing the job (check your input)')
            else:
                self.result('error')
        return result

    def submit(self, params, resultTypes):
        """ Submit a new job to the service with the given parameters.
        Return the output in the specified format. """
        params['email'] = self.__email__
        self.run(params)
        print(('Submitted new', self.service, 'job, jobId:', self.jobId))
        print('Please be patient while the job is completed')
        status = 'RUNNING'
        observe = 0
        while status == 'RUNNING':
            observe = observe + 1
            status = self.status()
            sleep(self.__checkInterval__)
        if status != 'FINISHED':
            raise RuntimeError('An error occurred and the job could not be completed')
        print('Job complete.')
        self.removeLock()
        if type(resultTypes) != list:
            resultTypes = [resultTypes]
        results = []
        for resultType in resultTypes:
            results.append(self.result(resultType))
        if len(results) == 1:
            return results[0]
        else:
            return results
Mikael Boden's avatar
Mikael Boden committed
448 449


450
def getUniProtDict(ids, cols="", db='uniprot', identities=None):
Mikael Boden's avatar
Mikael Boden committed
451 452 453 454 455 456 457 458 459 460 461 462 463 464
    """

    :param ids: The list of UniProt IDs
    :param cols: The list of UniProt database columns
    :param db: The database to search - uniprot or uniref
    :param identity: The identity to search uniref with
    :return: A dictionary mapping each UniProt ID to another dictionary where the keys are database columns and the
    values are the information stored within those columns
    """

    """
    *** EXAMPLE USAGE ***
    Get a list of UniProt IDs and a list of UniProt columns you're interested in.
    Full list of UniProt column names - https://www.uniprot.org/help/uniprotkb_column_names
465

Mikael Boden's avatar
Mikael Boden committed
466 467 468
    uniprot_names = ['Q9LIR4', 'Q1JUQ1', 'P05791', 'P0ADF6']
    cols = ["lineage(SUPERKINGDOM)", "genes", "lineage(KINGDOM)"]    
    up_dict = getUniProtDict(uniprot_names, cols)
469

Mikael Boden's avatar
Mikael Boden committed
470 471 472 473 474 475 476 477
    for record in up_dict:
        print (record, up_dict[record].get("lineage(SUPERKINGDOM)"))

    print()

    for record in up_dict:
        print (record, up_dict[record].get("genes"))

478

Mikael Boden's avatar
Mikael Boden committed
479
    If a record doesn't have an entry in UniProt for that column it'll just return None
480

Mikael Boden's avatar
Mikael Boden committed
481 482
    print (up_dict['Q1JUQ1'])
    print (up_dict['Q1JUQ1']['lineage(KINGDOM)'])
483 484


Mikael Boden's avatar
Mikael Boden committed
485
    *** EXAMPLE USAGE FOR UNIREF SEARCHING ***
486

Mikael Boden's avatar
Mikael Boden committed
487
    up_dict = getUniProtDict(["Q9LIR4", "P99999"], cols=["members"], db="uniref", identities = 1.0)
488

Mikael Boden's avatar
Mikael Boden committed
489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
    You can either pass a list of identities for each UniProt identifier (in which case the list of identities must be
    the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
    """

    # Format the lists of IDs and columns correctly
    cols = ",".join(cols)

    if db == 'uniprot':
        updated_ids = ' or '.join(ids)


    elif db == 'uniref':

        # If we just got a single value for an identity, create a list the same size as the queries with just this value
        if type(identities) != list:
            identities = [identities] * len(ids)
        elif len(identities) != len(ids):
506 507
            raise RuntimeError(
                'Either supply a single identity threshold or supply one for each identifier in the list')
Mikael Boden's avatar
Mikael Boden committed
508 509 510 511

        # Check that the identity thresholds are valid values
        for x in identities:
            if x not in [1.0, 0.9, 0.5]:
512 513
                raise RuntimeError(
                    "UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - " + str(x))
Mikael Boden's avatar
Mikael Boden committed
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536

        # Add the query syntax around the identifiers
        updated_ids = ""
        for query, identity in zip(ids, identities):
            updated_ids += "(member:" + query + "+AND+identity:" + str(identity) + ")+OR+"

        updated_ids = updated_ids[0:-4]

    else:
        raise RuntimeError('Database should be either uniprot or uniref')

    url = 'https://www.uniprot.org/' + db + '/'

    params = {
        'format': 'tab',
        'query': updated_ids,
        'columns': "id," + cols
    }

    data = urllib.parse.urlencode(params).encode('utf-8')
    request = urllib.request.Request(url, data)
    opener = urllib.request.build_opener()
    response = opener.open(request)
Mikael Boden's avatar
Mikael Boden committed
537
    page = response.read(20000000).decode('utf-8')
Mikael Boden's avatar
Mikael Boden committed
538 539 540 541 542
    up_dict = {}

    # For each record we retrieve, split the line by tabs and build up the UniProt dict
    for line in page.split("\n")[1:]:
        if line:
543
            splitlines = line.split("\t")
Mikael Boden's avatar
Mikael Boden committed
544 545 546 547
            id_dict = {}
            pos = 1
            for col in cols.split(","):
                id_dict[col] = None if splitlines[pos] == "" else splitlines[pos]
548
                pos += 1
Mikael Boden's avatar
Mikael Boden committed
549 550 551
            up_dict[splitlines[0]] = id_dict

    return up_dict