godata.py 48.5 KB
Newer Older
Mikael Boden's avatar
Mikael Boden committed
1
'''
Mikael Boden's avatar
Mikael Boden committed
2
Created on Jul 12, 2012, amended April 2015 and again May 2018
Mikael Boden's avatar
Mikael Boden committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28

Module for managing Gene Ontology data, in particular gene:terms
annotations and term definitions

It can be used on files you can download from geneontology.org.
The class GO is constructed from:
- annotation file which is (usually) specific to the species of interest
- OBO file which defines the GO terms and their relationships
  e.g.
  > go = GO('gene_association.goa_ref_human', 'go-basic.obo')
Internal data structures are created so that you can query
- what are the terms of my gene (or genes)? Use getTerms
- what are the genes of my term? Use getGenes
- what terms occur amongst my genes, ranked by their absolute count? Use getGOReport without background
- what terms are statistically enriched in my genes, relative a background set of genes? Use getGOReport with background

The class BinGO works with a compact (memory saving) binary format that aggregates information from an annotation
file and an OBO file. Therefore, you first need to construct this binary file, using writeBitFile.
Subsequently you can construct instances of BinGO and query terms and genes, roughly in the manner identified above for GO.

@author: mikael
'''

from struct import pack, unpack, calcsize, error
import operator
import time
Mikael Boden's avatar
Mikael Boden committed
29
import sys
Mikael Boden's avatar
Mikael Boden committed
30
import stats
Mikael Boden's avatar
Mikael Boden committed
31
import gzip
Mikael Boden's avatar
Mikael Boden committed
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85

# Character codes used by binary format to identify ontology
onto_codes = {
              'P': 'Biological process',
              'F': 'Molecular function',
              'C': 'Cellular component'}

# Labels for edges in the ontology graph, index is used in binary format
onto_rel = ['is_a', 'isect', 'part_of', 'has_part', 'regulates']

# Evidence codes assigned to annotations, an index is assigned when creating binary file and is stored in its header
evid_codes = { # Experimental Evidence Codes
    'EXP': 'Inferred from Experiment',
    'IDA': 'Inferred from Direct Assay',
    'IPI': 'Inferred from Physical Interaction',
    'IMP': 'Inferred from Mutant Phenotype',
    'IGI': 'Inferred from Genetic Interaction',
    'IEP': 'Inferred from Expression Pattern',
               #Computational Analysis Evidence Codes
    'ISS': 'Inferred from Sequence or Structural Similarity',
    'ISO': 'Inferred from Sequence Orthology',
    'ISA': 'Inferred from Sequence Alignment',
    'ISM': 'Inferred from Sequence Model',
    'IGC': 'Inferred from Genomic Context',
    'IBA': 'Inferred from Biological aspect of Ancestor',
    'IBD': 'Inferred from Biological aspect of Descendant',
    'IKR': 'Inferred from Key Residues',
    'IRD': 'Inferred from Rapid Divergence',
    'RCA': 'inferred from Reviewed Computational Analysis',
    'TAS': 'Traceable Author Statement',
    'NAS': 'Non-traceable Author Statement',
               #Curator Statement Evidence Codes
    'IC': 'Inferred by Curator',
    'ND': 'No biological Data available',
               #Automatically-assigned Evidence Codes
    'IEA': 'Inferred from Electronic Annotation',
               #Obsolete Evidence Codes
    'NR': 'Not Recorded'}

class GO():
    """ Classical interface for working with GO terms usually within the same species and when memory is not a major issue.
        Implementations are relatively efficient (for Python at least).
        Major functions:
        __init__: construct instance of GO session from an annotation file and an OBO file (geneontology.org)
        getTerms: get GO terms from gene or genes (transitively or not)
        getGenes: get genes that are annotated with given term or terms
        getGOReport: perform basic gene set enrichment
        """

    # Structures to hold all data relevant to session
    annots = {}     # annotations: annots[gene] = (taxa, terms[term] = (evid, T/F))
    termdefs = {}   # definitions: termdefs[term] = (onto, set((term, rel)), name)
    children = {}   # redundant, parent-to-child structure: children[term] = set((term, rel))

Mikael Boden's avatar
Mikael Boden committed
86
    def __init__(self, annotFile, obofile, annotfile_columns = (1,2,3,4,6,8), include_genes = None):
Mikael Boden's avatar
Mikael Boden committed
87 88 89 90 91 92 93 94
        """ Start GO session with specified data loaded:
        annotfile: name of annotation file, e.g.'gene_association.tair'
        OBO file: name of gene ontology definition file, e.g. 'gene_ontology_ext.obo'
        Optionally, specify what columns in the annotation file that contains in order:
        gene, symb, qual, term, evid, onto. Note that index starts at 0 NOT 1.
        (The default seems to work for most annotation files, but sometime if you wish to cross reference
        say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations:
        go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8))
Mikael Boden's avatar
Mikael Boden committed
95
        Optionally, specify what genes should be included when reading the annotations; None (default) means include everything.
Mikael Boden's avatar
Mikael Boden committed
96
        """
Mikael Boden's avatar
Mikael Boden committed
97
        print(("Started at", time.asctime()))
Mikael Boden's avatar
Mikael Boden committed
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
        # Get GO definitions
        terms = readOBOFile(obofile)
        for term in terms:
            (term_name, term_onto, term_is) = terms[term]
            self.termdefs[term] = (term_onto, term_is, term_name)
            self.children[term] = set()

        for term in self.termdefs:
            (term_onto, term_is, term_name) = self.termdefs[term]
            for (parent, prel) in term_is:
                try:
                    cset = self.children[parent]
                    cset.add((term, prel))
                except KeyError:
                    pass
Mikael Boden's avatar
Mikael Boden committed
113
        print(("Read %d GO definitions" % len(terms)))
Mikael Boden's avatar
Mikael Boden committed
114
        # open annotation file to analyse and index data
Mikael Boden's avatar
Mikael Boden committed
115 116 117 118
        if annotFile.endswith(".gz"):
            src = gzip.open(annotFile, 'rt')
        else:
            src = open(annotFile, 'rt')
Mikael Boden's avatar
Mikael Boden committed
119 120 121 122 123 124 125
        gene_cnt = 0
        cnt = 0
        for line in src:
            cnt += 1
            if line.startswith('!'):
                continue
            (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns)
Mikael Boden's avatar
Mikael Boden committed
126 127 128 129 130 131 132 133
            if include_genes == None or gene in include_genes:
                try:
                    (taxa_q, terms_map) = self.annots[gene]
                    terms_map[term] = (evid, qual != 'NOT')
                except KeyError: # not a previously encountered gene
                    gene_cnt += 1
                    terms_map = {term: (evid, qual != 'NOT')}
                    self.annots[gene] = (taxa, terms_map)
Mikael Boden's avatar
Mikael Boden committed
134
        src.close()
Mikael Boden's avatar
Mikael Boden committed
135
        print(("Read annotations for %d genes" % gene_cnt))
Mikael Boden's avatar
Mikael Boden committed
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291

    def _makeIntoList(self, id_or_ids):
        if type(id_or_ids) != list and type(id_or_ids) != set and type(id_or_ids) != tuple:
            return [id_or_ids]
        return id_or_ids

    def getTerms(self, genes_or_gene, evid = None, onto = None, include_more_general = True):
        """ Retrieve all terms for a gene or a set/list/tuple of genes.
            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
            If onto(logy) is specified the method includes only entries from specified ontology ('P', 'F' or 'C').
            If include_more_general is true, terms that are transitively related are included.
            With multiple genes provided in query, the result is a map, keyed by gene (each identifying a set of terms).
            When only one gene is provided, the result is simply a set of terms.
        """
        if type(genes_or_gene) != list and type(genes_or_gene) != set and type(genes_or_gene) != tuple:
            return self.getTerms4Gene(genes_or_gene, evid, onto, include_more_general)
        else:
            return self.getTerms4Genes(genes_or_gene, evid, onto, include_more_general)

    def getTerms4Genes(self, genes, evid = None, onto = None, include_more_general = True):
        """ Retrieve all GO terms for a given set/list/tuple of genes.
            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
            If onto(logy) is specified the method includes only entries from specified ontology ('P', 'F' or 'C').
            If include_more_general is True (default) then transitively related terms are included.
            With multiple genes provided in query, the result is a map, keyed by gene (each identifying a set of terms).
        """
        gomap = {} # gene to GO terms map
        genes = self._makeIntoList(genes)
        for gene in genes:
            gomap[gene] = self.getTerms4Gene(gene, evid, onto, include_more_general)
        return gomap

    def getTerms4Gene(self, gene, evid = None, onto = None, include_more_general = True):
        """ Retrieve all GO terms for a given (single) gene.
            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
            If onto(logy) is specified the method includes only entries from specified ontology ('P', 'F' or 'C').
            If include_more_general is True (default) then transitively related terms are included
            When only one gene is provided, the result is simply a set of terms.
        """
        direct = set()
        # STEP 1: Find all terms directly associated with specified genes
        try:
            (taxa, terms_map) = self.annots[gene]
            for term in terms_map:
                (term_evid, term_qual) = terms_map[term]
                if (evid == None or evid == term_evid) and term_qual:
                    direct.add(term)
        except KeyError:
            return set() # gene was not found, hence no annotations for it
        # STEP 2: Find terms associated with (indirect) parents of terms from STEP 1
        indirect = set()
        if include_more_general:
            for term in direct:
                parents = self.getParents(term, include_more_general)
                for parent in parents:
                    indirect.add(parent)
        return direct.union(indirect)

    def getGenes(self, terms_or_term, evid = None, taxa = None, rel = None, include_more_specific = False):
        """ Retrieve all genes that are annotated with specified term or terms,
            qualified by evidence, taxa and relation type, e.g. "is_a".
            If multiple terms are provided, a map is returned keyed by term (each identifying set of genes).
            With a single term provided, a set of genes is returned.
        """
        if type(terms_or_term) != list and type(terms_or_term) != set and type(terms_or_term) != tuple:
            return self.getGenes4Term(terms_or_term, evid, taxa, rel, include_more_specific)
        else:
            return self.getGenes4Terms(terms_or_term, evid, taxa, rel, include_more_specific)

    def getGenes4Terms(self, terms, evid = None, taxa = None, rel = None, include_more_specific = False):
        """ Retrieve all genes that are annotated with specified terms,
            qualified by evidence, taxa and relation type, e.g. "is_a".
            Since multiple terms are provided, a map is returned keyed by term (each identifying set of genes).
        """
        gomap = {} # term to genes map
        terms = self._makeIntoList(terms)
        for term in terms:
            gomap[term] = self.getGenes4Term(term, evid, taxa, rel, include_more_specific)
        return gomap

    def getGenes4Term(self, term, evid = None, taxa = None, rel = None, include_more_specific = False):
        """ Retrieve all genes that are annotated with specified term or terms,
            qualified by evidence, taxa and relation type, e.g. "is_a".
            With a single term provided, a set of genes is returned.
        """
        genes = self._getGenes4Term(term, evid, taxa, rel)
        if include_more_specific:
            terms = self.getChildren(term, rel, True) # not recursive yet
            for t in terms:
                tgenes = self._getGenes4Term(t, evid, taxa, rel)
                for g in tgenes:
                    genes.add(g)
        return genes

    def _getGenes4Term(self, term, evid = None, taxa = None, rel = None):
        """ Retrieve all genes that are annotated with specified term, and qualified by evidence, taxa etc. """
        genes = set()
        # Scour through all genes
        for gene in self.annots: # annotations: annots[gene] = (taxa, terms[term] = (evid, T/F))
            (qtaxa, qterms) = self.annots[gene]
            if taxa == None or taxa == qtaxa:
                for qterm in qterms:
                    if qterm != term:
                        continue
                    (qevid, qqual) = qterms[term]
                    if (evid == None or evid == qevid) and qqual:
                        genes.add(gene)
                        break
        return genes

    def getChildren(self, parent_term_id_or_ids, rel = None, include_more_specific = False):
        """ Retrieve all direct children of the given (parent) term.
        """
        parent_terms = self._makeIntoList(parent_term_id_or_ids)
        cset = set()
        for parent in parent_terms:
            # definitions: children[term] = set((term, relation), ...)
            current = self.children[parent]
            for (child_term, child_rel) in current:
                if rel == None or rel == child_rel:
                    cset.add(child_term)
        if len(cset) > 0 and include_more_specific:
            grandkids = self.getChildren(cset, rel, True)
            for grandkid in grandkids:
                cset.add(grandkid)
        return cset

    def getParents(self, child_term_id, include_more_general = True):
        """ Retrieve all parents of the given term, transitively or not.
        """
        direct = set() # all GO terms which are parents to given term
        try:
            (onto_ch, terms_ch, name_ch) = self.termdefs[child_term_id]
            for (parent_id, parent_rel) in terms_ch:
                (onto_pa, terms_pa, name_pa) = self.termdefs[parent_id]
                direct.add(parent_id)
                if (include_more_general):
                    parents = self.getParents(parent_id, True)
                    for parent in parents:
                        direct.add(parent)
        except KeyError:
            pass # term was not found, possibly throw error?
        return direct

    def getTermdef(self, term_id):
        """ Retrieve information about a given term:
            ontology, parent terms, and name as a tuple.
        """
        try:
            (onto_ch, terms_set, term_name) = self.termdefs[term_id]
            return (onto_ch, terms_set, term_name)
        except KeyError:
            return ('Unknown', 'Unknown', 'Unknown')

    def getAllAnnots(self):
        """ Retrieve all annotated gene products """
Mikael Boden's avatar
Mikael Boden committed
292
        return list(self.annots.keys())
Mikael Boden's avatar
Mikael Boden committed
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336

    def getAllBackground(self, positives = [], taxa = None, evid = None, include_more_general = False):
        """ Retrieve all genes and terms that are annotated but not in a list of positives (gene products).
        """
        # (taxa, terms[term] = (evid, T/F))
        bg_genes = set()
        bg_list = []
        for gene in self.annots:
            if not gene in positives:
                bg_genes.add(gene)
                (qtaxa, qterms) = self.annots[gene]
                if taxa == None or qtaxa == taxa:
                    for t in qterms:
                        (qevid, qqual) = qterms[t]
                        if (evid == None or qevid == evid) and qqual:
                            bg_list.append(t)
                            if include_more_general:
                                for parent in self.getParents(t, True):
                                    bg_list.append(parent)
        return (bg_genes, bg_list)

    def getCountReport(self, positives, threshold = None, include_more_general = True):
        """ For a set of named gene products (positives) this method determines the counts of GO terms.
            Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) sorted by count.
            positives: names of gene products
            threshold: the count that must be reached for term to be reported (default is 0)
            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
            include_more_general: if True, include also more general GO terms annotated to gene products (default is True)
            """
        fg_list = [] # all terms, with multiple copies for counting
        fg_map = self.getTerms4Genes(positives, include_more_general = include_more_general) #
        for id in fg_map:
            for t in fg_map[id]:
                fg_list.append(t)
        term_set = set(fg_list)
        term_cnt = {}

        nPos = len(positives)
        if threshold == None:
            threshold = 0 # include all terms
        for t in term_set:
            cnt = fg_list.count(t)
            if cnt >= threshold:
                term_cnt[t] = cnt
Mikael Boden's avatar
Mikael Boden committed
337
        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
Mikael Boden's avatar
Mikael Boden committed
338 339 340 341
        ret = []
        for t in sorted_cnt:
            defin = self.getTermdef(t[0])
            if defin == None:
Mikael Boden's avatar
Mikael Boden committed
342
                print(('Could not find definition of %s' % t[0]))
Mikael Boden's avatar
Mikael Boden committed
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
            else:
                ret.append((t[0], t[1], defin[2], defin[0]))
        return ret

    def getEnrichmentReport(self, positives, background = None, evid = None, threshold = None, include_more_general = True):
        """ For a set of named gene products (positives) this method determines the enrichment of GO terms.
            Each GO term is also assigned an enrichment p-value (on basis of provided background, or on basis of all annotated genes, if not provided).
            Note that to use the full set as background can be computationally expensive, so to speed up subsequent runs, the results are cached.
            Returns a list of tuples (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
            E-value is a Bonferroni-corrected p-value.
            positives: names of gene products
            background: names of gene products (or None if all annotated gene products should be used; default)
            threshold: E-value that must be reached for term to be reported (default is 0.05)
            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
            include_more_general: if True, include also more general GO terms annotated to gene products (default is True)
            """
        # Process foreground: find terms of genes
        fg_list = [] # all terms, with multiple copies for counting
        fg_map = self.getTerms4Genes(positives, evid = evid, include_more_general = include_more_general) #
        for fg_gene in fg_map:
            for t in fg_map[fg_gene]:
                fg_list.append(t)
        nPos = len(positives)
        # Process background: find terms of genes
        bg_list = []
        if background == None: # need to use the full set
Mikael Boden's avatar
Mikael Boden committed
369
            background = list(self.annots.keys())
Mikael Boden's avatar
Mikael Boden committed
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
        negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives
        nNeg = len(negatives)
        bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general)
        for bg_gene in bg_map:
            for t in bg_map[bg_gene]:
                bg_list.append(t)

        term_set = set(fg_list)
        term_cnt = {}

        if threshold == None:
            threshold = 0.05

        for t in term_set:
            fg_hit = fg_list.count(t) # number of foreground genes WITH GO term (number of terms in the list for the collective set of foreground genes)
            bg_hit = bg_list.count(t) # number of background genes WITH GO term (number of terms in the list for the collective set of background genes)
            fg_nohit = nPos - fg_hit  # total number of genes in foreground minus that number of hits
            bg_nohit = nNeg - bg_hit  # total number of genes in background minus that number of hits
            pvalue = stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False) # one-tailed FET
            evalue = pvalue * len(term_set) # Bonferroni correction
            if evalue <= threshold: # check if significance req is fulfilled
                term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue)
Mikael Boden's avatar
Mikael Boden committed
392
        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
Mikael Boden's avatar
Mikael Boden committed
393 394 395 396
        ret = []
        for t in sorted_cnt:
            defin = self.getTermdef(t[0])
            if defin == None:
Mikael Boden's avatar
Mikael Boden committed
397
                print(('Could not find definition of %s' % t[0]))
Mikael Boden's avatar
Mikael Boden committed
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
            else:
                ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0]))
        return ret

class BinGO():

    # Structures to hold all data relevant to session, all keys are "encoded"
    annots = {}     # annotations: annots[gene] = (taxa, terms[term] = (evid, T/F))
    termdefs = {}   # definitions: termdefs[term] = (onto, terms[term] = relation, name)
    # Codes for encoding and decoding
    gene_code = None
    term_code = None
    evid_code = None
    # indices
    annot_index = {}
    # Files
    f = None

    def __init__(self, filename, taxa = None):
        """ The binary file contains all the data and will initialise
            gene annotations (annots) and term definitions (termdefs)
            and the encoding/decoding keys. """
        self.f = self._readBitFile(filename, taxa = taxa)

    def _decodeGeneIDs(self, gene_codes):
        if type(gene_codes) != list and type(gene_codes) != set and type(gene_codes) != tuple:
            gene_codes = [gene_codes]
        ids = []
        for i in gene_codes:
            s = decode(i, self.gene_code)
            ids.append(s)
        return ids

    def _encodeGeneIDs(self, gene_names):
        if type(gene_names) != list and type(gene_names) != set and type(gene_names) != tuple:
            gene_names = [gene_names]
        ids = []
        for i in gene_names:
            y = encode(i, self.gene_code)
            ids.append(y)
        return ids

    def _getGeneEntry(self, gene):
        peek = self.annot_index[gene]
        self.f.seek(peek, 0)
        buf = self.f.read(calcsize('IIH'))
        (gene_int, taxa_int, nterms) = unpack('IIH', buf)
        buf = self.f.read(nterms * calcsize('?BI'))
        terms_dict = {}
        for pos in range(0, len(buf) - 1, calcsize('?BI')):
            (qual_bool, evid_int, term_int) = unpack('?BI', buf[pos:pos+calcsize('?BI')])
            terms_dict[term_int] = (evid_int, qual_bool)
        return (taxa_int, terms_dict)

    def _getSuperTerms(self, term, rel = None):
        """ Recursively compute the transitive closure. """
        found = set()
        try:
            (_, closure, _) = self.termdefs[term]
Mikael Boden's avatar
Mikael Boden committed
457
            for (t, r) in list(closure.items()):
Mikael Boden's avatar
Mikael Boden committed
458 459 460 461
                if (not rel) or r == rel:
                    found.add(t)
                    found.update(self._getSuperTerms(t, rel))
        except KeyError:
Mikael Boden's avatar
Mikael Boden committed
462
            print(('Could not find GO:%s' % (''.join(decode(term, self.term_code)))))
Mikael Boden's avatar
Mikael Boden committed
463 464 465 466
        return found

    def _getChildTerms(self, term, rel = None):
        found = set()
Mikael Boden's avatar
Mikael Boden committed
467
        for (child, termdef) in list(self.termdefs.items()):
Mikael Boden's avatar
Mikael Boden committed
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499
            (_, parents_dict, _) = termdef
            try:
                myrel = parents_dict[term]
                if rel == myrel or not rel: found.add(child)
            except KeyError:
                pass
        return found

    def _getSpecificTerms(self, term, rel = None):
        direct = self._getChildTerms(term, rel)
        found = set()
        for t in direct:
            found.add(t)
            found.update(self._getSpecificTerms(t, rel))
        return found

    def getTerms(self, genes, evid = None, onto = None, include_more_general = True):
        """
        Retrieve all GO terms for a given set of genes (or single gene).
        The result is given as a map (key=gene name, value=list of unique terms) OR
        in the case of a single gene as a list of unique terms.
        If include_more_general is True (default) then transitively related terms are included
        """
        mymap = dict()
        # STEP 1: Find all terms directly associated with specified genes
        direct = set() # all GO terms (encoded)
        ids = self._encodeGeneIDs(genes)
        for i in ids:
            gene_name = ''.join(decode(i, self.gene_code))
            mymap[gene_name] = set()
            try:
                (taxa, terms) = self._getGeneEntry(i)
Mikael Boden's avatar
Mikael Boden committed
500
                for (term, evid_and_qual) in list(terms.items()):
Mikael Boden's avatar
Mikael Boden committed
501 502 503 504 505 506 507 508 509 510 511 512 513
                    if evid_and_qual[1] and not evid: # if True and no evidence is specified
                        direct.add(term)
                        mymap[gene_name].add(term)
                    elif self.evid_code[evid_and_qual[0]] == evid:
                        direct.add(term)
                        mymap[gene_name].add(term)
            except KeyError:
                pass
                #print 'Failed to find annotations for gene %s' % gene_name
        if include_more_general:
            # STEP 2: Find the transitive closure of each term identified, store as a dictionary
            indirect = {}
            for t in direct:
Mikael Boden's avatar
Mikael Boden committed
514
                if not t in indirect:
Mikael Boden's avatar
Mikael Boden committed
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529
                    indirect[t] = set(self._getSuperTerms(t))
        # STEP 3: compile and return results
        for gene in mymap:
            term_ids = mymap[gene]
            all_ids = set(term_ids)
            if include_more_general:
                for term_id in term_ids:
                    all_ids.update(indirect[term_id])
            mymap[gene] = set()
            for term_enc in all_ids:
                mymap[gene].add('GO:'+''.join(decode(term_enc, self.term_code)))
        return mymap

    def getAllGenes(self):
        names = []
Mikael Boden's avatar
Mikael Boden committed
530
        for g in self._decodeGeneIDs(list(self.annot_index.keys())):
Mikael Boden's avatar
Mikael Boden committed
531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
            names.append(''.join(g))
        return names

    def getGenes(self, terms, evid = None, taxa = None, rel = None, include_more_specific = True):
        """ Retrieve all genes that are annotated with specified terms, and qualified by evidence, taxa etc. """
        """ TODO: Debug--suspect this implementation is incorrect. """
        term_ids = set()
        for t in terms:
            term_ids.add(encode(t[3:], self.term_code))
        # STEP 1 (optional): determine more specific terms to be included in query
        if include_more_specific:
            myterms = set()
            for t in term_ids:
                myterms.add(t)
                children = self._getSpecificTerms(t, rel)
                myterms.update(children)
            term_ids = myterms
        # STEP 2: identify genes with those terms
        found = {}
        for g in self.annot_index:
            gene_name = decode(g, self.gene_code)
            (mytaxa, tdict) = self._getGeneEntry(g)
            if not taxa or taxa == mytaxa:
Mikael Boden's avatar
Mikael Boden committed
554
                for annot_term in list(tdict.keys()):
Mikael Boden's avatar
Mikael Boden committed
555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
                    if tdict[annot_term] == evid:
                        if annot_term in terms:
                            try:
                                added = found[gene_name]
                                added.add(annot_term)
                            except KeyError:
                                found[gene_name] = set([annot_term])
        # STEP 3: compile and return results
        for gene in found:
            term_ids = found[gene]
            all_ids = set(term_ids)
            found[gene] = set()
            for term_enc in all_ids:
                found[gene].add('GO:'+''.join(decode(term_enc, self.term_code)))
        return found

    def getTermdef(self, term):
        term_id = encode(term[3:], self.term_code)
        try:
            (onto_ch, terms_dict, name_peek) = self.termdefs[term_id]
            self.f.seek(name_peek, 0)
            term_name = self.f.readline()
            return (onto_codes[onto_ch], terms_dict, term_name)
        except KeyError:
            return ('Unknown', 'Unknown', 'Unknown')

    def _readBitFile(self, filename, taxa, termnames = False):
        f = open(filename, 'r')
        # STEP 1: header info
        ngene_code = None
        nterm_code = None
        nevid_code = None
        ngene_cnt = 0
        nterm_cnt = 0
        nevid_cnt = 0
        header = True
        total_gene_cnt = None
        current_gene_cnt = 0
        current_terms_cnt = 0
        annot_offset = 0
        obo_offset = 0
        while f:
            if not ngene_code:
                line = f.readline()
                fields = line.split()
                total_gene_cnt = int(fields[0])
                total_terms_cnt = int(fields[1])
                ngene_code = int(fields[2])
                nterm_code = int(fields[3])
                nevid_code = int(fields[4])
                self.gene_code = ['' for _ in range(ngene_code)]
                self.term_code = ['' for _ in range(nterm_code)]
                self.evid_code = ['' for _ in range(nevid_code)]
            elif ngene_cnt < ngene_code:
                line = f.readline()
                self.gene_code[ngene_cnt] = line.strip()
                ngene_cnt += 1
            elif nterm_cnt < nterm_code:
                line = f.readline()
                self.term_code[nterm_cnt] = line.strip()
                nterm_cnt += 1
            elif nevid_cnt < nevid_code:
                line = f.readline()
                self.evid_code[nevid_cnt] = line.strip()
                nevid_cnt += 1
            else: # we're not in the header
                if header: offset = f.tell()
                header = False
                try:
                    if current_gene_cnt < total_gene_cnt: # we are reading gene:terms annotations
                        peek = f.tell()
                        buf = f.read(calcsize('IIH'))
                        (gene_int, taxa_int, nterms) = unpack('IIH', buf)
                        current_gene_cnt += 1
                        if (not taxa) or (taxa_int == taxa or taxa_int in taxa):
                            self.annot_index[gene_int] = peek
                        bufsize = calcsize('?BI')
                        f.read(nterms * bufsize)
                    elif current_terms_cnt < total_terms_cnt: # we are reading term definitions (term is_a term, term, term, ...)
                        buf = f.read(calcsize('IcH'))
                        (term_int, onto_ch, nterms) = unpack('IcH', buf)
                        current_terms_cnt += 1
                        bufsize = calcsize('BI')
                        buf = f.read(nterms * bufsize)
                        terms_dict = {}
                        for pos in range(0, len(buf) - 1, bufsize):
                            (rel_ndx, sup_int) = unpack('BI', buf[pos:pos+bufsize])
                            terms_dict[sup_int] = rel_ndx
                        name_peek = f.tell()
                        f.readline() # skip putting name in memory, instead refer to the position in the file
                        self.termdefs[term_int] = (onto_ch, terms_dict, name_peek)
                    else:
                        buf = f.read(calcsize('II'))
                        (annot_offset, obo_offset) = unpack('II', buf)
                        break
                except error as inst:
Mikael Boden's avatar
Mikael Boden committed
651
                    print(("Problem reading binary file: ", inst, "at gene ", current_gene_cnt, "at definition ", current_terms_cnt, "at", f.tell()))
Mikael Boden's avatar
Mikael Boden committed
652
                    exit(3)
Mikael Boden's avatar
Mikael Boden committed
653 654
        print(("Read %d genes and %d term definitions" % (current_gene_cnt, current_terms_cnt)))
        print(("Annotations start at", annot_offset, "\nDefinitions start at", obo_offset))
Mikael Boden's avatar
Mikael Boden committed
655 656 657 658 659 660 661 662 663
        return f

    #FIXME: write code to perform test of taxa enrichment

    def getGOReport_byScore(self, gene_score_map, negatives_score_map = {}, include_more_general = True, descending_order = True):
        """ Generate a complete GO term report for a set of genes with associated scores.
            Uses the Wilcoxon Ranksum test for each GO term to assign a p-value,
            indicating the enrichment of term to "top" genes in descending order by score (by default).
        """
Mikael Boden's avatar
Mikael Boden committed
664
        fg_map = self.getTerms(list(gene_score_map.keys()), include_more_general = include_more_general)
Mikael Boden's avatar
Mikael Boden committed
665 666 667 668 669 670 671
        fg_list = []
        for id in fg_map:
            for t in fg_map[id]:
                fg_list.append(t)
        term_set = set(fg_list)
        term_pval = {}
        if len(negatives_score_map) > 0:
Mikael Boden's avatar
Mikael Boden committed
672
            bg_map = self.getTerms(list(negatives_score_map.keys()), include_more_general = include_more_general)
Mikael Boden's avatar
Mikael Boden committed
673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
        for t in term_set:
            pos = []
            neg = []
            for gene in gene_score_map:
                annot = fg_map[gene]
                if not annot == None:
                    if t in annot:
                        pos.append(gene_score_map[gene])
                    else:
                        neg.append(gene_score_map[gene])
            if len(pos) > 0 and len(neg) > 0:
                if descending_order:
                    p = stats.getRSpval(neg, pos)
                else:
                    p = stats.getRSpval(pos, neg)
            if len(negatives_score_map) > 0 and p <= 0.05:
                mpos = pos # scores of foreground genes with matching GO term
                mneg = [] # scores of background genes with matching GO terms
                for gene in negatives_score_map:
                    annot = bg_map[gene]
                    if not annot == None:
                        if t in annot:
                            mneg.append(negatives_score_map[gene])
                if len(mneg) > 0:
                    if descending_order:
                        p2 = stats.getRSpval(mneg, mpos)
                    else:
                        p2 = stats.getRSpval(mpos, mneg)
                else:
                    p2 = 0.0
                term_pval[t] = (p, p2)
            else:
                term_pval[t] = (p, 1.0)

Mikael Boden's avatar
Mikael Boden committed
707
        sorted_pval = sorted(list(term_pval.items()), key=lambda v: v[1][0], reverse=False)
Mikael Boden's avatar
Mikael Boden committed
708 709 710 711 712

        ret = []
        for t in sorted_pval:
            defin = self.getTermdef(t[0])
            if defin == None:
Mikael Boden's avatar
Mikael Boden committed
713
                print(('Could not find definition of %s' % t[0]))
Mikael Boden's avatar
Mikael Boden committed
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747
            else:
                ret.append((t[0], t[1][0], t[1][1], defin[2].strip(), defin[0]))
        return ret

    def getGOReport(self, positives, background = None, taxa = None, include_more_general = True):
        """ Generate a complete GO term report for a set of genes (positives).
            Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
            Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
            (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
            E-value is a Bonferroni-corrected p-value.
            """
        pos = set(positives)
        fg_map = self.getTerms(pos, include_more_general = include_more_general)
        fg_list = []
        for id in fg_map:
            for t in fg_map[id]:
                fg_list.append(t)
        bg_map = {}
        bg_list = []
        neg = set()
        if background != None:
            neg = set(background).difference(pos)
            bg_map = self.getTerms(neg, include_more_general = include_more_general)
            for id in bg_map:
                for t in bg_map[id]:
                    bg_list.append(t)
        term_set = set(fg_list)
        term_cnt = {}

        nPos = len(pos)
        nNeg = len(neg)
        if background == None:
            for t in term_set:
                term_cnt[t] = fg_list.count(t)
Mikael Boden's avatar
Mikael Boden committed
748
            sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True)
Mikael Boden's avatar
Mikael Boden committed
749 750 751 752 753 754 755
        else: # a background is provided
            for t in term_set:
                fg_hit = fg_list.count(t)
                bg_hit = bg_list.count(t)
                fg_nohit = nPos - fg_hit
                bg_nohit = nNeg - bg_hit
                term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
Mikael Boden's avatar
Mikael Boden committed
756
            sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
Mikael Boden's avatar
Mikael Boden committed
757 758 759 760 761

        ret = []
        for t in sorted_cnt:
            defin = self.getTermdef(t[0])
            if defin == None:
Mikael Boden's avatar
Mikael Boden committed
762
                print(('Could not find definition of %s' % t[0]))
Mikael Boden's avatar
Mikael Boden committed
763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781
            else:
                if background != None:
                    ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][1], defin[2], defin[0]))
                else:
                    ret.append((t[0], t[1], defin[2], defin[0]))
        return ret

def encode(code_me, encode_strings):
    code = 0
    accum = 1
    try:
        for pos in range(len(code_me)):
            codelen = len(encode_strings[pos])
            for i in range(codelen):
                if encode_strings[pos][i] == code_me[pos]:
                    code += accum * i
                    accum *= codelen
                    break
    except IndexError as e:
Mikael Boden's avatar
Mikael Boden committed
782
        print((e, code_me))
Mikael Boden's avatar
Mikael Boden committed
783 784 785 786 787 788 789 790 791 792 793 794 795
    return code

def decode(code, encode_strings):
    npos    = len(encode_strings)
    accum   = [1 for _ in range(npos)]
    try:
        for pos in range(1, npos): accum[pos] = accum[pos - 1] * len(encode_strings[pos - 1])
        indices = [-1 for _ in range(npos)]
        for pos in range(npos - 1, -1, -1): # go backwards, start at last (most significant) position
            indices[pos] = code / accum[pos]
            code -= accum[pos] * indices[pos]
        string = [encode_strings[pos][indices[pos]] for pos in range(len(encode_strings))]
    except IndexError as e:
Mikael Boden's avatar
Mikael Boden committed
796
        print((e, code))
Mikael Boden's avatar
Mikael Boden committed
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845
    return string

def _extractAnnotFields(line, columns = (1,2,3,4,6,8)):
    """ Extract appropriate details from annotation file. This typically follows:
        1.  DB
            Database from which entry has been taken.
            Example: PDB
        2.  DB_Object_ID
            A unique identifier in the DB for the item being annotated.
            Here: PDB ID and chain ID of the PDB entry.
            Example: 2EKB_A
        3.  DB_Object_Symbol
            Here: PDB ID and chain ID of the PDB entry.
            Example:EKB_A
        4.  Qualifiers
            This column is used for flags that modify the interpretation of an annotation.
            This field may be equal to: NOT, colocalizes_with, contributes_to,
            NOT|contributes_to, NOT|colocalizes_with
            Example: NOT
        5.  GO Identifier
            The GO identifier for the term attributed to the DB_Object_ID.
            Example: GO:0005625
        6.  DB:Reference
            A single reference cited to support an annotation.
            Where an annotation cannot reference a paper, this field will contain
            a GO_REF identifier. See section 8 and
            http://www.geneontology.org/doc/GO.references
            for an explanation of the reference types used.
            Example: PMID:9058808
        7.  Evidence
            One of either EXP, IMP, IC, IGI, IPI, ISS, IDA, IEP, IEA, TAS, NAS,
            NR, ND or RCA.
            Example: TAS
        9.  Aspect
            One of the three ontologies: P (biological process), F (molecular function)
            or C (cellular component).
            Example: P

        In columns specify index (starts with 0 NOT 1) for gene, symb, qual, term, evid, onto
        """
    fields = line.strip().split('\t')
    gene = fields[columns[0]]
    symb = fields[columns[1]]
    qual = fields[columns[2]]
    term = fields[columns[3]]
    if not term.startswith('GO:'):
        term = None
        raise "No GO term on line: " + line
    evid = fields[columns[4]]
Mikael Boden's avatar
Mikael Boden committed
846
    if not evid in evid_codes:
Mikael Boden's avatar
Mikael Boden committed
847 848
        evid = None
    onto = fields[columns[5]]
Mikael Boden's avatar
Mikael Boden committed
849
    if not onto in onto_codes:
Mikael Boden's avatar
Mikael Boden committed
850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
        onto = None
    taxa_idx = line.find('taxon:')
    if taxa_idx == -1:
        taxa = None
    else:
        taxa = line[taxa_idx:]
        taxa = taxa.split('\t')
        taxa_spec = taxa[0].split(':')
        taxa = int(taxa_spec[len(taxa_spec) - 1]) # pick last taxon ID
    return (gene, symb, qual, term, evid, onto, taxa)

def readOBOFile(obofile):
    """
    http://www.geneontology.org/GO.format.obo-1_2.shtml
    """
    src = open(obofile, 'r')
    terms = {}
    in_term_def = False
    in_type_def = False
    for line in src:
        if in_term_def:
            if line.startswith('id: '):
                term_id = line[4:14]
                term_is = set()
            elif line.startswith('name: '):
                term_name = line[6:].strip()
            elif line.startswith('def: '):
                # Note this is a multi-line field, delimited by "'s
                pass
            elif line.startswith('namespace: '):
                if   line[11] == 'b': term_onto = 'P'
                elif line[11] == 'm': term_onto = 'F'
                elif line[11] == 'c': term_onto = 'C'
            elif line.startswith('is_a: '):
                term_is.add((line[6:16], 'is_a'))
            elif line.startswith('relationship: '):
                fields = line.split()
                term_is.add((fields[2], fields[1]))
            elif line.startswith('intersection_of: '):
                fields = line.split()
                if fields[1].startswith('GO:'):
                    term_is.add((fields[1], 'isect'))
                else:
                    term_is.add((fields[2], fields[1]))
            elif line.startswith('is_obsolete: '):
                in_term_def = False # ignore this entry
        if line.startswith('[Term]'):
            if in_term_def: # already defining one, stash it before moving on to the next...
                terms[term_id] = (term_name, term_onto, term_is)
            elif in_type_def:
                in_type_def = False
            in_term_def = True
        if line.startswith('[Typedef]'):
            if in_term_def: # already defining one, stash it before moving on to the next...
Mikael Boden's avatar
Mikael Boden committed
904
                terms[term_id] = (term_name, term_onto, term_is)
Mikael Boden's avatar
Mikael Boden committed
905 906 907 908 909 910 911
                in_term_def= False
            in_type_def = True
    if in_term_def: #  defining one, stash it
        terms[term_id] = (term_name, term_onto, term_is)
    return terms

def writeBitFile(annotFile, obofile, destFile, taxas = None):
Mikael Boden's avatar
Mikael Boden committed
912
    print(("Started at", time.asctime()))
Mikael Boden's avatar
Mikael Boden committed
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951
    # open annotation file to analyse and index data
    src = open(annotFile, 'r')
    gene_index = [{} for _ in range(6)]  # count different characters in different positions
    term_index = [{} for _ in range(7)]  # count different characters in different positions
    evid_index = {}
    gene_cnt = 0
    cnt = 0
    prev_gene = None
    for line in src:
        cnt += 1
        #if cnt > 100000:
        #    break
        if line.startswith('!'):
            continue
        (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line)
        if not (taxas and ((taxa == taxas) or (taxa in taxas))):  # The gene does NOT belong to a nominated taxon
            continue
        if not (gene == prev_gene): # not the same gene
            gene_cnt += 1
        try:
            evid_index[evid]
        except:
            evid_index[evid] = len(evid_index)
        pos = 0
        for ch in gene[0:6]:
            try:
                gene_index[pos][ch]
            except KeyError: # no match
                gene_index[pos][ch] = len(gene_index[pos])
            pos += 1
        pos = 0
        for ch in term[3:10]:
            try:
                term_index[pos][ch]
            except KeyError: # no match
                term_index[pos][ch] = len(term_index[pos])
            pos += 1
        prev_gene = gene
    src.close()
Mikael Boden's avatar
Mikael Boden committed
952
    print(("Read annotations for %d genes" % gene_cnt))
Mikael Boden's avatar
Mikael Boden committed
953 954 955 956 957

    gene_code = ['' for _ in range(6)]
    term_code = ['' for _ in range(7)]
    for d in range(len(gene_index)):
        arr = ['?' for _ in gene_index[d]]
Mikael Boden's avatar
Mikael Boden committed
958
        for (ch, index) in list(gene_index[d].items()):
Mikael Boden's avatar
Mikael Boden committed
959 960 961 962
            arr[index] = ch
        gene_code[d] = ''.join(arr)
    for d in range(len(term_index)):
        arr = ['?' for _ in term_index[d]]
Mikael Boden's avatar
Mikael Boden committed
963
        for (ch, index) in term_index[d].items():
Mikael Boden's avatar
Mikael Boden committed
964 965 966
            arr[index] = ch
        term_code[d] = ''.join(arr)
    evid_code = ['' for _ in range(len(evid_index))]
Mikael Boden's avatar
Mikael Boden committed
967
    for (e, ndx) in list(evid_index.items()):
Mikael Boden's avatar
Mikael Boden committed
968 969 970 971
        evid_code[ndx] = e

    # Get GO definitions
    terms = readOBOFile(obofile)
Mikael Boden's avatar
Mikael Boden committed
972
    print(("Read %d GO definitions" % len(terms)))
Mikael Boden's avatar
Mikael Boden committed
973 974 975 976 977 978 979 980 981 982 983 984

    # re-open, now with the aim of copying info
    src = open(annotFile, 'r')
    dst = open(destFile, 'w')
    # STEP 1: header info
    dst.write("%d\t%d\t%d\t%d\t%d\n" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index)))
    for code_str in gene_code:
        dst.write(code_str+"\n")
    for code_str in term_code:
        dst.write(code_str+"\n")
    for e_str in evid_code:
        dst.write(e_str+'\n')
Mikael Boden's avatar
Mikael Boden committed
985
    print(("Wrote header %d\t%d\t%d\t%d\t%d, now at @%d" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index), dst.tell())))
Mikael Boden's avatar
Mikael Boden committed
986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021

    # STEP 2: write annotations
    annot_offset = dst.tell()
    prev_gene = None
    concat_terms = {}
    cnt = 0
    for line in src:
        cnt += 1
        #if cnt > 100000:
        #    break
        if line.startswith('!'):
            continue
        (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line)
        if not (taxas and ((taxa == taxas) or (taxa in taxas))): # The gene does NOT belong to a nominated taxon
            continue
        if gene != prev_gene: # new gene is found
            if prev_gene != None:
                # write prev data
                s = pack('IIH', encode(prev_gene, gene_code), taxa, len(concat_terms))
                dst.write(s)
                for t in concat_terms:
                    (o, q, e) = concat_terms[t]
                    s = pack('?BI', q, evid_index[e], encode(t, term_code))
                    dst.write(s)
            # re-init
            prev_gene = gene
            concat_terms = {}
        concat_terms[term[3:]] = (onto, qual, evid)
    if len(concat_terms) > 0:
        # write data in buffer
        s = pack('IIH', encode(prev_gene, gene_code), taxa, len(concat_terms))
        dst.write(s)
        for t in concat_terms:
            (o, q, e) = concat_terms[t]
            s = pack('?BI', q, evid_index[e], encode(t, term_code))
            dst.write(s)
Mikael Boden's avatar
Mikael Boden committed
1022
    print(("Wrote GO annotations, now at @%d" % dst.tell()))
Mikael Boden's avatar
Mikael Boden committed
1023 1024 1025

    # Next, the ontology definition...
    obo_offset = dst.tell() # remember the position where the OBO starts
Mikael Boden's avatar
Mikael Boden committed
1026
    sorted_terms = sorted(iter(terms.items()), key=operator.itemgetter(0))
Mikael Boden's avatar
Mikael Boden committed
1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
    for [t, _] in sorted_terms:
        (term_name, term_onto, term_is) = terms[t]
        s = pack('IcH', encode(t[3:], term_code), term_onto, len(term_is))
        dst.write(s)
        for (sup_term, sup_rel) in term_is:
            try:
                index = onto_rel.index(sup_rel)
            except ValueError:
                index = 9
            s = pack('BI', index, encode(sup_term[3:], term_code))
            dst.write(s)
        dst.write(term_name + '\n')
Mikael Boden's avatar
Mikael Boden committed
1039
    print(("Wrote %d GO definitions, now at @%d" % (len(sorted_terms), dst.tell())))
Mikael Boden's avatar
Mikael Boden committed
1040 1041 1042 1043 1044

    # Finally, write the offsets to quickly access annotations and definitions, resp
    dst.write(pack('II', annot_offset, obo_offset))
    # done, close
    dst.close()
Mikael Boden's avatar
Mikael Boden committed
1045
    print(("Completed at", time.asctime()))
Mikael Boden's avatar
Mikael Boden committed
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094

def reduceAnnotFile(annotfile, newannotfile, include_genes, annotfile_column_gene = 1):
    """ Reduce size of annotation file by specifying the genes of interest:
    annotfile: name of annotation file, e.g. 'goa_uniprot_all.gaf'
    newannotfile: name of new, reduced-size annotation file, e.g. 'goa_uniprot_some.gaf'
    include_genes: set/list of gene names to include
    Optionally, specify what column in the annotation file that contains name of gene product:
    """
    print(("Started at", time.asctime()))
    # open annotation file to analyse and index data
    if annotfile.endswith(".gz"):
        src = gzip.open(annotfile, 'rt')
        dst = gzip.open(newannotfile, 'wt')
    else:
        src = open(annotfile, 'rt')
        dst = open(newannotfile, 'wt')
    cnt0 = 0
    cnt1 = 0
    for line in src:
        if line.startswith('!'):
            continue
        cnt0 += 1
        fields = line.strip().split('\t')
        gene = fields[annotfile_column_gene]
        if gene in include_genes:
            dst.write(line)
            cnt1 += 1
    src.close()
    dst.close()
    print(("Read %d annotations; wrote %d annotations" % (cnt0, cnt1)))

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('Usage: godata <COMMAND> where <COMMAND> is one of the following', \
        "\n\tinclude <gaf-file-src> <gaf-file-dest> <gene-txt-file> where",\
        "\n\t\t<gaf-file-src> is the original annotation file",\
        "\n\t\t<gaf-file-dest> is the new, reduced-size annotation file",\
        "\n\t\t<gene-txt-file> lists gene names to include in new gaf-file-dest")
        sys.exit(1)
    if sys.argv[1].upper() == 'INCLUDE' and len(sys.argv) > 4:
        f = open(sys.argv[4])
        include_genes = set()
        for line in f:
            include_genes.add(line.strip())
        reduceAnnotFile(sys.argv[2], sys.argv[3], include_genes)
    else:
        print('Unknown command \"' + sys.argv[1] + '\" or arguments:'),
        for arg in sys.argv[2:]:
            print(arg),