sym.py 13.9 KB
Newer Older
Mikael Boden's avatar
Mikael Boden committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
"""
Module symbol is for defining alphabets (of symbols), and
for storing and operating on symbols and tuples (ordered or
unordered).
"""
import os

# ------------------ Alphabet ------------------

class Alphabet(object):
    """ Defines an immutable biological alphabet (e.g. the alphabet for DNA is AGCT)
    that can be used to create sequences (see sequence.py).
    We use alphabets to define "tuple" tables, where entries are keyed by combinations
    of symbols of an alphabet (see class TupleStore below).
    Alphabets are used to define probability distributions for stochastic events
    (see prob.py). """

    def __init__(self, symbolString):
        """ Construct an alphabet from a string of symbols. Lower case characters
        will be converted to upper case, repeated characters are ignored.
        Example of constructing the DNA alphabet:
        >>> alpha = Alphabet('ACGTttga')
        >>> alpha.symbols
        ('A', 'C', 'G', 'T') """

        # Add each symbol to the symbols list, one at a time, and ignore doubles (could use "set" here...)
        _symbols = [] # create a temporary list
        for s in symbolString:
            if not str(s).upper()[0] in _symbols:
                _symbols.append(str(s).upper()[0])
        _symbols.sort() # we put them in alphabetical (one canonical) order
        # OK done extracting, put them in place
        self.symbols = tuple(_symbols); # create the immutable tuple from the extracted list
        self.length = len(self.symbols)
        self.annotations = {}

    def __str__(self):
        return str(self.symbols)

    def __len__(self):
        return len(self.symbols)

    def __iter__(self):
        return self.symbols.__iter__()

    def __getitem__(self, ndx):
        """ Retrieve the symbol(s) at the specified index (or slice of indices) """
        return self.symbols[ndx]

    def __contains__(self, sym):
        """ Check if the given symbol is a member of the alphabet. """
        return sym in self.symbols

    def index(self, sym):
        """ Retrieve the index of the given symbol in the alphabet. """
        # If the symbol is valid, use the tuple's index function
        if sym in self.symbols:
            syms = self.symbols
            return syms.index(sym)
        else:
            raise RuntimeError('Symbol %s is not indexed by alphabet %s' % (sym, str(self.symbols)))

    def __eq__(self, rhs):
        """ Test if the rhs alphabet is equal to ours. """
        if rhs == None:
            return False
        if len(rhs) != len(self):
            return False
        # OK we know they're same size...
        for sym in self.symbols:
            if not sym in rhs:
                return False
        return True

    def isSubsetOf(self, alpha2):
        """ Test if this alphabet is a subset of alpha2. """
        for sym in self.symbols:
            if not alpha2.isValidSymbol(sym):
                return False
        return True

    def isSupersetOf(self, alpha2):
        """ Test if this alphabet is a superset of alpha2. """
        return alpha2.isSubsetOf(self)

    def annotateSym(self, label, sym, value):
        try:
            lookup = self.annotations[label]
        except KeyError:
            lookup = self.annotations[label] = {}
        lookup[sym] = value

    def annotateAll(self, label, symdictOrFilename):
        if isinstance(symdictOrFilename, str): # we assume it is a filename
            fh = open(symdictOrFilename)
            string = fh.read()
            d = {}
            for line in string.splitlines():
                if len(line.strip()) == 0:
                    continue
                sections = line.split()
                symstr, value = sections[0:2]
                for sym in symstr:
                    d[sym] = value
            fh.close()
        else: # we assume it is a dictionary
            d = symdictOrFilename
        for sym in d:
            self.annotateSym(label, sym, d[sym])

    def getAnnotation(self, label, sym):
        try:
            lookup = self.annotations[label]
            return lookup[sym]
        except KeyError:
            return None


""" Below we declare alphabets that are going to be available when
this module is imported """
Bool_Alphabet = Alphabet('TF')
DNA_Alphabet = Alphabet('ACGT')
DNA_Alphabet_wN = Alphabet('ACGTN')
Mikael Boden's avatar
Mikael Boden committed
124
RNA_Alphabet_wN = Alphabet('ACGUN')
Mikael Boden's avatar
Mikael Boden committed
125 126 127
RNA_Alphabet = Alphabet('ACGU')
Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
Mikael Boden's avatar
Mikael Boden committed
128
Protein_Alphabet_wSTOP = Protein_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
Mikael Boden's avatar
Mikael Boden committed
129
Protein_wGAP  = Alphabet('ACDEFGHIKLMNPQRSTVWY-')
Mikael Boden's avatar
Mikael Boden committed
130 131 132
DSSP_Alphabet = Alphabet('GHITEBSC')
DSSP3_Alphabet = Alphabet('HEC')

Mikael Boden's avatar
Mikael Boden committed
133 134
predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
                   'DNA': DNA_Alphabet,
Mikael Boden's avatar
Mikael Boden committed
135
                   'RNA': RNA_Alphabet,
Mikael Boden's avatar
Mikael Boden committed
136 137
                   'DNAwN': RNA_Alphabet_wN,
                   'RNAwN': DNA_Alphabet_wN,
Mikael Boden's avatar
Mikael Boden committed
138
                   'Protein': Protein_Alphabet,
Mikael Boden's avatar
Mikael Boden committed
139 140
                   'ProteinwX': Protein_wX,
                   'ProteinwSTOP' : Protein_wSTOP,
141
                   'ProteinwGAP': Protein_wGAP,
Mikael Boden's avatar
Mikael Boden committed
142 143
                   'DSSP_Alphabet' : DSSP_Alphabet,
                   'DSSP3_Alphabet' : DSSP3_Alphabet}
Mikael Boden's avatar
Mikael Boden committed
144 145
# The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
146 147
preferredOrder = ['Bool_Alphabet', 'DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX', 'ProteinwSTOP',
                  'ProteinwGAP', 'DSSP_Alphabet', 'DSSP3_Alphabet']
Mikael Boden's avatar
Mikael Boden committed
148 149 150
# Useful annotations
DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
#Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
Protein_Alphabet.annotateAll('html-color', {
#orange*/
'G': "#F5A259",
#green*/
'N':"#00f900",
'Q':"#00f900",
'S': "#00f900",
'T': "#00f900",
#red*/
'K': "#f62f00",
'R': "#f62f00",
#blue/purple*/
'A':"#92b2f3",
'I': "#92b2f3",
'L': "#92b2f3",
'M': "#92b2f3",
'V': "#92b2f3",
'W': "#92b2f3",
'F': "#92b2f3",
#yellow*/
'P': "#FFFB00",
#pink*/
'C':"#F59692",
#aqua*/
'H': "#04B2B3",
'Y': "#04B2B3",
#purple*/
'D':"#CE64CB",
'E':"#CE64CB"})
Mikael Boden's avatar
Mikael Boden committed
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330

# ------------------ Substitution Matrix ------------------

class TupleStore(dict):
    """ Internal utility class that can be used for associating
    a value with ordered n-tuples (n=1..N).
    Read/write functions are defined for instances of this class.
    """

    def __init__(self, alphas=None, entries=None, sparse=True):
        """
        Manage entries keyed by symbol-tuples with values of arbitrary type.
        If alphas is None, the alphabet(s) are inferred from the provided entries.
        If entries is None, all entries are defined by possible combinations of symbols from specified alphabets,
        and are assumed to be None until specified. Either alphas or entries must be supplied.
        If sparse is True, a sparse memory-saving encoding is used, if false, a time-saving, more flexible encoding is used.
        >>> matrix = TupleStore({'AA': 2, 'AW': -3, 'WW': 4, 'AR': -1})
        >>> matrix[('A', 'W')]
        -3
        >>> matrix['AR']
        -1
        """
        assert sparse, "Currently only sparse encoding is implemented."
        assert alphas or entries, "Either alphabets or entries (from which alphabets can be inferred) must be supplied."
        self.sparse = sparse         # sparse encoding if true
        if alphas == None:
            self.alphas = None       # need to figure out alphabet from supplied entries
            self.keylen = None       # tuple length not known yet
        elif type(alphas) is Alphabet:
            self.alphas = tuple ([ alphas ]) # make it into a tuple
            self.keylen = 1          # tuple length 1
        else:
            self.alphas = alphas     # alphabets are supplied
            self.keylen = len(alphas)# length of tuples is the same as the number alphabets

        # Check if entries are supplied to the constructor
        if entries == None:
            self.entries = entries = {}
        elif type(entries) is dict:
            raise RuntimeError("When specified, entries must be a dictionary")
        # Check length of tuples, must be the same for all
        for entry in entries:
            if self.keylen == None:
                self.keylen = len(entry)
            elif self.keylen != len(entry):
                raise RuntimeError("All entries must have the same number of symbols")

        # go through each position in tuples, to check what alphabet is right
        myalphas = []                   # my suggestions from entries (need to be subsets of specified)
        for idx in range(self.keylen):
            symset = set()              # we collect all symbols in position idx here
            for key in entries:
                symset.add(key[idx])
            myalpha = Alphabet(symset)
            myalphas.append(myalpha)
            if self.alphas != None:     # if specified it needs to be a superset of that we constructed
                if not self.alphas[idx].isSupersetOf(myalpha):
                    raise RuntimeError("Specified alphabet is not compatible with specified entries")

        if self.alphas == None:     # if not specified to constructor use those we found
            self.alphas = tuple(myalphas)

        for key in entries:
            self[key] = entries[key]

    def _isValid(self, symkey):
        for idx in range(self.keylen):
            if not symkey[idx] in self.alphas[idx]:
                return False
        return True

    def __setitem__(self, symkey, value):
        assert self.keylen == len(symkey), "All entries in dictionary must be equally long"
        assert self._isValid(symkey), "Invalid symbol in entry"
        self.entries[symkey] = value

    def __getitem__(self, symkey):
        """ Return the score matching the given symbols together."""
        assert self.keylen == len(symkey), "Entries must be of the same length"
        try:
            return self.entries[symkey]
        except KeyError:
            return None

    def __iadd__(self, symkey, ivalue):
        assert self.keylen == len(symkey), "All entries in dictionary must be equally long"
        assert self._isValid(symkey), "Invalid symbol in entry"
        try:
            self.entries[symkey] += ivalue
        except KeyError:
            self.entries[symkey] = ivalue

    def __isub__(self, symkey, ivalue):
        assert self.keylen == len(symkey), "All entries in dictionary must be equally long"
        assert self._isValid(symkey), "Invalid symbol in entry"
        try:
            self.entries[symkey] -= ivalue
        except KeyError:
            self.entries[symkey] = -ivalue

    def getAll(self, symkey=None):
        """ Return the values matching the given symbols together.
        symkey: tuple (or list) of symbols or None (symcount symbol); if tuple is None, all entries are iterated over.
        """
        if symkey == None:
            symkey = []
            for idx in range(self.keylen):
                symkey.append(None)
        else:
            assert self.keylen == len(symkey), "Entries must be of the same length"
        for idx in range(self.keylen):
            if symkey[idx] != None:
                if not symkey[idx] in self.alphas[idx]:
                    raise RuntimeError("Invalid entry: must be symbols from specified alphabet or None")
        return TupleEntries(self, symkey)

    def __iter__(self):
        return TupleEntries(self, tuple([None for _ in range(self.keylen)]))

    def items(self, sort = False):
        """ In a dictionary-like way return all entries as a list of 2-tuples (key, prob).
        If sort is True, entries are sorted in descending order of value.
        Note that this function should NOT be used for big (>5 variables) tables."""
        ret = []
        for s in self.entries:
            if self[s] != None:
                ret.append((s, self[s]))
        if sort:
            return sorted(ret, key=lambda v: v[1], reverse=True)
        return ret

class TupleEntries(object):
    """ Iterator class for multiple entries in a tuple store.
    """
    def __init__(self, tuplestore, symkey):
        self.tuplestore = tuplestore
        self.symkey = symkey
        self.symcount = []
        self.indices = []
        for ndx in range(tuplestore.keylen):
            if symkey[ndx] == None:
                self.indices.append(ndx)
                self.symcount.append(0)        # start at this index to alter symbol
            else:
                self.symcount.append(None)     # do not alter this symbol
        self.nextIsLast = False

    def __iter__(self):
        return self

Mikael Boden's avatar
Mikael Boden committed
331
    def __next__(self):
Mikael Boden's avatar
Mikael Boden committed
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
        """ Step through sequence of entries, either
        (if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
        (if sparse) with calls to tuple store based on all possible symbol combinations."""

        if self.nextIsLast:
            raise StopIteration

        mykey = [] # construct current combination from known and unspecified symbols
        for ndx in range(self.tuplestore.keylen):
            if (self.symkey[ndx] == None):
                sym = self.tuplestore.alphas[ndx][self.symcount[ndx]]
                mykey.append(sym)
            else:
                mykey.append(self.symkey[ndx])

        # decide which ndx that should be increased (only one)
        self.nextIsLast = True # assume this is the last round (all counters are re-set)
        for ndx in self.indices:
            if self.symcount[ndx] == len(self.tuplestore.alphas[ndx]) - 1: # if we just entered the last symbol of this alphabet
                self.symcount[ndx] = 0                  # reset count here
            else:
                self.symcount[ndx] = self.symcount[ndx] + 1
                self.nextIsLast = False
                break

        return tuple(mykey)