sequence.py 59.7 KB
Newer Older
Mikael Boden's avatar
Mikael Boden committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
"""
Module *** sequence ***

This module depends on the following modules

sym -- defines an alphabet
prob -- defines structures to hold probabilities (prob also depends on sym)

This module incorporates classes for

Sequence -- names and defines a sequence of symbols; computes various transformations and pairwise alignments
Alignment -- defines a multiple sequence alignment; computes stats for use in substitution matrices
SubstMatrix -- substitution matrix class to support alignment methods
Regexp -- defines patterns as regular expressions for textual pattern matching in sequences
PWM -- defines a weight matrix that can score any site in actual sequences

Incorporates methods for loading and saving files relevant to the above (e.g. FASTA, ALN, substitution matrices)
and methods for retrieving relevant data from web services

Mikael Boden's avatar
Mikael Boden committed
20 21 22
This code has been adapted to Python 3.5 in 2017

This code has gone through many updates and has benefited from kind contributions of course participants.
Mikael Boden's avatar
Mikael Boden committed
23 24 25 26 27 28 29 30 31 32 33 34 35
Please keep suggestions coming!
Email: m.boden@uq.edu.au
"""

import string, sys, re, math, os, array
import numpy
from webservice import *
from sym import *
from prob import *

# Sequence ------------------****

class Sequence(object):
Mikael Boden's avatar
Mikael Boden committed
36 37
    """ A biological sequence. Stores the sequence itself (as a compact array), 
    the alphabet (i.e., type of sequence it is), and optionally a name and further 
Mikael Boden's avatar
Mikael Boden committed
38
    information. """
Mikael Boden's avatar
Mikael Boden committed
39 40
    
    sequence = None # The array of symbols that make up the sequence 
Mikael Boden's avatar
Mikael Boden committed
41 42 43 44 45
    alphabet = None # The alphabet from which symbols come
    name =     None # The name (identifier) of a sequence
    info =     None # Other information (free text; e.g. annotations)
    length =   None # The number of symbols that the sequence is composed of
    gappy =    None # True if the sequence has "gaps", i.e. positions that represent deletions relative another sequence
Mikael Boden's avatar
Mikael Boden committed
46
    
Mikael Boden's avatar
Mikael Boden committed
47 48 49 50 51 52 53 54 55 56 57 58
    def __init__(self, sequence, alphabet = None, name = '', info = '', gappy = False):
        """ Create a sequence with the sequence data. Specifying the alphabet,
        name and other information about the sequence are all optional.
        The sequence data is immutable (stored as a string).
        Example:
        >>> myseq = Sequence('MVSAKKVPAIAMSFGVSF')
        will create a sequence with no name, and assign one of the predefined
        alphabets on the basis of what symbols were used.
        >>> myseq.alphabet.symbols
        will output the standard protein alphabet:
        ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'] """
Mikael Boden's avatar
Mikael Boden committed
59
        
Mikael Boden's avatar
Mikael Boden committed
60
        self.sequence = sequence
Mikael Boden's avatar
Mikael Boden committed
61
        
Mikael Boden's avatar
Mikael Boden committed
62
        # Assign an alphabet
Mikael Boden's avatar
Mikael Boden committed
63
        # If no alphabet is provided, attempts to identify the alphabet from sequence
Mikael Boden's avatar
Mikael Boden committed
64 65 66 67 68 69 70 71 72 73 74
        self.alphabet = None
        if not alphabet is None:
            for sym in self.sequence:
                if not sym in alphabet and (sym != '-' or not gappy):  # error check: bail out
                    raise RuntimeError('Invalid symbol: %c in sequence %s' % (sym, name))
            self.alphabet = alphabet
        else:
            for alphaName in preferredOrder:
                alpha = predefAlphabets[alphaName]
                valid = True
                for sym in self.sequence:
Mikael Boden's avatar
Mikael Boden committed
75
                    if not sym in alpha and (sym != '-' or not gappy):  
Mikael Boden's avatar
Mikael Boden committed
76 77 78 79 80 81 82
                        valid = False
                        break
                if valid:
                    self.alphabet = alpha
                    break
            if self.alphabet is None:
                raise RuntimeError('Could not identify alphabet from sequence: %s' % name)
Mikael Boden's avatar
Mikael Boden committed
83
        
Mikael Boden's avatar
Mikael Boden committed
84 85 86 87 88
        # Store other information
        self.name = name
        self.info = info
        self.length = len(self.sequence)
        self.gappy = gappy
Mikael Boden's avatar
Mikael Boden committed
89
        
Mikael Boden's avatar
Mikael Boden committed
90 91 92
    def __len__(self):
        """ Defines what the "len" operator returns for an instance of Sequence, e.g.
        >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
Mikael Boden's avatar
Mikael Boden committed
93
        >>> print (len(seq))
Mikael Boden's avatar
Mikael Boden committed
94 95 96 97 98 99 100 101 102 103
        9
        """
        return len(self.sequence)

    def __str__(self):
        """ Defines what should be printed when the print statement is used on a Sequence instance """
        str = self.name + ': '
        for sym in self:
            str += sym
        return str
Mikael Boden's avatar
Mikael Boden committed
104
    
Mikael Boden's avatar
Mikael Boden committed
105 106 107 108
    def __iter__(self):
        """ Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
        >>> seq = Sequence('AGGAT', DNA_Alphabet)
        >>> for sym in seq:
Mikael Boden's avatar
Mikael Boden committed
109
                print (sym)
Mikael Boden's avatar
Mikael Boden committed
110
        will print A, G, G, A, T (each on a separate row)
Mikael Boden's avatar
Mikael Boden committed
111
        """ 
Mikael Boden's avatar
Mikael Boden committed
112 113
        tsyms = tuple(self.sequence)
        return tsyms.__iter__()
Mikael Boden's avatar
Mikael Boden committed
114
    
Mikael Boden's avatar
Mikael Boden committed
115 116 117
    def __contains__(self, item):
        """ Defines what is returned when the "in" operator is used on a Sequence, e.g.
        >>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
Mikael Boden's avatar
Mikael Boden committed
118
        >>> print ('T' in seq)
Mikael Boden's avatar
Mikael Boden committed
119
        True
Mikael Boden's avatar
Mikael Boden committed
120 121
            which is equivalent to 
        >>> print (seq.__contains__('T'))
Mikael Boden's avatar
Mikael Boden committed
122
        True
Mikael Boden's avatar
Mikael Boden committed
123
        >>> print ('X' in seq)
Mikael Boden's avatar
Mikael Boden committed
124
        False
Mikael Boden's avatar
Mikael Boden committed
125
        """ 
Mikael Boden's avatar
Mikael Boden committed
126 127 128 129
        for sym in self.sequence:
            if sym == item:
                return True
        return False
Mikael Boden's avatar
Mikael Boden committed
130
        
Mikael Boden's avatar
Mikael Boden committed
131 132
    def __getitem__(self, ndx):
        """ Retrieve a specified index (or a "slice" of indices) of the sequence data.
Mikael Boden's avatar
Mikael Boden committed
133
            Calling self.__getitem__(3) is equivalent to self[3] 
Mikael Boden's avatar
Mikael Boden committed
134 135
        """
        if type(ndx) is slice:
Mikael Boden's avatar
Mikael Boden committed
136
            return ''.join(self.sequence[ndx])
Mikael Boden's avatar
Mikael Boden committed
137 138
        else:
            return self.sequence[ndx]
Mikael Boden's avatar
Mikael Boden committed
139
        
Mikael Boden's avatar
Mikael Boden committed
140 141
    def writeFasta(self):
        """ Write one sequence in FASTA format to a string and return it. """
Mikael Boden's avatar
Mikael Boden committed
142 143 144 145
        if parseDefline(self.info)[0] == self.name: # this sequence was previously "parsed" and info should hold the original header
            fasta = '>' + self.info + '\n'
        else:
            fasta = '>' + self.name + ' ' + self.info + '\n'
Mikael Boden's avatar
Mikael Boden committed
146 147
        data = ''.join(self.sequence)
        nlines = int(math.ceil((len(self.sequence) - 1) / 60 + 1))
Mikael Boden's avatar
Mikael Boden committed
148 149 150 151
        for i in range(nlines):
            lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
            fasta += lineofseq
        return fasta
Mikael Boden's avatar
Mikael Boden committed
152
    
Mikael Boden's avatar
Mikael Boden committed
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
    def count(self, findme = None):
        """ Get the number of occurrences of specified symbol findme OR
            if findme = None, return a dictionary of counts of all symbols in alphabet """
        if findme != None:
            cnt = 0
            for sym in self.sequence:
                if findme == sym:
                    cnt = cnt + 1
            return cnt
        else:
            symbolCounts = {}
            for symbol in self.alphabet:
                symbolCounts[symbol] = self.count(symbol)
            return symbolCounts

Mikael Boden's avatar
Mikael Boden committed
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
    def getDegapped(self):
        """ Create the sequence excluding gaps, and provide the corresponding indices for the gapped version, e.g.
        >>> gappy = Sequence('AC--TA-GA', DNA_Alphabet, name = 'myseq', gappy = True)
        >>> degapped, indices = gappy.getDegapped()
        >>> print(degapped)
            myseq: ACTAGA
        >>> print(indices)
            [0, 1, 4, 5, 7, 8]
        """
        idxs = []
        newseq = []
        for i in range(len(self.sequence)):
            if not self.sequence[i] == '-':
                newseq.append(self.sequence[i])
                idxs.append(i)
        return Sequence(newseq, self.alphabet, self.name, self.info, gappy = False), idxs

    def find(self, findme, gappy = False):
Mikael Boden's avatar
Mikael Boden committed
186
        """ Find the position of the specified symbol or sub-sequence """
Mikael Boden's avatar
Mikael Boden committed
187 188 189 190 191 192
        if gappy == False or self.gappy == False:
            return ''.join(self.sequence).find(findme)
        else: # if the sequence is gappy AND the function is called with gappy = True THEN run the find on the de-gapped sequence
            degapped, idxs = self.getDegapped()
            idx = ''.join(degapped).find(findme)
            return idxs[idx] if idx >= 0 else -1
Mikael Boden's avatar
Mikael Boden committed
193 194 195 196 197

"""
Below are some useful methods for loading data from strings and files.
Recognize the FASTA format (nothing fancy).
"""
Mikael Boden's avatar
Mikael Boden committed
198
def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defline = True):
Mikael Boden's avatar
Mikael Boden committed
199 200 201 202 203 204
    """ Read the given string as FASTA formatted data and return the list of
        sequences contained within it.
        If alphabet is specified, use it, if None (default) then guess it.
        If ignore is False, errors cause the method to fail.
        If ignore is True, errors will disregard sequence.
        If gappy is False (default), sequence cannot contain gaps,
Mikael Boden's avatar
Mikael Boden committed
205 206
        if True gaps are accepted and included in the resulting sequences.
        If parse_defline is False, the name will be set to everything before the first space, else parsing will be attempted."""
Mikael Boden's avatar
Mikael Boden committed
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
    seqlist = []    # list of sequences contained in the string
    seqname = None  # name of *current* sequence
    seqinfo = None
    seqdata = []    # sequence data for *current* sequence
    for line in string.splitlines():    # read every line
        if len(line) == 0:              # ignore empty lines
            continue
        if line[0] == '>':  # start of new sequence
            if seqname:     # check if we've got one current
                try:
                    current = Sequence(seqdata, alphabet, seqname, seqinfo, gappy)
                    seqlist.append(current)
                except RuntimeError as errmsg:
                    if not ignore:
                        raise RuntimeError(errmsg)
            # now collect data about the new sequence
            seqinfo = line[1:].split() # skip first char
            if len(seqinfo) > 0:
                try:
Mikael Boden's avatar
Mikael Boden committed
226 227 228
                    if parse_defline:
                        parsed = parseDefline(seqinfo[0])
                        seqname = parsed[0]
229 230
                        seqinfo = line[1:]
                    else: # we are not parsing the sequence name so no need to duplicate it in the info
Mikael Boden's avatar
Mikael Boden committed
231
                        seqname = seqinfo[0]
232 233 234 235 236 237 238
                        if len(seqinfo) > 0: # more than a name
                            edited_info = ''
                            for infopart in seqinfo[1:]:
                                edited_info += infopart + ' '
                            seqinfo = edited_info
                        else:
                            seqinfo = ''
Mikael Boden's avatar
Mikael Boden committed
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
                except IndexError as errmsg:
                    if not ignore:
                        raise RuntimeError(errmsg)
            else:
                seqname = ''
                seqinfo = ''
            seqdata = []
        else:               # we assume this is (more) data for current
            cleanline = line.split()
            for thisline in cleanline:
                seqdata.extend(tuple(thisline.strip('*')))
    # we're done reading the file, but the last sequence remains
    if seqname:
        try:
            lastseq = Sequence(seqdata, alphabet, seqname, seqinfo, gappy)
            seqlist.append(lastseq)
        except RuntimeError as errmsg:
            if not ignore:
                raise RuntimeError(errmsg)
    return seqlist

def parseDefline(string):
    """ Parse the FASTA defline (see http://en.wikipedia.org/wiki/FASTA_format)
        GenBank, EMBL, etc                gi|gi-number|gb|accession|locus
        SWISS-PROT, TrEMBL                sp|accession|name
        ...
        Return a tuple with
        [0] primary search key, e.g. UniProt accession, Genbank GI
        [1] secondary search key, e.g. UniProt name, Genbank accession
        [2] source, e.g. 'sp' (SwissProt/UniProt), 'tr' (TrEMBL), 'gb' (Genbank)
    """
    if len(string) == 0: return ('', '', '', '')
    s = string.split()[0]
Mikael Boden's avatar
Mikael Boden committed
272
    if re.match("^sp\|[A-Z][A-Z0-9]*\|\S+", s):            arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
Mikael Boden's avatar
Mikael Boden committed
273
    elif re.match("^tr\|[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
Mikael Boden's avatar
Mikael Boden committed
274 275 276 277
    elif re.match("^gi\|[0-9]*\|\S+\|\S+", s):               arg = s.split('|');  return (arg[1], arg[3], arg[0], arg[2])
    elif re.match("gb\|\S+\|\S+", s):                        arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
    elif re.match("emb\|\S+\|\S+", s):                       arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
    elif re.match("^refseq\|\S+\|\S+", s):                   arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
Mikael Boden's avatar
Mikael Boden committed
278
    elif re.match("[A-Z][A-Z0-9]*\|\S+", s):            arg = s.split('|');  return (arg[0], arg[1], 'UniProt', '') # assume this is UniProt
Mikael Boden's avatar
Mikael Boden committed
279 280
    else: return (s, '', '', '')

Mikael Boden's avatar
Mikael Boden committed
281
def readFastaFile(filename, alphabet = None, ignore = False, gappy = False, parse_defline = True):
Mikael Boden's avatar
Mikael Boden committed
282 283 284 285 286 287
    """ Read the given FASTA formatted file and return the list of sequences
        contained within it. Note that if alphabet is NOT specified, it will take a
        separate guess for each sequence.
        If ignore is False, errors cause the method to fail.
        If ignore is True, errors will disregard sequence.
        If gappy is False (default), sequence cannot contain gaps,
Mikael Boden's avatar
Mikael Boden committed
288 289
        if True gaps are accepted and included in the resulting sequences.
        If parse_defline is False, the name will be set to everything before the first space, else parsing will be attempted."""
Mikael Boden's avatar
Mikael Boden committed
290 291 292 293 294 295 296 297
    fh = open(filename)
    seqlist = []
    batch = '' # a batch of rows including one or more complete FASTA entries
    rowcnt = 0
    for row in fh:
        row = row.strip()
        if len(row) > 0:
            if row.startswith('>') and rowcnt > 0:
Mikael Boden's avatar
Mikael Boden committed
298
                more = readFasta(batch, alphabet, ignore, gappy, parse_defline)
Mikael Boden's avatar
Mikael Boden committed
299 300 301 302 303 304 305
                if len(more) > 0:
                    seqlist.extend(more)
                batch = ''
                rowcnt = 0
            batch += row + '\n'
            rowcnt += 1
    if len(batch) > 0:
Mikael Boden's avatar
Mikael Boden committed
306
        more = readFasta(batch, alphabet, ignore, gappy, parse_defline)
Mikael Boden's avatar
Mikael Boden committed
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
        if len(more) > 0:
            seqlist.extend(more)
    fh.close()
    return seqlist

def writeFastaFile(filename, seqs):
    """ Write the specified sequences to a FASTA file. """
    fh = open(filename, 'w')
    for seq in seqs:
        fh.write(seq.writeFasta())
    fh.close()

def getMarkov(seqs, order = 0):
    """ Retrieve the Markov stats for a set of sequences. """
    myseqs = seqs
    if seqs is Sequence:
        myseqs = list([seqs])
    myalpha = None
    for seq in myseqs:
        if myalpha == None:
            myalpha = seq.alphabet
        else:
            if seq.alphabet != myalpha:
                raise RuntimeError('Sequence ' + seq.name + ' uses an invalid alphabet ')
Mikael Boden's avatar
Mikael Boden committed
331
    jp = Joint([myalpha for _ in range(order)])
Mikael Boden's avatar
Mikael Boden committed
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
    for seq in myseqs:
        for i in range(len(seq) - order):
            sub = seq[i:i + order + 1]
            jp.observe(sub)
    return jp

def getCount(seqs, findme = None):
    if findme != None:
        cnt = 0
        for seq in seqs:
            cnt += seq.count(findme)
        return cnt
    else:
        if len(seqs) > 0:
            alpha = seqs[0].alphabet
            patcnt = {}
            for a in alpha:
                patcnt[a] = getCount(seqs, a)
        return patcnt

# Alignment ------------------

class Alignment():
    """ A sequence alignment class. Stores two or more sequences of equal length where
Mikael Boden's avatar
Mikael Boden committed
356
    one symbol is gap '-' 
Mikael Boden's avatar
Mikael Boden committed
357
    Example usage:
Mikael Boden's avatar
Mikael Boden committed
358 359
    >>> seqs = [Sequence('THIS-LI-NE-', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
    >>> print (Alignment(seqs))
Mikael Boden's avatar
Mikael Boden committed
360
     THIS-LI-NE-
Mikael Boden's avatar
Mikael Boden committed
361
     --ISALIGNED """
Mikael Boden's avatar
Mikael Boden committed
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389

    alignlen = None
    seqs = None
    alphabet = None

    def __init__(self, seqs):
        self.alignlen = -1
        self.seqs = seqs
        self.alphabet = None
        for s in seqs:
            if self.alignlen == -1:
                self.alignlen = len(s)
            elif self.alignlen != len(s):
                raise RuntimeError("Alignment invalid: different lengths")
            if self.alphabet != None and self.alphabet != s.alphabet:
                raise RuntimeError("Alignment invalid: different alphabets")
            self.alphabet = s.alphabet

    def getnamelen(self):
        namelen = 0
        for seq in self.seqs:
            namelen = max(len(seq.name), namelen)
        return namelen

    def __len__(self):
        """ Defines what the "len" operator returns for an instance of Alignment, e.g.
        >>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
        >>> aln = Alignment(seqs)
Mikael Boden's avatar
Mikael Boden committed
390
        >>> print(len(aln))
Mikael Boden's avatar
Mikael Boden committed
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
        2
        """
        return len(self.seqs)

    def getSize(self):
        """ Returns the size of an alignment in terms of number of columns """
        return self.alignlen

    def __str__(self):
        string = ''
        namelen = self.getnamelen()
        for seq in self.seqs:
            string += seq.name.ljust(namelen+1)
            for sym in seq:
                string += sym
            string += '\n'
        return string

    def __getitem__(self, ndx):
        return self.seqs[ndx]

    def writeClustal(self, filename = None):
        """ Write the alignment to a string or file using the Clustal file format. """
        symbolsPerLine = 60
        maxNameLength =  self.getnamelen() + 1
        string = ''
        wholeRows = self.alignlen / symbolsPerLine
Mikael Boden's avatar
Mikael Boden committed
418
        for i in range(int(wholeRows)):
Mikael Boden's avatar
Mikael Boden committed
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477
            for j in range(len(self.seqs)):
                string += self.seqs[j].name.ljust(maxNameLength) + ' '
                string += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
            string += '\n'
        # Possible last row
        lastRowLength = self.alignlen - wholeRows*symbolsPerLine
        if lastRowLength > 0:
            for j in range(len(self.seqs)):
                if maxNameLength > 0:
                    string += self.seqs[j].name.ljust(maxNameLength) + ' '
                string += self.seqs[j][-lastRowLength:] + '\n'
        if filename != None:
            fh = open(filename, 'w')
            fh.write('CLUSTAL W (1.83) multiple sequence alignment\n\n\n') # fake header so that clustal believes it
            fh.write(string)
            fh.close()
            return
        return string

    def getProfile(self, pseudo = 0.0, countGaps = True):
        """ Determine the probability matrix from the alignment, assuming
        that each position is independent of all others. """
        p = IndepJoint([self.alphabet for _ in range(self.alignlen)], pseudo)
        for seq in self.seqs:
            p.observe(seq, 1, countGaps = countGaps)
        return p

    def getConsensus(self):
        """ Construct a consensus sequence. """
        syms = []
        for col in range(self.alignlen):
            d = Distrib(self.alphabet)
            for seq in self.seqs:
                if seq[col] in self.alphabet:
                    d.observe(seq[col])
            syms.append(d.getmax())
        return Sequence(syms)

    def getConsensusForColumn(self, colidx):
        symcnt = {}
        for seq in self.seqs:
            mysym = seq[colidx]
            try:
                symcnt[mysym] += 1
            except:
                symcnt[mysym] = 1
        consensus = None
        maxcnt = 0
        for mysym in symcnt:
            if symcnt[mysym] > maxcnt:
                maxcnt = symcnt[mysym]
                consensus = mysym
        return consensus

    def displayConsensus(self, theta1 = 0.2, theta2 = 0.05, lowercase = True):
        """ Display a table with rows for each alignment column, showing
            column index, entropy, number of gaps, and symbols in order of decreasing probability.
            theta1 is the threshold for displaying symbols in upper case,
            theta2 is the threshold for showing symbols at all, and in lower case. """
Mikael Boden's avatar
Mikael Boden committed
478 479
        print(("Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)))
        print(("Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)))
Mikael Boden's avatar
Mikael Boden committed
480 481 482 483 484 485 486 487
        for col in range(self.alignlen):
            d = Distrib(self.alphabet)
            gaps = 0
            for seq in self.seqs:
                if seq[col] in self.alphabet:
                    d.observe(seq[col])
                else:
                    gaps += 1
Mikael Boden's avatar
Mikael Boden committed
488
            print(((col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,))
Mikael Boden's avatar
Mikael Boden committed
489 490 491
            symprobs = d.getProbsort()
            (_, maxprob) = symprobs[0]
            if maxprob >= theta1:
Mikael Boden's avatar
Mikael Boden committed
492
                print(("%d\tTRUE\t" % int(maxprob * 100),))
Mikael Boden's avatar
Mikael Boden committed
493
            else:
Mikael Boden's avatar
Mikael Boden committed
494
                print(("%d\t\t" % int(maxprob * 100),))
Mikael Boden's avatar
Mikael Boden committed
495 496
            for (sym, prob) in symprobs:
                if prob >= theta1:
Mikael Boden's avatar
Mikael Boden committed
497
                    print((sym, "%d%%" % int(prob * 100),))
Mikael Boden's avatar
Mikael Boden committed
498
                elif prob >= theta2 and lowercase:
Mikael Boden's avatar
Mikael Boden committed
499
                    print((sym.lower(), "%d%%" % int(prob * 100),))
Mikael Boden's avatar
Mikael Boden committed
500
                elif prob >= theta2:
Mikael Boden's avatar
Mikael Boden committed
501 502
                    print((sym, "%d%%" % int(prob * 100),))
            print()
Mikael Boden's avatar
Mikael Boden committed
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627

    def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False):
        """ Display a table with rows for each alignment column, showing
            column index, entropy, number of gaps, and symbols in order of decreasing probability.
            theta1 is the threshold for displaying symbols in upper case,
            theta2 is the threshold for showing symbols at all, and in lower case. """
        filename = ''.join(e for e in filename if e.isalnum() or e == '_' or e == '.')
        f = open(filename, 'w')
        f.write("Alignment of %d sequences, with %d columns\n" % (len(self.seqs), self.alignlen))
        if compact:
            f.write("Column\tConserv\tVariab\tAll (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2))
        else:
            f.write("Column\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2))
        countrow = 0
        for col in range(self.alignlen):
            countrow += 1
            if myseq[col] == '-':
                continue
            alist = list(self.alphabet)
            alist.append('-')
            gapalphabet = Alphabet(alist)
            d_gap = Distrib(gapalphabet)
            d_nogap = Distrib(self.alphabet)
            for seq in self.seqs:
                if seq[col] in gapalphabet:
                    d_gap.observe(seq[col])
                if seq[col] in self.alphabet:
                    d_nogap.observe(seq[col])
            f.write("%d\t" % (col + 1))
            symprobs_nogap = d_nogap.getProbsort()
            symprobs_gap = d_gap.getProbsort()
            (maxsym, maxprob) = symprobs_nogap[0]
            if compact:
                if maxprob >= theta1:
                    f.write("%c\t" % maxsym)
                else:
                    f.write("\t")
                    for (sym, prob) in symprobs_gap:
                        if prob >= theta2 and lowercase:
                            f.write("%c" % sym.lower())
                        elif prob >= theta2:
                            f.write("%c" % sym)
                f.write("\t")
            else:
                if maxprob >= theta1:
                    f.write("%d\t" % int(maxprob * 100))
                else:
                    f.write("%d\t\t" % int(maxprob * 100))
            for (sym, prob) in symprobs_gap:
                if prob >= theta1:
                    f.write("%c %d%% " % (sym, int(prob * 100)))
                elif prob >= theta2 and lowercase:
                    f.write("%c %d%% " % (sym.lower(), int(prob * 100)))
                elif prob >= theta2:
                    f.write("%c %d%% " % (sym, int(prob * 100)))
            f.write('\n')
        f.close()

    def calcBackground(self):
        """ Count the proportion of each amino acid's occurrence in the
            alignment, and return as a probability distribution. """
        p = Distrib(self.alphabet)
        for seq in self.seqs:
            for sym in seq:
                if sym in self.alphabet: # ignore "gaps"
                    p.observe(sym)
        return p

    def calcSubstMatrix(self, background = None):
        """ Return a substitutionMatrix whose fg are based on this un-gapped
        multiple sequence alignment. Scores are given in half-bits. """
        # Get a list of the amino acids
        aminoAcids = self.alphabet.symbols
        columns = self.alignlen                   # Length of sequences in alignment
        numSeqs = len(self.seqs)                  # Number of sequences in alignment
        seqPairs = (numSeqs* (numSeqs - 1) ) / 2  # Number of pairs of sequences in ungapped alignment
        aaPairs = seqPairs * columns              # Number of pairs of amino acids in ungapped alignment
        # For each pair of amino acids, calculate the proportion of all aligned
        # amino acids in this alignment which are made up of that pair
        # (i.e., q[ab] = fab / aaPairs, where fab is the number of times
        #  a and b are aligned in this alignment)
        # See page 122 in Understanding Bioinformatics.
        q = {}
        for i in range( len(aminoAcids) ):
            a = aminoAcids[i]
            for j in range(i, len(aminoAcids)):
                b = aminoAcids[j]
                # Count the number of times each pair of amino acids is aligned
                fab = 0
                for column in range(columns):
                    # Count number of each amino acid in each column
                    col = [seq[column] for seq in self.seqs]
                    if a == b:
                        # Number of ways of pairing up n occurrences of amino
                        # acid a is n*(n-1)/2
                        cnt = col.count(a)
                        fab += cnt * (cnt-1)/2
                    else:
                        # Number of ways of pairing up n & m occurrences of
                        # amino acids a & b is n*m
                        fab += col.count(a)*col.count(b)
                # Calculate proportion of all aligned pairs of amino acids
                q[a+b] = q[b+a] = float(fab) / aaPairs
                if q[a+b] == 0:   # This is so we don't end up doing log(0)
                    q[a+b] = q[b+a] = 0.001
        # Background frequency calculation if required
        p = background or self.calcBackground()
        # Calculate log-odds ratio for each pair of amino acids
        s = SubstMatrix(self.alphabet)
        for a in aminoAcids:
            for b in aminoAcids:
                # Calculate random chance probabilitity (eab)
                if a == b:
                    eab = p[a]**2
                else:
                    eab = 2*p[a]*p[b]
                if eab == 0:
                    eab = 0.001
                # Calculate final score to be set in the substitution matrix
                odds = q[a+b] / eab
                sab = math.log(odds, 2) # log_2 transform
                sab = sab * 2 # units in half bits
                s.set(a, b, int(round(sab)))
        return s

Mikael Boden's avatar
Mikael Boden committed
628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
    def outliers(self, cap = None):
        """
        Score the extent to which each sequence in the alignment is an outlier
        :param cap: the number of sequences that need to share a the state of a position for it to be optimally aligned
        :return: a tuple of two lists, each with a score for each sequences, in order of the alignment;
        the first list contains an entropy-based score accumulated over the whole sequence;
        the second list has a gap-continuity score (the greatest entropy-based score collated for a single, continuous gap, most probably a "deletion");
        the third list has a character-continuity score (the greatest entropy-based score collated for a single, continuous character string, most probably an "insertion");
        for all three scores, higher means outlier, zero means it is optimally aligned
        """
        nseqs = len(self.seqs)
        if not cap:
            cap = nseqs
        gapmat = numpy.zeros((nseqs, self.alignlen))
        ngaps = numpy.zeros((self.alignlen))
        entscore = [0 for _ in range(nseqs)] # cumulative entropy based score
        gapscore = [0 for _ in range(nseqs)] # highest gap score
        insscore = [0 for _ in range(nseqs)] # highest insert score
        for c in range(self.alignlen):
            for r in range(nseqs):
                gapmat[r, c] = 1 if self.seqs[r][c] == '-' else 0
                ngaps[c] += gapmat[r, c]
        for r in range(nseqs):
            curgap = 0 # current gap score (cumulative from previous non-gap position)
            curchr = 0 # current insertion score (cumulative from previous gap position)
            in_gap = False
            for c in range(self.alignlen):
                agree_cnt = ngaps[c] if gapmat[r, c] == 1 else (nseqs - ngaps[c])
                logent = math.log(math.log(agree_cnt, nseqs) + 0.000001) if agree_cnt < cap else 0.0
                if gapmat[r, c] == 1:
                    if not in_gap:
                        curgap = 0
                    curgap -= logent
                    if curgap > gapscore[r]:
                        gapscore[r] = curgap
                else: # gapmat[r, c] == 0, i.e. character
                    if in_gap: # first character in a string
                        curchr = 0
                    curchr -= logent
                    if curchr > insscore[r]:
                        insscore[r] = curchr
                entscore[r] -= logent
                in_gap = gapmat[r, c] == 1
        return entscore, gapscore, insscore

Mikael Boden's avatar
Mikael Boden committed
673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
    def calcDistances(self, measure, a=1.0):
        """ Calculate the evolutionary distance between all pairs of sequences
        in this alignment, using the given measure. Measure can be one of
        'fractional', 'poisson', 'gamma', 'jc' or 'k2p'. If 'gamma' or 'k2p' is
        given, then the parameter a must also be specified (or else it will use
        the default value of 1.0).
        Definitions of each distance metric are found in Zvelebil and Baum p268-276.
        These are mostly intended for DNA, but adapted for protein (as below).
        Note however that there are alternative distance matrices for proteins (p276).
        """
        measure = measure.lower()
        if not measure in ['fractional', 'poisson', 'gamma', 'jc', 'k2p']:
            raise RuntimeError('Unsupported evolutionary distance measure: %s' % measure)
        a = float(a)
        if len(self.alphabet) == 4:
            oneless = 3
            alphalen = 4
        elif len(self.alphabet) == 20:
            oneless = 19
            alphalen = 20
        else:
            raise RuntimeError('Invalid sequence alphabet: %s' % str(self.alphabet))
        distmat = numpy.zeros((len(self.seqs), len(self.seqs)))
        # Loop through each pair of sequences
        for i in range(len(self.seqs)):
            for j in range(i + 1, len(self.seqs)):
                seqA = self.seqs[i]
                seqB = self.seqs[j]
                # Calculate the fractional distance (p) first
                # The two sequences of interest are in seqA and seqB
                L = 0
                D = 0
                for k in range(self.alignlen):
                    # For every non-gapped column, put to L
                    # For every non-gapped column where the sequences are
                    # different, put to D
                    if seqA[k] != '-' and seqB[k] != '-':
                        L += 1
                        if seqA[k] != seqB[k]:
                            D += 1
                p = float(D)/L
                # Now calculate the specified measure based on p
                if measure == 'fractional':
                    dist = p
                elif measure == 'poisson':
                    dist = -math.log(1-p)
                elif measure == 'jc':
                    dist = -(float(oneless)/alphalen)*math.log(1 - (float(alphalen)/oneless)*p)
                elif measure == 'k2p':
                    dist = (float(oneless)/alphalen)*a*((1 - (float(alphalen)/oneless)*p)**(-1/a) - 1)
                else: # measure == 'gamma'
                    dist = a*((1-p)**(-1/a) - 1)
                distmat[i, j] = distmat[j, i] = dist
        return distmat

728
    def writeHTML(self, filename = None, col_start = None, col_end = None):
Mikael Boden's avatar
Mikael Boden committed
729
        """ Generate HTML that displays the alignment in color.
Mikael Boden's avatar
Mikael Boden committed
730 731 732
            Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
            and that each symbol maps to a text string naming the color, e.g. 'blue'
        """
733 734
        col_start = col_start or 0
        col_end = col_end or self.alignlen
Mikael Boden's avatar
Mikael Boden committed
735
        html = '''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n'''
736
        html += '''<p style="font-size:12px">\n'''
Mikael Boden's avatar
Mikael Boden committed
737
        maxNameLength =  self.getnamelen()
Mikael Boden's avatar
Mikael Boden committed
738
        html += ''.ljust(maxNameLength) + ' '
Mikael Boden's avatar
Mikael Boden committed
739
        for i in range(self.alignlen - 1):
740
            if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
Mikael Boden's avatar
Mikael Boden committed
741
                html += str(i/10+1)[0]
742
            elif (i >= col_start and i < col_end):
Mikael Boden's avatar
Mikael Boden committed
743
                html += ' '
744 745
#        html += '%s\n' % (col_end)
        html += '\n'
Mikael Boden's avatar
Mikael Boden committed
746

Mikael Boden's avatar
Mikael Boden committed
747
        if self.alignlen > 10:
Mikael Boden's avatar
Mikael Boden committed
748
            html += ''.ljust(maxNameLength) + ' '
Mikael Boden's avatar
Mikael Boden committed
749
            for i in range(self.alignlen - 1):
750
                if (i+1) % 10 == 0 and (i >= col_start and i < col_end):
Mikael Boden's avatar
Mikael Boden committed
751 752
                    index = len(str(i/10 + 1).split('.')[0])
                    html += str(i / 10 + 1).split('.')[0][(index * -1) + 1 ] if (len(str(i / 10 + 1).split('.')[0]) > 1) else '0'
753
                elif (i >= col_start and i < col_end):
Mikael Boden's avatar
Mikael Boden committed
754 755 756 757 758 759
                    html += ' '
            html += '\n'

        if self.alignlen > 100:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
760
                if (i+1) % 10 == 0 and i >= 99  and (i >= col_start and i < col_end):
Mikael Boden's avatar
Mikael Boden committed
761 762
                    index = len(str(i/10 + 1).split('.')[0])
                    html += str(i / 10 + 1).split('.')[0][-1] if (len(str(i / 10 + 1).split('.')[0]) >2) else '0'
763
                elif (i >= col_start and i < col_end):
Mikael Boden's avatar
Mikael Boden committed
764 765 766 767 768 769
                    html += ' '
            html += '\n'

        if self.alignlen > 1000:
            html += ''.ljust(maxNameLength) + ' '
            for i in range(self.alignlen - 1):
770
                if (i+1) % 10 == 0  and (i >= col_start and i < col_end):
Mikael Boden's avatar
Mikael Boden committed
771
                    html += '0' if (len(str(i / 10 + 1).split('.')[0]) > 2) else ' '
772
                elif (i >= col_start and i < col_end):
Mikael Boden's avatar
Mikael Boden committed
773 774 775
                    html += ' '
            html += '\n'
        for seq in self.seqs:
Mikael Boden's avatar
Mikael Boden committed
776
            html += seq.name.ljust(maxNameLength) + ' '
777
            for sym in seq[col_start:col_end]:
Mikael Boden's avatar
Mikael Boden committed
778 779 780 781 782
                color = self.alphabet.getAnnotation('html-color', sym)
                if not color:
                    color = 'white'
                html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
            html += '\n'
783
        html += '</p></pre></body></html>'
Mikael Boden's avatar
Mikael Boden committed
784
        if filename:
Mikael Boden's avatar
Mikael Boden committed
785 786 787 788
            fh = open(filename, 'w')
            fh.write(html)
            fh.close()
        return html
Mikael Boden's avatar
Mikael Boden committed
789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852

def saveConsensus(aln, theta1 = 0.99, theta2 = 0.01, countgaps = False, consensus = True, filename = None):
    """ Display a table with rows for each alignment column, showing
        column index, entropy, number of gaps, and symbols in order of decreasing probability.
        theta1 is the percent threshold for consensus (when achieved, all other symbols are ignored)
        theta2 is the percent threshold for inclusion (symbols below are ignored).
        countgaps, if true, count gaps (default false).
        consensus, if true, always print the consensus symbol.
        filename is name of file to save the output to (default stdout)."""
    if filename == None:
        f = sys.stdout
    else:
        filename = ''.join(e for e in filename if e.isalnum() or e == '_' or e == '.')
        f = open(filename, 'w')
    if consensus:
        f.write("Alignment of %d sequences, with %d columns\n" % (len(aln.seqs), aln.alignlen))
        f.write("Consensus>=%.2f;Inclusion>=%.2f)\n" % (theta1, theta2))
    for col in range(aln.alignlen):
        # collect probabilities for column, with or without gap
        myalpha = aln.alphabet
        if countgaps:
            alist = list(aln.alphabet)
            alist.append('-')
            myalpha = Alphabet(alist)
        d = Distrib(myalpha)
        for seq in aln.seqs:
            if seq[col] in myalpha:
                d.observe(seq[col])
        symprobs = d.getProbsort() # the symbols sorted by probability
        ninclusions = 0
        for (s, p) in symprobs:
            if p >= theta2:
                ninclusions += 1
            else:
                break
        if consensus or ninclusions > 1:
            f.write("%d " % (col + 1))
        (maxs, maxp) = symprobs[0]
#        if maxp >= theta1 or consensus:
#            f.write("%c" % maxs)
        for (s, p) in symprobs[1:]:
            if p >= theta2:
                f.write("%c" % s)
        f.write("; ")
    f.write('\n')
    f.close()

def alignGlobal(seqA, seqB, substMatrix, gap = -1):
    """ Align seqA with seqB using the Needleman-Wunsch
    (global) algorithm. subsMatrix is the substitution matrix to use and
    gap is the linear gap penalty to use. """
    lenA, lenB = len(seqA), len(seqB)
    # Create the scoring matrix (S)
    S = numpy.zeros((lenA + 1, lenB + 1))
    # Fill the first row and column of S with multiples of the gap penalty
    for i in range(lenA + 1):
        S[i, 0] = i * gap
    for j in range(lenB + 1):
        S[0, j] = j * gap
    # Calculate the optimum score at each location in the matrix S
    # (where the score represents the best possible score for an alignment
    #  that ends at sequence indices i and j, for A and B, resp.)
    for i in range(1, lenA + 1):
        for j in range(1, lenB + 1):
Mikael Boden's avatar
Mikael Boden committed
853
            match  = S[i-1, j-1] + substMatrix.__getitem__(seqA[i - 1], seqB[j - 1])
Mikael Boden's avatar
Mikael Boden committed
854 855 856
            fromTop = S[i-1, j  ] + gap 
            fromLeft = S[i  , j-1] + gap 
            S[i, j] = max([match, fromTop, fromLeft])
Mikael Boden's avatar
Mikael Boden committed
857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911
    # Traceback the optimal alignment
    alignA = '' # a string for sequence A when aligned (e.g. 'THIS-LI-NE-', initially empty).
    alignB = '' # a string for sequence B when aligned (e.g. '--ISALIGNED', initially empty).
    # Start at the end (bottom-right corner of S)
    i = lenA
    j = lenB
    # Stop when we hit the beginning of at least one sequence
    while i > 0 and j > 0:
        if S[i, j] == S[i-1, j] + gap:
            # Got here by a gap in sequence B (go up)
            alignA = seqA[i-1] + alignA
            alignB = '-' + alignB
            i -= 1
        elif S[i, j] == S[i, j-1] + gap:
            # Got here by a gap in sequence A (go left)
            alignA = '-' + alignA
            alignB = seqB[j-1] + alignB
            j -= 1
        else:
            # Got here by aligning the bases (go diagonally)
            alignA = seqA[i-1] + alignA
            alignB = seqB[j-1] + alignB
            i -= 1
            j -= 1
    # Fill in the rest of the alignment if it begins with gaps
    # (i.e., traceback all the way to S[0, 0])
    while i > 0:
        # Go up
        alignA = seqA[i-1] + alignA
        alignB = '-' + alignB
        i -= 1
    while j > 0:
        # Go left
        alignA = '-' + alignA
        alignB = seqB[j-1] + alignB
        j -= 1
    return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])

def alignLocal(seqA, seqB, substMatrix, gap = -1):
    """ Align seqA with seqB using the Smith-Waterman
    (local) algorithm. subsMatrix is the substitution matrix to use and
    gap is the linear gap penalty to use. """
    lenA, lenB = len(seqA), len(seqB)
    # Create the scoring matrix (S)
    S = numpy.zeros((lenA + 1, lenB + 1))
    # Fill the first row and column of S with multiples of the gap penalty
    for i in range(lenA + 1):
        S[i, 0] = 0  # Local: init 0
    for j in range(lenB + 1):
        S[0, j] = 0  # Local: init 0
    # Calculate the optimum score at each location in the matrix S
    # (where the score represents the best possible score for an alignment
    #  that ends at sequence indices i and j, for A and B, resp.)
    for i in range(1, lenA + 1):
        for j in range(1, lenB + 1):
Mikael Boden's avatar
Mikael Boden committed
912
            match  = S[i-1, j-1] + substMatrix.__getitem__(seqA[i - 1], seqB[j - 1])
Mikael Boden's avatar
Mikael Boden committed
913 914 915
            fromTop = S[i-1, j  ] + gap 
            fromLeft = S[i  , j-1] + gap 
            S[i, j] = max([match, fromTop, fromLeft, 0]) # Local: add option that we re-start alignment from "0"
Mikael Boden's avatar
Mikael Boden committed
916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
    # Trace back the optimal alignment
    alignA = ''
    alignB = ''
    # Local: start at the cell which has the highest score; find it
    i = 0
    j = 0
    for ii in range(1, lenA + 1):
        for jj in range(1, lenB + 1):
            if S[ii, jj] > S[i, j]:
                i = ii
                j = jj

    # Stop when we hit the end of a sequence
    # Local: also stop when we hit a score 0
    while i > 0 and j > 0 and S[i, j] > 0:
        if S[i, j] == S[i-1, j] + gap:
            # Got here by a gap in sequence B (go up)
            alignA = seqA[i-1] + alignA
            alignB = '-' + alignB
            i -= 1
        elif S[i, j] == S[i, j-1] + gap:
            # Got here by a gap in sequence A (go left)
            alignA = "-" + alignA
            alignB = seqB[j-1] + alignB
            j -= 1
        else:
            # Got here by aligning the bases (go diagonally)
            alignA = seqA[i-1] + alignA
            alignB = seqB[j-1] + alignB
            i -= 1
            j -= 1
    return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])

def tripletAlignGlobal(seqA, seqB, seqC, subsMatrix, gap = -1):
    """ Triplet-wise align this sequence with sequences seqB and seqC,
    using the Needleman-Wunsch (global) algorithm. subsMatrix is the
    substitution matrix to use and gap is the linear gap penalty to use. """

    lenA, lenB, lenC = [s.length for s in [seqA, seqB, seqC]]

    # Create the 3D scoring matrix
    traceback = numpy.zeros((lenA+1, lenB+1, lenC+1))
    # Fill the first row (in each dimension) with multiples of the gap penalty
    S = numpy.zeros((lenA+1, lenB+1, lenC+1))
    for i in range(lenA+1):
        S[i,0,0] = i * gap
    for j in range(lenB+1):
        S[0,j,0] = j * gap
    for k in range(lenC+1):
        S[0,0,k] = k * gap
    # Calculate the optimum __getitem__ at each location in the matrix
    for i in range(1, lenA+1):
        for j in range(1, lenB+1):
            for k in range(1, lenC+1):
                # Scored using sum-of-pairs
Mikael Boden's avatar
Mikael Boden committed
971 972 973 974 975 976
                matchABC = S[i-1, j-1, k-1] + subsMatrix.__getitem__(seqA[i - 1], seqB[j - 1]) \
                           + subsMatrix.__getitem__(seqA[i - 1], seqC[k - 1]) \
                           + subsMatrix.__getitem__(seqB[j - 1], seqC[k - 1])
                matchAB = S[i-1, j-1, k] + 2*gap + subsMatrix.__getitem__(seqA[i - 1], seqB[j - 1])
                matchBC = S[i, j-1, k-1] + 2*gap + subsMatrix.__getitem__(seqB[j - 1], seqC[k - 1])
                matchAC = S[i-1, j, k-1] + 2*gap + subsMatrix.__getitem__(seqA[i - 1], seqC[k - 1])
Mikael Boden's avatar
Mikael Boden committed
977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086
                gapAB = S[i, j, k-1] + 3*gap
                gapBC = S[i-1, j, k] + 3*gap
                gapAC = S[i, j-1, k] + 3*gap
                # Use maximum of the 7 options for this location
                S[i, j, k] = max([matchABC, matchAB, matchBC, matchAC, gapAB, gapBC, gapAC])
                # Remember which one was max., for the traceback
                if S[i, j, k] == matchABC:
                    traceback[i, j, k] = 0 #"matchABC"
                elif S[i, j, k] == matchBC:
                    traceback[i, j, k] = 1 #"matchBC"
                elif S[i, j, k] == matchAC:
                    traceback[i, j, k] = 2 #"matchAC"
                elif S[i, j, k] == matchAB:
                    traceback[i, j, k] = 3 #"matchAB"
                elif S[i, j, k] == gapAB:
                    traceback[i, j, k] = 4 #"gapAB"
                elif S[i, j, k] == gapBC:
                    traceback[i, j, k] = 5 #"gapBC"
                elif S[i, j, k] == gapAC:
                    traceback[i, j, k] = 6 #"gapAC"

    # Traceback the optimal alignment
    alignA = ""
    alignB = ""
    alignC = ""
    # Start at the end
    i = lenA
    j = lenB
    k = lenC
    # Stop when we hit the end of all but one sequence
    while (i>0 and j>0) or (j>0 and k>0) or (i>0 and k>0):
        if traceback[i, j, k] == 0: #"matchABC":
            alignA = seqA[i-1] + alignA
            alignB = seqB[j-1] + alignB
            alignC = seqC[k-1] + alignC
            i -= 1
            j -= 1
            k -= 1
        elif traceback[i, j, k] == 3: #"matchAB":
            alignA = seqA[i-1] + alignA
            alignB = seqB[j-1] + alignB
            alignC = "-" + alignC
            i -= 1
            j -= 1
        elif traceback[i, j, k] == 2: #"matchAC":
            alignA = seqA[i-1] + alignA
            alignB = "-" + alignB
            alignC = seqC[k-1] + alignC
            i -= 1
            k -= 1
        elif traceback[i, j, k] == 1: #"matchBC":
            alignA = "-" + alignA
            alignB = seqB[j-1] + alignB
            alignC = seqC[k-1] + alignC
            j -= 1
            k -= 1
        elif traceback[i, j, k] == 4: #"gapAB":
            alignA = "-" + alignA
            alignB = "-" + alignB
            alignC = seqC[k-1] + alignC
            k -= 1
        elif traceback[i, j, k] == 6: #"gapAC":
            alignA = "-" + alignA
            alignB = seqB[j-1] + alignB
            alignC = "-" + alignC
            j -= 1
        elif traceback[i, j, k] == 5: #"gapBC":
            alignA = seqA[i-1] + alignA
            alignB = "-" + alignB
            alignC = "-" + alignC
            i -= 1
    # Fill in the rest of the alignment if it begins with gaps
    # (i.e., traceback all the way to S[0, 0, 0])
    while i > 0:
        alignA = seqA[i-1] + alignA
        alignB = "-" + alignB
        alignC = "-" + alignC
        i -= 1
    while j > 0:
        alignA = "-" + alignA
        alignB = seqB[j-1] + alignB
        alignC = "-" + alignC
        j -= 1
    while k > 0:
        alignA = "-" + alignA
        alignB = "-" + alignB
        alignC = seqC[k-1] + alignC
        k -= 1

    return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True),
                      Sequence(alignB, seqB.alphabet, seqB.name, gappy = True),
                      Sequence(alignC, seqC.alphabet, seqC.name, gappy = True)])

def readClustal(string, alphabet):
    """ Read a ClustalW2 alignment in the given string and return as an
    Alignment object. """
    seqs = {} # sequence data
    for line in string.splitlines():
        if line.startswith('CLUSTAL') or line.startswith('STOCKHOLM') \
           or line.startswith('#'):
            continue
        if len(line.strip()) == 0:
            continue
        if line[0] == ' ' or '*' in line or ':' in line:
            continue
        sections = line.split()
        name, seqstr = sections[0:2]
        index = name.find('/')
        if index >= 0:
            name = name[0:index]
Mikael Boden's avatar
Mikael Boden committed
1087
        if name in seqs:
Mikael Boden's avatar
Mikael Boden committed
1088 1089 1090 1091
            seqs[name] += seqstr
        else:
            seqs[name] = seqstr
    sequences = []
Mikael Boden's avatar
Mikael Boden committed
1092
    for name, seqstr in list(seqs.items()):
Mikael Boden's avatar
Mikael Boden committed
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
        sequences.append(Sequence(seqstr, alphabet, name, gappy = True))
    return Alignment(sequences)

def readClustalFile(filename, alphabet):
    """ Read a ClustalW2 alignment file and return an Alignment object
    containing the alignment. """
    fh = open(filename)
    data = fh.read()
    fh.close()
    aln = readClustal(data, alphabet)
    return aln

# Substitution Matrix ------------------

class SubstMatrix():

    scoremat = None
    alphabet = None

    def __init__(self, alphabet):
        self.alphabet = alphabet
        self.scoremat = {}

    def setScores(self, scoremat):
        """ Set all scores in one go.
            scoremat is a (sym1, sym2)-keyed dictionary of scores. """
        self.scoremat = scoremat

    def _getkey(self, sym1, sym2):
        """ Construct canonical (unordered) key for two symbols """
        if sym1 <= sym2:
            return tuple([sym1, sym2])
        else:
            return tuple([sym2, sym1])

    def set(self, sym1, sym2, score):
        """ Add a score to the substitution matrix """
        self.scoremat[self._getkey(sym1, sym2)] = score

    def get(self, sym1, sym2):
        return self.scoremat[self._getkey(sym1, sym2)]

    def __str__(self):
        symbols = self.alphabet.symbols # what symbols are in the alphabet
        i = len(symbols)
        string = ''
        for a in symbols:
            string += a + ' '
            for b in symbols[:len(symbols)-i+1]:
                score = self.scoremat[self._getkey(a, b)]
                if score != None:
                    string += str(score).rjust(3) + ' '
                else:
                    string += "?".rjust(3) + ' '
            string += '\n'
            i -= 1
        string += '    ' + '   '.join(self.alphabet.symbols)
        return string

    def writeFile(self, filename):
        """ Write this substitution matrix to the given file. """
        fh = open(filename, 'w')
        file = ''
        for key in self.scoremat:
            file += ''.join(key) + ': ' + str(self.scoremat[key]) + '\n'
        fh.write(file)
        fh.close()


def readSubstMatrix(filename, alphabet):
    """ Read in the substitution matrix stored in the given file. """
    mat = SubstMatrix(alphabet)
    fh = open(filename, 'r')
    data = fh.read()
    fh.close()
    lines = data.splitlines()
    for line in lines:
        if len(line.strip()) == 0:
            continue
        symbols, score = line.split(':')
        score = int(score)
        mat.set(symbols[0], symbols[1], score)
    return mat

#import os
#os.chdir('/Users/mikael/workspace/binf/data/')  # set to the directory where you keep your files
#BLOSUM62 = readSubstMatrix('blosum62.matrix', Protein_Alphabet)

# Motifs -------------------

class Regexp(object):

    """ A class that defines a sequence pattern in terms of a
    given regular expression, with . indicating any symbol and square brackets
    indicating a selection. See standard regexp definitions for more. """

    def __init__(self, pattern):
        """ Create a new consensus sequence with the given pattern. """
        try:
            self.pattern = pattern
            self.regex = re.compile(pattern)
        except:
            raise RuntimeError('invalid consensus sequence given: %s' % pattern)

    def __str__(self):
        return self.pattern

1200
    def search(self, sequence, gappy = False):
Mikael Boden's avatar
Mikael Boden committed
1201 1202
        """ Find matches to the motif in the specified sequence. Returns a list
        of triples, of the form (position, matched string, score). Note that
Mikael Boden's avatar
Mikael Boden committed
1203
        the score is always 1.0 because a regexp either matches
Mikael Boden's avatar
Mikael Boden committed
1204 1205 1206
        or doesn't. """
        if not type(sequence) is Sequence:
            sequence = Sequence(sequence)
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
        if gappy == False or sequence.gappy == False:
            sequenceString = sequence[:]
            results = []
            for match in self.regex.finditer(sequenceString):
                results.append((match.start(), match.group(), 1.0))
            return results
        else:  # if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
            degapped, idxs = sequence.getDegapped()
            results = []
            for match in self.regex.finditer(''.join(degapped)):
                results.append((idxs[match.start()], match.group(), 1.0))
            return results
Mikael Boden's avatar
Mikael Boden committed
1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286

class PWM(object):

    """ A position weight matrix. """

    def __init__(self, foreground, background = None, start = 0, end = None, pseudo = 0.0):
        """ Create a new PWM from the given probability matrix/ces.
        foreground: can be either an Alignment, a list of Distrib's or an instance of IndepJoint.
        background: must be a Distrib instance or None (in which case a uniform background will be used)
        Specify only a section of the matrix to use with start and end. """
        if isinstance(foreground, Alignment):
            foreground = foreground.getProfile(pseudo = pseudo)
        if isinstance(foreground, IndepJoint):
            foreground = foreground.store
        self.start = start
        self.end = end or len(foreground)
        self.length = self.end - self.start
        self.alphabet = foreground[self.start].alpha
        if False in [ col.alpha == self.alphabet for col in foreground[self.start + 1 : self.end] ]:
            raise RuntimeError("All positions need to be based on the same alphabet")
        self.symbols = self.alphabet.symbols
        # Set foreground probabilities from given alignment
        self.m = numpy.zeros((len(self.symbols), self.length))
        self.fg = foreground[self.start:self.end]
        self.bg = background or Distrib(self.alphabet, 1.0) # specified background or uniform
        if not self.alphabet == self.bg.alpha:
            raise RuntimeError("Background needs to use the same alphabet as the foreground")
        p = self.bg.prob()
        for i in range(self.length):
            q = self.fg[i].prob()
            for j in range(len(self.alphabet)):
                self.m[j][i] = self.logme(q[j], p[j])

    def __len__(self):
        return self.length

    def getRC(self, swap = [('A', 'T'), ('C', 'G')] ):
        """ Get the reverse complement of the current PWM.
            Use for DNA sequences with default params.
        """
        new_fg = self.fg[::-1]  # backwards
        for s in swap:
            new_fg = [d.swapxcopy(s[0], s[1]) for d in new_fg]
        return PWM(new_fg, self.bg)

    MIN_VALUE = 0.00000000001

    def logme(self, fg, bg):
        if fg > self.MIN_VALUE and bg > self.MIN_VALUE:
            ratio = fg / bg
            return math.log(ratio)
        # if not, one of fg and bg is practically zero
        if fg > self.MIN_VALUE: # bg is zero
            return math.log(fg / self.MIN_VALUE)
        else: # fg is zero
            return math.log(self.MIN_VALUE)

    def getMatrix(self):
        return self.m

    def __str__(self):
        str = ''
        for j in range(len(self.alphabet)):
            str += "%s\t%s\n" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
        return str

    def display(self, format = 'COLUMN'):
        if format == 'COLUMN':
Mikael Boden's avatar
Mikael Boden committed
1287
            print((" \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))))
Mikael Boden's avatar
Mikael Boden committed
1288
            for j in range(len(self.alphabet)):
Mikael Boden's avatar
Mikael Boden committed
1289
                print(("%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
Mikael Boden's avatar
Mikael Boden committed
1290 1291
        elif format == 'JASPAR':
            for j in range(len(self.alphabet)):
Mikael Boden's avatar
Mikael Boden committed
1292
                print(("%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))))
Mikael Boden's avatar
Mikael Boden committed
1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335

    def search(self, sequence, lowerBound=0):
        """ Find matches to the motif in a specified sequence. Returns a list
        of  results as triples: (position, matched string, score).
        The optional argument lowerBound specifies a lower bound on reported
        scores. """
        results = []
        for i in range(len(sequence)-self.length+1):
            subseq = sequence[i:i + self.length]
            ndxseq = [ self.alphabet.index(sym) for sym in subseq ]
            score = 0.0
            for w in range(len(ndxseq)):
                score += self.m[ ndxseq[w] ][ w ]
            if score > lowerBound:
                results.append((i, subseq, score))
        return results

    def maxscore(self, sequence):
        """ Find matches to the motif in a specified sequence.
            Returns the maximum score found in the sequence and its index as a tuple:
            (maxscore, maxindex) """
        maxscore = None
        maxindex = None
        for i in range(len(sequence)-self.length+1):
            subseq = sequence[i:i + self.length]
            ndxseq = [ self.alphabet.index(sym) for sym in subseq ]
            score = 0.0
            for w in range(len(ndxseq)):
                score += self.m[ ndxseq[w] ][ w ]
            if maxscore == None:
                maxscore = score
                maxindex = i
            elif maxscore < score:
                maxscore = score
                maxindex = i
        return (maxscore, maxindex)

# Web Service Functions -------------------

def getSequence(id, database = 'uniprotkb', start=None, end=None):
    """ Get the sequence identified by the given ID from the given database
    (e.g. 'uniprotkb', 'refseqn' or 'refseqp'), and return it as a Sequence
    object. An error is caused if the sequence ID is not found. If start and
Mikael Boden's avatar
Mikael Boden committed
1336
    end are given, then only that section of the sequence is returned.
Mikael Boden's avatar
Mikael Boden committed
1337 1338 1339 1340 1341 1342 1343
    Note: more flexible search options are supported by using webservice.fetch
    directly."""

    MAX_TRY = 5

    for i in range(MAX_TRY):
        try:
Mikael Boden's avatar
Mikael Boden committed
1344
            fastaData = fetch(id, database)
Mikael Boden's avatar
Mikael Boden committed
1345 1346 1347 1348
            seq = readFasta(fastaData)[0]
            break
        except:
            from time import sleep
Mikael Boden's avatar
Mikael Boden committed
1349
            print(('Failed on {i}th try for id {id}'.format(i=i, id=id)))
Mikael Boden's avatar
Mikael Boden committed
1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424
            sleep(0.1)
    try:
        return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info)
    except:
        raise RuntimeError('An error occurred while retrieving the specified sequence: %s (maybe the ID doesn\'t exist)' % id)

def searchSequences(query, database='uniprot'):
    """ Search for sequences matching the given query in the given database
    (must be 'uniprot'), and return a list of sequence IDs. """
    ids = search(query, limit = None)
    return ids

def runClustal(sequences, method='slow'):
    """ Run a ClustalOmega alignment of the given list of Sequence objects.
    Return an Alignment object. Method should be one of 'fast' or 'slow'. """
    alpha = None
    for seq in sequences:
        if alpha == None:
            alpha = seq.alphabet
        elif alpha != seq.alphabet:
            raise RuntimeError("Invalid alphabet: " + str(seq.alphabet) + ". Not compatible with " + str(alpha))
    serviceName = 'clustalo'
    resultType = 'aln-clustal'
    fastaSeqs = ''.join([seq.writeFasta() for seq in sequences])
    params = {'alignment': method.lower(), 'sequence': fastaSeqs}
    service = EBI(serviceName)
    result = service.submit(params, resultType)
    alignment = readClustal(result, alpha)
    return alignment

def createTree(alignment, type):
    """ Run a ClustalW 2 phylogeny tree creation of either a 'Neighbour-joining'
    or 'UPGMA' type tree from the given multiple sequence Alignment object. """
    if not type in ['Neighbour-joining', 'UPGMA']:
        raise RuntimeError('type must be either \'Neighbour-joining\' or \'UPGMA\'.')
    serviceName = 'clustalw2_phylogeny'
    resultType = 'tree'
    output = 'dist'
    clustalAln = alignment.writeClustal()
    params = {'tree': output, 'sequence': clustalAln, 'clustering': type, 'tossgaps': 'true'}
    service = EBI(serviceName)
    tree = service.submit(params, resultType)
    return tree

def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
    """ Run a BLAST search of nucleotide mouse databases using the given
    sequence as a query. Return a list of matched sequence IDs, in descending
    order of similarity to query sequence.
    program: either blastn (nucleotide) or blastp (protein)
    database: many available, e.g. uniprotkb, pdb (protein); em_rel, nrnl1 (EMBL nucleotide, non-redundant resp)
        (for protein see http://www.ebi.ac.uk/Tools/sss/ncbiblast/help/index-protein.html#database)
        (for nucleotide see http://www.ebi.ac.uk/Tools/sss/ncbiblast/help/index-nucleotide.html#database)
    exp: E-value threshold (select only hits that have a better E-value than this)
    """
    if sequence.alphabet == predefAlphabets['DNA']:
        stype = 'dna'
    elif sequence.alphabet == predefAlphabets['RNA']:
        stype = 'rna'
    else:
        stype = 'protein'
    serviceName = 'ncbiblast'
    resultTypes = ['ids', 'out'] # request
    fastaSeq = sequence.writeFasta()
    databases = [database]
    params = {'program': program, 'database': databases, 'sequence': fastaSeq,
              'stype': stype, 'exp': exp}
    service = EBI(serviceName)
    idsData, output = service.submit(params, resultTypes)
    ids=[]
    for id in idsData.splitlines():
        if len(id) > 0:
            ids.append(id.split(':')[1])
    return ids

if __name__ == '__main__':
Mikael Boden's avatar
Mikael Boden committed
1425 1426 1427 1428 1429 1430 1431 1432 1433 1434
    aln = readClustalFile('/Users/mikael/simhome/ASR/gappy.aln', Protein_Alphabet)
    x, g, i = aln.outliers()
    for s in range(len(aln)):
        print(aln[s].name, x[s], g[s], i[s])
        ngs, idxs = aln[s].getDegapped()
        print('\t', ngs, idxs)
        idx = aln[s].find('FFVK', gappy = True)
        if idx >= 0:
            print('\t', aln[s].sequence[idx:])
    print(('Read', len(aln), 'sequences'))