Commit 3ff492de authored by Mikael Boden's avatar Mikael Boden

release 2016.1

parents
from math import log, exp
import sys
MAXIT = 100
EPS = 3.0e-7
FPMIN = 1.0e-300
gamma_c = [76.18009172947146,
-86.50532032941677,
24.01409824083091,
-1.23173957245,
0.1208650973866179e-2,
-0.5395239384953e-5]
def log_binomial_ncdf(N, k, p):
"""
Log of one minus the cumulative distribution function of the binomial dist.
The binomial density gives the probability of k successes in N independent
trials each with probability p of success.
"""
if (k==0):
return 0
else:
return log_betai(k, N-k+1, p)
def betai (a, b, x):
"""
Incomplete beta function
"""
if (x<0 or x>1): die("Bad x=`" + str(x) + "' in routine betai")
if (x==0 or x==1):
bt = 0
else:
bt = exp(gammaln(a+b)-gammaln(a)-gammaln(b)+a*log(x)+b*log(1-x))
thresh = (a+1)/(a+b+2.0)
if (x<thresh):
return(bt*betacf(a,b,x)/a)
else:
return(1.0-bt*betacf(b,a,1.0-x)/b)
def log_betai(a, b, x):
"""
log incomplete beta function
"""
if (x<0 or x>1): die("Bad x=`" + str(x) + "' in routine betai")
if (x==0 or x==1):
log_bt = -1e300 # log(0)
else:
log_bt = gammaln(a+b)-gammaln(a)-gammaln(b)+a*log(x)+b*log(1.0-x)
thresh = (a+1.0)/(a+b+2.0)
if (x<thresh):
return(log_bt + log(betacf(a,b,x)/a))
else:
return(log(1.0 - exp(log_bt)*betacf(b,a,1.0-x)/b))
def betacf(a, b, x):
"""
used by betai
"""
qab = a+b
qap = a+1.0
qam = a-1.0
c = 1.0
d = 1.0-qab*x/qap
if (abs(d) < FPMIN): d = FPMIN
d = 1.0/d
h = d
for m in range(1, MAXIT+1):
m2 = 2.0*m
aa = m*(b-m)*x/((qam+m2)*(a+m2))
d=1.0+aa*d
if (abs(d) < FPMIN): d=FPMIN
c=1.0+aa/c
if (abs(c) < FPMIN): c=FPMIN
d = 1.0/d
h *= d*c
aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2))
d=1.0+aa*d
if (abs(d) < FPMIN): d=FPMIN
c=1.0+aa/c
if (abs(c) < FPMIN): c=FPMIN
d = 1.0/d
delta = d*c
h *= delta
if (abs(delta-1.0) < EPS): break
if (m > MAXIT): print >> sys.stderr, ("a or b too big or MAXIT too small "
"in betacf")
return h
def gammaln(x):
"""
Compute log gamma function
"""
xx = x
s = 1.000000000190015
for i in range(0, 6):
xx += 1
s += gamma_c[i]/xx
res = ((x+0.5) * log(x+5.5)) - (x+5.5) + log(2.5066282746310005*s/x)
if (res >= 0):
return res
else:
return 0 # avoid roundoff error
def die(string):
print >> sys.stderr, string
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import sym
class RCDict(dict):
""" Class that extends a standard dictionary to accept only fixed-length DNA symbol strings as keys.
Additionally, it maps the reverse complement to the same value. """
def __init__(self, alpha = sym.DNA_Alphabet):
""" Initialise a reverse-complement dictionary to accept strings of a given alphabet (DNA by default) """
self.alpha = alpha
self.length = None
def __setitem__(self, key, value):
""" Set the value for a key.
Checks to see that if
(a) previous keys have been used that the length is the same, and
(b) the key consists only of valid symbols. """
if self.length == None:
self.length = len(key)
elif len(key) != self.length:
raise RuntimeError("Invalid key: " + str(key))
for i in range(len(key)):
if not key[i] in sym.DNA_Alphabet:
raise RuntimeError("Invalid symbol in key: " + str(key[i]))
super(RCDict, self).__setitem__(self.canonical(key), value)
def canonical(self, key):
""" Figures out the canonical key (original or its reverse complement).
Note that is you use other than DNA you may need to modify this code. """
if self.length == None:
return key
alpha = self.alpha
rcindx = [0 for _ in range(self.length)]
fwindx = [alpha.index(sym) for sym in key]
undecided = True
for forw in range(self.length):
backw = self.length - forw - 1
rcindx[forw] = 3 - fwindx[backw] # formula for converting A <-> T, C <-> G
if undecided and rcindx[forw] > fwindx[forw]: # RC is "higher" than original
return key
undecided = rcindx[forw] == fwindx[forw]
return ''.join([alpha.symbols[indx] for indx in rcindx])
def __getitem__(self, key):
""" Retrieve the value associated with a specified key. """
return super(RCDict, self).__getitem__(self.canonical(key))
def getSum(self, IUPAC_key):
""" Retrieve the sum of all the entries that match the specified IUPAC key. """
# TODO
pass
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
'''
A module to enable experimentation with various methods for predicting properties
assigned to sequence elements, e.g. secondary structure of proteins.
A neural net wrapper class is provided.
A couple of example applications are found at the end of this module.
'''
import numpy
import sym
import prob
import sequence
import ml
def slidewin(seq, winsize):
""" Produce a list of sub-sequences of a given length from a complete sequence """
subseqs = []
for i in range(len(seq) - winsize + 1):
subseqs.append(seq[i : i + winsize])
return subseqs
def _onehotIndex(alpha, sym):
""" Create array with "one-hot" bit codes (only adding "ones" to an all-"zero" array) """
symlen = len(sym)
alphalen = len(alpha)
indices = [ alpha.index(sym[i]) + (i * alphalen) for i in range(symlen) ]
return indices
class SeqNN():
""" A neural net wrapper for multinomial classification of sequence input """
def __init__(self, inp_len, inp_alpha, outp_alpha, nhidden, cascade = 0):
""" Construct a neural net with numeric inputs and outputs
depending on alphabets used for inputs and outputs.
inp_len: number of symbols to use as input
inp_alpha: input alphabet
outp_alpha: output alphabet (defines number of classes)
nhidden: number of "hidden" nodes in the net
cascade: if non-zero, number of positions to feed into a cascaded structure-to-structure NN (also the number of hidden nodes of this NN)
"""
self.nn1 = ml.NN(inp_len * len(inp_alpha), nhidden, len(outp_alpha)) # neural net
self.nn2 = None
self.cascade = cascade
if cascade > 0:
self.nn2 = ml.NN(cascade * len(outp_alpha), cascade, len(outp_alpha)) # cascaded neural net
self.inp_len = inp_len
self.inp_alpha = inp_alpha
self.outp_alpha = outp_alpha
def _encodeseq(self, seqs, targets = None):
""" Convert a list of sequences into numeric input suitable as input to NN. """
try:
len(seqs[0]) # if this does not throw error, it is a multi-input already
except TypeError:
seqs = [ seqs ]
targets = [ targets ]
totlen = 0
alpha = None
for seq in seqs:
if not alpha:
alpha = seq.alphabet
totlen += len(seq) - self.inp_len + 1
im = numpy.zeros((totlen, self.inp_len * len(alpha)))
if targets:
om = numpy.zeros((totlen, len(self.outp_alpha)))
row = 0
for i in range(len(seqs)):
subseqs = slidewin(seqs[i], self.inp_len)
if targets:
# Note how we remove the targets at the ends of the sequence
subtarg = targets[i][self.inp_len/2:-self.inp_len/2+1]
for k in range(len(subseqs)):
im[row, _onehotIndex(alpha, subseqs[k])] = 1
if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1
row += 1
print "There are", row, "entries in data set"
if targets:
return im, om
else:
return im, None
def observeAll(self, seqs, targets, eta = 0.1, niter = 1):
""" Train a classifier to map from all possible windows to the target symbols.
Decompose each sequence to all full-width sub-sequences. Map each sub-sequence
to the target symbol for the symbol in the centre of the sub-sequence. """
assert len(seqs) == len(targets), "Number of input sequences need to match the number of target sequences"
im, om = self._encodeseq(seqs, targets)
for i in range(niter): # train first NN
rmse = self.nn1.train(im, om, eta = eta, niter = 1)
print i, ":", rmse
if not self.cascade: # if there's no cascaded NN, finish here
return rmse
nn1seqs = [] # a list of new SS sequences ...
for seq in seqs: # ... based on AA sequences
nn1seq = self.predict(seq, useCascade = False) # construct a new sequence which consists of SS predictions
nn1seqs.append(nn1seq)
im, om = self._encodeseq(nn1seqs, targets) # construct input/output patterns from SS sequences
for i in range(niter): # train cascaded NN
rmse = self.nn2.train(im, om, eta = eta, niter = 1)
print i, ":", rmse
return rmse
def testAll(self, seqs, targets):
""" Test the neural network on the specified sequences and target sequences.
Returns a confusion matrix with the predictions. """
assert len(seqs) == len(targets), "Number of input sequences needs to match the number of target sequences"
if not self.cascade:
im, om = self._encodeseq(seqs, targets)
cm = self.nn1.test(im, om)
return cm
else:
nn1seqs = []
for seq in seqs:
nn1seq = self.predict(seq, useCascade = False)
nn1seqs.append(nn1seq)
im, om = self._encodeseq(nn1seqs, targets)
cm = self.nn2.test(im, om)
return cm
def predict(self, inpseq, useCascade = True):
""" Classify each symbol in a sequence.
Return the predictions as a list of symbols. """
W = self.nn1.ninput / len(self.inp_alpha)
if useCascade and self.cascade:
nn1seq = self.predict(inpseq, useCascade = False)
subseqs = slidewin(nn1seq, self.cascade)
predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions
for i in range(len(subseqs)): # for each input sub-sequence of the primary NN
input = numpy.zeros(self.cascade * len(self.outp_alpha))
input[_onehotIndex(self.outp_alpha, subseqs[i])] = 1
outvec = self.nn2.feedforward(input)
d = prob.Distrib(self.outp_alpha)
for k in range(len(outvec)):
d.observe(self.outp_alpha[k], outvec[k])
predsyms[i + self.cascade / 2] = d.getmax() # use the symbol with the highest probability
return sequence.Sequence(predsyms, self.outp_alpha)
else: # only predict using the first NN
subseqs = slidewin(inpseq, W)
predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions
for i in range(len(subseqs)): # for each input sub-sequence of the primary NN
input = numpy.zeros(self.inp_len * len(self.inp_alpha))
input[_onehotIndex(self.inp_alpha, subseqs[i])] = 1
outvec = self.nn1.feedforward(input)
d = prob.Distrib(self.outp_alpha)
for k in range(len(outvec)):
d.observe(self.outp_alpha[k], outvec[k])
predsyms[i + W / 2] = d.getmax() # use the symbol with the highest probability
return sequence.Sequence(predsyms, self.outp_alpha)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment