From 06f131e6ea5d3bb85f20b16fa1fe26ba06250c97 Mon Sep 17 00:00:00 2001 From: Mikael Boden Date: Fri, 17 Aug 2018 09:01:34 +1000 Subject: [PATCH] markov_chain --- prob.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ sequence.py | 7 +++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/prob.py b/prob.py index 9ba859e..4f35c70 100644 --- a/prob.py +++ b/prob.py @@ -718,3 +718,54 @@ class NaiveBayes(): prob *= condprob[i][key[i]] or 0.0 out.observe(outsym, prob) return out + +class MarkovChain(): + """ Markov Chain in a very simple form + """ + + def __init__(self, alpha, order = 1): + self.order = order + self.startsym = '^' + self.endsym = '\$' + self.alpha = Alphabet(alpha.symbols + tuple([self.startsym, self.endsym])) + self.transit = TupleStore([self.alpha for _ in range(order)]) # transition probs, i.e. given key (prev state/s) what is the prob of current state + + def _terminate(self, unterm_seq): + """ Terminate sequence with start and end symbols """ + term_seq = [self.startsym for _ in range(self.order)] + term_seq.extend(unterm_seq) + term_seq.append(self.endsym) + return term_seq + + def _getpairs(self, term_seq): + """ Return a tuple of all (tuple) Markov pairs from a sequence """ + ret = [] + for i in range(len(term_seq) - self.order): + past = tuple(term_seq[i:i + self.order]) + present = term_seq[i + self.order] + ret.append(tuple([past, present])) + return tuple(ret) + + def observe(self, wholeseq): + """ Set parameters by counting transitions """ + myseq = self._terminate(wholeseq) + for (past, present) in self._getpairs(myseq): + d = self.transit[past] + if not d: # no distrib + d = Distrib(self.alpha) + self.transit[past] = d + d.observe(present) + + def __getitem__(self, wholeseq): + """ Determine the log probability of a given sequence """ + myseq = self._terminate(wholeseq) + logp = 0 + for (past, present) in self._getpairs(myseq): + d = self.transit[past] + if not d: + return None + p = d[present] + if p == 0: + return None + logp += math.log(p) + return logp \ No newline at end of file diff --git a/sequence.py b/sequence.py index 73f5b46..2676731 100644 --- a/sequence.py +++ b/sequence.py @@ -139,7 +139,10 @@ class Sequence(object): def writeFasta(self): """ Write one sequence in FASTA format to a string and return it. """ - fasta = '>' + self.name + ' ' + self.info + '\n' + if parseDefline(self.info)[0] == self.name: # this sequence was previously "parsed" and info should hold the original header + fasta = '>' + self.info + '\n' + else: + fasta = '>' + self.name + ' ' + self.info + '\n' data = ''.join(self.sequence) nlines = int(math.ceil((len(self.sequence) - 1) / 60 + 1)) for i in range(nlines): @@ -290,7 +293,7 @@ def getMarkov(seqs, order = 0): else: if seq.alphabet != myalpha: raise RuntimeError('Sequence ' + seq.name + ' uses an invalid alphabet ') - jp = Joint([myalpha for _ in range(order + 1)]) + jp = Joint([myalpha for _ in range(order)]) for seq in myseqs: for i in range(len(seq) - order): sub = seq[i:i + order + 1] -- 2.18.1