Commit 39b93af9 authored by Mikael Boden's avatar Mikael Boden

ASR_added

parent ea1b47e1
This diff is collapsed.
......@@ -203,9 +203,19 @@ class Distrib():
maxprob = self[sym]
return maxsym
def getProb(self):
""" Return a list of (sym, prob) tuples, in order of their alphabet"""
return [(sym, self.prob(sym)) for sym in self.alpha]
def getBits(self):
""" Return a list of (sym, bits) tuples, in order of their alphabet"""
H = sum([-f * (math.log2(f) if f > 0 else 999) for f in self.prob()])
I = math.log2(len(self.alpha)) - H
return [(s, self.prob(s) * I) for s in self.alpha]
def getsort(self):
""" Return the list of symbols, in order of their probability. """
symlist = [sym for (sym, _) in self.getProbsort()]
symlist = [s for (s, _) in self.getProbsort()]
return symlist
def getProbsort(self):
......@@ -1089,3 +1099,18 @@ def lgamma(x):
y += 1
ser += (cof[j] / y)
return (-tmp + math.log(2.5066282746310005 * ser / x))
import sequence as seq
if __name__ == '__main__':
myseqs = [seq.Sequence('TCCTAGCCCC'),
seq.Sequence('GCCGCCCCCA'),
seq.Sequence('ATCCGCCCGG'),
seq.Sequence('CCCCCGCCTT')]
mymc = MarkovChain(seq.DNA_Alphabet)
for myseq in myseqs:
print(myseq, len(myseq))
mymc.observe(myseq)
for t in mymc.transit:
print(t, mymc.transit[t])
......@@ -1344,6 +1344,61 @@ class PWM(object):
maxindex = i
return (maxscore, maxindex)
def readPWMs(filename, format='MEME'):
fh = open(filename, 'rt')
VERSION = None
ALPHABET = None
STRANDS = None
BACKGROUND = None
MOTIF = None
URL = None
FOREGROUND = []
NSITES = 1
EXPECT = None
COLLECTION = {}
data = fh.read()
lines = data.splitlines()
for line in lines:
myline = line.strip()
words = myline.split();
if EXPECT == None:
try:
if myline.startswith('MEME') and len(words) > 1:
VERSION = words[len(words) - 1]
elif myline.startswith('ALPHABET') and len(words) > 1:
ALPHABET = Alphabet(words[len(words) - 1])
elif myline.startswith('MOTIF') and len(words) > 1:
if MOTIF != None: # we have one motif that needs to be stored first
COLLECTION[MOTIF] = (FOREGROUND, BACKGROUND)
FOREGROUND = []
MOTIF = words[1]
elif myline.startswith('URL') and len(words) > 1:
URL = words[1]
elif len(words) == 0:
EXPECT = None
elif not myline.startswith('#'):
EXPECT = myline
if EXPECT.startswith('letter-probability matrix'):
match = re.compile(r""".*nsites=\s*(?P<name>[0-9]*?)\s.*""", re.VERBOSE).match(EXPECT)
NSITES = int(match.group('name'))
except:
print('Error in format: ' + line)
return None
elif len(words) == 0:
EXPECT = None
elif EXPECT.startswith('Background'):
BACKGROUND = Distrib(ALPHABET)
for z in zip(words[::2], words[1::2]):
BACKGROUND.observe(z[0], float(z[1]))
elif EXPECT.startswith('letter-probability matrix'):
d = Distrib(ALPHABET)
for z in zip(ALPHABET.symbols, words):
d.observe(z[0], float(z[1]) * NSITES)
FOREGROUND.append(d)
# save last motif here
COLLECTION[MOTIF] = (FOREGROUND, BACKGROUND)
return COLLECTION
# Web Service Functions -------------------
def getSequence(id, database = 'uniprotkb', start=None, end=None):
......@@ -1438,7 +1493,7 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
ids.append(id.split(':')[1])
return ids
if __name__ == '__main__':
if __name__ == '__main__1':
aln = readClustalFile('/Users/mikael/simhome/ASR/gappy.aln', Protein_Alphabet)
x, g, i = aln.outliers()
for s in range(len(aln)):
......@@ -1449,3 +1504,10 @@ if __name__ == '__main__':
if idx >= 0:
print('\t', aln[s].sequence[idx:])
print(('Read', len(aln), 'sequences'))
if __name__ == '__main__':
motifs = readPWMs('/Users/mikael/meme-5.4.1/motif_databases/PROKARYOTE/collectf.meme')
for name in motifs:
print(name)
for fg in motifs[name][0]:
print('\t', fg)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment