Commit 9889428a authored by Mikael Boden's avatar Mikael Boden

added_Poisson

parent e1db8f84
......@@ -64,13 +64,13 @@ class GtfFile:
See http://genome.ucsc.edu/FAQ/FAQformat#format1
"""
def __init__(self, entries):
def __init__(self, entries, filter_feature = None):
"""
Create a GtfFile instance.
:param entries: an iterable of entries or a filename
"""
if isinstance(entries, str): # filename
self.chroms = readGtfFile(entries)
self.chroms = readGtfFile(entries, filter_feature)
else:
self.chroms = dict()
for entry in entries:
......@@ -157,8 +157,24 @@ class GtfFile:
elif len(all) == 0: return None
else: return next(iter(all))
def readGtfFile(filename):
def getIndex(self, group_attribute):
"""
Create a dictionary for a specified attribute in the group list, e.g. "gene_id", "gene_name" or "transcript_id"
:param group_attribute:
:return: a dictionary keyed by the values of the nominated group_attribute,
pointing to the entry in the chromosome interval-tree
"""
d = {}
for entry in self:
if group_attribute in entry.attr: # is the entry indexed with the attribute
if not entry.attr[group_attribute] in d: # only add the first entry for this particular value
d[entry.attr[group_attribute]] = entry
return d
def readGtfFile(filename, filter_feature = None):
""" Read a GTF/GFF file.
filename: name of file
filter_feature: name of feature to be selected, all others ignored; None means anything
"""
f = open(filename)
row = 0
......@@ -180,6 +196,8 @@ def readGtfFile(filename):
seqname = words[0]
source = words[1]
feature = words[2]
if filter_feature and filter_feature != feature:
continue
start = int(words[3])
end = int(words[4])
score = None
......
......@@ -5,7 +5,6 @@ Uses and depends on "Alphabet" that is used to define discrete random variables.
'''
import random
from sym import *
from copy import deepcopy
import math
#################################################################################################
......@@ -957,3 +956,132 @@ class HMM():
fh.write(html)
fh.close()
return html
#################################################################################################
# The univariate Gaussian density function.
#################################################################################################
class Gaussian():
mu = None # mean
sigma = None # standard deviation
sigmaSquared = None # variance
def ROOT_2PI(self):
return (2 * math.pi) ** 2
def LOG_ROOT_2PI(self):
return 0.5 * (math.log(2) + math.log(math.pi))
def __init__(self, mean, variance):
""" Creates a univariate Gaussian distribution with the given fixed mean and variance. """
self.mu = mean
self.sigmaSquared = variance;
self.sigma = math.sqrt(variance);
self.normConst = self.sigma * self.ROOT_2PI();
self.logNormConst = (0.5 * math.log(variance)) + self.LOG_ROOT_2PI();
def __str__(self):
return "<" + "%5.3f" % self.mu + u"\u00B1%5.3f" % self.sigma + ">"
def getDensity(self, x):
""" Returns the density of this Gaussian distribution at the given value. """
return (math.exp(-math.pow((x - self.mu), 2) / (2 * self.sigmaSquared)) / self.normConst);
def __getitem__(self, value):
""" Get the probability density of a specified value for this Gaussian """
return self.getDensity(value)
def getMean(self):
return self.mu
def getVariance(self):
return self.sigmaSquared
def getLogDensity(self, x):
""" Returns the natural log of the density of this Gaussian distribution at the given value. """
return (-math.pow((x - self.mu), 2) / (2 * self.sigmaSquared)) - self.logNormConst;
def sample(self):
""" Returns a value sampled from this Gaussian distribution. The implementation uses the Box - Muller transformation
[G.E.P.Box and M.E.Muller(1958) "A note on the generation of random normal deviates". Ann.Math.Stat 29: 610 - 611]. """
U = random.random() # get a random value between 0 and 1
V = random.random() # get a random value between 0 and 1
return (self.mu + (self.sigma * math.sin(2 * math.pi * V) * math.sqrt((-2 * math.log(U)))))
def estimate(samples, count = None):
""" Create a density based on the specified samples. Optionally, provide an iterable with the corresponding counts/weights. """
mean = 0
if count == None:
for i in range(len(samples)):
mean += samples[i] / len(samples)
diff = 0
for i in range(len(samples)):
diff += (mean - samples[i]) * (mean - samples[i]);
if (diff == 0):
return None
return Gaussian(mean, diff / len(samples))
elif len(count) == len(samples):
totcnt = 0
for i in range(len(samples)):
mean += samples[i] * count[i]
totcnt += count[i]
mean /= totcnt
diff = 0
for i in range(len(samples)):
diff += (mean - samples[i]) * (mean - samples[i]) * count[i];
if (diff == 0):
return None
return Gaussian(mean, diff / totcnt)
#################################################################################################
# The univariate Poisson distribution
#################################################################################################
class Poisson():
def __init__(self, LAMBDA):
"""
* Define a Poisson distribution
* @ param lambda the average number of events per interval
"""
self.LAMBDA = LAMBDA
def p(self, k):
"""
* The probability mass function
* @param k the number of events in the interval
* @return the probability of k events
"""
return math.exp(k * math.log(self.LAMBDA) - self.LAMBDA - lgamma(k + 1))
def cdf(self, k):
"""
* The cumulative probability function.
* The implementation calls the PMF for all values i from 0 to floor(k)
* @param k the number of events in the interval
* @return the cumulative probability of k events
* https://en.wikipedia.org/wiki/Poisson_distribution
"""
sum = 0
for i in range(k + 1):
sum += self.p(i)
if (sum >= 1.0): # happens only due to poor numerical precision
return 1.0
return sum
def lgamma(x):
"""
* Returns an approximation of the log of the Gamma function of x. Laczos
* Approximation Reference: Numerical Recipes in C
* http://www.library.cornell.edu/nr/cbookcpdf.html
"""
cof = [ 76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5 ]
y = x
tmp = x + 5.5
tmp -= ((x + 0.5) * math.log(tmp))
ser = 1.000000000190015
for j in range(len(cof)):
y += 1
ser += (cof[j] / y)
return (-tmp + math.log(2.5066282746310005 * ser / x))
......@@ -195,14 +195,15 @@ class Sequence(object):
Below are some useful methods for loading data from strings and files.
Recognize the FASTA format (nothing fancy).
"""
def readFasta(string, alphabet = None, ignore = False, gappy = False):
def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defline = True):
""" Read the given string as FASTA formatted data and return the list of
sequences contained within it.
If alphabet is specified, use it, if None (default) then guess it.
If ignore is False, errors cause the method to fail.
If ignore is True, errors will disregard sequence.
If gappy is False (default), sequence cannot contain gaps,
if True gaps are accepted and included in the resulting sequences."""
if True gaps are accepted and included in the resulting sequences.
If parse_defline is False, the name will be set to everything before the first space, else parsing will be attempted."""
seqlist = [] # list of sequences contained in the string
seqname = None # name of *current* sequence
seqinfo = None
......@@ -222,8 +223,11 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False):
seqinfo = line[1:].split() # skip first char
if len(seqinfo) > 0:
try:
parsed = parseDefline(seqinfo[0])
seqname = parsed[0]
if parse_defline:
parsed = parseDefline(seqinfo[0])
seqname = parsed[0]
else:
seqname = seqinfo[0]
seqinfo = line[1:]
except IndexError as errmsg:
if not ignore:
......@@ -266,14 +270,15 @@ def parseDefline(string):
elif re.match("^refseq\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
else: return (s, '', '', '')
def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
def readFastaFile(filename, alphabet = None, ignore = False, gappy = False, parse_defline = True):
""" Read the given FASTA formatted file and return the list of sequences
contained within it. Note that if alphabet is NOT specified, it will take a
separate guess for each sequence.
If ignore is False, errors cause the method to fail.
If ignore is True, errors will disregard sequence.
If gappy is False (default), sequence cannot contain gaps,
if True gaps are accepted and included in the resulting sequences."""
if True gaps are accepted and included in the resulting sequences.
If parse_defline is False, the name will be set to everything before the first space, else parsing will be attempted."""
fh = open(filename)
seqlist = []
batch = '' # a batch of rows including one or more complete FASTA entries
......@@ -282,7 +287,7 @@ def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
row = row.strip()
if len(row) > 0:
if row.startswith('>') and rowcnt > 0:
more = readFasta(batch, alphabet, ignore, gappy)
more = readFasta(batch, alphabet, ignore, gappy, parse_defline)
if len(more) > 0:
seqlist.extend(more)
batch = ''
......@@ -290,7 +295,7 @@ def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
batch += row + '\n'
rowcnt += 1
if len(batch) > 0:
more = readFasta(batch, alphabet, ignore, gappy)
more = readFasta(batch, alphabet, ignore, gappy, parse_defline)
if len(more) > 0:
seqlist.extend(more)
fh.close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment