Commit 3ff492de authored by Mikael Boden's avatar Mikael Boden

release 2016.1

parents
from math import log, exp
import sys
MAXIT = 100
EPS = 3.0e-7
FPMIN = 1.0e-300
gamma_c = [76.18009172947146,
-86.50532032941677,
24.01409824083091,
-1.23173957245,
0.1208650973866179e-2,
-0.5395239384953e-5]
def log_binomial_ncdf(N, k, p):
"""
Log of one minus the cumulative distribution function of the binomial dist.
The binomial density gives the probability of k successes in N independent
trials each with probability p of success.
"""
if (k==0):
return 0
else:
return log_betai(k, N-k+1, p)
def betai (a, b, x):
"""
Incomplete beta function
"""
if (x<0 or x>1): die("Bad x=`" + str(x) + "' in routine betai")
if (x==0 or x==1):
bt = 0
else:
bt = exp(gammaln(a+b)-gammaln(a)-gammaln(b)+a*log(x)+b*log(1-x))
thresh = (a+1)/(a+b+2.0)
if (x<thresh):
return(bt*betacf(a,b,x)/a)
else:
return(1.0-bt*betacf(b,a,1.0-x)/b)
def log_betai(a, b, x):
"""
log incomplete beta function
"""
if (x<0 or x>1): die("Bad x=`" + str(x) + "' in routine betai")
if (x==0 or x==1):
log_bt = -1e300 # log(0)
else:
log_bt = gammaln(a+b)-gammaln(a)-gammaln(b)+a*log(x)+b*log(1.0-x)
thresh = (a+1.0)/(a+b+2.0)
if (x<thresh):
return(log_bt + log(betacf(a,b,x)/a))
else:
return(log(1.0 - exp(log_bt)*betacf(b,a,1.0-x)/b))
def betacf(a, b, x):
"""
used by betai
"""
qab = a+b
qap = a+1.0
qam = a-1.0
c = 1.0
d = 1.0-qab*x/qap
if (abs(d) < FPMIN): d = FPMIN
d = 1.0/d
h = d
for m in range(1, MAXIT+1):
m2 = 2.0*m
aa = m*(b-m)*x/((qam+m2)*(a+m2))
d=1.0+aa*d
if (abs(d) < FPMIN): d=FPMIN
c=1.0+aa/c
if (abs(c) < FPMIN): c=FPMIN
d = 1.0/d
h *= d*c
aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2))
d=1.0+aa*d
if (abs(d) < FPMIN): d=FPMIN
c=1.0+aa/c
if (abs(c) < FPMIN): c=FPMIN
d = 1.0/d
delta = d*c
h *= delta
if (abs(delta-1.0) < EPS): break
if (m > MAXIT): print >> sys.stderr, ("a or b too big or MAXIT too small "
"in betacf")
return h
def gammaln(x):
"""
Compute log gamma function
"""
xx = x
s = 1.000000000190015
for i in range(0, 6):
xx += 1
s += gamma_c[i]/xx
res = ((x+0.5) * log(x+5.5)) - (x+5.5) + log(2.5066282746310005*s/x)
if (res >= 0):
return res
else:
return 0 # avoid roundoff error
def die(string):
print >> sys.stderr, string
import math
import numpy as np
class GeneExpression:
dataset = '' # name of data set (if any)
genes = {} # a dictionary of gene names to profile matrix index
matrix = None # a numpy two-dim array holding all expression values
headers = [] # the names of the samples/experiments, e.g. GSM123
default_value_if_null = None # Default value to use if entry is not set (e.g. addSamples may not add values for all genes)
def __init__(self, datasetname='', headerlist=[], genedict={}):
""" Create a gene expression data set.
The class will store gene names and associated profiles (in which values correspond to "samples").
It also stores headers (which names correspond to samples, i.e. experiments).
Data should be provided as
(0) a name of the set
(1) a list of sample names (headerlist; must agree with the number of values in each gene profile)
(2) a gene name dictionary where values contain the expression profile (genedict; profile is an iterable with the same number of elements)
For example
>>> g = GeneExpression("MySet", ['Sample1', 'Sample2'], {'G1': [0.13, 1.23], 'G2': [4.1, -0.9], 'G3': [2.1, -2.1]})
"""
self.dataset = datasetname
self.genes = {}
ndx = 0
for gene in genedict:
self.genes[gene] = ndx
ndx += 1
self.matrix = self._createMatrix(genedict)
if len(self.matrix) == 0:
nsamples = 0
else:
nsamples = len(self.matrix[0])
if isinstance(headerlist, str):
headerlist = [headerlist]
if len(headerlist) != nsamples:
raise RuntimeError("The number of headers (%d) is not equal to the number of samples (%d)" % (len(headerlist), nsamples))
self.headers = headerlist or ['S%d' % cnt for cnt in range(nsamples)]
def _createMatrix(self, genedict):
""" Internal method for constructing a numpy matrix from a gene-profile dictionary. """
ngenes = len(self.genes)
allow_new_genes = False
if ngenes == 0: # if instance is empty, include all genes in dict
ngenes = len(genedict)
allow_new_genes = True
nsamples = 0
for gene in genedict:
profile = genedict[gene]
try:
actual = len(profile)
except TypeError:
actual = 1
genedict[gene] = [profile]
if nsamples == 0:
nsamples = actual
elif nsamples != actual:
raise RuntimeError("Each gene must have the same number of samples (see %s)" % gene)
matrix = np.empty((ngenes, nsamples))
matrix[:] = self.default_value_if_null
ndx = 0
for gene in genedict:
try:
ndx = self.genes[gene]
matrix[ndx] = genedict[gene]
except: # no match in current gene list
if allow_new_genes:
matrix[ndx] = genedict[gene]
self.genes[gene] = ndx
ndx += 1
return matrix
def getHeaders(self, indices = None):
""" Retrieve headers (names of experiments/samples).
If indices is None (default), all headers are returned, e.g.
>>> g.getHeaders()
['Sample1', 'Sample2']
If indices is a single integer, the header for the corresponding entry is returned, e.g.
>>> g.getHeaders(1)
'Sample2'
If indices is an iterable of integers (multiple indices), the list of corresponding headers is returned, e.g.
>>> g.getHeaders([1,0])
['Sample2', 'Sample1']
"""
if indices == None:
return self.headers
elif isinstance(indices, int) or indices is slice:
return self.headers[indices]
else:
ret = []
for index in indices:
ret.append(self.headers[index])
return ret
def getGenes(self, names = None):
""" Retrieve applicable gene-profile entries.
If names is None (default), all gene names are returned, e.g.
>>> g.getGenes()
['G1', 'G2', 'G3']
If names is a single string, the profile for the corresponding entry is returned, e.g.
>>> g.getGenes('G2')
array([ 4.1, -0.9])
If names is an iterable of strings (multiple gene names), a dictionary with gene name as key and profile as value is returned.
>>> g.getGenes(['G3','G2'])
{'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
"""
if names == None:
return self.genes.keys()
elif isinstance(names, str):
return self.matrix[self.genes[names],:]
else:
ret = {}
for name in names:
ret[name] = self.matrix[self.genes[name],:]
return ret
def __getitem__(self, ndx):
""" Retrieve a specified sample (or a "slice" of samples) for all genes, e.g.
>>> g[0:2]
array([[ 2.1 , -2.1 ],
[ 4.1 , -0.9 ],
[ 0.13, 1.23]])
Note that the order of rows/genes is NOT necessarily the same as that used for inserting the data.
"""
return self.matrix[:,ndx]
def getHeaderIndex(self, headers):
""" Find the index of the named experiment.
Raises a ValueError if not in list. """
if isinstance(headers, str):
return self.headers.index(headers)
else:
return [self.headers.index(header) for header in headers]
def getSamples(self, samples):
"""Construct a gene dictionary including only samples in specified indices, e.g.
>>> g.getSamples(0)
{'G1': 0.13, 'G2': 4.0999999999999996, 'G3': 2.1000000000000001}
>>> g.getSamples('Sample2')
{'G1': 1.23, 'G2': -0.90000000000000002, 'G3': -2.1000000000000001}
>>> g.getSamples(['Sample2','Sample1'])
{'G1': array([ 1.23, 0.13]),
'G2': array([-0.9, 4.1]),
'G3': array([-2.1, 2.1])}
"""
try:
index = self.getHeaderIndex(samples)
except:
index = samples
mygenes = {}
for (name, ndx) in self.genes.items():
mygenes[name] = self.matrix[ndx, index]
return mygenes
def sort(self, sample, descending=True):
"""Get a list of gene names, sorted by order of value in specified sample, e.g.
>>> g.sort(0)
['G2', 'G3', 'G1']
Then retrieve actual genes using e.g.
>>> g.getGenes('G2')
array([-0.9, 4.1])
"""
try:
index = self.getHeaderIndex(sample)
sort_ndx = np.nan_to_num(self.matrix[:,index]).argsort()
except:
sort_ndx = np.nan_to_num(self.matrix[:,sample]).argsort()
name_tuples = sorted(self.genes.items(), key=lambda v: v[1]) # put all gene names in order of the matrix of profiles
names = []
if descending:
for (name, index) in [name_tuples[index] for index in sort_ndx[::-1]]: # reverse the order
names.append(name)
else:
for (name, index) in [name_tuples[index] for index in sort_ndx]: # maintain order
names.append(name)
return names
def addSamples(self, headerlist, genedict):
"""Add a sample or multiple samples to the current data set.
genedict is a dictionary with the same keys as the current gene set.
Only values for genes in the current set will be added (others are ignored).
>>> g.addSamples('Sample3', {'G1': 3.4, 'G2': -3.0})
"""
newmat = self._createMatrix(genedict)
nsamples = len(newmat[0])
if headerlist != None:
if isinstance(headerlist, str):
headerlist = [headerlist]
if len(headerlist) != nsamples:
raise RuntimeError("The number of headers (%d) is not equal to the number of samples (%d)" % (len(headerlist), nsamples))
if len(self.matrix) == 0:
self.matrix = newmat
else:
self.matrix = np.hstack((self.matrix, newmat))
self.headers.extend(headerlist or ['S%d' % cnt+len(self.headers) for cnt in range(nsamples)])
def getRatio(self, index1, index2):
""" Get the ratio of two samples in the data set (index1 and index2).
Creates and returns a gene dictionary with the corresponding ratios. """
mygenes = {}
mdiv = self.matrix[:, index1] / self.matrix[:, index2]
for (name, ndx) in self.genes.items():
mygenes[name] = mdiv[ndx]
return mygenes
def getLogRatio(self, index1, index2):
""" Get the log2-transformed ratio of two samples (index1 and index2)
Creates and returns a gene dictionary with the corresponding log-ratios. """
mygenes = {}
mlr = np.log2(self.matrix[:, index1] / self.matrix[:, index2])
for (name, ndx) in self.genes.items():
mygenes[name] = mlr[ndx]
return mygenes
def getPearson(self, probeID):
""" Given a probe identifier, returns a gene/probe dictionary:
identifiers to correlation coefficients with the specified probe. """
index = self.genes[probeID]
profile = self.matrix[index, :]
mygenes = {}
for (name, ndx) in self.genes.items():
other = self.matrix[ndx, :]
mygenes[name] = pearson(profile, other)
return mygenes
def writeGEOFile(self, filename):
""" Save data as a truncated GEO SOFT file named filename. """
line = '^DATASET = ' + self.dataset + '\n'
line += '!dataset_table_begin\nID_REF\tIDENTIFIER\t'
for header in self.headers:
line += header + '\t'
line += '\n'
for gene in self.genes:
line += gene + '\t' + gene + '\t'
index = self.genes[gene]
for value in self.matrix[index, :]:
line += format(value, '5.3f') + '\t'
line += '\n'
line += '!dataset_table_end\n'
fh = open(filename, 'w')
fh.write(line)
fh.close()
def getZScore(self, index):
""" Get the Z-score of each expression value.
index can be a list of indices (for which the z-score is computed independently).
Important: assumes that values are normally distributed.
For example use log-transformed ratios. """
# Calculate mean and standard deviation of the list of values
mu = np.mean(self.matrix[:, index], axis=0)
sd = np.std(self.matrix[:, index], axis=0)
# Calculate Z-score for the given column for each gene
zscore = (self.matrix[:, index] - mu) / sd
mygenes = {}
for (name, ndx) in self.genes.items():
try:
mygenes[name] = zscore[ndx, :]
except IndexError:
mygenes[name] = zscore[ndx]
# Return the dictionary of Z-scores
return mygenes
# Utility functions
def readGEOFile(filename, id_column=0):
"""Read a Gene Expression Omnibus file; return a GeneExpression instance.
id_column indicates what field of each row that should be taken as
gene identifier.
"""
fh = open(filename, "rU")
manylines = fh.read()
fh.close()
# If True, ignore genes with null samples; if False, use default value
ignore_gene_if_null = False
default_value_if_null = None # Default value to use if entry is null and not ignored
# Indicates whether we're reading the data section or metadata
data_rows = False
cnt_data = 0
cnt_null = 0
dataset = '' # name of dataset
headers = [] # list of headers
genes = {} # dict with gene-name as key, expression profile as a list of floats
for line in manylines.splitlines():
if line.startswith('^DATASET'):
dataset = line.split('= ')[1]
continue
if line.startswith('!dataset_table_begin'):
data_rows = True
continue
if line.startswith('!dataset_table_end'):
data_rows = False
continue
if line.startswith('!') or line.startswith('#') \
or line.startswith('^'):
continue
if len(line.strip()) == 0:
continue
if data_rows:
cnt_data += 1
ignore = False
name = line.split('\t')[id_column]
# Ignore control probes
if name.startswith("AFFX"):
continue
if (cnt_data == 1): # First line contains the headers
headers = line.split('\t')
else:
values = []
cnt_word = 0
for word in line.split('\t'):
cnt_word += 1
if cnt_word <= (id_column + 1):
continue
if word == 'null':
cnt_null += 1
if ignore_gene_if_null:
ignore = True
break
else:
word = default_value_if_null
try:
if word == None:
values.append(None)
else:
values.append(float(word))
except: # ignore values that are not "float"
continue
if ignore:
pass
elif not name in genes:
genes[name] = values
if len(genes) == 0:
raise RuntimeError('No data in file')
print 'Data set %s contains %d entries' % (dataset, len(genes))
if cnt_null > 0:
print 'Data set has %d null-values' % (cnt_null)
return GeneExpression(dataset, headers[2:], genes)
# ------------------ Helpful Extra Functions ------------------
def pearson(X, Y):
""" Pearson correlation coefficient (r).
Note that we are using the standard deviation of the sample, NOT the sample
standard deviation (see http://en.wikipedia.org/wiki/Standard_deviation). """
Xmu = np.mean(X)
Xvar = np.var(X)
Ymu = np.mean(Y)
Yvar = np.var(Y)
if len(X) != len(Y):
raise RuntimeError('vectors are of uneven length')
n = len(X)
sum = 0.0
for i in range(n):
sum += (X[i] * Y[i])
if n == 0 or Xvar == 0 or Yvar == 0:
return 0
return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar))
# ------------------- Example ---------------------
ge3716 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS3716.soft')
ratio = GeneExpression('GDS3716_ratio')
ratio.addSamples('S1_ER+/Healthy', ge3716.getRatio( 33, 0))
ratio.addSamples('S2_ER+/Healthy', ge3716.getRatio( 34, 1))
ratio.addSamples('S3_ER+/Healthy', ge3716.getRatio( 35, 2))
ratio.addSamples('S4_ER+/Healthy', ge3716.getRatio( 36, 3))
ratio.addSamples('S5_ER+/Healthy', ge3716.getRatio( 37, 4))
ratio.addSamples('S6_ER+/Healthy', ge3716.getRatio( 38, 5))
ratio.addSamples('S7_ER+/Healthy', ge3716.getRatio( 39, 6))
ratio.addSamples('S8_ER+/Healthy', ge3716.getRatio( 40, 7))
ratio.addSamples('S9_ER+/Healthy', ge3716.getRatio( 41, 8))
ratio.addSamples('S1_ER-/Healthy', ge3716.getRatio( 24, 9))
ratio.addSamples('S2_ER-/Healthy', ge3716.getRatio( 25, 10))
ratio.addSamples('S3_ER-/Healthy', ge3716.getRatio( 26, 11))
ratio.addSamples('S4_ER-/Healthy', ge3716.getRatio( 27, 12))
ratio.addSamples('S5_ER-/Healthy', ge3716.getRatio( 28, 13))
ratio.addSamples('S6_ER-/Healthy', ge3716.getRatio( 29, 14))
ratio.addSamples('S7_ER-/Healthy', ge3716.getRatio( 30, 15))
ratio.addSamples('S8_ER-/Healthy', ge3716.getRatio( 31, 16))
ratio.addSamples('S9_ER-/Healthy', ge3716.getRatio( 32, 17))
ratio.writeGEOFile('/Users/mikael/workspace/COSC2000/GDS3716_ratios.soft')
print ge3716.getHeaders()
z = ratio.getZScore(0) # NOT recommended! Ratios are NOT normally distributed! Use log-ratios instead.
ge38 = readGEOFile('/Users/mikael/workspace/COSC2000/GDS38.soft', id_column = 1)
cln2_profile = ge38.getGenes('CLN2')
pcorr = ge38.getPearson('CLN2')
gp = GeneExpression('Ex3', 'PC_CLN2', pcorr)
sorted = gp.sort('PC_CLN2', True)
print sorted[0], ge38.getGenes(sorted[0])
print sorted[1], ge38.getGenes(sorted[1])
"""
Motif discovery using Gibb's sampling
@author: mikael
"""
import math
import random
import sym
import prob
import sequence
class GibbsMotif():
"""
A class for discovering linear motifs in sequence data.
Uses Gibb's sampling (Lawrence et al., Science 262:208-214 1993).
Also see http://bayesweb.wadsworth.org/gibbs/content.html which has info
on "site sampling", "motif sampling", "recursive sampling" and "centroid
sampling". The first is implemented (roughly) below.
"""
def __init__(self, seqs, length, alignment = None):
""" Construct a "discovery" session by providing the sequences that will be used.
seqs: sequences in which the motif is sought
length: length of sought pattern (W)
alignment: positions in each sequence for the initial alignment (use only if the alignment
has been determined from a previous run).
"""
self.seqs = seqs
self.length = length # length of motif 1..W
seqs = self.seqs
self.alphabet = None
k = 0
for s in seqs:
if self.alphabet != None and self.alphabet != s.alphabet:
raise RuntimeError("Sequences invalid: different alphabets")
self.alphabet = s.alphabet
if alignment:
if alignment[k] < 0 or alignment[k] >= len(s):
raise RuntimeError("Initial alignment invalid: does not match sequence " + s.name)
k += 1
""" Initialise parameters that are part of the setup (below) """
self.alignment = alignment or [ random.randint(0, len(s) - length) for s in seqs ] # starting positions defining alignment
def discover(self, pseudocount = None, niter = None):
""" Find the most probable common pattern represented by a
position weight matrix (PWM), based on W+1 distributions
pseudocount: the distribution used for pseudo-counts (default is uniform)
niter: number of iterations (if None, 100*N is used; where N is number of seqs).
"""
""" Initialise parameters necessary for the discovery run (below) """
N = len(self.seqs) # number of sequences 1..N
seqs = self.seqs
W = self.length # motif width
""" background that will be used as pseudo-counts """
pseudocount = pseudocount or prob.Distrib(self.alphabet, 1.0)
""" q: the foreground distribution (specifying the W distributions in aligned columns)
p: the background distribution (for non-aligned positions in all sequences) """
q = [ prob.Distrib(self.alphabet, pseudocount) for _ in range(W) ]
p = prob.Distrib(self.alphabet, pseudocount)
a = self.alignment
new_z = random.randint(0, N-1) # pick a random sequence to withhold
for k in range(N):
if k != new_z:
k_len = len(seqs[k]) # length of current seq
offset = 0
for i in range(k_len):
if i >= a[k] and i < a[k] + W: # within pattern
q[offset].observe(seqs[k][i])
offset += 1
else: # outside pattern
p.observe(seqs[k][i])
""" Main loop: predictive update step THEN sampling step, repeat... """
niter = niter or 100 * N # use specified number of iterations or default
for round in range(niter):
""" Predictive update step:
One of the N sequences are chosen at random: z.
We will not use it in the profile, nor background so we
exclude it from our counts. """
prev_z = new_z
new_z = random.randint(0, N - 1)
# q's and p's are updated from current a's and all sequences except z,
# which is the same as use old q's and p's and subtract z's contribs...
offset = 0
for i in range(len(seqs[new_z])):
if i >= a[new_z] and i < a[new_z] + W: # within pattern
q[offset].observe(seqs[new_z][i], -1) # subtract the count
offset += 1
else: # outside pattern
p.observe(seqs[new_z][i], -1) # subtract the count
# ... and add back the previous and now updated z
offset = 0
for i in range(len(seqs[prev_z])):
if i >= a[prev_z] and i < a[prev_z] + W: # within pattern
q[offset].observe(seqs[prev_z][i], +1) # add the count
offset += 1
else: # outside pattern
p.observe(seqs[prev_z][i], +1) # add the count
""" Sampling step:
Consider each position x in z as a match: find a weight Ax """
z_len = len(seqs[new_z]) # length of seq z
A = [ 0.0 for _ in range(z_len) ]
Asum = 0.0
for x in range(z_len - W + 1): # look at all starts for a W-wide pattern
Px = 1.0; Qx = 1.0
for w in range(W):
Px *= p[seqs[new_z][x+w]]
Qx *= q[w][seqs[new_z][x+w]]
try:
A[x] = Qx / Px
except ZeroDivisionError:
pass
Asum += A[x]
for x in range(z_len - W + 1): # score all starts for a W-wide pattern
A[x] /= Asum # normalise so that all Ax's sum to 1.0
# Pick the next a[z], with a probability proportional to Ax
pick = random.random() # any value between 0 and 1
cumul = 0.0 # cumulative probability
for x in range(z_len - W + 1): # check starts for a W-wide pattern
cumul += A[x]
if pick <= cumul: # check if our random pick is smaller than the cumulative prob
a[new_z] = x
break
""" Evaluate data log-likelihood """
if round % 100 == 0: # but only every 100th round
LL = 0.0
for k in range(N):
Pk = 1.0; Qk = 1.0
for w in range(W):
Pk *= p[seqs[k][a[k]+w]]
Qk *= q[w][seqs[k][a[k]+w]]
try:
LL += math.log(Qk / Pk)
except ZeroDivisionError:
pass
print "LL @ %5d=\t%5.2f" % (round, LL)
# end main for-loop
self.q = q
self.p = p
self.alignment = a
return q
def getForeground(self):
""" Return the probability distributions for columns in the discovered alignment. """
return self.q
def getBackground(self):
""" Return the probability distributions for the background used in the discovery. """
return self.p
def getAlignment(seqs, motif, background):
""" Retrieve the best alignment (positions) in provided sequences defined by the specified
motif params.
seqs: sequence data
motif: the foreground distribution (specifying the W distributions in aligned columns)
background: the background distribution (for non-aligned positions in all sequences)
Note that this is similar but not the same as the stochastically selected alignment that
is kept while training. It can be implemented using a PWM constructed from a previous session.
Note also that this alignment can be used as input to continue an earlier discovery session
when motif distributions had been saved. """
N = len(seqs)
q = motif
p = background
W = len(q)
a = [0 for _ in range(N)] # start positions unknown
for k in range(N):
k_len = len(seqs[k]) # length of seq k
Amax = None
xmax = 0
for x in range(k_len - W + 1):
Px = 1.0; Qx = 1.0
for w in range(W):
Px *= p[seqs[k][x+w]]
Qx *= q[w][seqs[k][x+w]]
try:
Atmp = math.log(Qx / Px)
except ZeroDivisionError:
pass
if Amax == None or Amax < Atmp:
Amax = Atmp
xmax = x
a[k] = xmax
return a
class GibbsAlign():
""" A class for performing ungapped sequence alignment.
Uses Gibb's sampling (Lawrence et al., Science 262:208-214 1993).
"""
def __init__(self, seqs, length, alignment = None):
""" Construct a "discover" session by providing the sequences that will be aligned.
seqs: sequences that will be aligned
length: maximum length of alignment (must be equal or greater than max sequence length)
alignment: positions in each sequence for the initial alignment (use only if the alignment
has been determined from a previous run).
"""
self.seqs = seqs
self.length = length # length of motif 1..W
seqs = self.seqs
self.alphabet = None
k = 0
for s in seqs:
if self.alphabet != None and self.alphabet != s.alphabet:
raise RuntimeError("Sequences invalid: different alphabets")
self.alphabet = s.alphabet
if alignment:
if alignment[k] < 0 or alignment[k] >= len(s):
raise RuntimeError("Initial alignment invalid: does not match sequence " + s.name)
k += 1
""" Initialise parameters that are part of the setup (below) """
self.alignment = alignment or [ random.randint(0, length - len(s)) for s in seqs ] # starting offsets defining alignment
def discover(self, pseudocount = None, niter = None):
""" Find the most probable common pattern represented by a
position weight matrix (PWM), based on W+1 distributions
pseudocount: the distribution used for pseudo-counts (default is uniform)
niter: number of iterations (if None, 100*N is used; where N is number of seqs).
"""
""" Initialise parameters necessary for the discovery run (below) """
N = len(self.seqs) # number of sequences 1..N
seqs = self.seqs
W = self.length # alignment width
""" background that will be used as pseudo-counts """
pseudocount = pseudocount or prob.Distrib(self.alphabet, 1.0)
""" q: the foreground distribution (specifying the W distributions in aligned columns)
p: the background distribution (for non-aligned positions in all sequences) """
q = [ prob.Distrib(self.alphabet, pseudocount) for _ in range(W) ]
p = prob.Distrib(self.alphabet, pseudocount)
a = self.alignment
new_z = random.randint(0, N-1) # pick a random sequence to withhold
for k in range(N):
if k != new_z:
k_len = len(seqs[k]) # length of current seq
offset = 0
for i in range(k_len):
if i >= a[k] and i < a[k] + W: # within pattern
q[offset].observe(seqs[k][i])
offset += 1
else: # outside pattern
p.observe(seqs[k][i])
""" Main loop: predictive update step THEN sampling step, repeat... """
niter = niter or 100 * N # use specified number of iterations or default
for round in range(niter):
""" Predictive update step:
One of the N sequences are chosen at random: z.
We will not use it in the profile, nor background so we
exclude it from our counts. """
prev_z = new_z
new_z = random.randint(0, N - 1)
# q's and p's are updated from current a's and all sequences except z,
# which is the same as use old q's and p's and subtract z's contribs...
offset = 0
for i in range(len(seqs[new_z])):
if i >= a[new_z] and i < a[new_z] + W: # within pattern
q[offset].observe(seqs[new_z][i], -1) # subtract the count
offset += 1
else: # outside pattern
p.observe(seqs[new_z][i], -1) # subtract the count
# ... and add back the previous and now updated z
offset = 0
for i in range(len(seqs[prev_z])):
if i >= a[prev_z] and i < a[prev_z] + W: # within pattern
q[offset].observe(seqs[prev_z][i], +1) # add the count
offset += 1
else: # outside pattern
p.observe(seqs[prev_z][i], +1) # add the count
""" Sampling step:
Consider each position x in z as a match: find a weight Ax """
z_len = len(seqs[new_z]) # length of seq z
A = [ 0.0 for _ in range(z_len) ]
Asum = 0.0
for x in range(z_len - W + 1): # look at all starts for a W-wide pattern
Px = 1.0; Qx = 1.0
for w in range(W):
Px *= p[seqs[new_z][x+w]]
Qx *= q[w][seqs[new_z][x+w]]
try:
A[x] = Qx / Px
except ZeroDivisionError:
pass
Asum += A[x]
for x in range(z_len - W + 1): # score all starts for a W-wide pattern
A[x] /= Asum # normalise so that all Ax's sum to 1.0
# Pick the next a[z], with a probability proportional to Ax
pick = random.random() # any value between 0 and 1
cumul = 0.0 # cumulative probability
for x in range(z_len - W + 1): # check starts for a W-wide pattern
cumul += A[x]
if pick <= cumul: # check if our random pick is smaller than the cumulative prob
a[new_z] = x
break
""" Evaluate data log-likelihood """
if round % 100 == 0: # but only every 100th round
LL = 0.0
for k in range(N):
Pk = 1.0; Qk = 1.0
for w in range(W):
Pk *= p[seqs[k][a[k]+w]]
Qk *= q[w][seqs[k][a[k]+w]]
try:
LL += math.log(Qk / Pk)
except ZeroDivisionError:
pass
print "LL @ %5d=\t%5.2f" % (round, LL)
# end main for-loop
self.q = q
self.p = p
self.alignment = a
return q
def getForeground(self):
""" Return the probability distributions for columns in the discovered alignment. """
return self.q
def getBackground(self):
""" Return the probability distributions for the background used in the discovery. """
return self.p
'''
Created on Jul 12, 2012, amended April 2015
Module for managing Gene Ontology data, in particular gene:terms
annotations and term definitions
It can be used on files you can download from geneontology.org.
The class GO is constructed from:
- annotation file which is (usually) specific to the species of interest
- OBO file which defines the GO terms and their relationships
e.g.
> go = GO('gene_association.goa_ref_human', 'go-basic.obo')
Internal data structures are created so that you can query
- what are the terms of my gene (or genes)? Use getTerms
- what are the genes of my term? Use getGenes
- what terms occur amongst my genes, ranked by their absolute count? Use getGOReport without background
- what terms are statistically enriched in my genes, relative a background set of genes? Use getGOReport with background
The class BinGO works with a compact (memory saving) binary format that aggregates information from an annotation
file and an OBO file. Therefore, you first need to construct this binary file, using writeBitFile.
Subsequently you can construct instances of BinGO and query terms and genes, roughly in the manner identified above for GO.
@author: mikael
'''
from struct import pack, unpack, calcsize, error
import operator
import time
import os
import stats
# Character codes used by binary format to identify ontology
onto_codes = {
'P': 'Biological process',
'F': 'Molecular function',
'C': 'Cellular component'}
# Labels for edges in the ontology graph, index is used in binary format
onto_rel = ['is_a', 'isect', 'part_of', 'has_part', 'regulates']
# Evidence codes assigned to annotations, an index is assigned when creating binary file and is stored in its header
evid_codes = { # Experimental Evidence Codes
'EXP': 'Inferred from Experiment',
'IDA': 'Inferred from Direct Assay',
'IPI': 'Inferred from Physical Interaction',
'IMP': 'Inferred from Mutant Phenotype',
'IGI': 'Inferred from Genetic Interaction',
'IEP': 'Inferred from Expression Pattern',
#Computational Analysis Evidence Codes
'ISS': 'Inferred from Sequence or Structural Similarity',
'ISO': 'Inferred from Sequence Orthology',
'ISA': 'Inferred from Sequence Alignment',
'ISM': 'Inferred from Sequence Model',
'IGC': 'Inferred from Genomic Context',
'IBA': 'Inferred from Biological aspect of Ancestor',
'IBD': 'Inferred from Biological aspect of Descendant',
'IKR': 'Inferred from Key Residues',
'IRD': 'Inferred from Rapid Divergence',
'RCA': 'inferred from Reviewed Computational Analysis',
'TAS': 'Traceable Author Statement',
'NAS': 'Non-traceable Author Statement',
#Curator Statement Evidence Codes
'IC': 'Inferred by Curator',
'ND': 'No biological Data available',
#Automatically-assigned Evidence Codes
'IEA': 'Inferred from Electronic Annotation',
#Obsolete Evidence Codes
'NR': 'Not Recorded'}
class GO():
""" Classical interface for working with GO terms usually within the same species and when memory is not a major issue.
Implementations are relatively efficient (for Python at least).
Major functions:
__init__: construct instance of GO session from an annotation file and an OBO file (geneontology.org)
getTerms: get GO terms from gene or genes (transitively or not)
getGenes: get genes that are annotated with given term or terms
getGOReport: perform basic gene set enrichment
"""
# Structures to hold all data relevant to session
annots = {} # annotations: annots[gene] = (taxa, terms[term] = (evid, T/F))
termdefs = {} # definitions: termdefs[term] = (onto, set((term, rel)), name)
children = {} # redundant, parent-to-child structure: children[term] = set((term, rel))
def __init__(self, annotFile, obofile, annotfile_columns = (1,2,3,4,6,8)):
""" Start GO session with specified data loaded:
annotfile: name of annotation file, e.g.'gene_association.tair'
OBO file: name of gene ontology definition file, e.g. 'gene_ontology_ext.obo'
Optionally, specify what columns in the annotation file that contains in order:
gene, symb, qual, term, evid, onto. Note that index starts at 0 NOT 1.
(The default seems to work for most annotation files, but sometime if you wish to cross reference
say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations:
go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8))
"""
print "Started at", time.asctime()
# Get GO definitions
terms = readOBOFile(obofile)
for term in terms:
(term_name, term_onto, term_is) = terms[term]
self.termdefs[term] = (term_onto, term_is, term_name)
self.children[term] = set()
for term in self.termdefs:
(term_onto, term_is, term_name) = self.termdefs[term]
for (parent, prel) in term_is:
try:
cset = self.children[parent]
cset.add((term, prel))
except KeyError:
pass
print "Read %d GO definitions" % len(terms)
# open annotation file to analyse and index data
src = open(annotFile, 'r')
gene_cnt = 0
cnt = 0
for line in src:
cnt += 1
if line.startswith('!'):
continue
(gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns)
try:
(taxa_q, terms_map) = self.annots[gene]
terms_map[term] = (evid, qual != 'NOT')
except KeyError: # not a previously encountered gene
gene_cnt += 1
terms_map = {term: (evid, qual != 'NOT')}
self.annots[gene] = (taxa, terms_map)
src.close()
print "Read annotations for %d genes" % gene_cnt
def _makeIntoList(self, id_or_ids):
if type(id_or_ids) != list and type(id_or_ids) != set and type(id_or_ids) != tuple:
return [id_or_ids]
return id_or_ids
def getTerms(self, genes_or_gene, evid = None, onto = None, include_more_general = True):
""" Retrieve all terms for a gene or a set/list/tuple of genes.
If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
If onto(logy) is specified the method includes only entries from specified ontology ('P', 'F' or 'C').
If include_more_general is true, terms that are transitively related are included.
With multiple genes provided in query, the result is a map, keyed by gene (each identifying a set of terms).
When only one gene is provided, the result is simply a set of terms.
"""
if type(genes_or_gene) != list and type(genes_or_gene) != set and type(genes_or_gene) != tuple:
return self.getTerms4Gene(genes_or_gene, evid, onto, include_more_general)
else:
return self.getTerms4Genes(genes_or_gene, evid, onto, include_more_general)
def getTerms4Genes(self, genes, evid = None, onto = None, include_more_general = True):
""" Retrieve all GO terms for a given set/list/tuple of genes.
If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
If onto(logy) is specified the method includes only entries from specified ontology ('P', 'F' or 'C').
If include_more_general is True (default) then transitively related terms are included.
With multiple genes provided in query, the result is a map, keyed by gene (each identifying a set of terms).
"""
gomap = {} # gene to GO terms map
genes = self._makeIntoList(genes)
for gene in genes:
gomap[gene] = self.getTerms4Gene(gene, evid, onto, include_more_general)
return gomap
def getTerms4Gene(self, gene, evid = None, onto = None, include_more_general = True):
""" Retrieve all GO terms for a given (single) gene.
If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
If onto(logy) is specified the method includes only entries from specified ontology ('P', 'F' or 'C').
If include_more_general is True (default) then transitively related terms are included
When only one gene is provided, the result is simply a set of terms.
"""
direct = set()
# STEP 1: Find all terms directly associated with specified genes
try:
(taxa, terms_map) = self.annots[gene]
for term in terms_map:
(term_evid, term_qual) = terms_map[term]
if (evid == None or evid == term_evid) and term_qual:
direct.add(term)
except KeyError:
return set() # gene was not found, hence no annotations for it
# STEP 2: Find terms associated with (indirect) parents of terms from STEP 1
indirect = set()
if include_more_general:
for term in direct:
parents = self.getParents(term, include_more_general)
for parent in parents:
indirect.add(parent)
return direct.union(indirect)
def getGenes(self, terms_or_term, evid = None, taxa = None, rel = None, include_more_specific = False):
""" Retrieve all genes that are annotated with specified term or terms,
qualified by evidence, taxa and relation type, e.g. "is_a".
If multiple terms are provided, a map is returned keyed by term (each identifying set of genes).
With a single term provided, a set of genes is returned.
"""
if type(terms_or_term) != list and type(terms_or_term) != set and type(terms_or_term) != tuple:
return self.getGenes4Term(terms_or_term, evid, taxa, rel, include_more_specific)
else:
return self.getGenes4Terms(terms_or_term, evid, taxa, rel, include_more_specific)
def getGenes4Terms(self, terms, evid = None, taxa = None, rel = None, include_more_specific = False):
""" Retrieve all genes that are annotated with specified terms,
qualified by evidence, taxa and relation type, e.g. "is_a".
Since multiple terms are provided, a map is returned keyed by term (each identifying set of genes).
"""
gomap = {} # term to genes map
terms = self._makeIntoList(terms)
for term in terms:
gomap[term] = self.getGenes4Term(term, evid, taxa, rel, include_more_specific)
return gomap
def getGenes4Term(self, term, evid = None, taxa = None, rel = None, include_more_specific = False):
""" Retrieve all genes that are annotated with specified term or terms,
qualified by evidence, taxa and relation type, e.g. "is_a".
With a single term provided, a set of genes is returned.
"""
genes = self._getGenes4Term(term, evid, taxa, rel)
if include_more_specific:
terms = self.getChildren(term, rel, True) # not recursive yet
for t in terms:
tgenes = self._getGenes4Term(t, evid, taxa, rel)
for g in tgenes:
genes.add(g)
return genes
def _getGenes4Term(self, term, evid = None, taxa = None, rel = None):
""" Retrieve all genes that are annotated with specified term, and qualified by evidence, taxa etc. """
genes = set()
# Scour through all genes
for gene in self.annots: # annotations: annots[gene] = (taxa, terms[term] = (evid, T/F))
(qtaxa, qterms) = self.annots[gene]
if taxa == None or taxa == qtaxa:
for qterm in qterms:
if qterm != term:
continue
(qevid, qqual) = qterms[term]
if (evid == None or evid == qevid) and qqual:
genes.add(gene)
break
return genes
def getChildren(self, parent_term_id_or_ids, rel = None, include_more_specific = False):
""" Retrieve all direct children of the given (parent) term.
"""
parent_terms = self._makeIntoList(parent_term_id_or_ids)
cset = set()
for parent in parent_terms:
# definitions: children[term] = set((term, relation), ...)
current = self.children[parent]
for (child_term, child_rel) in current:
if rel == None or rel == child_rel:
cset.add(child_term)
if len(cset) > 0 and include_more_specific:
grandkids = self.getChildren(cset, rel, True)
for grandkid in grandkids:
cset.add(grandkid)
return cset
def getParents(self, child_term_id, include_more_general = True):
""" Retrieve all parents of the given term, transitively or not.
"""
direct = set() # all GO terms which are parents to given term
try:
(onto_ch, terms_ch, name_ch) = self.termdefs[child_term_id]
for (parent_id, parent_rel) in terms_ch:
(onto_pa, terms_pa, name_pa) = self.termdefs[parent_id]
direct.add(parent_id)
if (include_more_general):
parents = self.getParents(parent_id, True)
for parent in parents:
direct.add(parent)
except KeyError:
pass # term was not found, possibly throw error?
return direct
def getTermdef(self, term_id):
""" Retrieve information about a given term:
ontology, parent terms, and name as a tuple.
"""
try:
(onto_ch, terms_set, term_name) = self.termdefs[term_id]
return (onto_ch, terms_set, term_name)
except KeyError:
return ('Unknown', 'Unknown', 'Unknown')
def getAllAnnots(self):
""" Retrieve all annotated gene products """
return self.annots.keys()
def getAllBackground(self, positives = [], taxa = None, evid = None, include_more_general = False):
""" Retrieve all genes and terms that are annotated but not in a list of positives (gene products).
"""
# (taxa, terms[term] = (evid, T/F))
bg_genes = set()
bg_list = []
for gene in self.annots:
if not gene in positives:
bg_genes.add(gene)
(qtaxa, qterms) = self.annots[gene]
if taxa == None or qtaxa == taxa:
for t in qterms:
(qevid, qqual) = qterms[t]
if (evid == None or qevid == evid) and qqual:
bg_list.append(t)
if include_more_general:
for parent in self.getParents(t, True):
bg_list.append(parent)
return (bg_genes, bg_list)
def getCountReport(self, positives, threshold = None, include_more_general = True):
""" For a set of named gene products (positives) this method determines the counts of GO terms.
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) sorted by count.
positives: names of gene products
threshold: the count that must be reached for term to be reported (default is 0)
If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
include_more_general: if True, include also more general GO terms annotated to gene products (default is True)
"""
fg_list = [] # all terms, with multiple copies for counting
fg_map = self.getTerms4Genes(positives, include_more_general = include_more_general) #
for id in fg_map:
for t in fg_map[id]:
fg_list.append(t)
term_set = set(fg_list)
term_cnt = {}
nPos = len(positives)
if threshold == None:
threshold = 0 # include all terms
for t in term_set:
cnt = fg_list.count(t)
if cnt >= threshold:
term_cnt[t] = cnt
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
ret = []
for t in sorted_cnt:
defin = self.getTermdef(t[0])
if defin == None:
print 'Could not find definition of %s' % t[0]
else:
ret.append((t[0], t[1], defin[2], defin[0]))
return ret
def getEnrichmentReport(self, positives, background = None, evid = None, threshold = None, include_more_general = True):
""" For a set of named gene products (positives) this method determines the enrichment of GO terms.
Each GO term is also assigned an enrichment p-value (on basis of provided background, or on basis of all annotated genes, if not provided).
Note that to use the full set as background can be computationally expensive, so to speed up subsequent runs, the results are cached.
Returns a list of tuples (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
E-value is a Bonferroni-corrected p-value.
positives: names of gene products
background: names of gene products (or None if all annotated gene products should be used; default)
threshold: E-value that must be reached for term to be reported (default is 0.05)
If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
include_more_general: if True, include also more general GO terms annotated to gene products (default is True)
"""
# Process foreground: find terms of genes
fg_list = [] # all terms, with multiple copies for counting
fg_map = self.getTerms4Genes(positives, evid = evid, include_more_general = include_more_general) #
for fg_gene in fg_map:
for t in fg_map[fg_gene]:
fg_list.append(t)
nPos = len(positives)
# Process background: find terms of genes
bg_list = []
if background == None: # need to use the full set
background = self.annots.keys()
negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives
nNeg = len(negatives)
bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general)
for bg_gene in bg_map:
for t in bg_map[bg_gene]:
bg_list.append(t)
term_set = set(fg_list)
term_cnt = {}
if threshold == None:
threshold = 0.05
for t in term_set:
fg_hit = fg_list.count(t) # number of foreground genes WITH GO term (number of terms in the list for the collective set of foreground genes)
bg_hit = bg_list.count(t) # number of background genes WITH GO term (number of terms in the list for the collective set of background genes)
fg_nohit = nPos - fg_hit # total number of genes in foreground minus that number of hits
bg_nohit = nNeg - bg_hit # total number of genes in background minus that number of hits
pvalue = stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False) # one-tailed FET
evalue = pvalue * len(term_set) # Bonferroni correction
if evalue <= threshold: # check if significance req is fulfilled
term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
ret = []
for t in sorted_cnt:
defin = self.getTermdef(t[0])
if defin == None:
print 'Could not find definition of %s' % t[0]
else:
ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0]))
return ret
class BinGO():
# Structures to hold all data relevant to session, all keys are "encoded"
annots = {} # annotations: annots[gene] = (taxa, terms[term] = (evid, T/F))
termdefs = {} # definitions: termdefs[term] = (onto, terms[term] = relation, name)
# Codes for encoding and decoding
gene_code = None
term_code = None
evid_code = None
# indices
annot_index = {}
# Files
f = None
def __init__(self, filename, taxa = None):
""" The binary file contains all the data and will initialise
gene annotations (annots) and term definitions (termdefs)
and the encoding/decoding keys. """
self.f = self._readBitFile(filename, taxa = taxa)
def _decodeGeneIDs(self, gene_codes):
if type(gene_codes) != list and type(gene_codes) != set and type(gene_codes) != tuple:
gene_codes = [gene_codes]
ids = []
for i in gene_codes:
s = decode(i, self.gene_code)
ids.append(s)
return ids
def _encodeGeneIDs(self, gene_names):
if type(gene_names) != list and type(gene_names) != set and type(gene_names) != tuple:
gene_names = [gene_names]
ids = []
for i in gene_names:
y = encode(i, self.gene_code)
ids.append(y)
return ids
def _getGeneEntry(self, gene):
peek = self.annot_index[gene]
self.f.seek(peek, 0)
buf = self.f.read(calcsize('IIH'))
(gene_int, taxa_int, nterms) = unpack('IIH', buf)
buf = self.f.read(nterms * calcsize('?BI'))
terms_dict = {}
for pos in range(0, len(buf) - 1, calcsize('?BI')):
(qual_bool, evid_int, term_int) = unpack('?BI', buf[pos:pos+calcsize('?BI')])
terms_dict[term_int] = (evid_int, qual_bool)
return (taxa_int, terms_dict)
def _getSuperTerms(self, term, rel = None):
""" Recursively compute the transitive closure. """
found = set()
try:
(_, closure, _) = self.termdefs[term]
for (t, r) in closure.items():
if (not rel) or r == rel:
found.add(t)
found.update(self._getSuperTerms(t, rel))
except KeyError:
print 'Could not find GO:%s' % (''.join(decode(term, self.term_code)))
return found
def _getChildTerms(self, term, rel = None):
found = set()
for (child, termdef) in self.termdefs.items():
(_, parents_dict, _) = termdef
try:
myrel = parents_dict[term]
if rel == myrel or not rel: found.add(child)
except KeyError:
pass
return found
def _getSpecificTerms(self, term, rel = None):
direct = self._getChildTerms(term, rel)
found = set()
for t in direct:
found.add(t)
found.update(self._getSpecificTerms(t, rel))
return found
def getTerms(self, genes, evid = None, onto = None, include_more_general = True):
"""
Retrieve all GO terms for a given set of genes (or single gene).
The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms.
If include_more_general is True (default) then transitively related terms are included
"""
mymap = dict()
# STEP 1: Find all terms directly associated with specified genes
direct = set() # all GO terms (encoded)
ids = self._encodeGeneIDs(genes)
for i in ids:
gene_name = ''.join(decode(i, self.gene_code))
mymap[gene_name] = set()
try:
(taxa, terms) = self._getGeneEntry(i)
for (term, evid_and_qual) in terms.items():
if evid_and_qual[1] and not evid: # if True and no evidence is specified
direct.add(term)
mymap[gene_name].add(term)
elif self.evid_code[evid_and_qual[0]] == evid:
direct.add(term)
mymap[gene_name].add(term)
except KeyError:
pass
#print 'Failed to find annotations for gene %s' % gene_name
if include_more_general:
# STEP 2: Find the transitive closure of each term identified, store as a dictionary
indirect = {}
for t in direct:
if not indirect.has_key(t):
indirect[t] = set(self._getSuperTerms(t))
# STEP 3: compile and return results
for gene in mymap:
term_ids = mymap[gene]
all_ids = set(term_ids)
if include_more_general:
for term_id in term_ids:
all_ids.update(indirect[term_id])
mymap[gene] = set()
for term_enc in all_ids:
mymap[gene].add('GO:'+''.join(decode(term_enc, self.term_code)))
return mymap
def getAllGenes(self):
names = []
for g in self._decodeGeneIDs(self.annot_index.keys()):
names.append(''.join(g))
return names
def getGenes(self, terms, evid = None, taxa = None, rel = None, include_more_specific = True):
""" Retrieve all genes that are annotated with specified terms, and qualified by evidence, taxa etc. """
""" TODO: Debug--suspect this implementation is incorrect. """
term_ids = set()
for t in terms:
term_ids.add(encode(t[3:], self.term_code))
# STEP 1 (optional): determine more specific terms to be included in query
if include_more_specific:
myterms = set()
for t in term_ids:
myterms.add(t)
children = self._getSpecificTerms(t, rel)
myterms.update(children)
term_ids = myterms
# STEP 2: identify genes with those terms
found = {}
for g in self.annot_index:
gene_name = decode(g, self.gene_code)
(mytaxa, tdict) = self._getGeneEntry(g)
if not taxa or taxa == mytaxa:
for annot_term in tdict.keys():
if tdict[annot_term] == evid:
if annot_term in terms:
try:
added = found[gene_name]
added.add(annot_term)
except KeyError:
found[gene_name] = set([annot_term])
# STEP 3: compile and return results
for gene in found:
term_ids = found[gene]
all_ids = set(term_ids)
found[gene] = set()
for term_enc in all_ids:
found[gene].add('GO:'+''.join(decode(term_enc, self.term_code)))
return found
def getTermdef(self, term):
term_id = encode(term[3:], self.term_code)
try:
(onto_ch, terms_dict, name_peek) = self.termdefs[term_id]
self.f.seek(name_peek, 0)
term_name = self.f.readline()
return (onto_codes[onto_ch], terms_dict, term_name)
except KeyError:
return ('Unknown', 'Unknown', 'Unknown')
def _readBitFile(self, filename, taxa, termnames = False):
f = open(filename, 'r')
# STEP 1: header info
ngene_code = None
nterm_code = None
nevid_code = None
ngene_cnt = 0
nterm_cnt = 0
nevid_cnt = 0
header = True
total_gene_cnt = None
current_gene_cnt = 0
current_terms_cnt = 0
annot_offset = 0
obo_offset = 0
while f:
if not ngene_code:
line = f.readline()
fields = line.split()
total_gene_cnt = int(fields[0])
total_terms_cnt = int(fields[1])
ngene_code = int(fields[2])
nterm_code = int(fields[3])
nevid_code = int(fields[4])
self.gene_code = ['' for _ in range(ngene_code)]
self.term_code = ['' for _ in range(nterm_code)]
self.evid_code = ['' for _ in range(nevid_code)]
elif ngene_cnt < ngene_code:
line = f.readline()
self.gene_code[ngene_cnt] = line.strip()
ngene_cnt += 1
elif nterm_cnt < nterm_code:
line = f.readline()
self.term_code[nterm_cnt] = line.strip()
nterm_cnt += 1
elif nevid_cnt < nevid_code:
line = f.readline()
self.evid_code[nevid_cnt] = line.strip()
nevid_cnt += 1
else: # we're not in the header
if header: offset = f.tell()
header = False
try:
if current_gene_cnt < total_gene_cnt: # we are reading gene:terms annotations
peek = f.tell()
buf = f.read(calcsize('IIH'))
(gene_int, taxa_int, nterms) = unpack('IIH', buf)
current_gene_cnt += 1
if (not taxa) or (taxa_int == taxa or taxa_int in taxa):
self.annot_index[gene_int] = peek
bufsize = calcsize('?BI')
f.read(nterms * bufsize)
elif current_terms_cnt < total_terms_cnt: # we are reading term definitions (term is_a term, term, term, ...)
buf = f.read(calcsize('IcH'))
(term_int, onto_ch, nterms) = unpack('IcH', buf)
current_terms_cnt += 1
bufsize = calcsize('BI')
buf = f.read(nterms * bufsize)
terms_dict = {}
for pos in range(0, len(buf) - 1, bufsize):
(rel_ndx, sup_int) = unpack('BI', buf[pos:pos+bufsize])
terms_dict[sup_int] = rel_ndx
name_peek = f.tell()
f.readline() # skip putting name in memory, instead refer to the position in the file
self.termdefs[term_int] = (onto_ch, terms_dict, name_peek)
else:
buf = f.read(calcsize('II'))
(annot_offset, obo_offset) = unpack('II', buf)
break
except error as inst:
print "Problem reading binary file: ", inst, "at gene ", current_gene_cnt, "at definition ", current_terms_cnt, "at", f.tell()
exit(3)
print "Read %d genes and %d term definitions" % (current_gene_cnt, current_terms_cnt)
print "Annotations start at", annot_offset, "\nDefinitions start at", obo_offset
return f
#FIXME: write code to perform test of taxa enrichment
def getGOReport_byScore(self, gene_score_map, negatives_score_map = {}, include_more_general = True, descending_order = True):
""" Generate a complete GO term report for a set of genes with associated scores.
Uses the Wilcoxon Ranksum test for each GO term to assign a p-value,
indicating the enrichment of term to "top" genes in descending order by score (by default).
"""
fg_map = self.getTerms(gene_score_map.keys(), include_more_general = include_more_general)
fg_list = []
for id in fg_map:
for t in fg_map[id]:
fg_list.append(t)
term_set = set(fg_list)
term_pval = {}
if len(negatives_score_map) > 0:
bg_map = self.getTerms(negatives_score_map.keys(), include_more_general = include_more_general)
for t in term_set:
pos = []
neg = []
for gene in gene_score_map:
annot = fg_map[gene]
if not annot == None:
if t in annot:
pos.append(gene_score_map[gene])
else:
neg.append(gene_score_map[gene])
if len(pos) > 0 and len(neg) > 0:
if descending_order:
p = stats.getRSpval(neg, pos)
else:
p = stats.getRSpval(pos, neg)
if len(negatives_score_map) > 0 and p <= 0.05:
mpos = pos # scores of foreground genes with matching GO term
mneg = [] # scores of background genes with matching GO terms
for gene in negatives_score_map:
annot = bg_map[gene]
if not annot == None:
if t in annot:
mneg.append(negatives_score_map[gene])
if len(mneg) > 0:
if descending_order:
p2 = stats.getRSpval(mneg, mpos)
else:
p2 = stats.getRSpval(mpos, mneg)
else:
p2 = 0.0
term_pval[t] = (p, p2)
else:
term_pval[t] = (p, 1.0)
sorted_pval = sorted(term_pval.items(), key=lambda v: v[1][0], reverse=False)
ret = []
for t in sorted_pval:
defin = self.getTermdef(t[0])
if defin == None:
print 'Could not find definition of %s' % t[0]
else:
ret.append((t[0], t[1][0], t[1][1], defin[2].strip(), defin[0]))
return ret
def getGOReport(self, positives, background = None, taxa = None, include_more_general = True):
""" Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
(GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
E-value is a Bonferroni-corrected p-value.
"""
pos = set(positives)
fg_map = self.getTerms(pos, include_more_general = include_more_general)
fg_list = []
for id in fg_map:
for t in fg_map[id]:
fg_list.append(t)
bg_map = {}
bg_list = []
neg = set()
if background != None:
neg = set(background).difference(pos)
bg_map = self.getTerms(neg, include_more_general = include_more_general)
for id in bg_map:
for t in bg_map[id]:
bg_list.append(t)
term_set = set(fg_list)
term_cnt = {}
nPos = len(pos)
nNeg = len(neg)
if background == None:
for t in term_set:
term_cnt[t] = fg_list.count(t)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
else: # a background is provided
for t in term_set:
fg_hit = fg_list.count(t)
bg_hit = bg_list.count(t)
fg_nohit = nPos - fg_hit
bg_nohit = nNeg - bg_hit
term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
ret = []
for t in sorted_cnt:
defin = self.getTermdef(t[0])
if defin == None:
print 'Could not find definition of %s' % t[0]
else:
if background != None:
ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][1], defin[2], defin[0]))
else:
ret.append((t[0], t[1], defin[2], defin[0]))
return ret
def encode(code_me, encode_strings):
code = 0
accum = 1
try:
for pos in range(len(code_me)):
codelen = len(encode_strings[pos])
for i in range(codelen):
if encode_strings[pos][i] == code_me[pos]:
code += accum * i
accum *= codelen
break
except IndexError as e:
print e, code_me
return code
def decode(code, encode_strings):
npos = len(encode_strings)
accum = [1 for _ in range(npos)]
try:
for pos in range(1, npos): accum[pos] = accum[pos - 1] * len(encode_strings[pos - 1])
indices = [-1 for _ in range(npos)]
for pos in range(npos - 1, -1, -1): # go backwards, start at last (most significant) position
indices[pos] = code / accum[pos]
code -= accum[pos] * indices[pos]
string = [encode_strings[pos][indices[pos]] for pos in range(len(encode_strings))]
except IndexError as e:
print e, code
return string
def _extractAnnotFields(line, columns = (1,2,3,4,6,8)):
""" Extract appropriate details from annotation file. This typically follows:
1. DB
Database from which entry has been taken.
Example: PDB
2. DB_Object_ID
A unique identifier in the DB for the item being annotated.
Here: PDB ID and chain ID of the PDB entry.
Example: 2EKB_A
3. DB_Object_Symbol
Here: PDB ID and chain ID of the PDB entry.
Example:EKB_A
4. Qualifiers
This column is used for flags that modify the interpretation of an annotation.
This field may be equal to: NOT, colocalizes_with, contributes_to,
NOT|contributes_to, NOT|colocalizes_with
Example: NOT
5. GO Identifier
The GO identifier for the term attributed to the DB_Object_ID.
Example: GO:0005625
6. DB:Reference
A single reference cited to support an annotation.
Where an annotation cannot reference a paper, this field will contain
a GO_REF identifier. See section 8 and
http://www.geneontology.org/doc/GO.references
for an explanation of the reference types used.
Example: PMID:9058808
7. Evidence
One of either EXP, IMP, IC, IGI, IPI, ISS, IDA, IEP, IEA, TAS, NAS,
NR, ND or RCA.
Example: TAS
9. Aspect
One of the three ontologies: P (biological process), F (molecular function)
or C (cellular component).
Example: P
In columns specify index (starts with 0 NOT 1) for gene, symb, qual, term, evid, onto
"""
fields = line.strip().split('\t')
gene = fields[columns[0]]
symb = fields[columns[1]]
qual = fields[columns[2]]
term = fields[columns[3]]
if not term.startswith('GO:'):
term = None
raise "No GO term on line: " + line
evid = fields[columns[4]]
if not evid_codes.has_key(evid):
evid = None
onto = fields[columns[5]]
if not onto_codes.has_key(onto):
onto = None
taxa_idx = line.find('taxon:')
if taxa_idx == -1:
taxa = None
else:
taxa = line[taxa_idx:]
taxa = taxa.split('\t')
taxa_spec = taxa[0].split(':')
taxa = int(taxa_spec[len(taxa_spec) - 1]) # pick last taxon ID
return (gene, symb, qual, term, evid, onto, taxa)
def readOBOFile(obofile):
"""
http://www.geneontology.org/GO.format.obo-1_2.shtml
"""
src = open(obofile, 'r')
terms = {}
in_term_def = False
in_type_def = False
for line in src:
if in_term_def:
if line.startswith('id: '):
term_id = line[4:14]
term_is = set()
elif line.startswith('name: '):
term_name = line[6:].strip()
elif line.startswith('def: '):
# Note this is a multi-line field, delimited by "'s
pass
elif line.startswith('namespace: '):
if line[11] == 'b': term_onto = 'P'
elif line[11] == 'm': term_onto = 'F'
elif line[11] == 'c': term_onto = 'C'
elif line.startswith('is_a: '):
term_is.add((line[6:16], 'is_a'))
elif line.startswith('relationship: '):
fields = line.split()
term_is.add((fields[2], fields[1]))
elif line.startswith('intersection_of: '):
fields = line.split()
if fields[1].startswith('GO:'):
term_is.add((fields[1], 'isect'))
else:
term_is.add((fields[2], fields[1]))
elif line.startswith('is_obsolete: '):
in_term_def = False # ignore this entry
if line.startswith('[Term]'):
if in_term_def: # already defining one, stash it before moving on to the next...
terms[term_id] = (term_name, term_onto, term_is)
elif in_type_def:
in_type_def = False
in_term_def = True
if line.startswith('[Typedef]'):
if in_term_def: # already defining one, stash it before moving on to the next...
in_term_def= False
in_type_def = True
if in_term_def: # defining one, stash it
terms[term_id] = (term_name, term_onto, term_is)
return terms
def writeBitFile(annotFile, obofile, destFile, taxas = None):
print "Started at", time.asctime()
# open annotation file to analyse and index data
src = open(annotFile, 'r')
gene_index = [{} for _ in range(6)] # count different characters in different positions
term_index = [{} for _ in range(7)] # count different characters in different positions
evid_index = {}
gene_cnt = 0
cnt = 0
prev_gene = None
for line in src:
cnt += 1
#if cnt > 100000:
# break
if line.startswith('!'):
continue
(gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line)
if not (taxas and ((taxa == taxas) or (taxa in taxas))): # The gene does NOT belong to a nominated taxon
continue
if not (gene == prev_gene): # not the same gene
gene_cnt += 1
try:
evid_index[evid]
except:
evid_index[evid] = len(evid_index)
pos = 0
for ch in gene[0:6]:
try:
gene_index[pos][ch]
except KeyError: # no match
gene_index[pos][ch] = len(gene_index[pos])
pos += 1
pos = 0
for ch in term[3:10]:
try:
term_index[pos][ch]
except KeyError: # no match
term_index[pos][ch] = len(term_index[pos])
pos += 1
prev_gene = gene
src.close()
print "Read annotations for %d genes" % gene_cnt
gene_code = ['' for _ in range(6)]
term_code = ['' for _ in range(7)]
for d in range(len(gene_index)):
arr = ['?' for _ in gene_index[d]]
for (ch, index) in gene_index[d].items():
arr[index] = ch
gene_code[d] = ''.join(arr)
for d in range(len(term_index)):
arr = ['?' for _ in term_index[d]]
for (ch, index) in term_index[d].iteritems():
arr[index] = ch
term_code[d] = ''.join(arr)
evid_code = ['' for _ in range(len(evid_index))]
for (e, ndx) in evid_index.items():
evid_code[ndx] = e
# Get GO definitions
terms = readOBOFile(obofile)
print "Read %d GO definitions" % len(terms)
# re-open, now with the aim of copying info
src = open(annotFile, 'r')
dst = open(destFile, 'w')
# STEP 1: header info
dst.write("%d\t%d\t%d\t%d\t%d\n" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index)))
for code_str in gene_code:
dst.write(code_str+"\n")
for code_str in term_code:
dst.write(code_str+"\n")
for e_str in evid_code:
dst.write(e_str+'\n')
print "Wrote header %d\t%d\t%d\t%d\t%d, now at @%d" % (gene_cnt, len(terms), len(gene_code), len(term_code), len(evid_index), dst.tell())
# STEP 2: write annotations
annot_offset = dst.tell()
prev_gene = None
concat_terms = {}
cnt = 0
for line in src:
cnt += 1
#if cnt > 100000:
# break
if line.startswith('!'):
continue
(gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line)
if not (taxas and ((taxa == taxas) or (taxa in taxas))): # The gene does NOT belong to a nominated taxon
continue
if gene != prev_gene: # new gene is found
if prev_gene != None:
# write prev data
s = pack('IIH', encode(prev_gene, gene_code), taxa, len(concat_terms))
dst.write(s)
for t in concat_terms:
(o, q, e) = concat_terms[t]
s = pack('?BI', q, evid_index[e], encode(t, term_code))
dst.write(s)
# re-init
prev_gene = gene
concat_terms = {}
concat_terms[term[3:]] = (onto, qual, evid)
if len(concat_terms) > 0:
# write data in buffer
s = pack('IIH', encode(prev_gene, gene_code), taxa, len(concat_terms))
dst.write(s)
for t in concat_terms:
(o, q, e) = concat_terms[t]
s = pack('?BI', q, evid_index[e], encode(t, term_code))
dst.write(s)
print "Wrote GO annotations, now at @%d" % dst.tell()
# Next, the ontology definition...
obo_offset = dst.tell() # remember the position where the OBO starts
sorted_terms = sorted(terms.iteritems(), key=operator.itemgetter(0))
for [t, _] in sorted_terms:
(term_name, term_onto, term_is) = terms[t]
s = pack('IcH', encode(t[3:], term_code), term_onto, len(term_is))
dst.write(s)
for (sup_term, sup_rel) in term_is:
try:
index = onto_rel.index(sup_rel)
except ValueError:
index = 9
s = pack('BI', index, encode(sup_term[3:], term_code))
dst.write(s)
dst.write(term_name + '\n')
print "Wrote %d GO definitions, now at @%d" % (len(sorted_terms), dst.tell())
# Finally, write the offsets to quickly access annotations and definitions, resp
dst.write(pack('II', annot_offset, obo_offset))
# done, close
dst.close()
print "Completed at", time.asctime()
###################################################
# This module is a supplement to the Python guide #
# Version 2.2016.1 (8/3/2016) #
###################################################
'''
This module contains code for that can help solving bioinformatics problems.
See the accompanying Python guide for more explanations and examples.
Alphabet is a class that defines valid symbols that we then use to make up valid
biological sequences. Note that we also define variables corresponding to
DNA, RNA and Protein sequences that can be used directly.
Sequence defines basic parts and operations on biological sequences.
Alignment defines an alignment of sequences (how symbols in different sequences line
up when placed on-top). Alignment methods should generate instances of this class.
SubstMatrix defines a substitution matrix, i.e. a scoring system for performing
alignments. You can read these from files or construct them manually.
GeneProfile defines parts and operations for gene expression profiles. Essentially,
the class will help to index expression data by gene name (rows) and by sample name (columns).
There are several methods not tied to a particular class because they construct new instances,
e.g. reading from file, retrieving from the internet, creating an alignment from sequences etc.
You need to have numpy installed (see http://www.numpy.org/).
Should work with Python v2.6-2.7 (see http://www.python.org/).
Has not been written to work with Python v3 and later--but this should be easy to do.
The code may contain bugs--please report to m.boden@uq.edu.au
'''
import math, numpy, urllib, urllib2
###############################################################################
# Alphabet #
###############################################################################
class Alphabet():
""" A minimal class for alphabets """
def __init__(self, symbolString):
self.symbols = symbolString
def __len__(self): # implements the "len" operator, e.g. "len(Alphabet('XYZ')" results in 3
return len(self.symbols)
def __contains__(self, sym): # implements the "in" operator, e.g. "'A' in Alphabet('ACGT')" results in True
return sym in self.symbols
def __iter__(self): # method that allows us to iterate over all symbols, e.g. "for sym in Alphabet('ACGT'): print sym" prints A, C, G and T on separate lines
tsyms = tuple(self.symbols)
return tsyms.__iter__()
def __getitem__(self, ndx):
""" Retrieve the symbol(s) at the specified index (or slice of indices) """
return self.symbols[ndx]
def index(self, sym):
""" Retrieve the index of the given symbol in the alphabet. """
return self.symbols.index(sym)
def __str__(self):
return self.symbols
""" Below we declare alphabet variables that are going to be available when
this module is imported """
DNA_Alphabet = Alphabet('ACGT')
RNA_Alphabet = Alphabet('ACGU')
Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
###############################################################################
# Sequence #
###############################################################################
class Sequence():
""" A biological sequence class. Stores the sequence itself,
the alphabet and a name.
Usage:
>>> seq1 = Sequence('ACGGGAGAGG', DNA_Alphabet, 'ABC')
>>> print seq1
ABC: ACGGGAGAGG
>>> 'C' in seq1
True
>>> for sym in seq1:
... print sym
"""
def __init__(self, sequence, alphabet, name = '', gappy = False, annot = ''):
""" Construct a sequence from a string, an alphabet (gappy or not) and a name.
The parameter gappy is for sequences when used in alignments, which means that '-' is allowed. """
for sym in sequence:
if not sym in alphabet and (sym != '-' or not gappy): # error check: bail out
raise RuntimeError('Invalid symbol: ' + sym)
self.sequence = sequence
self.alphabet = alphabet
self.name = name
self.gappy = gappy
self.annot = annot # some annotation, e.g. species
def __len__(self): # the "len" operator
return len(self.sequence)
def __iter__(self): # method that allows us to iterate over a sequence
tsyms = tuple(self.sequence)
return tsyms.__iter__()
def __contains__(self, item): # test for membership (the "in" operator)
for sym in self.sequence:
if sym == item:
return True
return False
def __getitem__(self, ndx): # [ndx] operator (retrieve a specified index (or a "slice" of indices) of the sequence data.
return self.sequence[ndx]
def writeFasta(self):
""" Write one sequence in FASTA format to a string and return it. """
fasta = '>' + self.name + ' ' + self.annot + '\n'
data = self.sequence
nlines = (len(self.sequence) - 1) / 60 + 1
for i in range(nlines):
lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
fasta += lineofseq
return fasta
def __str__(self): # "pretty" print sequence
str = self.name + ': '
for sym in self:
str += sym
return str
def count(self, findme):
""" Get the number of occurrences of specified symbol """
cnt = 0
for sym in self.sequence:
if findme == sym:
cnt = cnt + 1
return cnt
def find(self, findme):
""" Find the position of the specified symbol or sub-sequence """
return self.sequence.find(findme)
###############################################################################
# Alignment #
###############################################################################
class Alignment():
""" A sequence alignment class. Stores two or more sequences of equal length where
one symbol is gap '-'. The number of columns in the alignment is given by alignlen.
Example usage:
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print Alignment(seqs)
THIS-LI-NE-
--ISALIGNED """
def __init__(self, seqs):
self.alphabet = None
self.alignlen = -1
self.seqs = seqs
self.namelen = 0
for s in seqs:
if self.alphabet == None:
self.alphabet = s.alphabet
elif self.alphabet != s.alphabet:
raise RuntimeError("Alignment invalid: contains a mix of alphabets")
if self.alignlen == -1:
self.alignlen = len(s)
elif self.alignlen != len(s):
raise RuntimeError("Alignment invalid: lengths vary")
self.namelen = max(len(s.name), self.namelen)
def __str__(self):
string = ''
for seq in self.seqs:
string += seq.name.ljust(self.namelen+1)
for sym in seq:
string += sym
string += '\n'
return string
def __len__(self):
""" Defines what the "len" operator returns for an instance of Alignment: the number of sequences. """
return len(self.seqs)
def __getitem__(self, ndx):
return self.seqs[ndx]
def calcDistances(self, measure, a=1.0):
""" Calculate the evolutionary distance between all pairs of sequences
in this alignment, using the given measure. Measure can be one of
'fractional', 'poisson', 'gamma', 'jc' or 'k2p'. If 'gamma' or 'k2p' is
given, then the parameter a must also be specified (or else it will use
the default value of 1.0).
Definitions of each distance metric are found in Zvelebil and Baum p268-276.
These are mostly intended for DNA, but adapted for protein (as below).
Note however that there are alternative distance matrices for proteins (p276).
"""
measure = measure.lower()
if not measure in ['fractional', 'poisson', 'gamma', 'jc', 'k2p']:
raise RuntimeError('Unsupported evolutionary distance measure: %s' % measure)
a = float(a)
distmat = numpy.zeros((len(self.seqs), len(self.seqs)))
# Loop through each pair of sequences
for i in range(len(self.seqs)):
for j in range(i + 1, len(self.seqs)):
seqA = self.seqs[i]
seqB = self.seqs[j]
# Calculate the fractional distance (p) first
# The two sequences of interest are in seqA and seqB
L = 0
D = 0
for k in range(self.alignlen):
# For every non-gapped column, put to L
# For every non-gapped column where the sequences are
# different, put to D
if seqA[k] != '-' and seqB[k] != '-':
L += 1
if seqA[k] != seqB[k]:
D += 1
p = float(D)/L
# Now calculate the specified measure based on p
if measure == 'fractional':
dist = p
else:
raise RuntimeError('Not implemented: %s' % measure)
distmat[i, j] = distmat[j, i] = dist
return distmat
def writeClustal(self):
""" Write the alignment to a string using the Clustal file format. """
symbolsPerLine = 60
maxNameLength = self.namelen + 1
mystring = ''
wholeRows = self.alignlen / symbolsPerLine
for i in range(wholeRows):
for j in range(len(self.seqs)):
mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
mystring += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
mystring += '\n'
# Possible last row
lastRowLength = self.alignlen - wholeRows*symbolsPerLine
if lastRowLength > 0:
for j in range(len(self.seqs)):
if maxNameLength > 0:
mystring += self.seqs[j].name.ljust(maxNameLength) + ' '
mystring += self.seqs[j][-lastRowLength:] + '\n'
return mystring
def writeHTML(self, filename):
""" Generate HTML that displays the alignment in colour.
"""
fh = open(filename, 'w')
fh.write('<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n')
html = ''.ljust(self.namelen) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += str(i/10+1)[-1]
else:
html += ' '
html += '%s\n' % (self.alignlen)
fh.write(html)
if self.alignlen > 10:
html = ''.ljust(self.namelen) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += '0'
else:
html += ' '
html += '\n'
fh.write(html)
if len(self.alphabet) <= 5: # DNA or RNA
colours = {'A':'green','C':'orange','G':'red','T':'#66bbff','U':'#66bbff'}
else: # amino acids
colours = {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'}
for seq in self.seqs:
html = seq.name.ljust(self.namelen) + ' '
for sym in seq:
try:
colour = colours[sym]
except KeyError:
colour = 'white'
html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (colour, sym)
html += '\n'
fh.write(html)
fh.write('</pre></body></html>\n')
fh.close()
def scoreAlignment(self, substmat = None, gap = -1):
"""Score the alignment using a substitution matrix (substmat).
If the alignment consists of more than two sequences, the minimum
score of each column is used.
If substmat is not specified (None), the count of matches is returned.
"""
nseqs = len(self.seqs)
total = 0
for pos in range(self.alignlen):
min = None
for i in range(nseqs):
for j in range(i+1, nseqs):
gap_here = self.seqs[i][pos] == '-' or self.seqs[j][pos] == '-'
score = 0
if substmat == None:
if self.seqs[i][pos] == self.seqs[j][pos]:
score = 1
else: # we have a substitution matrix
if gap_here:
score = gap
else:
score = substmat.get(self.seqs[i][pos], self.seqs[j][pos])
if min == None:
min = score
elif min > score:
min = score
total += min
return total
###############################################################################
# Methods to create instances of Alignment #
###############################################################################
def align(seqA, seqB, substMatrix, gap = -1):
""" Align seqA with seqB using the Needleman-Wunsch
(global) algorithm. substMatrix is the substitution matrix to use and
gap is the linear gap penalty to use. """
stringA, stringB = seqA.sequence, seqB.sequence
lenA, lenB = len(seqA), len(seqB)
# Create the scoring matrix (S) and a matrix for traceback
S = numpy.zeros((lenA + 1, lenB + 1))
Traceback = numpy.zeros((lenA + 1, lenB + 1))
# Fill the first row and column of S with multiples of the gap penalty
for i in range(lenA + 1):
S[i, 0] = i * gap
for j in range(lenB + 1):
S[0, j] = j * gap
# Calculate the optimum score at each location in the matrix, note which option that was chosen for traceback
for i in range(1, lenA + 1):
for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(stringA[i-1], stringB[j-1])
delete = S[i-1, j ] + gap
insert = S[i , j-1] + gap
Traceback[i, j] = numpy.argmax([match, delete, insert])
S[i, j] = max([match, delete, insert])
# Trace back the optimal alignment
alignA = ''
alignB = ''
# Start at the end
i = lenA
j = lenB
# Stop when we hit the end of a sequence
while i > 0 and j > 0:
if Traceback[i, j] == 1:
# Got here by a gap in sequence B (go up)
alignA = stringA[i-1] + alignA
alignB = '-' + alignB
i -= 1
elif Traceback[i, j] == 2:
# Got here by a gap in sequence A (go left)
alignA = "-" + alignA
alignB = stringB[j-1] + alignB
j -= 1
else:
# Got here by aligning the bases (go diagonally)
alignA = stringA[i-1] + alignA
alignB = stringB[j-1] + alignB
i -= 1
j -= 1
# Fill in the rest of the alignment if it begins with gaps
# (i.e., trace back all the way to S[0, 0])
while i > 0:
# Go up
alignA = stringA[i-1] + alignA
alignB = '-' + alignB
i -= 1
while j > 0:
# Go left
alignA = '-' + alignA
alignB = stringB[j-1] + alignB
j -= 1
return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])
###############################################################################
# SubstMatrix #
###############################################################################
class SubstMatrix():
""" Create a substitution matrix for an alphabet.
Example usage:
>>> sm = SubstMatrix(DNA_Alphabet)
>>> for a in DNA_Alphabet:
... for b in DNA_Alphabet:
... if a > b:
... sm.set(a, b, -1)
... elif a == b:
... sm.set(a, b, +1)
...
>>> print sm
A 1
C -1 1
G -1 -1 1
T -1 -1 -1 1
A C G T
>>> sm.get('C', 'T')
-1
"""
def __init__(self, alphabet, scoremat = None):
self.scoremat = scoremat or {} # start with empty dictionary
self.alphabet = alphabet
def _getkey(self, sym1, sym2):
""" Construct canonical (unordered) key for two symbols """
if sym1 <= sym2:
return tuple([sym1, sym2])
else:
return tuple([sym2, sym1])
def set(self, sym1, sym2, score):
""" Add a score to the substitution matrix """
self.scoremat[self._getkey(sym1, sym2)] = score
def get(self, sym1, sym2):
return self.scoremat[self._getkey(sym1, sym2)]
def __str__(self):
symbols = self.alphabet.symbols # what symbols are in the alphabet
i = len(symbols)
string = ''
for a in symbols:
string += a + ' '
for b in symbols[:len(symbols)-i+1]:
score = self.scoremat[self._getkey(a, b)]
if score != None:
string += str(score).rjust(3) + ' '
else:
string += "?".rjust(3) + ' '
string += '\n'
i -= 1
string += ' ' + ' '.join(self.alphabet.symbols)
return string
def writeFile(self, filename):
""" Write this substitution matrix to the given file. """
fh = open(filename, 'w')
file = ''
for key in self.scoremat:
file += ''.join(key) + ': ' + str(self.scoremat[key]) + '\n'
fh.write(file)
fh.close()
###############################################################################
# Below are some useful methods for loading data from strings and files. #
# They recognize the FASTA and Clustal formats (nothing fancy). #
###############################################################################
def readSubstMatrix(filename, alphabet):
""" Read in the substitution matrix stored in the given file. """
mat = SubstMatrix(alphabet)
fh = open(filename, 'r')
data = fh.read()
fh.close()
lines = data.splitlines()
for line in lines:
if len(line.strip()) == 0:
continue
symbols, score = line.split(':')
score = int(score)
mat.set(symbols[0], symbols[1], score)
return mat
def readFastaString(string, alphabet, gappy = False):
""" Read the given string as FASTA formatted data and return the list of
sequences contained within it. """
seqlist = [] # list of sequences contained in the string
seqname = '' # name of *current* sequence
seqannot = '' # annotation of *current* sequence
seqdata = [] # sequence data for *current* sequence
for line in string.splitlines(): # read every line
if len(line) == 0: # ignore empty lines
continue
if line[0] == '>': # start of new sequence
if seqname: # check if we've got one current
current = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot)
seqlist.append(current)
# now collect data about the new sequence
parts = line[1:].split() # skip first char
seqname = '' # name of *current* sequence
seqannot = '' # annotation of *current* sequence
if len(parts) > 0: seqname = parts[0]
if len(parts) > 1: seqannot = line[len(seqname) + 2:] # the rest of the line
seqdata = []
else: # we assume this is (more) data for current
cleanline = line.split()
for thisline in cleanline:
seqdata.extend(tuple(thisline.strip('*')))
# we're done reading the file, but the last sequence remains
if seqname:
lastseq = Sequence(''.join(seqdata), alphabet, seqname, gappy, seqannot)
seqlist.append(lastseq)
return seqlist
def readFastaFile(filename, alphabet, gappy = False):
""" Read the given FASTA formatted file and return the list of sequences
contained within it. """
fh = open(filename)
data = fh.read()
fh.close()
seqlist = readFastaString(data, alphabet, gappy)
return seqlist
def writeFastaFile(filename, seqs):
""" Write the specified sequences to a FASTA file. """
fh = open(filename, 'w')
for seq in seqs:
fh.write(seq.writeFasta())
fh.close()
def readClustalString(string, alphabet):
""" Read a ClustalW2 alignment in the given string and return as an
Alignment object. """
seqs = {} # sequence data
for line in string.splitlines():
if line.startswith('CLUSTAL') or line.startswith('STOCKHOLM') \
or line.startswith('#'):
continue
if len(line.strip()) == 0:
continue
if line[0] == ' ' or '*' in line or ':' in line:
continue
sections = line.split()
name, seq = sections[0:2]
if seqs.has_key(name):
seqs[name] += seq
else:
seqs[name] = seq
sequences = []
for name, seq in seqs.items():
sequences.append(Sequence(seq, alphabet, name, gappy = True))
return Alignment(sequences)
def readClustalFile(filename, alphabet):
""" Read a ClustalW2 alignment file and return an Alignment object
containing the alignment. """
fh = open(filename)
data = fh.read()
fh.close()
aln = readClustalString(data, alphabet)
return aln
def writeClustalFile(filename, aln):
""" Write the specified alignment to a Clustal file. """
fh = open(filename, 'w')
fh.write('CLUSTAL W (1.83) multiple sequence alignment\n\n\n') # fake header so that clustal believes it
fh.write(aln.writeClustal())
fh.close()
###############################################################################
# GeneProfile #
###############################################################################
class GeneProfile():
""" A class for gene expression data.
Example usage:
>>> gp = GeneProfile('MyMicroarray', ['Exp1', 'Exp2'])
>>> gp['gene1'] = [0.1, 0.5]
>>> gp['gene2'] = [2, 1]
>>> gp.getSample('Exp2')
{'gene1': [0.5], 'gene2': [1.0]}
"""
def __init__(self, dataset_name='', sample_names=[], profiles = None):
""" Create a gene profile set. """
self.name = dataset_name
self.samples = sample_names
self.genes = profiles or {} # dictionary for storing all gene--measurement pairs
def __setitem__(self, name, probevalues):
if len(probevalues) == len(self.samples):
self.genes[name] = [float(y) for y in probevalues]
else:
raise RuntimeError('Invalid number of measurements for probe ' + name)
def __getitem__(self, name):
return self.genes[name]
def getSorted(self, index, descending=True):
"""Get a list of (gene, value) tuples in descending order by value"""
key_fn = lambda v: v[1][index]
return sorted(self.genes.items(), key=key_fn, reverse=descending)
def addSample(self, sample_name, sample_dict):
"""Add a sample to the current data set.
sample_dict is a dictionary with the same keys as the current gene set.
Only values for genes in the current set will be added. """
self.headers.extend(sample_name)
if not self.genes:
self.genes = sample_dict
else:
for gene in self.genes:
values = sample_dict[gene]
if values:
self.genes[gene].extend([float(y) for y in values])
else:
self.genes[gene].extend([0.0 for _ in sample_name])
return self.genes
def getSample(self, sample_name):
"""Construct a gene dictionary including only named samples. """
mygenes = {}
if isinstance(sample_name, str): # a single sample-name
mysamples = [sample_name]
else: # a list of sample-names
mysamples = sample_name
for gene in self.genes:
mygenes[gene] = []
for name in mysamples:
mygenes[gene].append(self.genes[gene][self.samples.index(name)])
return mygenes
def getRatio(self, sample1, sample2):
"""Get the ratio of two samples in the data set. """
mygenes = {}
index1 = self.samples.index(sample1)
index2 = self.samples.index(sample2)
for gene in self.genes:
mygenes[gene] = []
mygenes[gene].append(self.genes[gene][index1] / self.genes[gene][index2])
return mygenes
def __str__(self):
""" Display data as a truncated GEO SOFT file named filename. """
line = '^DATASET = ' + self.dataset + '\n'
line += '!dataset_table_begin\nID_REF\t'
for header in self.headers:
line += header + '\t'
line += '\n'
for gene in self.genes:
line += gene + '\t'
values = self.genes[gene]
for value in values:
line += format(value, '5.3f') + '\t'
line += '\n'
line += '!dataset_table_end\n'
def writeGeoFile(self, filename):
fh = open(filename, 'w')
fh.write(str(self))
fh.close()
def getLog(genedict, base=2):
"""Get the log-transformed value of a sample/column. """
mygenes = {}
for gene in genedict:
mygenes[gene] = []
for sample in genedict[gene]:
mygenes[gene].append(math.log(sample, base))
return mygenes
def readGeoFile(filename, id_column = 0):
""" Read a Gene Expression Omnibus SOFT file. """
dataset = None
fh = open(filename, "rU")
manylines = fh.read()
fh.close()
data_rows = False # Indicates whether we're reading the data section or metadata
name = 'Unknown'
cnt_data = 0
for line in manylines.splitlines():
if line.startswith('^DATASET'):
name = line.split('= ')[1]
continue
data_rows = line.startswith('!dataset_table_begin')
data_rows = not line.startswith('!dataset_table_end')
if len(line.strip()) == 0 or line.startswith('!') or line.startswith('#') or line.startswith('^'):
continue
if data_rows:
cnt_data += 1
if (cnt_data == 1): # First line contains the headers
headers = line.split('\t')
dataset = GeneProfile(name, headers[2:]) # Create the data set
continue
ignore = (dataset == None) # ignore the row if the dataset is not initialised
id = line.split('\t')[id_column]
values = []
cnt_word = 0
for word in line.split('\t'):
cnt_word += 1
if cnt_word <= (id_column + 1): # ignore the gene names
continue
if word == 'null':
ignore = True # ignore gene if a value is null
continue
try:
values.append(float(word))
except: # ignore values that are not "float"
continue
if not ignore:
dataset[id] = tuple(values)
print 'Data set %s contains %d genes' % (name, len(dataset.genes))
return dataset
###############################################################################
# Web service methods that find data in online databases.
# Our implementations are mainly serviced by EBI.
###############################################################################
def getSequence(entryId, dbName, alphabet):
""" Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' (Uniprot Accession) or 'SUMO1_MOUSE' (Uniprot Identifier)
dbName: name of db e.g. 'uniprotkb', 'pdb' or 'refseqn'.
See: http://www.uniprot.org/faq/28. """
url = 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db=' +\
dbName + '&format=fasta&id=' + entryId
try:
data = urllib2.urlopen(url).read()
return readFastaString(data, alphabet)[0]
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
def searchSequences(query, dbName = 'uniprot'):
"""
Retrieve multiple entries matching query from a database currently only via UniProtKB
query: search term(s) e.g. 'organism:9606+AND+antigen'
dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
"""
if dbName.startswith('uniprot'):
# Construct URL
url = 'http://www.uniprot.org/' + dbName + '/?format=list&query=' + query
# Get the entries
try:
data = urllib2.urlopen(url).read()
return data.splitlines()
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
elif dbName.startswith('refseq'):
dbs = dbName.split(":")
if len(dbs) > 1:
dbName = dbs[1]
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query
# Get the entries
try:
data = urllib2.urlopen(url).read()
words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]]
return words
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
return
def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC'):
"""
Map identifiers between databases (based on UniProtKB;
see http://www.uniprot.org/faq/28)
identifiers: a list of identifiers (list of strings)
frm: the abbreviation for the identifier FROM which to idmap
to: the abbreviation for the identifier TO which to idmap
Returns a dictionary with key (from) -> value (to).
ACC is Uniprot Accession (e.g. 'P42813').
"""
url = 'http://www.uniprot.org/mapping/'
# construct query by concatenating the list of identifiers
if isinstance(identifiers, str):
query = identifiers.strip()
else: # assume it is a list of strings
query = ''
for id in identifiers:
query = query + id.strip() + ' '
query = query.strip() # remove trailing spaces
params = {
'from' : frm,
'to' : to,
'format' : 'tab',
'query' : query
}
if len(query) > 0:
request = urllib2.Request(url, urllib.urlencode(params))
response = urllib2.urlopen(request).read()
d = dict()
for row in response.splitlines()[1:]:
pair = row.split('\t')
d[pair[0]] = pair[1]
return d
else:
return dict()
###############################################################################
# Gene Ontology services.
# See http://www.ebi.ac.uk/QuickGO/WebServices.html for more info
###############################################################################
def getGODef(goterm):
"""
Retrieve information about a GO term
goterm: the identifier, e.g. 'GO:0002080'
"""
# Construct URL
url = 'http://www.ebi.ac.uk/QuickGO/GTerm?format=obo&id=' + goterm
# Get the entry: fill in the fields specified below
try:
entry={'id': None, 'name': None, 'def': None}
data = urllib2.urlopen(url).read()
for row in data.splitlines():
index = row.find(':')
if index > 0 and len(row[index:]) > 1:
field = row[0:index].strip()
value = row[index+1:].strip(' "') # remove spaces
if field in entry.keys(): # check if we need field
if entry[field] == None: # check if assigned
entry[field] = value
return entry
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
def getGOTerms(genes, db='UniProtKB'):
"""
Retrieve all GO terms for a given set of genes (or single gene).
db: use specified database, e.g. 'UniProtKB', 'UniGene',
or 'Ensembl'.
The result is given as a map (key=gene name, value=list of unique
terms) OR in the case of a single gene as a list of unique terms.
"""
if type(genes) != list and type(genes) != set and type(genes) != tuple:
genes = [genes] # if 'genes' is a single gene, we make a single item list
map = dict()
uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&protein='
for gene in genes:
terms = set() # empty result set
url = uri + gene.strip() # Construct URL
try: # Get the entry: fill in the fields specified below
data = urllib2.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore header row
values = row.split('\t')
if len(values) >= 7:
terms.add(values[6]) # add term to result set
map[gene] = list(terms) # make a list of the set
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
if len(genes) == 1:
return map[genes[0]]
else:
return map
def getGenes(goterms, db='UniProtKB', taxo=None):
"""
Retrieve all genes/proteins for a given set of GO terms
(or single GO term).
db: use specified database, e.g. 'UniProtKB', 'UniGene',
or 'Ensembl'
taxo: use specific taxonomic identifier, e.g. 9606 (human)
The result is given as a map (key=gene name, value=list of unique
terms) OR in the case of a single gene as a list of unique terms.
"""
if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
goterms = [goterms]
map = dict()
if taxo == None:
uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&term='
else:
uri = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&db='+db+'&tax='+\
str(taxo)+'&term='
for goterm in goterms:
genes = set() # start with empty result set
url = uri + goterm.strip() # Construct URL
try: # Get the entry: fill in the fields specified below
data = urllib2.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
genes.add(values[1]) # add gene name to result set
map[goterm] = list(genes)
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
if len(goterms) == 1:
return map[goterms[0]]
else:
return map
###############################################################################
# PhyloTree #
###############################################################################
class PhyloTree:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
Functionality includes labelling and traversing nodes; reading and writing to Newick format;
association with sequence alignment; maximum parsimony inference of ancestral sequence;
generation of single, bifurcating rooted tree by UPGMA.
Known issues: Binary only; Parsimony does not handle gaps in alignment.
Programmers should note that almost all functionality is implemented through recursion. """
def __init__(self, root):
""" Create a tree from a node that is "root" in the tree."""
self.root = root
def putAlignment(self, aln):
""" Associate the tree with a set of sequences/alignment.
Involves assigning the sequence to the leaf nodes. """
self.aln = aln
self.root._assignAlignment(aln)
def __str__(self):
""" Produce a printable representation of the tree, specifically the root of the tree. """
return str(self.root)
def strSequences(self, start = None, end = None):
""" Produce a sequence representation of the tree, specifically the root of the tree.
Specify the start and end positions in the alignment for the sequence to be printed
(if None the min and max positions will be used). """
if self.aln != None:
my_start = start or 0
my_end = end or self.aln.alignlen
return self.root._printSequences(my_start, my_end)
def findLabel(self, label):
""" Retrieve/return the node with the specified label.
Returns None if not found."""
return self.root._findLabel(label)
def getDescendantsOf(self, node, transitive = False):
""" Retrieve and return the (list of) descendants (children) of a specified node.
Node can be the label or the instance.
transitive indicates if only the direct descendants (False) or if all descendants
should be returned.
If node does not exist, None is returned.
If node has no descendants, an empty list will be returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
if node:
return node.getDescendants(transitive)
return None
def getAncestorsOf(self, node, transitive = False):
""" Retrieve and return the ancestor (transitive=False) or
ancestors (transitive=True) of a specified node.
Node can be the label or the instance.
If node does not exist, None is returned.
If node is the root of the tree, None is returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
if node:
myroot = self.root
found = False
branching = []
while not found and myroot != None:
branching.append(myroot)
if myroot.left == node or myroot.right == node:
found = True
break
if myroot.left:
if myroot.left.isAncestorOf(node, transitive = True):
myroot = myroot.left
else: # must be right branch then...
myroot = myroot.right
else: # must be right branch then...
myroot = myroot.right
if found and transitive:
return branching
elif found and len(branching) > 0:
return branching[len(branching)-1]
return None
def parsimony(self):
""" Solve the "small parsimony problem",
i.e. find the sequences on each of the internal nodes.
See Jones and Pevzner, p. 368 and onwards, for details. """
self.root._forwardParsimony(self.aln) # setup and compute scores for all nodes
self.root._backwardParsimony(self.aln) # use scores to determine sequences
return self.root.getSequence() # return the sequence found at the root
###############################################################################
# PhyloNode #
###############################################################################
class PhyloNode:
""" A class for a node in a rooted, binary (bifurcating) tree.
Contains pointers to descendants/daughters (left and right),
optional fields include data, label, sequence and dist.
If parsimony is used scores and traceback pointers are available.
A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """
def __init__(self, label = ''):
""" Initialise an initially unlinked node.
Populate fields left and right to link it with other nodes.
Set label to name it.
Use field data for any type of information associated with node.
Use dist to indicate the distance to its parent (if any).
Other fields are used internally, including sequence for associated alignment,
seqscores, backleft and backright for maximum parsimony. """
self.left = None
self.right = None
self.data = None
self.label = label
self.dist = None
self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
self.seqscores = None # The scores propagated from leaves via children
self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols
self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
def __str__(self):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = str(self.left)
if self.right:
right = str(self.right)
if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist)
if self.label != None:
label = str(self.label)
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if not self.left and self.right:
return ','+right
elif self.left and not self.right:
return left+','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def _printSequences(self, start, end):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = self.left._printSequences(start, end)
if self.right:
right = self.right._printSequences(start, end)
if self.dist:
dist = ':' + str(self.dist)
if self.sequence != None:
label = "".join(self.sequence[start:end]) + ""
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if not self.left and self.right:
return ','+right
elif self.left and not self.right:
return left+','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def _findLabel(self, label):
""" Find a node by label at this node or in any descendants (recursively). """
if self.label == label:
return self
else:
if self.left:
foundLeft = self.left._findLabel(label)
if foundLeft:
return foundLeft
if self.right:
return self.right._findLabel(label)
return None
def _propagateDistance(self, parent_dist):
""" Convert absolute distances to relative.
The only parameter is the absolute distance to the parent of this node. """
travelled = self.dist # absolute distance to this node
self.dist = parent_dist - self.dist # relative distance to this node
if self.left != None: # if there is a child node...
self.left._propagateDistance(travelled) # pass absolute distance to this node
if self.right != None:
self.right._propagateDistance(travelled)
def _assignAlignment(self, aln):
""" Assign an alignment to the node, which implies assigning a sequence to it if one is
available in the alignment. """
self.sequence = None
if self.left != None:
self.left._assignAlignment(aln)
if self.right != None:
self.right._assignAlignment(aln)
for seq in aln.seqs:
if seq.name == self.label:
self.sequence = seq
break
def _forwardParsimony(self, aln):
""" Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node,
then to propagate scores from sequence assigned nodes to root (backward). """
if self.sequence == None: # no sequence has been assigned
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
scoresleft = scoresright = None
if self.left != None:
scoresleft = self.left._forwardParsimony(aln)
if self.right != None:
scoresright = self.right._forwardParsimony(aln)
# for each position in the alignment,
# introduce (initially zero) score for each symbol in alphabet
self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
# for each position in the alignment,
# allocate a position to put the left child symbol from which each current node symbol score was determined
self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
# allocate a position to put the right child symbol from which each current node symbol score was determined
self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
for col in range(aln.alignlen):
for a_parent in range(len(aln.alphabet)):
best_score_left = +9999999
best_score_right = +9999999
best_symb_left = 0
best_symb_right = 0
for a_left in range(len(aln.alphabet)):
score = (scoresleft[col][a_left] + (1 if a_left != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_left:
best_symb_left = a_left
best_score_left = score
for a_right in range(len(aln.alphabet)):
score = (scoresright[col][a_right] + (1 if a_right != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_right:
best_symb_right = a_right
best_score_right = score
self.seqscores[col][a_parent] = best_score_left + best_score_right
self.backleft[col][a_parent] = best_symb_left
self.backright[col][a_parent] = best_symb_right
else:
self.seqscores = [[0 if a==sym else 999999 for a in aln.alphabet] for sym in self.sequence] # if we want to weight scores, this would need to change
return self.seqscores
def _backwardParsimony(self, aln, seq = None):
""" Internal function that operates recursively to inspect scores to determine
most parsimonious sequence, from root to leaves. """
if self.sequence == None: # no sequence has been assigned
leftbuf = []
rightbuf = []
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
currbuf = []
for col in range(aln.alignlen):
min_score = 999999
min_symb = None
left_symb = None
right_symb = None
for a_parent in range(len(aln.alphabet)):
if self.seqscores[col][a_parent] < min_score:
min_score = self.seqscores[col][a_parent]
min_symb = a_parent
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
currbuf.append(aln.alphabet[min_symb])
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
self.sequence = Sequence(currbuf, aln.alphabet, self.label, gappy = True)
else: # Non-root, but not leaf
self.sequence = seq
col = 0
for sym_parent in self.sequence:
a_parent = aln.alphabet.index(sym_parent)
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
col += 1
self.left._backwardParsimony(aln, Sequence(leftbuf, aln.alphabet, self.label, gappy = True))
self.right._backwardParsimony(aln, Sequence(rightbuf, aln.alphabet, self.label, gappy = True))
return self.sequence
def getSequence(self):
""" Get the sequence for the node. Return None if no sequence is assigned.
Requires that an alignment is associated with the tree, and that sequence names match node labels.
If the explored node is not a leaf, the sequence can be determined by parsimony. """
if self.sequence != None: # a sequence has been assigned
return self.sequence
elif self.seqscores != None: # inferred by parsimony but not yet assigned
return None # determine most parsimonous sequence, not yet implemented
def isAncestorOf(self, node, transitive = True):
""" Decide if this node is the ancestor of specified node.
If transitive is True (default), all descendants are included.
If transitive is False, only direct descendants are included. """
if node == self.left or node == self.right:
return True
elif transitive:
if self.left:
statusLeft = self.left.isAncestorOf(node, transitive)
if statusLeft: return True
if self.right:
return self.right.isAncestorOf(node, transitive)
else:
return False
def getDescendants(self, transitive = False):
""" Retrieve and return (list of) nodes descendant of this.
If transitive is False (default), only direct descendants are included.
If transitive is True, all descendants are (recursively) included. """
children = []
if self.left:
children.append(self.left)
if self.right:
children.append(self.right)
if not transitive:
return children
else:
grandchildren = []
for c in children:
d = c.getDescendants(transitive)
if d:
grandchildren.extend(d)
children.extend(grandchildren)
return children
###############################################################################
# Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
# Methods for processing files of trees on the Newick format
###############################################################################
def runUPGMA(aln, measure, absoluteDistances = False):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances).
If absoluteDistances is True, the tree will be assigned the total distance from provided species.
Otherwise, the relative addition at each path will be assigned."""
D = {}
N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances
nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for i in range(len(nodes)):
nodes[i].sequence = aln.seqs[i]
nodes[i].dist = 0.0
N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i):
D[_getkey(nodes[i], nodes[j])] = M[i, j]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged
dist = D[pair]
if dist < closest_dist or closest_dist == None:
closest_dist = dist
closest_pair = pair
# So we know the closest, now we need to merge...
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
dz = {} # new distances to cluster z
for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[_getkey(z, w)] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree
z.right = y
nodes.append(z)
if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
y._propagateDistance(z.dist) # convert absolute distances to relative by recursing down right path
z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return
def _getkey(node1, node2):
""" Construct canonical (unordered) key for two symbols """
if node1 <= node2:
return tuple([node1, node2])
else:
return tuple([node2, node1])
def _findComma(string, level = 0):
""" Find first comma at specified level of embedding """
mylevel = 0
for i in range(len(string)):
if string[i] == '(':
mylevel += 1
elif string[i] == ')':
mylevel -= 1
elif string[i] == ',' and mylevel == level:
return i
return -1
def parseNewickNode(string):
""" Utility function that recursively parses embedded string using Newick format. """
first = string.find('(')
last = string[::-1].find(')') # look from the back
if first == -1 and last == -1: # we are at leaf
y = string.split(':')
node = PhyloNode(y[0])
if len(y) >= 2:
node.dist = float(y[1])
return node
elif first >= 0 and last >= 0:
# remove parentheses
last = len(string) - last - 1 # correct index to refer from start instead of end of string
embed = string[first + 1:last]
tail = string[last + 1:]
# find where corresp comma is
comma = _findComma(embed)
if comma == -1:
raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
left = embed[0:comma].strip()
right = embed[comma + 1:].strip()
y = tail.split(':')
node = PhyloNode(y[0])
if len(y) >= 2:
node.dist = float(y[1])
node.left = parseNewickNode(left)
node.right = parseNewickNode(right)
return node
else:
raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
def parseNewick(string):
""" Main method for parsing a Newick string into a (phylogenetic) tree.
Handles labels (on both leaves and internal nodes), and includes distances (if provided).
Returns an instance of a PhyloTree. """
if string.find(';') != -1:
string = string[:string.find(';')]
return PhyloTree(parseNewickNode(string))
def readNewickFile(filename):
""" Read file on Newick format.
Returns an instance of a PhyloTree."""
f = open(filename)
string = ''.join(f)
return parseNewick(string)
def writeNewickFile(filename, tree):
""" Write the specified tree to a Newick file. """
fh = open(filename, 'w')
fh.write(tree.__str__())
fh.close()
###############################################################################
# Below is code that will be run if the module is "run", and not just "imported".
###############################################################################
if __name__=='__main__':
x = Sequence('ACTGA', DNA_Alphabet, 'x')
print "Sequence", x, "is constructed from the symbols", x.alphabet.symbols
print "( There are", x.count('A'), "occurrences of the symbol 'A' in", x.sequence, ")"
y = Sequence('TACGA', DNA_Alphabet, 'y')
print "Sequence", y, "is constructed from the symbols", y.alphabet.symbols
print
print "( The sub-sequence 'CG' starts at index", y.find('CG'), "of", y.sequence, ")"
print
sm = SubstMatrix(DNA_Alphabet)
for a in DNA_Alphabet:
for b in DNA_Alphabet:
if a==b:
sm.set(a, b, +2) # match
else:
sm.set(a, b, -1) # mismatch
print "Below is a substitution matrix for the alphabet", DNA_Alphabet.symbols
print sm
print
aln = align(x, y, sm, -2)
print "Below is the alignment between x and y"
print aln
\ No newline at end of file
import numpy
import numpy.random
import math
import random
class NN():
"""
A basic implementation of a standard, multi-layer, feed-forward neural network
and back-propagation learning.
"""
def __init__(self, nInput, nHidden, nOutput):
""" Constructs a neural network and initializes its weights to small random values.
nInput Number of input nodes
nHidden Number of hidden nodes
nOutput Number of output nodes
"""
self.ninput = nInput
self.hidden = numpy.empty(nHidden) # hidden nodes
self.output = numpy.empty(nOutput) # output nodes
self.w_hid = numpy.random.randn(nHidden, nInput) # weights in -> hid
self.b_hid = numpy.random.randn(nHidden) # biases hidden layer
self.w_out = numpy.random.randn(nOutput, nHidden) # weights hid -> out
self.b_out = numpy.random.randn(nOutput) # biases output layer
print "Constructed NN with %d inputs, %d hidden and %d output nodes." % (self.ninput, len(self.hidden), len(self.output))
def writeFile(self, filename):
""" Save NN to a file. """
f = open(filename, 'w')
f.write(str(self.ninput)+'\n')
f.write(str(len(self.hidden))+'\n')
f.write(str(len(self.output))+'\n')
for row in self.w_hid:
for w in row:
f.write(str(w)+'\n')
for b in self.b_hid:
f.write(str(b)+'\n')
for row in self.w_out:
for w in row:
f.write(str(w)+'\n')
for b in self.b_out:
f.write(str(b)+'\n')
f.close()
def _fLogistic(self, net):
""" The logistic output function.
Computes the output value of a node given the summed incoming activation,
values bounded between 0 and 1.
net: The summed incoming activation. """
return 1.0 / (1.0 + numpy.exp(-net))
def _fSoftmax(self, net):
""" The softmax output function.
Computes the output value of a node given the summed incoming activation,
values bounded between 0 and 1, where all add to 1.0.
net: The summed incoming activation for each output (must be the full layer). """
tmp = numpy.exp(net)
sum = numpy.sum(tmp)
out = tmp / sum
return out
def _fprimeLogistic(self, y):
""" The derivative of the logistic output function.
y: The value by which the gradient is determined.
returns the gradient at output y. """
return y * (1.0 - y)
def feedforward(self, input):
""" Computes the output values of the output nodes in the network given input values.
input: the one-dim array of input values
returns the one-dim array of computed output values. """
# compute the activation of each hidden node (depends on supplied input values)
self.hidden = self._fLogistic(self.w_hid.dot(input) + self.b_hid)
# compute the activation of each output node (depends on hidden node activations computed above)
if len(self.output) == 1:
self.output = self._fLogistic(self.w_out.dot(self.hidden) + self.b_out)
else:
self.output = self._fSoftmax(self.w_out.dot(self.hidden) + self.b_out)
return self.output
def test(self, inputs, targets):
""" Create a confusion matrix for all predictions with known target classes. """
cm = numpy.zeros((len(self.output), len(self.output))) # confusion matrix
for p in range(len(inputs)):
input = inputs[p]
target = targets[p]
# present the input and calculate the outputs
output = self.feedforward(input)
# which class?
c_targ = maxIndex(target)
c_pred = maxIndex(output)
cm[c_targ, c_pred] += 1
return cm
def train(self, input, target, eta = 0.1, niter = 1, shuffle = True):
""" Adapts weights in the network given the values that should appear at the output (target)
when the input has been presented. The procedure is known as error back-propagation.
This implementation is "online" rather than "batched", that is, the change is not based
on the gradient of the golbal error, merely the local, pattern-specific error.
target: The desired output values
eta: The learning rate, always between 0 and 1, typically a small value (default 0.1)
shuffle: If true, input rows are shuffled before training (reduces bias imposed by order
in online training)
returns an error value (the root-mean-squared-error). """
try:
len(input[0])
multi_input = input
multi_targ = target
except TypeError:
multi_input = [ input ]
multi_targ = [ target ]
for i in range(niter):
mse = 0.0
entries = range(len(multi_input))
if shuffle:
random.shuffle(entries)
for p in entries:
input = multi_input[p]
target = multi_targ[p]
# present the input and calculate the outputs
self.feedforward(input)
# compute the error of output nodes (explicit target is available -- so quite simple)
# also, calculate the root-mean-squared-error to indicate progress
dif_out = (target - self.output)
if len(self.output) == 1:
err_out = dif_out * self._fprimeLogistic(self.output)
else:
err_out = dif_out #* self._fprimeSoftmax(self.output)
# compute the error of hidden nodes (indirect contribution to error at output layer)
err_hid = self.w_out.T.dot(err_out) * self._fprimeLogistic(self.hidden)
# change weights according to errors
self.w_out += numpy.outer(err_out, self.hidden) * eta
self.b_out += err_out * eta
self.w_hid += numpy.outer(err_hid, input) * eta
self.b_hid += err_hid * eta
if i == niter - 1: # last round
mse += float(numpy.mean(numpy.square(dif_out)))
return math.sqrt(mse / len(entries)) # Root of mean squared error (RMSE)
def readNNFile(filename):
""" Load a NN from a file. """
f = open(filename, 'r')
nInput = int(f.readline())
nHidden = int(f.readline())
nOutput = int(f.readline())
nn = NN(nInput, nHidden, nOutput)
for i in range(nHidden):
for j in range(nInput):
nn.w_hid[i, j] = float(f.readline())
for i in range(nHidden):
nn.b_hid[i] = float(f.readline())
for i in range(nOutput):
for j in range(nHidden):
nn.w_out[i, j] = float(f.readline())
for i in range(nOutput):
nn.b_out[i] = float(f.readline())
f.close()
return nn
def maxIndex(output):
""" Figure out the index of the largest value in the specified array/list. """
if len(output) > 1: # multi-class
max = 0
for i in range(len(output)):
if output[i] > output[max]:
max = i
else: # two-class, single output 0/1
max = int(round(output[0]))
return max
def Qk(cm, alpha):
""" Compute the Q accuracy from a confusion matrix (see test method above) """
Q = {}
for a in alpha:
i = alpha.index(a)
Q[a] = (cm[i, i] / numpy.sum(cm[i])) * 100
tp = 0; pos = 0
for a in alpha:
i = alpha.index(a)
tp += cm[i, i]
pos += sum(cm[i])
return (float(tp) / float(pos)) * 100, Q
def readDenseDataFile(filename):
""" Read data from file for training a neural network.
The file follows the "dense" row-format:
<i1> <i2> ... <im> | <o1> ... <on>
where ix are m input values and ox are n output values """
# first check format
ninputs = None
noutputs = None
nexamples = 0
f = open(filename)
cnt = 0
for row in f:
cnt += 1
inp, outp = row.split('|')
indata = [ float(token) for token in inp.split() ]
if ninputs:
if len(indata) != ninputs:
raise RuntimeError('Error reading file: Invalid input at row %d' % cnt)
ninputs = len(indata)
outdata = [ float(token) for token in outp.split() ]
if noutputs:
if len(outdata) != noutputs:
raise RuntimeError('Error reading file: Invalid output at row %d' % cnt)
noutputs = len(outdata)
f.close()
nexamples = cnt
inm = numpy.zeros((nexamples, ninputs))
outm = numpy.zeros((nexamples, noutputs))
f = open(filename)
cnt = 0
for row in f:
inp, outp = row.split('|')
inm[cnt] = [ float(token) for token in inp.split() ]
outm[cnt] = [ float(token) for token in outp.split() ]
cnt += 1
f.close()
return inm, outm
def fGaussian(x, mu = 0.0, sigma2 = 1.0):
""" Gaussian PDF for numpy arrays """
num = (x - mu) ** 2
den = 2 * sigma2
expon = numpy.exp(-num/den)
return expon / numpy.sqrt(2.0 * numpy.pi * sigma2)
class KMeans():
"""
K-means clustering is a special case of Expectation-Maximization (EM).
In K-means clustering we consider samples
x1,...,xn labeled with z1,...,zN with xt vectors in R^D and zt \in {1,...,K}.
In other words, zt is a class label, or cluster label, for the data point xt.
We can define a K-means probability model as follows where N(mu, I) denotes the
D-dimensional Gaussian distribution with mean mu \in R^D and with the
identity covariance matrix.
theta = <mu_1,...,mu_K>, mu_k \in R^D
P(x1, . . . , xn, z1, . . . zn) = PROD P(zt) P(xt|zt) = PROD 1/K N(mu_zt, I) (xt)
We now consider the optimization problem defined for this model. For
this model
(mu_1,...,mu_K)* = argmin_mu min_z SUM || muzt - xt || ^2
The optimization problem defines K-means clustering (under quadratic distortion).
This problem is non-convex and in fact is NP-hard. The K-means algorithm is coordinate
descent applied to this objective and is equivalent to EM under the above probability
model. The K-means clustering algorithm can be written as follows where we specify a
typical initialization step.
1. Initialize mu_z to be equal to a randomly selected point xt.
2. Repeat the following until (z1, . . . zn) stops changing.
(a) zt := argmin_z || mu_z - xt || ^2
(b) Nz := |{t: zt = z}|
(c) mu_z := 1 / Nz SUM_t:zt=z xt
In words, the K-means algorithm first assigns a class center mu_z for each class z.
It then repeatedly classifies each point xt as belonging to the class whose center is
nearest xt and then recomputes the class centers to be the mean of the point placed in that class.
Because it is a coordinate descent algorithm, the sum of squares of the difference
between each point and its class center is reduced by each update. This implies that the
classification must eventually stabilize.
The procedure terminates when the class labels stop changing.
"""
def __init__(self, data):
""" Construct a K-means classifier using the provided data.
data: a two-dim numpy array, with one row corresponding to a data point.
If training is not performed, the provided data is used as the "means". """
assert len(data) > 0, "Data must be supplied"
self.data = data
self.means = data.copy()
self.samplelen = len(data[0])
self.vars = numpy.empty((len(data), self.samplelen))
def classify(self, sample):
assert len(sample) == self.samplelen, "Sample vector has invalid length: " + str(len(sample))
sqrdist = numpy.sum((self.means - sample) ** 2, 1)
return sqrdist.argmin(0)
def train(self, K):
data = self.data
N = len(data)
clusters = numpy.zeros((N, 1))
self.means = self.data[numpy.random.randint(N, size = K),:] # pick K random samples
while True:
previous = clusters.copy()
""" Compute cluster memberships GIVEN means """
kdist = numpy.empty((len(data), K))
for i in range(K):
kdist[:,i] = numpy.sum((data - self.means[i]) ** 2, 1)
clusters[:,0] = kdist.argmin(1)
nsame = numpy.sum(previous[:,0] == clusters[:,0])
if nsame == N:
break
""" Compute means GIVEN cluster memberships """
for i in range(K):
members = data[clusters[:,0] == i]
self.means[i] = members.mean(0) # mean over rows per column
def eucdist(v1, v2):
diff = 0
for i in range(len(v1)):
diff += (v1[i] - v2[i])**2
return math.sqrt(diff)
'''
Module with methods and classes for phylogeny.
@author: mikael
'''
##import sequence
class PhyloTree:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
Functionality includes labelling and traversing nodes; reading and writing to Newick format;
association with sequence alignment; maximum parsimony inference of ancestral sequence;
generation of single, bifurcating rooted tree by UPGMA.
Known issues: Binary only; Parsimony does not handle gaps in alignment.
Programmers should note that almost all functionality is implemented through recursion. """
def __init__(self, root):
""" Create a tree from a node that is "root" in the tree."""
self.root = root
def putAlignment(self, aln):
""" Associate the tree with a set of sequences/alignment.
Involves assigning the sequence to the leaf nodes. """
self.aln = aln
self.root._assignAlignment(aln)
def __str__(self):
""" Produce a printable representation of the tree, specifically the root of the tree. """
return str(self.root)
def strSequences(self, start = None, end = None):
""" Produce a sequence representation of the tree, specifically the root of the tree.
Specify the start and end positions in the alignment for the sequence to be printed
(if None the min and max positions will be used). """
if self.aln != None:
my_start = start or 0
my_end = end or self.aln.alignlen
return self.root._printSequences(my_start, my_end)
def findLabel(self, label):
""" Retrieve/return the node with the specified label.
Returns None if not found."""
return self.root._findLabel(label)
def getDescendantsOf(self, node, transitive = False):
""" Retrieve and return the (list of) descendants (children) of a specified node.
Node can be the label or the instance.
transitive indicates if only the direct descendants (False) or if all descendants
should be returned.
If node does not exist, None is returned.
If node has no descendants, an empty list will be returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
if node:
return node.getDescendants(transitive)
return None
def getAncestorsOf(self, node, transitive = False):
""" Retrieve and return the ancestor (transitive=False) or
ancestors (transitive=True) of a specified node.
Node can be the label or the instance.
If node does not exist, None is returned.
If node is the root of the tree, None is returned."""
if not isinstance(node, PhyloNode):
node = self.root.findLabel(node)
if node:
myroot = self.root
found = False
branching = []
while not found and myroot != None:
branching.append(myroot)
if myroot.left == node or myroot.right == node:
found = True
break
if myroot.left:
if myroot.left.isAncestorOf(node, transitive = True):
myroot = myroot.left
else: # must be right branch then...
myroot = myroot.right
else: # must be right branch then...
myroot = myroot.right
if found and transitive:
return branching
elif found and len(branching) > 0:
return branching[len(branching)-1]
return None
def parsimony(self):
""" Solve the "small parsimony problem",
i.e. find the sequences on each of the internal nodes.
See Jones and Pevzner, p. 368 and onwards, for details. """
self.root._forwardParsimony(self.aln) # setup and compute scores for all nodes
self.root._backwardParsimony(self.aln) # use scores to determine sequences
return self.root.getSequence() # return the sequence found at the root
class PhyloNode:
""" A class for a node in a rooted, binary (bifurcating) tree.
Contains pointers to descendants/daughters (left and right),
optional fields include data, label, sequence and dist.
If parsimony is used scores and traceback pointers are available.
A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """
def __init__(self, label = ''):
""" Initialise an initially unlinked node.
Populate fields left and right to link it with other nodes.
Set label to name it.
Use field data for any type of information associated with node.
Use dist to indicate the distance to its parent (if any).
Other fields are used internally, including sequence for associated alignment,
seqscores, backleft and backright for maximum parsimony. """
self.left = None
self.right = None
self.data = None
self.label = label
self.dist = None
self.sequence = None # The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
self.seqscores = None # The scores propagated from leaves via children
self.backleft = None # Pointers back to left child: what symbol rendered current/parent symbols
self.backright = None # Pointers back to right child: what symbol rendered current/parent symbols
def __str__(self):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = str(self.left)
if self.right:
right = str(self.right)
if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist)
if self.label != None:
label = str(self.label)
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if not self.left and self.right:
return ','+right
elif self.left and not self.right:
return left+','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def _printSequences(self, start, end):
""" Returns string with node (incl descendants) in a Newick style. """
left = right = label = dist = ''
if self.left:
left = self.left._printSequences(start, end)
if self.right:
right = self.right._printSequences(start, end)
if self.dist:
dist = ':' + str(self.dist)
if self.sequence != None:
label = "".join(self.sequence[start:end]) + ""
if not self.left and not self.right:
return label + dist
else:
return '(' + left + ',' + right + ')' + label + dist
else: # there is no label
if not self.left and self.right:
return ','+right
elif self.left and not self.right:
return left+','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
def _findLabel(self, label):
""" Find a node by label at this node or in any descendants (recursively). """
if self.label == label:
return self
else:
if self.left:
foundLeft = self.left._findLabel(label)
if foundLeft:
return foundLeft
if self.right:
return self.right._findLabel(label)
return None
def _propagateDistance(self, parent_dist):
""" Convert absolute distances to relative.
The only parameter is the absolute distance to the parent of this node. """
travelled = self.dist # absolute distance to this node
self.dist = parent_dist - self.dist # relative distance to this node
if self.left != None: # if there is a child node...
self.left._propagateDistance(travelled) # pass absolute distance to this node
if self.right != None:
self.right._propagateDistance(travelled)
def _assignAlignment(self, aln):
""" Assign an alignment to the node, which implies assigning a sequence to it if one is
available in the alignment. """
self.sequence = None
if self.left != None:
self.left._assignAlignment(aln)
if self.right != None:
self.right._assignAlignment(aln)
for seq in aln.seqs:
if seq.name == self.label:
self.sequence = seq
break
def _forwardParsimony(self, aln):
""" Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node,
then to propagate scores from sequence assigned nodes to root (backward). """
if self.sequence == None: # no sequence has been assigned
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
scoresleft = scoresright = None
if self.left != None:
scoresleft = self.left._forwardParsimony(aln)
if self.right != None:
scoresright = self.right._forwardParsimony(aln)
# for each position in the alignment,
# introduce (initially zero) score for each symbol in alphabet
#Project "Substitution weights" should focus on this line of code
self.seqscores = [[0 for _ in aln.alphabet] for col in range(aln.alignlen)]
# for each position in the alignment,
# allocate a position to put the left child symbol from which each current node symbol score was determined
self.backleft = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
# allocate a position to put the right child symbol from which each current node symbol score was determined
self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
for col in range(aln.alignlen):
for a_parent in range(len(aln.alphabet)):
best_score_left = +9999999
best_score_right = +9999999
best_symb_left = 0
best_symb_right = 0
for a_left in range(len(aln.alphabet)):
score = (scoresleft[col][a_left] + (1 if a_left != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_left:
best_symb_left = a_left
best_score_left = score
for a_right in range(len(aln.alphabet)):
score = (scoresright[col][a_right] + (1 if a_right != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_right:
best_symb_right = a_right
best_score_right = score
self.seqscores[col][a_parent] = best_score_left + best_score_right
self.backleft[col][a_parent] = best_symb_left
self.backright[col][a_parent] = best_symb_right
else:
self.seqscores = [[0 if a==sym else 999999 for a in aln.alphabet] for sym in self.sequence] # if we want to weight scores, this would need to change
return self.seqscores
def _backwardParsimony(self, aln, seq = None):
""" Internal function that operates recursively to inspect scores to determine
most parsimonious sequence, from root to leaves. """
if self.sequence == None: # no sequence has been assigned
leftbuf = []
rightbuf = []
if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores
raise RuntimeError("No sequence assigned to leaf node:", self.label)
if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol
currbuf = []
for col in range(aln.alignlen):
min_score = 999999
min_symb = None
left_symb = None
right_symb = None
for a_parent in range(len(aln.alphabet)):
if self.seqscores[col][a_parent] < min_score:
min_score = self.seqscores[col][a_parent]
min_symb = a_parent
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
currbuf.append(aln.alphabet[min_symb])
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy = True)
else: # Non-root, but not leaf
self.sequence = seq
col = 0
for sym_parent in self.sequence:
a_parent = aln.alphabet.index(sym_parent)
left_symb = self.backleft[col][a_parent]
right_symb = self.backright[col][a_parent]
leftbuf.append(aln.alphabet[left_symb])
rightbuf.append(aln.alphabet[right_symb])
col += 1
self.left._backwardParsimony(aln, sequence.Sequence(leftbuf, aln.alphabet, self.label, gappy = True))
self.right._backwardParsimony(aln, sequence.Sequence(rightbuf, aln.alphabet, self.label, gappy = True))
return self.sequence
def getSequence(self):
""" Get the sequence for the node. Return None if no sequence is assigned.
Requires that an alignment is associated with the tree, and that sequence names match node labels.
If the explored node is not a leaf, the sequence can be determined by parsimony. """
if self.sequence != None: # a sequence has been assigned
return self.sequence
elif self.seqscores != None: # inferred by parsimony but not yet assigned
return None # determine most parsimonous sequence, not yet implemented
def isAncestorOf(self, node, transitive = True):
""" Decide if this node is the ancestor of specified node.
If transitive is True (default), all descendants are included.
If transitive is False, only direct descendants are included. """
if node == self.left or node == self.right:
return True
elif transitive:
if self.left:
statusLeft = self.left.isAncestorOf(node, transitive)
if statusLeft: return True
if self.right:
return self.right.isAncestorOf(node, transitive)
else:
return False
def getDescendants(self, transitive = False):
""" Retrieve and return (list of) nodes descendant of this.
If transitive is False (default), only direct descendants are included.
If transitive is True, all descendants are (recursively) included. """
children = []
if self.left:
children.append(self.left)
if self.right:
children.append(self.right)
if not transitive:
return children
else:
grandchildren = []
for c in children:
d = c.getDescendants(transitive)
if d:
grandchildren.extend(d)
children.extend(grandchildren)
return children
""" ----------------------------------------------------------------------------------------
Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
----------------------------------------------------------------------------------------"""
def runUPGMA(aln, measure, absoluteDistances = False):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances).
If absoluteDistances is True, the tree will be assigned the total distance from provided species.
Otherwise, the relative addition at each path will be assigned."""
D = {}
N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances
nodes = [PhyloNode(seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for i in range(len(nodes)):
nodes[i].sequence = aln.seqs[i]
nodes[i].dist = 0.0
N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i):
D[_getkey(nodes[i], nodes[j])] = M[i, j]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them
for pair in D: # check all pairs which should be merged
dist = D[pair]
if dist < closest_dist or closest_dist == None:
closest_dist = dist
closest_pair = pair
# So we know the closest, now we need to merge...
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(_getkey(x, y)) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
dz = {} # new distances to cluster z
for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(_getkey(x, w)) # retrieve and remove distance from D: x to w
dyw = D.pop(_getkey(y, w)) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[_getkey(z, w)] = dz[w]# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z.left = x # link the phylogenetic tree
z.right = y
nodes.append(z)
if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
y._propagateDistance(z.dist) # convert absolute distances to relative by recursing down right path
z.dist = 0.0 # root z is at distance 0 from merged x and y
return PhyloTree(z) # make it to tree, return
def _getkey(node1, node2):
""" Construct canonical (unordered) key for two symbols """
if node1 <= node2:
return tuple([node1, node2])
else:
return tuple([node2, node1])
""" ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format
----------------------------------------------------------------------------------------"""
def _findComma(string, level = 0):
""" Find first comma at specified level of embedding """
mylevel = 0
for i in range(len(string)):
if string[i] == '(':
mylevel += 1
elif string[i] == ')':
mylevel -= 1
elif string[i] == ',' and mylevel == level:
return i
return -1
def parseNewickNode(string):
""" Utility function that recursively parses embedded string using Newick format. """
first = string.find('(')
last = string[::-1].find(')') # look from the back
if first == -1 and last == -1: # we are at leaf
y = string.split(':')
node = PhyloNode(y[0])
if len(y) >= 2:
node.dist = float(y[1])
return node
elif first >= 0 and last >= 0:
# remove parentheses
last = len(string) - last - 1 # correct index to refer from start instead of end of string
embed = string[first + 1:last]
tail = string[last + 1:]
# find where corresp comma is
comma = _findComma(embed)
if comma == -1:
raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
left = embed[0:comma].strip()
right = embed[comma + 1:].strip()
y = tail.split(':')
node = PhyloNode(y[0]) #node is an instance of the PhyloNode() class
if len(y) >= 2:
node.dist = float(y[1])
node.left = parseNewickNode(left)
node.right = parseNewickNode(right)
return node
else:
raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
def parseNewick(string):
""" Main method for parsing a Newick string into a (phylogenetic) tree.
Handles labels (on both leaves and internal nodes), and includes distances (if provided).
Returns an instance of a PhyloTree. """
if string.find(';') != -1:
string = string[:string.find(';')]
return PhyloTree(parseNewickNode(string))
def readNewick(filename):
""" Read file on Newick format.
Returns an instance of a PhyloTree."""
f = open(filename)
string = ''.join(f)
return parseNewick(string)
'''
Module for classes and functions that are representing and processing basic probabilities.
Uses and depends on "Alphabet" that is used to define discrete random variables.
'''
import random
from sym import *
from copy import deepcopy
import math
#################################################################################################
# Generic utility functions
#################################################################################################
def _getMeTuple(alphas, str):
""" Handy function that resolves what entries that are being referred to in the case
of written wildcards etc.
Example y = _getMeTuple([DNA_Alphabet, Protein_Alphabet], '*R') gives y = (None, 'R')
alphas: the alphabets
str: the string that specifies entries (may include '*' and '-' signifying any symbol) """
assert len(str) == len(alphas), "Entry invalid"
if not type(str) is tuple:
list = []
for ndx in range(len(alphas)):
if str[ndx] == '*' or str[ndx] == '-':
list.append(None)
else:
list.append(str[ndx])
return tuple(list)
else:
return str
#################################################################################################
# Distrib class
#################################################################################################
class Distrib():
""" A class for a discrete probability distribution, defined over a specified "Alphabet"
TODO: Fix pseudo counts
Exclude from counts, specify in constructor,
include only when computing probabilities by standard formula (n_a + pseudo_a * N^(1/2)) / (N + N^(1/2))
Exclude from filesaves, include with filereads (optional)
"""
def __init__(self, alpha, pseudo = 0.0):
""" Construct a new distribution for a specified alphabet, using an optional pseudo-count.
alpha: alphabet
pseudo: either a single "count" that applies to all symbols, OR a distribution/dictionary with counts.
"""
self.pseudo = pseudo or 0.0
self.alpha = alpha
self.cnt = [0.0 for _ in alpha]
try: # assume pseudo is a dictionary or a Distrib itself
self.tot = 0
symndx = 0
for sym in alpha:
cnt = float(pseudo[sym])
self.cnt[symndx] = cnt
self.tot = self.tot + cnt
symndx += 1
except TypeError: # assume pseudo is a single count for each symbol
self.cnt = [float(self.pseudo) for _ in alpha]
self.tot = float(self.pseudo) * len(alpha) # track total counts (for efficiency)
def observe(self, sym, cntme = 1.0):
""" Make an observation of a symbol
sym: symbol that is being observed
cntme: number/weight of observation (default is 1)
"""
ndx = self.alpha.symbols.index(sym)
self.cnt[ndx] = self.cnt[ndx] + cntme
self.tot = self.tot + cntme
return
def reset(self):
""" Re-set the counts of this distribution. Pseudo-counts are re-applied. """
try:
self.tot = 0
symndx = 0
for sym in self.alpha: # assume it is a Distribution
cnt = float(self.pseudo[sym])
self.cnt[symndx] = cnt
self.tot = self.tot + cnt
symndx += 1
except TypeError: # assume pseudo is a single count for each symbol
self.cnt = [float(self.pseudo) for _ in self.alpha]
self.tot = float(self.pseudo) * len(self.alpha) # track total counts (for efficiency)
def reduce(self, new_alpha):
""" Create new distribution from self, using (smaller) alphabet new_alpha. """
d = Distrib(new_alpha, self.pseudo)
for sym in new_alpha:
d.observe(sym, self.cnt[self.alpha.index(sym)])
return d
def count(self, sym = None):
""" Return the absolute count(s) of the distribution
or the count for a specified symbol. """
if sym != None:
ndx = self.alpha.symbols.index(sym)
return self.cnt[ndx]
else:
d = {}
index = 0
for a in self.alpha:
d[a] = self.cnt[index]
index += 1
return d
def add(self, distrib):
""" Add the counts for the provided distribution to the present. """
for i in range(len(self.cnt)):
cnt = distrib.count(self.alpha[i])
self.cnt[i] += cnt
self.tot += cnt
def subtract(self, distrib):
""" Subtract the counts for the provided distribution from the present. """
for i in range(len(self.cnt)):
cnt = distrib.count(self.alpha[i])
self.cnt[i] -= cnt
self.tot -= cnt
def getSymbols(self):
return self.alpha.symbols
def __getitem__(self, sym):
""" Retrieve the probability of a symbol (ascertained by counts incl pseudo-counts) """
if self.tot > 0.0:
return self.count(sym) / self.tot
else:
return 1.0 / len(self.alpha) # uniform
def prob(self, sym = None):
""" Retrieve the probability of a symbol OR the probabilities of all symbols
(listed in order of the alphabet index). """
if sym != None:
return self.__getitem__(sym)
elif self.tot > 0:
return [ s / self.tot for s in self.cnt ]
else:
return [ 1.0 / len(self.alpha) for _ in self.cnt ]
def __iter__(self):
return self.alpha
def __str__(self):
""" Return a readable representation of the distribution """
str = '< '
for s in self.alpha:
str += (s + ("=%4.2f " % self[s]))
return str + ' >'
def swap(self, sym1, sym2):
""" Swap the entries for specified symbols. Useful for reverse complement etc.
Note that changes are made to the current instance. Use swapxcopy if you
want to leave this instance intact. """
sym1ndx = self.alpha.index(sym1)
sym2ndx = self.alpha.index(sym2)
tmpcnt = self.cnt[sym1ndx]
self.cnt[sym1ndx] = self.cnt[sym2ndx]
self.cnt[sym2ndx] = tmpcnt
def swapxcopy(self, sym1, sym2):
""" Create a new instance with swapped entries for specified symbols.
Useful for reverse complement etc.
Note that changes are NOT made to the current instance.
Use swap if you want to modify this instance. """
newdist = Distrib(self.alpha, self.count())
newdist.swap(sym1, sym2)
return newdist
def writeDistrib(self, filename = None):
""" Write the distribution to a file or string.
Note that the total number of counts is also saved, e.g.
* 1000 """
str = ''
for s in self.alpha:
str += (s + ("\t%f\n" % self[s]))
str += "*\t%d\n" % self.tot
if filename != None:
fh = open(filename, 'w')
fh.write(str)
fh.close()
return str
def generate(self):
""" Generate and return a symbol from the distribution using assigned probabilities. """
alpha = self.alpha
p = random.random() # get a random value between 0 and 1
q = 0.0
for sym in alpha: # pick a symbol with a frequency proportional to its probability
q = q + self[sym]
if p < q:
return sym
return alpha[len(alpha)]
def getmax(self):
""" Generate the symbol with the largest probability. """
maxprob = 0.0
maxsym = None
for sym in self.alpha:
if self[sym] > maxprob or maxprob == 0.0:
maxsym = sym
maxprob = self[sym]
return maxsym
def getsort(self):
""" Return the list of symbols, in order of their probability. """
symlist = [sym for (sym, _) in self.getProbsort()]
return symlist
def getProbsort(self):
""" Return the list of symbol-probability pairs, in order of their probability. """
s = [(sym, self.prob(sym)) for sym in self.alpha]
ss = sorted(s, key=lambda y: y[1], reverse=True)
return ss
def divergence(self, distrib2):
""" Calculate the Kullback-Leibler divergence between two discrete distributions.
Note that when self.prob(x) is 0, the divergence for x is 0.
When distrib2.prob(x) is 0, it is replaced by 0.0001.
"""
assert self.alpha == distrib2.alpha
sum = 0.0
base = len(self.alpha)
for sym in self.alpha:
if self[sym] > 0:
if distrib2[sym] > 0:
sum += math.log(self[sym] / distrib2[sym]) * self[sym]
else:
sum += math.log(self[sym] / 0.0001) * self[sym]
return sum
def entropy(self):
""" Calculate the information (Shannon) entropy of the distribution.
Note that the base is the size of the alphabet, so maximum entropy is by definition 1.
Also note that if the probability is exactly zero, it is replaced by a small value to
avoid numerical issues with the logarithm. """
sum = 0.0
base = len(self.alpha)
for sym in self.alpha:
p = self.__getitem__(sym)
if p == 0:
p = 0.000001
sum += p * math.log(p, base)
return -sum
def writeDistribs(distribs, filename):
""" Write a list/set of distributions to a single file. """
str = ''
k = 0
for d in distribs:
str += "[%d]\n%s" % (k, d.writeDistrib())
k += 1
fh = open(filename, 'w')
fh.write(str)
fh.close()
def _readDistrib(linelist):
""" Extract distribution from a pre-processed list if strings. """
symstr = ''
d = {}
for line in linelist:
line = line.strip()
if len(line) == 0 or line.startswith('#'):
continue
sections = line.split()
sym, value = sections[0:2]
if len(sym) == 1:
if sym != '*':
symstr += sym
else:
raise RuntimeError("Invalid symbol in distribution: " + sym)
try:
d[sym] = float(value)
except ValueError:
raise RuntimeError("Invalid value in distribution for symbol " + sym + ": " + value)
if len(d) == 0:
return None
alpha = Alphabet(symstr)
if '*' in d.keys(): # tot provided
for sym in d:
if sym != '*':
d[sym] = d[sym] * d['*']
distrib = Distrib(alpha, d)
return distrib
def readDistribs(filename):
""" Load a list of distributions from file.
Note that if a row contains '* <number>' then it is assumed that each probability
associated with the specific distribution is based on <number> counts. """
fh = open(filename)
string = fh.read()
distlist = []
linelist = []
for line in string.splitlines():
line = line.strip()
if line.startswith('['):
if len(linelist) != 0:
distlist.append(_readDistrib(linelist))
linelist = []
elif len(line) == 0 or line.startswith('#'):
pass # comment or blank line --> ignore
else:
linelist.append(line)
# end for-loop, reading the file
if len(linelist) != 0:
distlist.append(_readDistrib(linelist))
fh.close()
return distlist
def readDistrib(filename):
""" Load a distribution from file.
Note that if a row contains '* <number>' then it is assumed that each probability
is based on <number> counts. """
dlist = readDistribs(filename)
if len(dlist) > 0: # if at least one distribution was in the file...
return dlist[0] # return the first
import re
def _readMultiCount(linelist, format = 'JASPAR'):
ncol = 0
symcount = {}
if format == 'JASPAR2010':
for line in linelist:
line = line.strip()
if len(line) > 0:
name = line.split()[0]
counts = []
for txt in re.findall(r'\w+', line):
try:
y = float(txt)
counts.append(y)
except ValueError:
pass # ignore non-numeric entries
if len(counts) != ncol and ncol != 0:
raise RuntimeError('Invalid row in file: ' + line)
ncol = len(counts)
if len(name) == 1: # proper symbol
symcount[name] = counts
alpha = Alphabet(''.join(symcount.keys()))
distribs = []
for col in range(ncol):
d = dict([(sym, symcount[sym][col]) for sym in symcount])
distribs.append(Distrib(alpha, d))
elif format == 'JASPAR':
alpha_str = 'ACGT'
alpha = Alphabet(alpha_str)
cnt = 0
for sym in alpha_str:
line = linelist[cnt].strip()
counts = []
for txt in re.findall(r'\w+', line):
try:
y = float(txt)
counts.append(y)
except ValueError:
pass # ignore non-numeric entries
if len(counts) != ncol and ncol != 0:
raise RuntimeError('Invalid row in file: ' + line)
ncol = len(counts)
symcount[sym] = counts
cnt += 1
distribs = []
for col in range(ncol):
d = dict([(sym, symcount[sym][col]) for sym in symcount])
distribs.append(Distrib(alpha, d))
else:
raise RuntimeError('Unsupported format: ' + format)
return distribs
def readMultiCounts(filename, format = 'JASPAR'):
""" Read a file of raw counts for multiple distributions over the same set of symbols
for (possibly) multiple (named) entries.
filename: name of file
format: format of file, default is 'JASPAR' exemplified below
>MA0001.1 SEP4
0 3 79 40 66 48 65 11 65 0
94 75 4 3 1 2 5 2 3 3
1 0 3 4 1 0 5 3 28 88
2 19 11 50 29 47 22 81 1 6
returns a dictionary of Distrib's, key:ed by entry name (e.g. MA001.1)
"""
fh = open(filename)
linelist = []
entryname = ''
entries = {}
for row in fh:
row = row.strip()
if len(row) < 1: continue
if row.startswith('>'):
if len(linelist) > 0:
entries[entryname] = _readMultiCount(linelist, format=format)
linelist = []
entryname = row[1:].split()[0]
else:
linelist.append(row)
if len(linelist) > 0:
entries[entryname] = _readMultiCount(linelist, format=format)
fh.close()
return entries
def readMultiCount(filename, format = 'JASPAR'):
""" Read a file of raw counts for multiple distributions over the same set of symbols.
filename: name of file
format: format of file, default is 'JASPAR' exemplified below
0 3 79 40 66 48 65 11 65 0
94 75 4 3 1 2 5 2 3 3
1 0 3 4 1 0 5 3 28 88
2 19 11 50 29 47 22 81 1 6
returns a list of Distrib's
"""
d = readMultiCounts(filename, format=format)
if len(d) > 0:
return d.values()[0]
#################################################################################################
# Joint class
#################################################################################################
class Joint(object):
""" A joint probability class.
The JP is represented as a distribution over n-tuples where n is the number of variables.
Variables can be for any defined alphabet. The size of each alphabet determine the
number of entries in the table (with probs that add up to 1.0) """
def __init__(self, alphas):
""" A distribution of n-tuples.
alphas: Alphabet(s) over which the distribution is defined
"""
if type(alphas) is Alphabet:
self.alphas = tuple( [alphas] )
elif type(alphas) is tuple:
self.alphas = alphas
else:
self.alphas = tuple( alphas )
self.store = TupleStore(self.alphas)
self.totalCnt = 0
def getN(self):
""" Retrieve the number of distributions/random variables. """
return len(self.alphas)
def __iter__(self):
return self.store.__iter__()
def reset(self):
""" Re-set the counts of this joint distribution. Pseudo-counts are re-applied. """
for entry in self.store:
self.store[entry] = None
self.totalCnt = 0
def observe(self, key, cnt = 1):
""" Make an observation of a tuple/key
key: tuple that is being observed
cnt: number/weight of observation (default is 1)
"""
key = _getMeTuple(self.alphas, key)
if not None in key:
score = self.store[key]
if (score == None):
score = 0
self.totalCnt += cnt
self.store[key] = score + cnt
else: # there are wildcards in the key
allkeys = [mykey for mykey in self.store.getAll(key)]
mycnt = float(cnt)/float(len(allkeys))
self.totalCnt += cnt
for mykey in allkeys:
score = self.store[mykey]
if (score == None):
score = 0
self.store[mykey] = score + mycnt
return
def count(self, key):
""" Return the absolute count that is used for the joint probability table. """
key = _getMeTuple(self.alphas, key)
score = self.store[key]
if (score == None):
score = 0.0
for match in self.store.getAll(key):
y = self.store[match]
if y != None:
score += y
return score
def __getitem__(self, key):
""" Determine and return the probability of a specified expression of the n-tuple
which can involve "wildcards"
Note that no assumptions are made regarding independence. """
key = _getMeTuple(self.alphas, key)
score = self.store[key]
if (score == None):
score = 0.0
for match in self.store.getAll(key):
y = self.store[match]
if y != None:
score += y
if self.totalCnt == 0:
return 0.0
return float(score) / float(self.totalCnt)
def __str__(self):
""" Return a textual representation of the JP. """
str = '< '
if self.totalCnt == 0.0:
return str + 'None >'
for s in self.store:
if self[s] == None:
y = 0.0
else:
y = self[s]
str += (''.join(s) + ("=%4.2f " % y))
return str + ' >'
def items(self, sort = False):
""" In a dictionary-like way return all entries as a list of 2-tuples (key, prob).
If sort is True, entries are sorted in descending order of probability.
Note that this function should NOT be used for big (>5 variables) tables."""
if self.totalCnt == 0.0:
return []
ret = []
for s in self.store:
if self[s] != None:
ret.append((s, self[s]))
if sort:
return sorted(ret, key=lambda v: v[1], reverse=True)
return ret
class IndepJoint(Joint):
def __init__(self, alphas, pseudo = 0.0):
""" A distribution of n-tuples.
All positions are assumed to be independent.
alphas: Alphabet(s) over which the distribution is defined
"""
self.pseudo = pseudo
if type(alphas) is Alphabet:
self.alphas = tuple( [alphas] )
elif type(alphas) is tuple:
self.alphas = alphas
else:
self.alphas = tuple( alphas )
self.store = [Distrib(alpha, pseudo) for alpha in self.alphas]
def getN(self):
""" Retrieve the number of distributions/random variables. """
return len(self.alphas)
def __iter__(self):
return TupleStore(self.alphas).__iter__()
def reset(self):
""" Re-set the counts of each distribution. Pseudo-counts are re-applied. """
self.store = [Distrib(alpha, self.pseudo) for alpha in self.alphas]
def observe(self, key, cnt = 1, countGaps = True):
""" Make an observation of a tuple/key
key: tuple that is being observed
cnt: number/weight of observation (default is 1)
"""
assert len(key) == len(self.store), "Number of symbols must agree with the number of positions"
for i in range(len(self.store)):
subkey = key[i]
if subkey == '-' and countGaps == False:
continue
if subkey == '*' or subkey == '-':
for sym in self.alphas[i]:
score = self.store[i][sym]
if (score == None):
score = 0
self.store[i].observe(sym, float(cnt)/float(len(self.alphas[i])))
else:
score = self.store[i][subkey]
if (score == None):
score = 0
self.store[i].observe(subkey, cnt)
def __getitem__(self, key):
""" Determine and return the probability of a specified expression of the n-tuple
which can involve "wildcards"
Note that variables are assumed to be independent. """
assert len(key) == len(self.store), "Number of symbols must agree with the number of positions"
prob = 1.0
for i in range(len(self.store)):
mykey = key[i]
if mykey == '*' or mykey == '-':
pass # same as multiplying with 1.0 (all symbols possible)
else:
prob *= self.store[i][mykey]
return prob
def get(self, sym, pos):
""" Retrieve the probability of a specific symbol at a specified position. """
mystore = self.store[pos]
return mystore[sym]
def getColumn(self, column, count = False):
""" Retrieve all the probabilities (or counts) for a specified position.
Returns values as a dictionary, with symbol as key."""
d = {}
for a in self.alphas[column]:
if count: # absolute count
d[a] = self.store[column].count(a)
else: # probability
d[a] = self.store[column][a]
return d
def getRow(self, sym, count = False):
""" Retrieve the probabilities (or counts) for a specific symbol over all columns/positions.
Returns a list of values in the order of the variables/alphabets supplied to the constructor. """
d = []
for store in self.store:
if count: # absolute count
d.append(store.count(sym))
else: # probability
d.append(store[sym])
return d
def getMatrix(self, count = False):
""" Retrieve the full matrix of probabilities (or counts) """
d = {}
for a in self.alphas[0]:
d[a] = self.getRow(a, count)
return d
def displayMatrix(self, count = False):
""" Pretty-print matrix """
print " \t%s" % (''.join("\t%5d" % (i + 1) for i in range(len(self.alphas))))
for a in self.alphas[0]:
if count:
print "%s\t%s" % (a, ''.join("\t%5d" % (y) for y in self.getRow(a, True)))
else:
print "%s\t%s" % (a, ''.join("\t%5.3f" % (y) for y in self.getRow(a)))
def __str__(self):
""" Text representation of the table. Note that size is an issue so big tables
will not be retrieved and displayed. """
if self.alphas > 5:
return '< ... too large to process ... >'
tstore = TupleStore(self.alphas)
str = '< '
for key in tstore:
p = 1.0
for i in range(len(self.store)):
value = self.store[i][key[i]]
if value != None and value != 0.0:
p *= value
else:
p = 0;
break;
str += (''.join(key) + ("=%4.2f " % p))
return str + ' >'
def items(self, sort = False):
""" In a dictionary-like way return all entries as a list of 2-tuples (key, prob).
If sort is True, entries are sorted in descending order of probability.
Note that this function should NOT be used for big (>5 variables) tables."""
tstore = TupleStore(self.alphas)
ret = []
for key in tstore:
p = 1.0
for i in range(len(self.store)):
value = self.store[i][key[i]]
if value != None and value != 0.0:
p *= value
else:
p = 0;
break;
if p > 0.0:
ret.append((key, p))
if sort:
return sorted(ret, key=lambda v: v[1], reverse=True)
return ret
class NaiveBayes():
""" NaiveBayes implements a classifier: a model defined over a class variable
and conditional on a list of discrete feature variables.
Note that feature variables are assumed to be independent. """
def __init__(self, inputs, output, pseudo_input = 0.0, pseudo_output = 0.0):
""" Initialise a classifier.
inputs: list of alphabets that define the values that input variables can take.
output: alphabet that defines the possible values the output variable takes
pseudo_input: pseudo-count used for each input variable (default is 0.0)
pseudo_output: pseudo-count used for the output variable (default is 0.0) """
if type(inputs) is Alphabet:
self.inputs = tuple( [inputs] )
elif type(inputs) is tuple:
self.inputs = inputs
else:
self.inputs = tuple( inputs )
self.condprobs = {} # store conditional probabilities as a dictionary (class is key)
for outsym in output: # GIVEN the class
# for each input variable initialise a conditional probability
self.condprobs[outsym] = [ Distrib(input, pseudo_input) for input in self.inputs ]
self.classprob = Distrib(output, pseudo_output) # the class prior
def observe(self, inpseq, outsym):
""" Record an observation of an input sequence of feature values that belongs to a class.
inpseq: sequence/list of feature values, e.g. 'ATG'
outsym: the class assigned to these feature values. """
condprob = self.condprobs[outsym]
for i in range(len(inpseq)):
condprob[i].observe(inpseq[i])
self.classprob.observe(outsym)
def __getitem__(self, key):
""" Determine and return the class probability GIVEN a specified n-tuple of feature values
The class probability is given as an instance of Distrib. """
out = Distrib(self.classprob.alpha)
for outsym in self.classprob.getSymbols():
condprob = self.condprobs[outsym]
prob = self.classprob[outsym]
for i in range(len(key)):
prob *= condprob[i][key[i]] or 0.0
out.observe(outsym, prob)
return out
import sym
class RCDict(dict):
""" Class that extends a standard dictionary to accept only fixed-length DNA symbol strings as keys.
Additionally, it maps the reverse complement to the same value. """
def __init__(self, alpha = sym.DNA_Alphabet):
""" Initialise a reverse-complement dictionary to accept strings of a given alphabet (DNA by default) """
self.alpha = alpha
self.length = None
def __setitem__(self, key, value):
""" Set the value for a key.
Checks to see that if
(a) previous keys have been used that the length is the same, and
(b) the key consists only of valid symbols. """
if self.length == None:
self.length = len(key)
elif len(key) != self.length:
raise RuntimeError("Invalid key: " + str(key))
for i in range(len(key)):
if not key[i] in sym.DNA_Alphabet:
raise RuntimeError("Invalid symbol in key: " + str(key[i]))
super(RCDict, self).__setitem__(self.canonical(key), value)
def canonical(self, key):
""" Figures out the canonical key (original or its reverse complement).
Note that is you use other than DNA you may need to modify this code. """
if self.length == None:
return key
alpha = self.alpha
rcindx = [0 for _ in range(self.length)]
fwindx = [alpha.index(sym) for sym in key]
undecided = True
for forw in range(self.length):
backw = self.length - forw - 1
rcindx[forw] = 3 - fwindx[backw] # formula for converting A <-> T, C <-> G
if undecided and rcindx[forw] > fwindx[forw]: # RC is "higher" than original
return key
undecided = rcindx[forw] == fwindx[forw]
return ''.join([alpha.symbols[indx] for indx in rcindx])
def __getitem__(self, key):
""" Retrieve the value associated with a specified key. """
return super(RCDict, self).__getitem__(self.canonical(key))
def getSum(self, IUPAC_key):
""" Retrieve the sum of all the entries that match the specified IUPAC key. """
# TODO
pass
"""
This python module reads in sam files from RNA-seq experiment and processes
them and RNA-seq data1
"""
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import itertools
import operator
import math
from scipy import stats
from numpy import array, empty
import scipy.cluster.hierarchy as sch
def sam_reader(filename):
"""Mandatory fields are QNAME,FLAG,RNAME,POS,MAPQ,CIGAR,RNEXT,PNEXT,TLEN,SEQ,QUAL
for more info http://samtools.github.io/hts-specs/SAMv1.pdf """
data=[]
f= open(filename,'r')
for row in f:
if row.startswith('@'): # skip the header
pass
else:
info=row.strip().split('\t')
data.append(info)
return data
def base_percentages(reads):
"reports base percentage %A,%T,%C,%G "
all_seqs=[]
for read in reads:
seq=read[9]
seq=[seq[i:i+1] for i in range(0,len(seq),1)]
for nuc in seq:
all_seqs.append(nuc)
counts=dict(Counter(all_seqs))
nucs=counts.keys()
freqs={}
for nuc in nucs:
freqs[nuc]=float(counts[nuc])/sum(counts.values())
return freqs
def numberofreads(reads):
"""Incremented for every sequence-containing line in the sam file, regardless of whether it represents an alignment.
for some files, this is not actually the number of reads. indeed, this may be a poor name for this stat"""
return len(reads)
def mapped_reads(reads,paired_end=True):
"""If duplicate tracking was enabled via -D, then this attempts to recapitulate the number of unique, mapped, probe-id's in the original sam file. It is multiplied by 2 for paired-end data with duplicate read id's.
The idea is that if you divide this by the number of reads in the fastq you aligned (possibly from the output of fastq-stats),
you will get an accurate "percentage of reads aligned" statistic.
"mapped" is something with a non-negative position, and a "non-asterisk" cigar string."""
mapped_reads=[]
store_reads=[]
for read in reads:
if read[3]>0 and read[5]!='*':
mapped_reads.append(read[0])
store_reads.append(read)
mapped=set(mapped_reads)
list_mapped=list(mapped)
if paired_end==True:
mapped=len(mapped)+len(mapped)
else:
mapped=len(mapped)
print "number of mapped reads",mapped
return store_reads
def mappedBases(mapped_reads):
"""Total number of mapped bases in sam file"""
seq=""
for read in mapped_reads:
seq=seq+read[9]
return len(seq)
def forward(mapped_reads):
"""The number of lines in the sam file that were aligned to the "forward" strand. No accounting is done on duplicates."""
forward=[read for read in mapped_reads if read[9]>0]
return forward
def reverse(mapped_reads):
"""The number of lines in the sam file that were aligned to the "reverse" strand. No accounting is done on duplicates."""
reverse=[read for read in mapped_reads if read[9]<0]
return reverse
########Qualities and STATS
def subgroups(mapped_reads):
"""form groups p<1e-3 one group,1e-3<=p<1e-2 one group,1e-2<=p<1 one group a total of three groups"""
group1=[]
group2=[]
group3=[]
for read in mapped_reads:
if int(read[4])>29:
group1.append(read)
elif int(read[4])<=29 and int(read[4])>17:
group2.append(read)
elif int(read[4])<=17:
group3.append(read)
else:
pass
print len(group1),"in p<1e-3 group"
print len(group2),"in 1e-3<=p<1e-2 group"
print len(group3),"in 1e-2<=p<1 group"
return group1,group2,group3
def dinuc_freq(mapped_reads):
"reports dinucleotide composition using p(Rho) statistics for overrepresentation"
all_seqs=[]
for read in mapped_reads:
seq=read[9]
seq=[seq[i:i+1] for i in range(0,len(seq),1)]
for nuc in seq:
all_seqs.append(nuc)
counts=dict(Counter(all_seqs))
nucs=counts.keys()
freqs={}
for nuc in nucs:
freqs[nuc]=float(counts[nuc])/sum(counts.values())
all_seqs=[]
for read in mapped_reads:
seq=read[9]
seq=[seq[i:i+2] for i in range(0,len(seq),2)]
for nuc in seq:
all_seqs.append(nuc)
counts=dict(Counter(all_seqs))
dinucs=counts.keys()
dinuc_counts={}
for i in dinucs:
val=float(counts[i])/sum(counts.values())
dinuc_counts[i]=val/(freqs[i[0]]*freqs[i[1]]) # p-values
return dinuc_counts
def PercentReadsAligned(group1,group2,group3,numfastq):
"""Provide a list of mapped_reads and the number of reads in the fastq file"""
mapped_reads=group1+group2+group3
Mapped=len(mapped_reads)/float(numfastq)
Unmapped=1-float(Mapped)
## print "Mapping stats"
## print"p<1e-3", len(group1)/float(numfastq)
## print"1e-3<=p<1e-2",len(group2)/float(numfastq)
## print "1e-2<=p<1",len(group3)/float(numfastq)
## print "Unmapped",Unmapped
labels="p<1e-3","1e-3<=p<1e-2","1e-2<=p<1","Unmapped"
x=[len(group1)/float(numfastq),len(group2)/float(numfastq),len(group3)/float(numfastq),Unmapped]
plt.figure(1, figsize=(8,8))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
plt.pie(x,labels=labels,autopct='%1.1f%%', shadow=True)
plt.title('Mapping stats')
plt.show()
return Mapped
def length_stats(group1,group2,group3):
"""returns basic stats relating to the lengths of the reads
Calculations are based on the the length of the (possibly hard-clipped) sequence in the sam file."""
reads=[group1,group2,group3]
data=[]
for i in range(0,len(reads)):
lengths=[]
for read in reads[i]:
if int(read[8])<0:
length=-1*int(read[8])
else:
length=int(read[8])
lengths.append(length)
mean_len=np.mean(lengths)
print "group"+str(i+1)+"mean",mean_len
max_len=np.max(lengths)
print "group"+str(i+1)+"max length",max_len
min_len=np.min(lengths)
print "group"+str(i+1)+"min length",min_len
data.append(["group"+str(i+1),mean_len,max_len,min_len])
return data
def plot_length_distrib(group,name):
"""distribution of lengths of all the sam reads"""
lengths=[]
for read in group:
if int(read[8])<0:
length=-1*int(read[8])
else:
length=int(read[8])
lengths.append(length)
##Visualize length distribution
plt.figure(1, figsize=(8,8))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
n, bins, patches = plt.hist(lengths,100, normed=0, facecolor='g')
plt.xlabel("lengths")
plt.ylabel("number of mapped reads")
plt.title(name)
plt.show()
def inv_logit(p):
return 10**(p/-10)
def plot_base_composition(reads,sym):
"reports nucelotide frequencies at each position in the sam sequences"
#DNA_Alphabet=["A","C","T","G","N"]
all_nucs=[]
for read in reads:
nucs={}#dictionary to store nucleotide data
seq=read[9]
for i in range(0,len(seq)):
nucs[str(i+1)]=seq[i]
all_nucs.append(nucs)
all_items=[]
counts=[]
pos=range(1,len(seq)+1)
for dicts in all_nucs:
for item in dicts.items():
all_items.append(item)
all_items.sort(key=operator.itemgetter(0))
groups= [map(operator.itemgetter(1),list(group)) for key, group in itertools.groupby(all_items, operator.itemgetter(0))]
for group in groups:
counts.append(group.count(sym))
print counts
plt.figure(1, figsize=(8,8))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
plt.bar(pos,counts,facecolor='g')
plt.xlabel("Position")
plt.ylabel("number of mapped reads")
plt.title(sym)
plt.show()
return counts
#####################################################
#Transcript reader
def raw_count_reader(filename):
"""Count the raw counts in the file"""
data={}
f= open(filename,'r')
for row in f:
if row.startswith('t1'): # skip the header
pass
else:
info=row.strip().split('\t')
data[info[0]]=[int(info[1]),int(info[2]),int(info[3]),int(info[4]),float(info[5])] #t1,rept1,t10,rept10,len
return data
#####Normalisation methods
def get_RPKM(data,num_map1,num_map2,num_map3,num_map4):
"""provide number of mapped reads for the two groups of interest and raw count data .This method provides length normalisation to prevent length and total count bias"""
all_rpkms=[];final={}
for i,s,ii,ss,v in data.values():
rpkms=[]
num_mapped_reads=[num_map1,num_map2,num_map3,num_map4]
vals=[i,s,ii,ss]
lengths=[v,v,v,v]
for n in range(0,len(vals)):
if vals[n]==0:
rpkm=0
rpkms.append(rpkm)
else:
#perform RPKM calc
rpkm= float(vals[n])/(lengths[n]*(float(num_mapped_reads[n])/10**6))
rpkms.append(rpkm)
all_rpkms.append(rpkms)
#return gene names and rpkms
for i in range(0,len(data.keys())):
final[data.keys()[i]]=[float(all_rpkms[i][0]),float(all_rpkms[i][1]),float(all_rpkms[i][2]),float(all_rpkms[i][3])]
return final
def write_RPKM_data(RPKM_data,filename):
"""write RPKM data to a file"""
f=open(filename,'w')
for i in range(0,len(RPKM_data)):
f.write("%s\t%d\t%d\t%d\t%d\n"%(RPKM_data.keys()[i],int(RPKM_data.values()[i][0]),int(RPKM_data.values()[i][1]),int(RPKM_data.values()[i][2]),int(RPKM_data.values()[i][3])))
f.close()
###############Visualize replicates to determine degree of biological variation
def pearson_def(x, y):
"""Pearson correlation coefficient R"""
assert len(x) == len(y)
n = len(x)
assert n > 0
avg_x = np.mean(x)
avg_y = np.mean(y)
diffprod = 0
xdiff2 = 0
ydiff2 = 0
for idx in range(n):
xdiff = x[idx] - avg_x
ydiff = y[idx] - avg_y
diffprod += xdiff * ydiff
xdiff2 += xdiff * xdiff
ydiff2 += ydiff * ydiff
return diffprod / math.sqrt(xdiff2 * ydiff2)
def plotreprpkm(rpkm_data,timepoint):
"""plot showing level of agreement between technical replicates for RPKM between replicates and plots coefficient of determination"""
one=[]
two=[]
if timepoint=="t1":
for i in range(0,len(rpkm_data.values())):
one.append(int(rpkm_data.values()[i][0]))
two.append(int(rpkm_data.values()[i][1]))
else:
for i in range(0,len(rpkm_data.values())):
one.append(int(rpkm_data.values()[i][2]))
two.append(int(rpkm_data.values()[i][3]))
plt.plot(one,two,'o')
pcc=pearson_def(one,two)
R2=pcc**2
name="""Technical Replicates
R2="""+str(R2)
m,b= np.polyfit(one,two,1)
plt.figure(1, figsize=(8,8))
plt.plot(one, np.array(one)*m +b,'r-')
plt.text(3000, max(two)-1000,name , fontsize=12)
plt.xlabel("RPKM replicate 1")
plt.ylabel("RPKM replicate 2")
plt.title(timepoint)
plt.show()
def plotMAreprpkm(rpkm_data,timepoint):
"""MA Plot of log(RPKM) vs Average log(RPKM) of replicates"""
m=[]
a=[]
if timepoint=="t1":
for i in range(0,len(rpkm_data.values())):
y=np.log2(rpkm_data.values()[i][0]+1)-np.log2(rpkm_data.values()[i][1]+1)
x=(np.log2(rpkm_data.values()[i][0]+1)+np.log2(rpkm_data.values()[i][1]+1))/2
m.append(y)
a.append(x)
else:
for i in range(0,len(rpkm_data.values())):
y=np.log2(rpkm_data.values()[i][2]+1)-np.log2(rpkm_data.values()[i][3]+1)
x=(np.log2(rpkm_data.values()[i][2]+1)+np.log2(rpkm_data.values()[i][3]+1))/2
m.append(y)
a.append(x)
plt.figure(1, figsize=(8,8))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
plt.plot(a,m,'o')
plt.axhline(np.mean(m)+1.96*np.std(m),color="green",label="avg diff +1.96(std diff)")
plt.axhline(np.mean(m)-1.96*np.std(m),color="green",label="avg diff -1.96(std diff)")
plt.xlabel("Average log(RPKM) of replicates")
plt.ylabel("Difference in log(RPKM) of replicates")
plt.legend(loc="lower right")
plt.title(timepoint)
plt.show()
def get_cv(data1,condition):
cvs=[]
if condition=="t1":
for i in range(0,len(data1.values())):
mean = np.mean([data1.values()[i][0],data1.values()[i][1]])
std=np.std([data1.values()[i][0],data1.values()[i][1]])
if mean==0.0 and std==0.0:
pass
else:
cv=float(mean+1)/(std+1)
cvs.append(cv)
else:
for i in range(0,len(data1.values())):
mean = np.mean([data1.values()[i][2],data1.values()[i][3]])
std=np.std([data1.values()[i][2],data1.values()[i][3]])
if mean==0.0 and std==0.0:
pass
else:
cv=float(mean+1)/(std+1)
cvs.append(cv)
return cvs
def get_boxplots(norm,original):
"""distribution of the coeficient of variation across samples (replicates) normalised using the methods provided"""
bp=plt.boxplot([norm,original],notch=False, patch_artist=True)
for box in bp['boxes']:
box.set(color="red")
box.set(color="blue")
plt.ylabel("coefficient of variation")
plt.xlabel("Methods")
my_xticks = ['RPKM','raw counts']
x=[1,2]
plt.xticks(x,my_xticks)
plt.ylim(0,400)
plt.show()
def plotavg_cv(norm,original):
"""distribution of the coeficient of variation across samples (replicates) normalised using the methods provided"""
x=[1,2]
y=[np.mean(norm),np.mean(original)]
plt.figure(1, figsize=(8,8))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
plt.bar(x[0],y[0],color="red",label="RPKM")
plt.bar(x[1],y[1],color="blue",label="Raw counts")
plt.ylabel("Average coefficient of variation")
plt.xlabel("Methods")
ax.xaxis.set_ticklabels([])
plt.legend(loc="upper right")
plt.show()
def plotMA(rpkm_data,cutoff=[-1.5,1.5]):
"""Produce MA plot using logfold as cutoff"""
logfc=[]
avg_rpkm=[]
sig_logfc=[]
sig_avg_rpkm=[]
logfc2=[]
avg_rpkm2=[]
sig_logfc2=[]
sig_avg_rpkm2=[]
for i,ii,s,ss in rpkm_data.values():
fc=np.log2(float(s+1)/(i+1))
if fc<cutoff[0] or fc>cutoff[1]:
sig_logfc.append(fc)
sig_avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
else:
logfc.append(fc)
avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
for i,ii,s,ss in rpkm_data.values():
fc2=np.log2(float(ss+1)/(ii+1))
if fc2<cutoff[0] or fc2>cutoff[1]:
sig_logfc2.append(fc2)
sig_avg_rpkm2.append(np.log2(ss+1)+np.log2(ii+1)/2)
else:
logfc2.append(fc2)
avg_rpkm2.append(np.log2(ss+1)+np.log2(ii+1)/2)
plt.figure(1, figsize=(8,8))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
plt.plot(avg_rpkm,logfc,'o',color="blue",label="rep1")
plt.plot(avg_rpkm2,logfc2,'x',color="blue",label="rep2")
plt.plot(sig_avg_rpkm,sig_logfc,'o',color="red",label="sig rep1")
plt.plot(sig_avg_rpkm2,sig_logfc2,'x',color="red",label="sig rep2")
plt.axhline(cutoff[0],color="orange")
plt.axhline(cutoff[1],color="orange")
plt.ylabel("Fold Change (log2)")
plt.xlabel("Average RPKM (log2)")
plt.title("MA plot")
plt.legend(loc="upper left")
plt.show()
def plotMA_pval(rpkm_data,cutoff=0.05):
"""Produce MA plot using the pvalue as cutoff"""
logfc=[]
avg_rpkm=[]
sig_logfc=[]
sig_avg_rpkm=[]
logfc2=[]
avg_rpkm2=[]
sig_logfc2=[]
sig_avg_rpkm2=[]
for i,ii,s,ss,pval in rpkm_data.values():
fc=np.log2(float(s+1)/(i+1))
if float(pval)<cutoff:
sig_logfc.append(fc)
sig_avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
else:
logfc.append(fc)
avg_rpkm.append(np.log2(s+1)+np.log2(i+1)/2)
for i,ii,s,ss,pval in rpkm_data.values():
fc2=np.log2(float(ss+1)/(ii+1))
if float(pval)<cutoff:
sig_logfc2.append(fc2)
sig_avg_rpkm2.append(np.log2(ss+1)+np.log2(ii+1)/2)
else:
logfc2.append(fc2)
avg_rpkm2.append(np.log2(ss+1)+np.log2(ii+1)/2)
plt.figure(1, figsize=(8,8))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
plt.plot(avg_rpkm,logfc,'o',color="blue",label="rep1")
plt.plot(avg_rpkm2,logfc2,'o',color="blue",label="rep2")
plt.plot(sig_avg_rpkm,sig_logfc,'o',color="red",label="sig rep1")
plt.plot(sig_avg_rpkm2,sig_logfc2,'x',color="red",label="sig rep2")
plt.ylabel("Fold Change (log2)")
plt.xlabel("Average RPKM (log2)")
plt.title("MA plot")
plt.legend(loc="upper left")
plt.show()
#####DE expression statistical test (T-Test, ANOVA and FDR)
def Welcht(rpkm):
"""Performs Welchs T-statistic (one-tailed)"""
ts=[]
result={}
for i,ii,s,ss in rpkm.values():
sd1=np.std([i,ii])
sd2=np.std([s,ss])
t=(np.mean([s,ss])-np.mean([i,ii]))/(math.sqrt(((float(sd2)/2)+(float(sd1)/2))))
ts.append(t)
pvals=[]
for t in ts:
pval = stats.t.sf(np.abs(t), 2-1)
if pval==float('nan'):
pval=1
pvals.append(pval)
else:
pval=pval
pvals.append(pval)
corr_pvals=correct_pvalues_for_multiple_testing(pvals, correction_type = "Benjamini-Hochberg")
for i in range(0,len(rpkm.values())):
result[rpkm.keys()[i]]=[rpkm.values()[i][0],rpkm.values()[i][1],rpkm.values()[i][2],rpkm.values()[i][3],corr_pvals[i]]
return result
def correct_pvalues_for_multiple_testing(pvalues, correction_type = "Benjamini-Hochberg"):
"""
consistent with R print correct_pvalues_for_multiple_testing([0.0, 0.01, 0.029, 0.03, 0.031, 0.05, 0.069, 0.07, 0.071, 0.09, 0.1])
"""
pvalues = array(pvalues)
n = float(pvalues.shape[0])
new_pvalues = empty(n)
if correction_type == "Bonferroni":
new_pvalues = n * pvalues
elif correction_type == "Bonferroni-Holm":
values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
values.sort()
for rank, vals in enumerate(values):
pvalue, i = vals
new_pvalues[i] = (n-rank) * pvalue
elif correction_type == "Benjamini-Hochberg":
values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
values.sort()
values.reverse()
new_values = []
for i, vals in enumerate(values):
rank = n - i
pvalue, index = vals
new_values.append((n/rank) * pvalue)
for i in xrange(0, int(n)-1):
if new_values[i] < new_values[i+1]:
new_values[i+1] = new_values[i]
for i, vals in enumerate(values):
pvalue, index = vals
new_pvalues[index] = new_values[i]
return new_pvalues
####Method Run hiearachical clustering on the correlation matrix (of differentially expressed genes) -Coexpression
def cluster_data(data_matrix,genenames,timepoint):
"One replicates at a specific time point"
D = np.zeros([np.shape(data_matrix)[0],1])
##generate a distance matrix
for i in range(np.shape(data_matrix)[0]):
for j in range(1):
D[i,j] = abs(data_matrix[i] - data_matrix[j])**2 #use Wards method (other methods could be implemented here)
labels=list('' for i in range(np.shape(data_matrix)[0]))
for i in range(np.shape(data_matrix)[0]):
labels[i]=str(i)+","+str(genenames[i])
fig=plt.figure(1, figsize=(17,8))
linked = sch.linkage(D, method='centroid')
dend = sch.dendrogram(linked, orientation='right',labels=labels) # sets the oirentation root at the right
plt.title(timepoint)
fig.savefig(timepoint+'dendogram.png')
return dend['ivl']
def heatmap_cluster(data_matrix,timepoint):
"""Produces a heatmap of the clustered count data"""
D = np.zeros([np.shape(data_matrix)[0],np.shape(data_matrix)[0]])
for i in range(np.shape(data_matrix)[0]):
for j in range(np.shape(data_matrix)[0]):
D[i,j] = abs(data_matrix[i] - data_matrix[j])**2 #use Wards method (other methods could be implemented here)
fig = plt.figure()
axdendro = fig.add_axes([0.09,0.1,0.2,0.8])
linked = sch.linkage(D, method='centroid')
dend = sch.dendrogram(linked, orientation='right') # sets the oirentation root at the right
axdendro.set_xticks([])
axdendro.set_yticks([])
#plot distance matrix
axmatrix = fig.add_axes([0.3,0.1,0.6,0.8])
index = dend['leaves']
D=D[index,:]
D=D[:,index]
im = axmatrix.matshow(D, aspect='auto', origin='lower')
axmatrix.set_xticks([])
axmatrix.set_yticks([])
#plot color bar
axcolor = fig.add_axes([0.91,0.1,0.02,0.8])
fig.colorbar(im, cax=axcolor)
#display the heatmap
fig.savefig(timepoint+'heatmap.png')
'''
Module that provides methods and classes for working with genome sequence data.
For instance,
- BED files
- 2bit genome sequence files
'''
def overlap(chromLoc1, chromLoc2):
""" Check if chromosome locations described by tuples
(chrom, chromStart, chromEnd) overlap.
If so return the number of positions that overlap.
Return 0 in case of NO overlap.
"""
if chromLoc1[0] == chromLoc2[0]:
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
minWidth = min(halfWidth1, halfWidth2)
minWidth = max(minWidth, 1)
maxWidth = max(halfWidth1, halfWidth2)
maxWidth = max(maxWidth, 1)
centre1 = chromLoc1[1] + halfWidth1
centre2 = chromLoc2[1] + halfWidth2
diffCentres = abs(centre1 - centre2)
if diffCentres + minWidth < maxWidth: # one fragment encompasses the other
return minWidth * 2
else:
return max(0, halfWidth1 + halfWidth2 - diffCentres)
else:
return 0
def distance(chromLoc1, chromLoc2, minimum = True):
""" Check the distance between two locations described by tuples
(chrom, chromStart, chromEnd).
If chromLoc1 is BEFORE chromLoc2 then the distance is positive, else negative.
If not on same chromosome return None.
minimum: if True (default), then use minimum distance, if False, use centre to centre
"""
if chromLoc1[0] == chromLoc2[0]:
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
minWidth = min(halfWidth1, halfWidth2)
minWidth = max(minWidth, 1)
maxWidth = max(halfWidth1, halfWidth2)
maxWidth = max(maxWidth, 1)
centre1 = chromLoc1[1] + halfWidth1
centre2 = chromLoc2[1] + halfWidth2
diffCentres = abs(centre1 - centre2)
if not minimum:
return centre2 - centre1
if diffCentres + minWidth < maxWidth: # one fragment encompasses the other
return 0
elif halfWidth1 + halfWidth2 - diffCentres > 0: # fragments overlap A-to-B or B-to-A
return 0
else:
loc1_is_1st = chromLoc2[1] - chromLoc1[2]
loc1_is_2nd = chromLoc1[1] - chromLoc2[2]
if loc1_is_1st > loc1_is_2nd:
return loc1_is_1st
else:
return -loc1_is_2nd
else:
return None
class BedEntry():
def __init__(self, chrom, chromStart, chromEnd):
self.chrom = chrom
self.chromStart = chromStart
self.chromEnd = chromEnd
self.blockCount = None
self.usestrand = False
self.name = ''
def addOption(self,
name = None,
score = None,
strand = None,
thickStart = None,
thickEnd = None,
itemRgb = None,
blockCount = None,
blockSizes = None,
blockStarts = None,
signalValue = None,
pValue = None,
qValue = None,
peak = None,
tags = None,
summit = None,
fold = None,
fdr = None,
zscore = None,
bg = None):
if name: self.name = name
if score: self.score = score
if strand:
self.strand = strand
self.usestrand = True # use reverse complement when sequence is requested from genome
if thickStart: self.thickStart = thickStart
if thickEnd: self.thickEnd = thickEnd
if itemRgb: self.itemRgb = [int(color) for color in itemRgb.split(',')]
if blockCount:
self.blockCount = max(0, blockCount)
if blockCount > 0:
self.blockSizes = [int(sizeword) for sizeword in blockSizes.split(',')]
self.blockStarts = [int(startword) for startword in blockStarts.split(',')]
if len(self.blockSizes) != blockCount or len(self.blockStarts) != blockCount:
raise RuntimeError('Blockcount is incorrect in BED entry \"%s\"' % str(self))
if signalValue: self.signalValue = signalValue
if pValue: self.pValue = pValue
if qValue: self.qValue = qValue
if peak: self.peak = peak
if tags: self.tags = tags
if summit: self.summit = summit
if fold: self.fold = fold
if fdr: self.fdr = fdr
if bg: self.bg = bg
if zscore: self.zscore = zscore
def __str__(self):
return str((self.chrom, self.chromStart, self.chromEnd))
def __getitem__(self, i):
if self.blockCount:
return (self.chrom, self.blockStarts[i], self.blockStarts[i] + self.blockSizes[i])
def __iter__(self):
if self.blockCount:
for i in range(self.blockCount):
if self.blockSizes[i] > 0:
yield (self.chrom, self.blockStarts[i], self.blockStarts[i] + self.blockSizes[i])
def __len__(self):
return self.blockCount
def loc(self, genome = None, fixedwidth = None, usesummit = False, useshift = None):
""" Retrieve the genomic location for BED entry, or sequence if genome is provided
genome: a dictionary with keys for sequence names, e.g. 'chr1', 'chrX', etc, and values with indexed/sliceable strings
fixedwidth: the width of the location/sequence if the width in the BED entry is ignored, and only its centre is used
usesummit: centre a fixedwidth window around an assigned "summit"
useshift: centre a fixedwidth window around a shifted centre point, e.g. useshift=-125 will shiftcentre point 125bp upstream,
to say capture a fixedwidth=350bp window with 350/2-125=50bp downstream
"""
otherstrand = False
if (self.usestrand):
if (self.strand == '-'):
otherstrand = True
if (otherstrand == False):
end = self.chromEnd
start = self.chromStart
mywidth = fixedwidth or (self.chromEnd - self.chromStart)
mycentre = start + (self.chromEnd - self.chromStart) / 2
if usesummit:
mycentre = self.summit
if useshift:
mycentre = mycentre + useshift
if fixedwidth: # we need to re-calculate start and end
if genome:
end = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
else:
end = mycentre + (mywidth / 2)
start = max(0, mycentre - (mywidth / 2))
else: # other strand
start = self.chromEnd
end = self.chromStart
mywidth = fixedwidth or (self.chromEnd - self.chromStart)
mycentre = self.chromStart + (self.chromEnd - self.chromStart) / 2
if usesummit:
mycentre = self.summit
if useshift:
mycentre = mycentre - useshift # shift is reversed on other strand
if fixedwidth: # we need to re-calculate start and end
end = max(0, mycentre - (mywidth / 2))
if genome:
start = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
else:
start = mycentre + (mywidth / 2)
if genome: # refer to the genome sequence
return genome[self.chrom][start : end]
else:
return (self.chrom, start, end)
def setwidth(self, fixedwidth = None, usesummit = False):
if fixedwidth:
if usesummit:
diff = self.summit - fixedwidth / 2
else:
diff = (self.chromEnd - self.chromStart) / 2 - fixedwidth / 2
self.chromStart += diff
self.chromStart += diff + fixedwidth
return (self.chrom, self.chromStart, self.chromEnd)
class BedFile():
""" Read BED file.
See http://genome.ucsc.edu/FAQ/FAQformat#format1
The first three required BED fields are (part of all supported sub-formats):
chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671).
chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
The 9 additional optional BED fields are (part of sub-format "Optional"):
name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode.
score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). This table shows the Genome Browser's translation of BED score values into shades of gray:
shade
strand - Defines the strand - either '+' or '-'.
thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).
itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
blockCount - The number of blocks (exons) in the BED line.
blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
ENCODE also defines broadpeaks and narrowpeaks format (part of our "Peaks" sub-format):
name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode.
score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were '0' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
strand - +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned.
signalValue - Measurement of overall (usually, average) enrichment for the region.
pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
MACS also defines a "summit" peaks format (part of our "Summit" sub-format)
It contains the peak summits locations for every peaks. The 5th column in this file is the .
In addition to the required three, the following fields follow:
length [redundant, ignored]
summit summit height of fragment pileup
tags
pValue [-10*log10(pvalue)]
fold [enrichment]
FDR [%; optional]
"CCAT" BED-like file format:
chromosome,
peakcenter [converted to summit],
regionstart,
regionend,
tags [tagcount],
bg [bgcount],
zscore,
fdr
"""
def __init__(self, entries, format = 'Limited'):
if isinstance(entries, str): # filename
self.rows = self._read(entries, format)
else:
self.rows = entries
self.format = format
self.indices = self._createIndices()
def _read(self, filename, format = 'Limited'):
""" Read a BED file.
format: specifies the format of the file,
"Limited", e.g.
chr22 1000 5000
chr22 2000 6000
"Optional", e.g.
track name=pairedReads description="Clone Paired Reads" useScore=1
chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512
chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601
...
(also handles the Limited + score format)
"Peaks", e.g.
chr1 569780 569930 . 0 . 19 6.07811 -1 -1
chr1 713300 713450 . 0 . 54 49.1167 -1 -1
"Strand", e.g.
chr4 185772359 185772424 -
chr18 20513381 20513401 +
also supports a 5th label field
chr5 20611949 20611949 + ENSG00000251629_20611949
chr3 42187863 42187863 - ENSG00000234562_42187863
"Summit", e.g.
# d = 130
chr start end length summit tags -10*log10(pvalue) fold_enrichment FDR(%)
chr1 8250 8671 422 286 46 145.84 11.68 0.51
chr1 36382 36984 603 405 46 315.23 27.05 0.24
"CCAT", e.g.
chr8 94747805 94747070 94749250 525 3 21.519196 0.002000
chr17 55277895 55277070 55279280 560 18 21.283333 0.002000
"Cropped", e.g.
chr1 851602 10
chr1 921184 18
chr1 931838 9
"""
f = open(filename)
row = 0
acceptHeaderRows = 1
headerRow = None
rows = []
for line in f:
row += 1
words = line.strip().split()
if len(words) == 0:
continue # ignore empty lines
if words[0].strip().startswith('#'):
continue # comment
if words[0].strip().startswith('browser'):
continue # ignore
if words[0].strip().startswith('track'):
continue # ignore
try:
chrom = words[0]
if format.lower().startswith('ccat'):
chromStart = int(words[2])
chromEnd = int(words[3])
else: # all other standard BED formats
chromStart = int(words[1])
chromEnd = int(words[2])
entry = BedEntry(chrom, chromStart, chromEnd)
if format.lower().startswith('opt'):
if len(words) >= 12:
entry.addOption(name = words[3], score = float(words[4]), strand = words[5], thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8], blockCount = int(words[9]), blockSizes = words[10], blockStarts = words[11])
elif len(words) >= 9:
entry.addOption(name = words[3], score = float(words[4]), strand = words[5], thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8])
elif len(words) >= 6:
entry.addOption(name = words[3], score = float(words[4]), strand = words[5])
elif len(words) >= 5:
entry.addOption(name = words[3], score = float(words[4]))
elif len(words) >= 4:
entry.addOption(name = words[3])
else:
entry.addOption(name = '.', score = int(words[3]), strand = '.')
elif format.lower().startswith('bed6'):
entry.addOption(name=words[3], score=float(words[4]), strand=words[5])
elif format.lower().startswith('strand'):
if len(words) >= 4: # properly formatted
entry.addOption(strand = words[3])
if len(words) >= 5:
entry.addOption(name = words[4])
elif format.lower().startswith('peak'):
if len(words) >= 10: # narrowpeaks
entry.addOption(name = words[3], score = int(words[4]), strand = words[5], signalValue = float(words[6]), pValue = float(words[7]), qValue = float(words[8]), peak = int(words[9]))
else: # broadpeaks
entry.addOption(name = words[3], score = int(words[4]), strand = words[5], signalValue = float(words[6]), pValue = float(words[7]), qValue = float(words[8]))
elif format.lower().startswith('summit'):
if len(words) >= 9:
entry.addOption(summit = int(words[4]), tags = int(words[5]), pValue = float(words[6]), fold = float(words[7]), fdr = float(words[8]))
else:
entry.addOption(summit = int(words[4]), tags = int(words[5]), pValue = float(words[6]), fold = float(words[7]))
elif format.lower().startswith('ccat'):
entry.addOption(summit = int(words[1]) - entry.chromStart, tags = int(words[4]), bg = int(words[5]), zscore = float(words[6]), fdr = float(words[7]), name = '.', score = int(words[4]), strand = '.')
elif format.lower().startswith('crop'):
entry.addOption(score = int(words[2]), name = '.', strand = '.')
entry.chromEnd = entry.chromStart + 1
rows.append(entry)
except RuntimeError as e:
if not acceptHeaderRows:
raise RuntimeError('Error in BED file at row %d (%s)' % (row, e.strerror))
else:
headerRow = words
acceptHeaderRows -= 1 # count down the number of header rows that can occur
f.close()
return rows
def __iter__(self):
return self.rows.__iter__()
def __getslice__(self, i, j):
return self.rows.__getslice__(i, j)
def __getitem__(self, i):
return self.rows[i]
def __len__(self):
return len(self.rows)
def _createIndices(self):
index_start = {}
index_centre = {}
index_end = {}
index_name = {}
for i in range(len(self.rows)):
row = self.rows[i]
if not index_start.has_key(row.chrom): # seeing chromosome entry first time
index_start[row.chrom] = []
if not index_centre.has_key(row.chrom): # seeing chromosome entry first time
index_centre[row.chrom] = []
if not index_end.has_key(row.chrom): # seeing chromosome entry first time
index_end[row.chrom] = []
index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
index_end[row.chrom].append((row.chromEnd, row.chromEnd - row.chromStart, i))
if row.name:
index_name[row.name] = row
for chr in index_start:
index_start[chr].sort()
index_centre[chr].sort()
index_end[chr].sort()
return (index_start, index_centre, index_end, index_name)
def __contains__(self, elem):
""" Test for containment: does the specified elem overlap with at least one of the BED entries.
The method performs a binary search. """
try:
if isinstance(elem, BedEntry):
elem = elem.loc()
entries = self.indices[0][elem[0]] # use the start index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
while True:
entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), elem, minimum = True)
delta = 0
if d == 0:
return True
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
inspect -= delta
if delta == 0:
return False
except KeyError:
return False
def match(self, elem, name):
""" Test for containment: does the specified elem overlap with at least one of the BED entries
that has the nominated name (label)."""
try:
if isinstance(elem, BedEntry):
elem = elem.loc()
entries = self.indices[0][elem[0]] # use the start index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
while True:
entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), elem, minimum = True)
delta = 0
if d == 0:
delta = 0
while d == 0:
if entry.name == name:
return True
delta += 1
entry = self.rows[entries[inspect + delta][2]]
d = distance(entry.loc(), elem, minimum = True)
delta = -1
entry = self.rows[entries[inspect + delta][2]]
d = distance(entry.loc(), elem, minimum = True)
while d == 0:
if entry.name == name:
return True
delta -= 1
entry = self.rows[entries[inspect + delta][2]]
d = distance(entry.loc(), elem, minimum = True)
return False
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
inspect -= delta
if delta == 0:
return False
except KeyError:
return False
def findByName(self, myname):
""" Find the unique entry with the specified name.
Note that if the name is not unique, the last entry with the name will be returned.
"""
return self.indices[3][myname]
def closest(self, myloc, minimum = True):
""" Find the closest entry in the current BedFile to a given location.
Return a tuple with the absolute distance and the entry that is closest.
If several entries are closest, then any of the closest entries are returned.
If no location is found on the same chromosome, the tuple None, None is returned.
minimum: if True, use minimum distance, if False, use centre to centre distance.
"""
mindist = None
minentry = None
try:
if isinstance(myloc, BedEntry):
myloc = myloc.loc()
if minimum:
entries = self.indices[0][myloc[0]] # use start index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
delta = None
while not delta == 0:
entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), myloc, minimum = True)
if mindist == None:
mindist = abs(d)
minentry = entry
elif abs(d) < mindist:
mindist = abs(d)
minentry = entry
if d == 0:
return (mindist, minentry)
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
inspect -= delta
# we may have missed the closest, so need to look around this point
for i_dn in range(inspect + 1, len(entries)): # Look downstream since
entry = self.rows[entries[i_dn][2]]
d = distance(entry.loc(), myloc, minimum = True)
if abs(d) < mindist:
mindist = abs(d)
minentry = entry
elif abs(d) > mindist:
break
# also need to investigate upstream, doing so by using end index
entries = self.indices[2][myloc[0]] # use end index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
delta = None
while not delta == 0:
entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), myloc, minimum = True)
if abs(d) < mindist:
mindist = abs(d)
minentry = entry
if d == 0:
return (mindist, minentry)
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
inspect -= delta
# we may have missed the closest, so need to look around this point
for i_up in range(inspect - 1, 0, -1): # Look upstream since
entry = self.rows[entries[i_up][2]]
d = distance(entry.loc(), myloc, minimum = True)
if abs(d) < mindist:
mindist = abs(d)
minentry = entry
elif abs(d) > mindist:
break
return (mindist, minentry)
else: # minimum == False, i.e. use centre-to-centre distance
entries = self.indices[1][myloc[0]] # use centre index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
delta = None
while not delta == 0:
entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), myloc, minimum = False)
if mindist == None:
mindist = abs(d)
minentry = entry
elif abs(d) < mindist:
mindist = abs(d)
minentry = entry
if d == 0:
return (mindist, minentry)
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
inspect -= delta
# at bottom of search
return (mindist, minentry)
except KeyError:
return None
def merge(self, usestrand = False):
""" Collapse entries that overlap to create a new BedFile.
If usestrand is True, then strands are considered exclusively.
If usestrand is False, the strand info is ignored when overlap is checked.
When entries are merged, the options assigned to the last entry are retained, others are ignored.
"""
starts = self.indices[0]
rows = self.rows
newrows = []
for c in starts: # chromosome
earliest_start = None
latest_end = None
for e in starts[c]:
idx = e[2]
strand = rows[idx].strand
if not usestrand or strand == '+':
start = rows[idx].chromStart
end = rows[idx].chromEnd
if not earliest_start: # not yet initialised
earliest_start = start
latest_end = end
else:
if start > latest_end: # new entry
entry = BedEntry(c, earliest_start, latest_end)
if self.format == 'Peaks':
entry.addOption(name = rows[idx].name, score = rows[idx].score, signalValue = rows[idx].signalValue, strand = rows[idx].strand, pValue = rows[idx].pValue)
elif self.format == 'Strand':
entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
newrows.append(entry)
earliest_start = start
latest_end = end
earliest_start = min(earliest_start, start)
latest_end = max(latest_end, end)
# the last entry on the chromosome
if not usestrand or earliest_start: # or strand == '+': # strand could have been overwritten if not last
entry = BedEntry(c, earliest_start, latest_end)
if self.format == 'Peaks':
entry.addOption(name = rows[idx].name, score = rows[idx].score, signalValue = rows[idx].signalValue, strand = rows[idx].strand, pValue = rows[idx].pValue)
elif self.format == 'Strand':
entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
newrows.append(entry)
if usestrand:
earliest_start = None
latest_end = None
for e in starts[c]:
idx = e[2]
strand = rows[idx].strand
if strand == '-':
start = rows[idx].chromStart
end = rows[idx].chromEnd
if not earliest_start: # not yet initialised
earliest_start = start
latest_end = end
else:
if start > latest_end: # new entry
entry = BedEntry(c, earliest_start, latest_end)
if self.format == 'Peaks':
entry.addOption(name = rows[idx].name, score = rows[idx].score, signalValue = rows[idx].signalValue, strand = rows[idx].strand, pValue = rows[idx].pValue)
elif self.format == 'Strand':
entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
newrows.append(entry)
earliest_start = start
latest_end = end
earliest_start = min(earliest_start, start)
latest_end = max(latest_end, end)
# the last entry on the chromosome
if earliest_start: # starnd info could've been overwritten so check only that there is an entry to be written
entry = BedEntry(c, earliest_start, latest_end)
if self.format == 'Peaks':
entry.addOption(name = rows[idx].name, score = rows[idx].score, signalValue = rows[idx].signalValue, strand = rows[idx].strand, pValue = rows[idx].pValue)
elif self.format == 'Strand':
entry.addOption(name = rows[idx].name, strand = rows[idx].strand)
newrows.append(entry)
return BedFile(newrows, format = self.format)
def write(self, filename, format = 'BED6'):
""" Save the data
format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
"""
f = open(filename, 'w')
for row in self.__iter__():
if self.format == 'Peaks':
#f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
f.write("%s %d %d %s %d %s %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
elif self.format == 'Limited':
f.write("%s %d %d" % (row.chrom, row.chromStart, row.chromEnd))
else:
f.write("%s %d %d %s %d %s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
f.write("\n")
f.close()
def readBedFile(filename, format = 'Limited'):
""" Read a BED file.
format: specifies the format of the file,
"Limited", e.g.
chr22 1000 5000
chr22 2000 6000
"Optional", e.g.
track name=pairedReads description="Clone Paired Reads" useScore=1
chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512
chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601
...
(also handles the Limited + score format)
"Peaks", e.g.
chr1 569780 569930 . 0 . 19 6.07811 -1 -1
chr1 713300 713450 . 0 . 54 49.1167 -1 -1
"Strand", e.g.
chr4 185772359 185772424 -
chr18 20513381 20513401 +
also supports a 5th label field
chr5 20611949 20611949 + ENSG00000251629_20611949
chr3 42187863 42187863 - ENSG00000234562_42187863
"Summit", e.g.
# d = 130
chr start end length summit tags -10*log10(pvalue) fold_enrichment FDR(%)
chr1 8250 8671 422 286 46 145.84 11.68 0.51
chr1 36382 36984 603 405 46 315.23 27.05 0.24
"CCAT", e.g.
chr8 94747805 94747070 94749250 525 3 21.519196 0.002000
chr17 55277895 55277070 55279280 560 18 21.283333 0.002000
"Cropped", e.g.
chr1 851602 10
chr1 921184 18
chr1 931838 9
"""
return BedFile(filename, format)
def writeBedFile(entries, filename, format = 'BED6'):
""" Save the BED entries to a BED file.
format - the format to use for WRITING, currently only BED6 ('Optional' 6-col format) is supported.
"""
f = open(filename, 'w')
for row in entries:
if format == 'Peaks':
#f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
f.write("%s %d %d %s %d %s %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
elif format == 'Limited':
f.write("%s %d %d" % (row.chrom, row.chromStart, row.chromEnd))
else:
f.write("%s %d %d %s %d %s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
f.write("\n")
f.close()
def uniteBed(bed1, bed2):
if bed1.format != bed2.format:
raise RuntimeError('BEDs are of different formats')
rows = []
rows.extend(bed1.rows)
rows.extend(bed2.rows)
return BedFile(rows, bed1.format)
"""
This following code is a modified version of twobitreader (which is under Perl Artistic License 2.0).
As per license restrictions, the code below indicates what has been modified in relation to the
standard version (retrieved from https://bitbucket.org/thesylex/twobitreader on the 16 May 2012).
No warranty is provided, express or implied
Modifications to package:
- removed download.py and __main__ because they were not used and __main__ had errors.
- removed command-line interface because the BED file functionality is implemented more extensively elsewhere
"""
from array import array
from bisect import bisect_right
from errno import ENOENT, EACCES
from os import R_OK, access
try:
from os import strerror
except ImportError:
strerror = lambda x: 'strerror not supported'
from os.path import exists
from itertools import izip
def true_long_type():
"""
OS X uses an 8-byte long, so make sure L (long) is the right size
and switch to I (int) if needed
"""
for type_ in ['L', 'I']:
test_array = array(type_, [0])
long_size = test_array.itemsize
if long_size == 4: return type_
raise ImportError("Couldn't determine a valid 4-byte long type to use \
as equivalent to LONG")
LONG = true_long_type()
def byte_to_bases(x):
"""convert one byte to the four bases it encodes"""
c = (x >> 4) & 0xf
f = x & 0xf
cc = (c >> 2) & 0x3
cf = c & 0x3
fc = (f >> 2) & 0x3
ff = f & 0x3
return map(bits_to_base, (cc, cf, fc, ff))
def bits_to_base(x):
"""convert integer representation of two bits to correct base"""
if x is 0: return 'T'
if x is 1: return 'C'
if x is 2: return 'A'
if x is 3: return 'G'
def base_to_bin(x):
"""
provided for user convenience
convert a nucleotide to its bit representation
"""
if x == 'T': return '00'
if x == 'C': return '01'
if x == 'A': return '10'
if x == 'G': return '11'
def create_byte_table():
"""create BYTE_TABLE"""
d = {}
for x in xrange(2**8):
d[x] = byte_to_bases(x)
return d
def split16(x):
"""
split a 16-bit number into integer representation
of its course and fine parts in binary representation
"""
c = (x >> 8) & 0xff
f = x & 0xff
return c, f
def create_twobyte_table():
"""create TWOBYTE_TABLE"""
d = {}
for x in xrange(2**16):
c, f = split16(x)
d[x] = byte_to_bases(c) + byte_to_bases(f)
return d
BYTE_TABLE = create_byte_table()
TWOBYTE_TABLE = create_twobyte_table()
def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
"""
takes in a iterable of longs and converts them to bases in a char array
returns a ctypes string buffer
"""
longs_len = len(longs)
# dna = ctypes.create_string_buffer(array_size)
dna = array('c', 'N' * longs_len)
# translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed)
bytes = array('B')
bytes.fromstring(longs.tostring())
# first block
first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
i = 16 - first_base_offset
if array_size < i: i = array_size
dna[0:i] = array('c', first_block[first_base_offset:first_base_offset + i])
if longs_len == 1: return dna
# middle blocks (implicitly skipped if they don't exist)
for byte in bytes[4:-4]:
dna[i:i + 4] = array('c', BYTE_TABLE[byte])
i += 4
# last block
last_block = array('c', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
dna[i:i + last_base_offset] = last_block[0:last_base_offset]
return dna
class TwoBitFile(dict):
"""
python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support)
TwoBitFile inherits from dict
You may access sequences by name, e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> chr20[100100:100200]
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20)
Fair warning: dumping the entire chromosome requires a lot of memory
See TwoBitSequence for more info
"""
def __init__(self, foo):
super(TwoBitFile, self).__init__()
if not exists(foo):
raise IOError(ENOENT, strerror(ENOENT), foo)
if not access(foo, R_OK):
raise IOError(EACCES, strerror(EACCES), foo)
self._filename = foo
self._file_handle = open(foo, 'rb')
self._load_header()
self._load_index()
for name, offset in self._offset_dict.iteritems():
self[name] = TwoBitSequence(self._file_handle, offset,
self._byteswapped)
return
def _load_header(self):
file_handle = self._file_handle
header = array(LONG)
header.fromfile(file_handle, 4)
# check signature -- must be 0x1A412743
# if not, swap bytes
byteswapped = False
(signature, version, sequence_count, reserved) = header
if not signature == 0x1A412743:
byteswapped = True
header.byteswap()
(signature2, version, sequence_count, reserved) = header
if not signature2 == 0x1A412743:
raise TwoBitFileError('Signature in header should be 0x1A412743'
+ ', instead found 0x%X' % signature)
if not version == 0:
raise TwoBitFileError('File version in header should be 0.')
if not reserved == 0:
raise TwoBitFileError('Reserved field in header should be 0.')
self._byteswapped = byteswapped
self._sequence_count = sequence_count
def _load_index(self):
file_handle = self._file_handle
byteswapped = self._byteswapped
remaining = self._sequence_count
sequence_offsets = []
file_handle.seek(16)
while True:
if remaining == 0: break
name_size = array('B')
name_size.fromfile(file_handle, 1)
if byteswapped: name_size.byteswap()
name = array('c')
if byteswapped: name.byteswap()
name.fromfile(file_handle, name_size[0])
offset = array(LONG)
offset.fromfile(file_handle, 1)
if byteswapped: offset.byteswap()
sequence_offsets.append((name.tostring(), offset[0]))
remaining -= 1
self._sequence_offsets = sequence_offsets
self._offset_dict = dict(sequence_offsets)
def sequence_sizes(self):
"""returns a dictionary with the sizes of each sequence"""
d = {}
file_handle = self._file_handle
byteswapped = self._byteswapped
for name, offset in self._offset_dict.iteritems():
file_handle.seek(offset)
dna_size = array(LONG)
dna_size.fromfile(file_handle, 1)
if byteswapped: dna_size.byteswap()
d[name] = dna_size[0]
return d
class TwoBitSequence(object):
"""
A TwoBitSequence object refers to an entry in a TwoBitFile
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20[100100:100200] # slicing returns a string
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20) # get whole chr as string
Fair warning: dumping the entire chromosome requires a lot of memory
Note that we follow python/UCSC conventions:
Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
If you attempt to access a slice past the end of the sequence,
it will be truncated at the end.
Your computer probably doesn't have enough memory to load a whole genome
but if you want to string-ize your TwoBitFile, here's a recipe:
x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.iteritems(): d[k] = str(v)
"""
def __init__(self, file_handle, offset, byteswapped=False):
self._file_handle = file_handle
self._original_offset = offset
self._byteswapped = byteswapped
file_handle.seek(offset)
header = array(LONG)
header.fromfile(file_handle, 2)
if byteswapped: header.byteswap()
dna_size, n_block_count = header
self._dna_size = dna_size
self._packed_dna_size = (dna_size + 15) / 16 # this is 32-bit fragments
n_block_starts = array(LONG)
n_block_sizes = array(LONG)
n_block_starts.fromfile(file_handle, n_block_count)
if byteswapped: n_block_starts.byteswap()
n_block_sizes.fromfile(file_handle, n_block_count)
if byteswapped: n_block_sizes.byteswap()
self._n_block_starts = n_block_starts
self._n_block_sizes= n_block_sizes
mask_rawc = array(LONG)
mask_rawc.fromfile(file_handle, 1)
if byteswapped: mask_rawc.byteswap()
mask_block_count = mask_rawc[0]
mask_block_starts = array(LONG)
mask_block_starts.fromfile(file_handle, mask_block_count)
if byteswapped: mask_block_starts.byteswap()
mask_block_sizes = array(LONG)
mask_block_sizes.fromfile(file_handle, mask_block_count)
if byteswapped: mask_block_sizes.byteswap()
self._mask_block_starts = mask_block_starts
self._mask_block_sizes = mask_block_sizes
file_handle.read(4)
self._offset = file_handle.tell()
def __len__(self):
return self._dna_size
def __getslice__(self, min_, max_=None):
return self.get_slice(min_, max_)
def get_slice(self, min_, max_=None):
"""
get_slice returns only a sub-sequence
"""
# handle negative coordinates
dna_size = self._dna_size
if max_ < 0:
if max_ < -dna_size: raise IndexError('index out of range')
max_ = dna_size + 1 + max_
if min_ < 0:
if max_ < -dna_size: raise IndexError('index out of range')
min_ = dna_size + 1 + min_
# Find out if the reverse complement is sought
reverse = False # assume not RC
if min_ > max_ and max_ is not None:
reverse = True
mymax = max_
max_ = min_
min_ = mymax
if max_ == 0: return ''
# load all the data
if max_ > dna_size: max_ = dna_size
file_handle = self._file_handle
byteswapped = self._byteswapped
n_block_starts = self._n_block_starts
n_block_sizes = self._n_block_sizes
mask_block_starts = self._mask_block_starts
mask_block_sizes = self._mask_block_sizes
offset = self._offset
packed_dna_size = self._packed_dna_size
# region_size is how many bases the region is
if max_ is None: region_size = dna_size - min_
else: region_size = max_ - min_
# start_block, end_block are the first/last 32-bit blocks we need
# note: end_block is not read
# blocks start at 0
start_block = min_ / 16
end_block = max_ / 16
# don't read past seq end
if end_block >= packed_dna_size: end_block = packed_dna_size - 1
# +1 we still need to read block
blocks_to_read = end_block - start_block + 1
# jump directly to desired file location
local_offset = offset + start_block * 4
file_handle.seek(local_offset)
# note we won't actually read the last base
# this is a python slice first_base_offset:16*blocks+last_base_offset
first_base_offset = min_ % 16
last_base_offset = max_ % 16
fourbyte_dna = array(LONG)
fourbyte_dna.fromfile(file_handle, blocks_to_read)
if byteswapped: fourbyte_dna.byteswap()
string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
last_base_offset, region_size)
for start, size in izip(n_block_starts, n_block_sizes):
end = start + size
if end <= min_: continue
if start > max_: break
if start < min_: start = min_
if end > max_: end = max_
start -= min_
end -= min_
string_as_array[start:end] = array('c', 'N'*(end-start))
lower = str.lower
first_masked_region = max(0,
bisect_right(mask_block_starts, min_) - 1)
last_masked_region = min(len(mask_block_starts),
1 + bisect_right(mask_block_starts, max_,
lo=first_masked_region))
for start, size in izip(mask_block_starts[first_masked_region:last_masked_region],
mask_block_sizes[first_masked_region:last_masked_region]):
end = start + size
if end <= min_: continue
if start > max_: break
if start < min_: start = min_
if end > max_: end = max_
start -= min_
end -= min_
string_as_array[start:end] = array('c', lower(string_as_array[start:end].tostring()))
if not len(string_as_array) == max_ - min_:
raise RuntimeError, "Sequence was longer than it should be"
if reverse:
return self.reverseComplement(string_as_array.tostring())
return string_as_array.tostring()
def reverseComplement(self, dna):
""" Return a new sequence: the reverse complement of this sequence. """
newseq=''
symbols={'A':'T','C':'G','T':'A','G':'C','a':'t','c':'g','t':'a','g':'c','n':'n','N':'N'} # reverse complement dictionary
for symbol in dna[::-1]:
newsymbol=symbols[symbol] # uses the reverse complement symbols in dictionary
newseq+=newsymbol
return newseq # returns RC sequences
def __str__(self):
"""
returns the entire chromosome
"""
return self.__getslice__(0, None)
class TwoBitFileError(StandardError):
"""
Base exception for TwoBit module
"""
def __init__(self, msg):
errtext = 'Invalid 2-bit file. ' + msg
return super(TwoBitFileError, self).__init__(errtext)
def print_specification():
"""
Prints the twoBit file format specification I got from the Internet.
This is only here for reference
"""
return """
From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
.2bit files
A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact \
randomly accessible format. The two bit files contain masking information as \
well as the DNA itself. The file begins with a 16 byte header containing the \
following fields:
signature - the number 0x1A412743 in the architecture of the machine that \
created the file.
version - zero for now. Readers should abort if they see a version number \
higher than 0.
sequenceCount - the number of sequences in the file
reserved - always zero for now.
All fields are 32 bits unless noted. If the signature value is not as given, \
the reader program should byte swap the signature and see if the swapped \
version matches. If so all multiple-byte entities in the file will need to be \
byte-swapped. This enables these binary files to be used unchanged on \
different architectures.
The header is followed by a file index. There is one entry in the index for \
each sequence. Each index entry contains three fields:
nameSize - a byte containing the length of the name field
name - this contains the sequence name itself, and is variable length \
depending on nameSize.
offset - 32 bit offset of the sequence data relative to the start of the file
The index is followed by the sequence records. These contain 9 fields:
dnaSize - number of bases of DNA in the sequence.
nBlockCount - the number of blocks of N's in the file (representing unknown \
sequence).
nBlockStarts - a starting position for each block of N's
nBlockSizes - the size of each block of N's
maskBlockCount - the number of masked (lower case) blocks
maskBlockStarts - starting position for each masked block
maskBlockSizes - the size of each masked block
packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A, \
11 - G. The first base is in the most significant 2 bits byte, and the last \
base in the least significant 2 bits, so that the sequence TCAG would be \
represented as 00011011. The packedDna field will be padded with 0 bits as \
necessary so that it takes an even multiple of 32 bit in the file, as this \
improves i/o performance on some machines.
.nib files
"""
"""
Module *** sequence ***
This module depends on the following modules
sym -- defines an alphabet
prob -- defines structures to hold probabilities (prob also depends on sym)
This module incorporates classes for
Sequence -- names and defines a sequence of symbols; computes various transformations and pairwise alignments
Alignment -- defines a multiple sequence alignment; computes stats for use in substitution matrices
SubstMatrix -- substitution matrix class to support alignment methods
Regexp -- defines patterns as regular expressions for textual pattern matching in sequences
PWM -- defines a weight matrix that can score any site in actual sequences
Incorporates methods for loading and saving files relevant to the above (e.g. FASTA, ALN, substitution matrices)
and methods for retrieving relevant data from web services
This code has gone through many updates and has benefitted from kind contributions of course participants.
Please keep suggestions coming!
Email: m.boden@uq.edu.au
"""
import string, sys, re, math, os, array
import numpy
from webservice import *
from sym import *
from prob import *
# Sequence ------------------****
class Sequence(object):
""" A biological sequence. Stores the sequence itself (as a compact array),
the alphabet (i.e., type of sequence it is), and optionally a name and further
information. """
sequence = None # The array of symbols that make up the sequence
alphabet = None # The alphabet from which symbols come
name = None # The name (identifier) of a sequence
info = None # Other information (free text; e.g. annotations)
length = None # The number of symbols that the sequence is composed of
gappy = None # True if the sequence has "gaps", i.e. positions that represent deletions relative another sequence
def __init__(self, sequence, alphabet = None, name = '', info = '', gappy = False):
""" Create a sequence with the sequence data. Specifying the alphabet,
name and other information about the sequence are all optional.
The sequence data is immutable (stored as a string).
Example:
>>> myseq = Sequence('MVSAKKVPAIAMSFGVSF')
will create a sequence with no name, and assign one of the predefined
alphabets on the basis of what symbols were used.
>>> myseq.alphabet.symbols
will output the standard protein alphabet:
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'Y'] """
try: # convert sequence data into a compact array representation
self.sequence = array.array('c', ''.join([s.upper() for s in sequence]))
except TypeError:
raise RuntimeError('Sequence data is not specified correctly: must be iterable')
# Assign an alphabet
self.alphabet = None
if not alphabet is None:
for sym in self.sequence:
if not sym in alphabet and (sym != '-' or not gappy): # error check: bail out
raise RuntimeError('Invalid symbol: %c in sequence %s' % (sym, name))
self.alphabet = alphabet
else:
for alphaName in preferredOrder:
alpha = predefAlphabets[alphaName]
valid = True
for sym in self.sequence:
if not sym in alpha and (sym != '-' or not gappy):
valid = False
break
if valid:
self.alphabet = alpha
break
if self.alphabet is None:
raise RuntimeError('Could not identify alphabet from sequence: %s' % name)
# Store other information
self.name = name
self.info = info
self.length = len(self.sequence)
self.gappy = gappy
def __len__(self):
""" Defines what the "len" operator returns for an instance of Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print len(seq)
9
"""
return len(self.sequence)
def __str__(self):
""" Defines what should be printed when the print statement is used on a Sequence instance """
str = self.name + ': '
for sym in self:
str += sym
return str
def __iter__(self):
""" Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
>>> seq = Sequence('AGGAT', DNA_Alphabet)
>>> for sym in seq:
print sym
will print A, G, G, A, T (each on a separate row)
"""
tsyms = tuple(self.sequence)
return tsyms.__iter__()
def __contains__(self, item):
""" Defines what is returned when the "in" operator is used on a Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print 'T' in seq
True
which is equivalent to
>>> print seq.__contains__('T')
True
>>> print 'X' in seq
False
"""
for sym in self.sequence:
if sym == item:
return True
return False
def __getitem__(self, ndx):
""" Retrieve a specified index (or a "slice" of indices) of the sequence data.
Calling self.__getitem__(3) is equivalent to self[3]
"""
if type(ndx) is slice:
return self.sequence[ndx].tostring()
else:
return self.sequence[ndx]
def writeFasta(self):
""" Write one sequence in FASTA format to a string and return it. """
fasta = '>' + self.name + ' ' + self.info + '\n'
data = self.sequence.tostring()
nlines = (len(self.sequence) - 1) / 60 + 1
for i in range(nlines):
lineofseq = ''.join(data[i*60 : (i+1)*60]) + '\n'
fasta += lineofseq
return fasta
def count(self, findme = None):
""" Get the number of occurrences of specified symbol findme OR
if findme = None, return a dictionary of counts of all symbols in alphabet """
if findme != None:
cnt = 0
for sym in self.sequence:
if findme == sym:
cnt = cnt + 1
return cnt
else:
symbolCounts = {}
for symbol in self.alphabet:
symbolCounts[symbol] = self.count(symbol)
return symbolCounts
def find(self, findme):
""" Find the position of the specified symbol or sub-sequence """
return self.sequence.tostring().find(findme)
"""
Below are some useful methods for loading data from strings and files.
Recognize the FASTA format (nothing fancy).
"""
def readFasta(string, alphabet = None, ignore = False, gappy = False):
""" Read the given string as FASTA formatted data and return the list of
sequences contained within it.
If alphabet is specified, use it, if None (default) then guess it.
If ignore is False, errors cause the method to fail.
If ignore is True, errors will disregard sequence.
If gappy is False (default), sequence cannot contain gaps,
if True gaps are accepted and included in the resulting sequences."""
seqlist = [] # list of sequences contained in the string
seqname = None # name of *current* sequence
seqinfo = None
seqdata = [] # sequence data for *current* sequence
for line in string.splitlines(): # read every line
if len(line) == 0: # ignore empty lines
continue
if line[0] == '>': # start of new sequence
if seqname: # check if we've got one current
try:
current = Sequence(seqdata, alphabet, seqname, seqinfo, gappy)
seqlist.append(current)
except RuntimeError as errmsg:
if not ignore:
raise RuntimeError(errmsg)
# now collect data about the new sequence
seqinfo = line[1:].split() # skip first char
if len(seqinfo) > 0:
try:
parsed = parseDefline(seqinfo[0])
seqname = parsed[0]
seqinfo = line[1:]
except IndexError as errmsg:
if not ignore:
raise RuntimeError(errmsg)
else:
seqname = ''
seqinfo = ''
seqdata = []
else: # we assume this is (more) data for current
cleanline = line.split()
for thisline in cleanline:
seqdata.extend(tuple(thisline.strip('*')))
# we're done reading the file, but the last sequence remains
if seqname:
try:
lastseq = Sequence(seqdata, alphabet, seqname, seqinfo, gappy)
seqlist.append(lastseq)
except RuntimeError as errmsg:
if not ignore:
raise RuntimeError(errmsg)
return seqlist
def parseDefline(string):
""" Parse the FASTA defline (see http://en.wikipedia.org/wiki/FASTA_format)
GenBank, EMBL, etc gi|gi-number|gb|accession|locus
SWISS-PROT, TrEMBL sp|accession|name
...
Return a tuple with
[0] primary search key, e.g. UniProt accession, Genbank GI
[1] secondary search key, e.g. UniProt name, Genbank accession
[2] source, e.g. 'sp' (SwissProt/UniProt), 'tr' (TrEMBL), 'gb' (Genbank)
"""
if len(string) == 0: return ('', '', '', '')
s = string.split()[0]
if re.match("^sp\|[A-Z][A-Z0-9]{5}\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^tr\|[A-Z][A-Z0-9]{5}\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^gi\|[0-9]*\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[3], arg[0], arg[2])
elif re.match("gb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("emb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^refseq\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
else: return (s, '', '', '')
def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
""" Read the given FASTA formatted file and return the list of sequences
contained within it. Note that if alphabet is NOT specified, it will take a
separate guess for each sequence.
If ignore is False, errors cause the method to fail.
If ignore is True, errors will disregard sequence.
If gappy is False (default), sequence cannot contain gaps,
if True gaps are accepted and included in the resulting sequences."""
fh = open(filename)
seqlist = []
batch = '' # a batch of rows including one or more complete FASTA entries
rowcnt = 0
for row in fh:
row = row.strip()
if len(row) > 0:
if row.startswith('>') and rowcnt > 0:
more = readFasta(batch, alphabet, ignore, gappy)
if len(more) > 0:
seqlist.extend(more)
batch = ''
rowcnt = 0
batch += row + '\n'
rowcnt += 1
if len(batch) > 0:
more = readFasta(batch, alphabet, ignore, gappy)
if len(more) > 0:
seqlist.extend(more)
fh.close()
return seqlist
def writeFastaFile(filename, seqs):
""" Write the specified sequences to a FASTA file. """
fh = open(filename, 'w')
for seq in seqs:
fh.write(seq.writeFasta())
fh.close()
def getMarkov(seqs, order = 0):
""" Retrieve the Markov stats for a set of sequences. """
myseqs = seqs
if seqs is Sequence:
myseqs = list([seqs])
myalpha = None
for seq in myseqs:
if myalpha == None:
myalpha = seq.alphabet
else:
if seq.alphabet != myalpha:
raise RuntimeError('Sequence ' + seq.name + ' uses an invalid alphabet ')
jp = Joint([myalpha for _ in range(order + 1)])
for seq in myseqs:
for i in range(len(seq) - order):
sub = seq[i:i + order + 1]
jp.observe(sub)
return jp
def getCount(seqs, findme = None):
if findme != None:
cnt = 0
for seq in seqs:
cnt += seq.count(findme)
return cnt
else:
if len(seqs) > 0:
alpha = seqs[0].alphabet
patcnt = {}
for a in alpha:
patcnt[a] = getCount(seqs, a)
return patcnt
# Alignment ------------------
class Alignment():
""" A sequence alignment class. Stores two or more sequences of equal length where
one symbol is gap '-'
Example usage:
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print Alignment(seqs)
THIS-LI-NE-
--ISALIGNED
"""
alignlen = None
seqs = None
alphabet = None
def __init__(self, seqs):
self.alignlen = -1
self.seqs = seqs
self.alphabet = None
for s in seqs:
if self.alignlen == -1:
self.alignlen = len(s)
elif self.alignlen != len(s):
raise RuntimeError("Alignment invalid: different lengths")
if self.alphabet != None and self.alphabet != s.alphabet:
raise RuntimeError("Alignment invalid: different alphabets")
self.alphabet = s.alphabet
def getnamelen(self):
namelen = 0
for seq in self.seqs:
namelen = max(len(seq.name), namelen)
return namelen
def __len__(self):
""" Defines what the "len" operator returns for an instance of Alignment, e.g.
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> aln = Alignment(seqs)
>>> print len(aln)
2
"""
return len(self.seqs)
def getSize(self):
""" Returns the size of an alignment in terms of number of columns """
return self.alignlen
def __str__(self):
string = ''
namelen = self.getnamelen()
for seq in self.seqs:
string += seq.name.ljust(namelen+1)
for sym in seq:
string += sym
string += '\n'
return string
def __getitem__(self, ndx):
return self.seqs[ndx]
def writeClustal(self, filename = None):
""" Write the alignment to a string or file using the Clustal file format. """
symbolsPerLine = 60
maxNameLength = self.getnamelen() + 1
string = ''
wholeRows = self.alignlen / symbolsPerLine
for i in range(wholeRows):
for j in range(len(self.seqs)):
string += self.seqs[j].name.ljust(maxNameLength) + ' '
string += self.seqs[j][i*symbolsPerLine:(i+1)*symbolsPerLine] + '\n'
string += '\n'
# Possible last row
lastRowLength = self.alignlen - wholeRows*symbolsPerLine
if lastRowLength > 0:
for j in range(len(self.seqs)):
if maxNameLength > 0:
string += self.seqs[j].name.ljust(maxNameLength) + ' '
string += self.seqs[j][-lastRowLength:] + '\n'
if filename != None:
fh = open(filename, 'w')
fh.write('CLUSTAL W (1.83) multiple sequence alignment\n\n\n') # fake header so that clustal believes it
fh.write(string)
fh.close()
return
return string
def getProfile(self, pseudo = 0.0, countGaps = True):
""" Determine the probability matrix from the alignment, assuming
that each position is independent of all others. """
p = IndepJoint([self.alphabet for _ in range(self.alignlen)], pseudo)
for seq in self.seqs:
p.observe(seq, 1, countGaps = countGaps)
return p
def getConsensus(self):
""" Construct a consensus sequence. """
syms = []
for col in range(self.alignlen):
d = Distrib(self.alphabet)
for seq in self.seqs:
if seq[col] in self.alphabet:
d.observe(seq[col])
syms.append(d.getmax())
return Sequence(syms)
def getConsensusForColumn(self, colidx):
symcnt = {}
for seq in self.seqs:
mysym = seq[colidx]
try:
symcnt[mysym] += 1
except:
symcnt[mysym] = 1
consensus = None
maxcnt = 0
for mysym in symcnt:
if symcnt[mysym] > maxcnt:
maxcnt = symcnt[mysym]
consensus = mysym
return consensus
def displayConsensus(self, theta1 = 0.2, theta2 = 0.05, lowercase = True):
""" Display a table with rows for each alignment column, showing
column index, entropy, number of gaps, and symbols in order of decreasing probability.
theta1 is the threshold for displaying symbols in upper case,
theta2 is the threshold for showing symbols at all, and in lower case. """
print "Alignment of %d sequences, with %d columns" % (len(self.seqs), self.alignlen)
print "Column\tEntropy\tGaps\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2)
for col in range(self.alignlen):
d = Distrib(self.alphabet)
gaps = 0
for seq in self.seqs:
if seq[col] in self.alphabet:
d.observe(seq[col])
else:
gaps += 1
print (col + 1), "\t%5.3f" % d.entropy(), "\t%4d\t" % gaps,
symprobs = d.getProbsort()
(_, maxprob) = symprobs[0]
if maxprob >= theta1:
print "%d\tTRUE\t" % int(maxprob * 100),
else:
print "%d\t\t" % int(maxprob * 100),
for (sym, prob) in symprobs:
if prob >= theta1:
print sym, "%d%%" % int(prob * 100),
elif prob >= theta2 and lowercase:
print sym.lower(), "%d%%" % int(prob * 100),
elif prob >= theta2:
print sym, "%d%%" % int(prob * 100),
print
def saveConsensus(self, myseq, filename, theta1 = 0.2, theta2 = 0.05, lowercase = True, compact = False):
""" Display a table with rows for each alignment column, showing
column index, entropy, number of gaps, and symbols in order of decreasing probability.
theta1 is the threshold for displaying symbols in upper case,
theta2 is the threshold for showing symbols at all, and in lower case. """
filename = ''.join(e for e in filename if e.isalnum() or e == '_' or e == '.')
f = open(filename, 'w')
f.write("Alignment of %d sequences, with %d columns\n" % (len(self.seqs), self.alignlen))
if compact:
f.write("Column\tConserv\tVariab\tAll (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2))
else:
f.write("Column\tProb\tConserv\tSymbols (Up>=%.2f;Low>=%.2f)\n" % (theta1, theta2))
countrow = 0
for col in range(self.alignlen):
countrow += 1
if myseq[col] == '-':
continue
alist = list(self.alphabet)
alist.append('-')
gapalphabet = Alphabet(alist)
d_gap = Distrib(gapalphabet)
d_nogap = Distrib(self.alphabet)
for seq in self.seqs:
if seq[col] in gapalphabet:
d_gap.observe(seq[col])
if seq[col] in self.alphabet:
d_nogap.observe(seq[col])
f.write("%d\t" % (col + 1))
symprobs_nogap = d_nogap.getProbsort()
symprobs_gap = d_gap.getProbsort()
(maxsym, maxprob) = symprobs_nogap[0]
if compact:
if maxprob >= theta1:
f.write("%c\t" % maxsym)
else:
f.write("\t")
for (sym, prob) in symprobs_gap:
if prob >= theta2 and lowercase:
f.write("%c" % sym.lower())
elif prob >= theta2:
f.write("%c" % sym)
f.write("\t")
else:
if maxprob >= theta1:
f.write("%d\t" % int(maxprob * 100))
else:
f.write("%d\t\t" % int(maxprob * 100))
for (sym, prob) in symprobs_gap:
if prob >= theta1:
f.write("%c %d%% " % (sym, int(prob * 100)))
elif prob >= theta2 and lowercase:
f.write("%c %d%% " % (sym.lower(), int(prob * 100)))
elif prob >= theta2:
f.write("%c %d%% " % (sym, int(prob * 100)))
f.write('\n')
f.close()
def calcBackground(self):
""" Count the proportion of each amino acid's occurrence in the
alignment, and return as a probability distribution. """
p = Distrib(self.alphabet)
for seq in self.seqs:
for sym in seq:
if sym in self.alphabet: # ignore "gaps"
p.observe(sym)
return p
def calcSubstMatrix(self, background = None):
""" Return a substitutionMatrix whose fg are based on this un-gapped
multiple sequence alignment. Scores are given in half-bits. """
# Get a list of the amino acids
aminoAcids = self.alphabet.symbols
columns = self.alignlen # Length of sequences in alignment
numSeqs = len(self.seqs) # Number of sequences in alignment
seqPairs = (numSeqs* (numSeqs - 1) ) / 2 # Number of pairs of sequences in ungapped alignment
aaPairs = seqPairs * columns # Number of pairs of amino acids in ungapped alignment
# For each pair of amino acids, calculate the proportion of all aligned
# amino acids in this alignment which are made up of that pair
# (i.e., q[ab] = fab / aaPairs, where fab is the number of times
# a and b are aligned in this alignment)
# See page 122 in Understanding Bioinformatics.
q = {}
for i in range( len(aminoAcids) ):
a = aminoAcids[i]
for j in range(i, len(aminoAcids)):
b = aminoAcids[j]
# Count the number of times each pair of amino acids is aligned
fab = 0
for column in range(columns):
# Count number of each amino acid in each column
col = [seq[column] for seq in self.seqs]
if a == b:
# Number of ways of pairing up n occurrences of amino
# acid a is n*(n-1)/2
cnt = col.count(a)
fab += cnt * (cnt-1)/2
else:
# Number of ways of pairing up n & m occurrences of
# amino acids a & b is n*m
fab += col.count(a)*col.count(b)
# Calculate proportion of all aligned pairs of amino acids
q[a+b] = q[b+a] = float(fab) / aaPairs
if q[a+b] == 0: # This is so we don't end up doing log(0)
q[a+b] = q[b+a] = 0.001
# Background frequency calculation if required
p = background or self.calcBackground()
# Calculate log-odds ratio for each pair of amino acids
s = SubstMatrix(self.alphabet)
for a in aminoAcids:
for b in aminoAcids:
# Calculate random chance probabilitity (eab)
if a == b:
eab = p[a]**2
else:
eab = 2*p[a]*p[b]
if eab == 0:
eab = 0.001
# Calculate final score to be set in the substitution matrix
odds = q[a+b] / eab
sab = math.log(odds, 2) # log_2 transform
sab = sab * 2 # units in half bits
s.set(a, b, int(round(sab)))
return s
def calcDistances(self, measure, a=1.0):
""" Calculate the evolutionary distance between all pairs of sequences
in this alignment, using the given measure. Measure can be one of
'fractional', 'poisson', 'gamma', 'jc' or 'k2p'. If 'gamma' or 'k2p' is
given, then the parameter a must also be specified (or else it will use
the default value of 1.0).
Definitions of each distance metric are found in Zvelebil and Baum p268-276.
These are mostly intended for DNA, but adapted for protein (as below).
Note however that there are alternative distance matrices for proteins (p276).
"""
measure = measure.lower()
if not measure in ['fractional', 'poisson', 'gamma', 'jc', 'k2p']:
raise RuntimeError('Unsupported evolutionary distance measure: %s' % measure)
a = float(a)
if len(self.alphabet) == 4:
oneless = 3
alphalen = 4
elif len(self.alphabet) == 20:
oneless = 19
alphalen = 20
else:
raise RuntimeError('Invalid sequence alphabet: %s' % str(self.alphabet))
distmat = numpy.zeros((len(self.seqs), len(self.seqs)))
# Loop through each pair of sequences
for i in range(len(self.seqs)):
for j in range(i + 1, len(self.seqs)):
seqA = self.seqs[i]
seqB = self.seqs[j]
# Calculate the fractional distance (p) first
# The two sequences of interest are in seqA and seqB
L = 0
D = 0
for k in range(self.alignlen):
# For every non-gapped column, put to L
# For every non-gapped column where the sequences are
# different, put to D
if seqA[k] != '-' and seqB[k] != '-':
L += 1
if seqA[k] != seqB[k]:
D += 1
p = float(D)/L
# Now calculate the specified measure based on p
if measure == 'fractional':
dist = p
elif measure == 'poisson':
dist = -math.log(1-p)
elif measure == 'jc':
dist = -(float(oneless)/alphalen)*math.log(1 - (float(alphalen)/oneless)*p)
elif measure == 'k2p':
dist = (float(oneless)/alphalen)*a*((1 - (float(alphalen)/oneless)*p)**(-1/a) - 1)
else: # measure == 'gamma'
dist = a*((1-p)**(-1/a) - 1)
distmat[i, j] = distmat[j, i] = dist
return distmat
def writeHTML(self, filename):
""" Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue'
"""
fh = open(filename, 'w')
fh.write('<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n')
maxNameLength = self.getnamelen()
html = ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += str(i/10+1)[-1]
else:
html += ' '
html += '%s\n' % (self.alignlen)
fh.write(html)
if self.alignlen > 10:
html = ''.ljust(maxNameLength) + ' '
for i in range(self.alignlen - 1):
if (i+1) % 10 == 0:
html += '0'
else:
html += ' '
html += '\n'
fh.write(html)
for seq in self.seqs:
html = seq.name.ljust(maxNameLength) + ' '
for sym in seq:
color = self.alphabet.getAnnotation('html-color', sym)
if not color:
color = 'white'
html += '<font style="BACKGROUND-COLOR: %s">%s</font>' % (color, sym)
html += '\n'
fh.write(html)
fh.write('</pre></body></html>\n')
fh.close()
def saveConsensus(aln, theta1 = 0.99, theta2 = 0.01, countgaps = False, consensus = True, filename = None):
""" Display a table with rows for each alignment column, showing
column index, entropy, number of gaps, and symbols in order of decreasing probability.
theta1 is the percent threshold for consensus (when achieved, all other symbols are ignored)
theta2 is the percent threshold for inclusion (symbols below are ignored).
countgaps, if true, count gaps (default false).
consensus, if true, always print the consensus symbol.
filename is name of file to save the output to (default stdout)."""
if filename == None:
f = sys.stdout
else:
filename = ''.join(e for e in filename if e.isalnum() or e == '_' or e == '.')
f = open(filename, 'w')
if consensus:
f.write("Alignment of %d sequences, with %d columns\n" % (len(aln.seqs), aln.alignlen))
f.write("Consensus>=%.2f;Inclusion>=%.2f)\n" % (theta1, theta2))
for col in range(aln.alignlen):
# collect probabilities for column, with or without gap
myalpha = aln.alphabet
if countgaps:
alist = list(aln.alphabet)
alist.append('-')
myalpha = Alphabet(alist)
d = Distrib(myalpha)
for seq in aln.seqs:
if seq[col] in myalpha:
d.observe(seq[col])
symprobs = d.getProbsort() # the symbols sorted by probability
ninclusions = 0
for (s, p) in symprobs:
if p >= theta2:
ninclusions += 1
else:
break
if consensus or ninclusions > 1:
f.write("%d " % (col + 1))
(maxs, maxp) = symprobs[0]
# if maxp >= theta1 or consensus:
# f.write("%c" % maxs)
for (s, p) in symprobs[1:]:
if p >= theta2:
f.write("%c" % s)
f.write("; ")
f.write('\n')
f.close()
def alignGlobal(seqA, seqB, substMatrix, gap = -1):
""" Align seqA with seqB using the Needleman-Wunsch
(global) algorithm. subsMatrix is the substitution matrix to use and
gap is the linear gap penalty to use. """
lenA, lenB = len(seqA), len(seqB)
# Create the scoring matrix (S)
S = numpy.zeros((lenA + 1, lenB + 1))
# Fill the first row and column of S with multiples of the gap penalty
for i in range(lenA + 1):
S[i, 0] = i * gap
for j in range(lenB + 1):
S[0, j] = j * gap
# Calculate the optimum score at each location in the matrix S
# (where the score represents the best possible score for an alignment
# that ends at sequence indices i and j, for A and B, resp.)
for i in range(1, lenA + 1):
for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
delete = S[i-1, j ] + gap
insert = S[i , j-1] + gap
S[i, j] = max([match, delete, insert])
# Traceback the optimal alignment
alignA = '' # a string for sequence A when aligned (e.g. 'THIS-LI-NE-', initially empty).
alignB = '' # a string for sequence B when aligned (e.g. '--ISALIGNED', initially empty).
# Start at the end (bottom-right corner of S)
i = lenA
j = lenB
# Stop when we hit the beginning of at least one sequence
while i > 0 and j > 0:
if S[i, j] == S[i-1, j] + gap:
# Got here by a gap in sequence B (go up)
alignA = seqA[i-1] + alignA
alignB = '-' + alignB
i -= 1
elif S[i, j] == S[i, j-1] + gap:
# Got here by a gap in sequence A (go left)
alignA = '-' + alignA
alignB = seqB[j-1] + alignB
j -= 1
else:
# Got here by aligning the bases (go diagonally)
alignA = seqA[i-1] + alignA
alignB = seqB[j-1] + alignB
i -= 1
j -= 1
# Fill in the rest of the alignment if it begins with gaps
# (i.e., traceback all the way to S[0, 0])
while i > 0:
# Go up
alignA = seqA[i-1] + alignA
alignB = '-' + alignB
i -= 1
while j > 0:
# Go left
alignA = '-' + alignA
alignB = seqB[j-1] + alignB
j -= 1
return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])
def alignLocal(seqA, seqB, substMatrix, gap = -1):
""" Align seqA with seqB using the Smith-Waterman
(local) algorithm. subsMatrix is the substitution matrix to use and
gap is the linear gap penalty to use. """
lenA, lenB = len(seqA), len(seqB)
# Create the scoring matrix (S)
S = numpy.zeros((lenA + 1, lenB + 1))
# Fill the first row and column of S with multiples of the gap penalty
for i in range(lenA + 1):
S[i, 0] = 0 # Local: init 0
for j in range(lenB + 1):
S[0, j] = 0 # Local: init 0
# Calculate the optimum score at each location in the matrix S
# (where the score represents the best possible score for an alignment
# that ends at sequence indices i and j, for A and B, resp.)
for i in range(1, lenA + 1):
for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
delete = S[i-1, j ] + gap
insert = S[i , j-1] + gap
S[i, j] = max([match, delete, insert, 0]) # Local: add option that we re-start alignment from "0"
# Trace back the optimal alignment
alignA = ''
alignB = ''
# Local: start at the cell which has the highest score; find it
i = 0
j = 0
for ii in range(1, lenA + 1):
for jj in range(1, lenB + 1):
if S[ii, jj] > S[i, j]:
i = ii
j = jj
# Stop when we hit the end of a sequence
# Local: also stop when we hit a score 0
while i > 0 and j > 0 and S[i, j] > 0:
if S[i, j] == S[i-1, j] + gap:
# Got here by a gap in sequence B (go up)
alignA = seqA[i-1] + alignA
alignB = '-' + alignB
i -= 1
elif S[i, j] == S[i, j-1] + gap:
# Got here by a gap in sequence A (go left)
alignA = "-" + alignA
alignB = seqB[j-1] + alignB
j -= 1
else:
# Got here by aligning the bases (go diagonally)
alignA = seqA[i-1] + alignA
alignB = seqB[j-1] + alignB
i -= 1
j -= 1
return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True), Sequence(alignB, seqB.alphabet, seqB.name, gappy = True)])
def tripletAlignGlobal(seqA, seqB, seqC, subsMatrix, gap = -1):
""" Triplet-wise align this sequence with sequences seqB and seqC,
using the Needleman-Wunsch (global) algorithm. subsMatrix is the
substitution matrix to use and gap is the linear gap penalty to use. """
lenA, lenB, lenC = [s.length for s in [seqA, seqB, seqC]]
# Create the 3D scoring matrix
traceback = numpy.zeros((lenA+1, lenB+1, lenC+1))
# Fill the first row (in each dimension) with multiples of the gap penalty
S = numpy.zeros((lenA+1, lenB+1, lenC+1))
for i in range(lenA+1):
S[i,0,0] = i * gap
for j in range(lenB+1):
S[0,j,0] = j * gap
for k in range(lenC+1):
S[0,0,k] = k * gap
# Calculate the optimum __getitem__ at each location in the matrix
for i in range(1, lenA+1):
for j in range(1, lenB+1):
for k in range(1, lenC+1):
# Scored using sum-of-pairs
matchABC = S[i-1, j-1, k-1] + subsMatrix.get(seqA[i-1], seqB[j-1]) \
+ subsMatrix.get(seqA[i-1], seqC[k-1]) \
+ subsMatrix.get(seqB[j-1], seqC[k-1])
matchAB = S[i-1, j-1, k] + 2*gap + subsMatrix.get(seqA[i-1], seqB[j-1])
matchBC = S[i, j-1, k-1] + 2*gap + subsMatrix.get(seqB[j-1], seqC[k-1])
matchAC = S[i-1, j, k-1] + 2*gap + subsMatrix.get(seqA[i-1], seqC[k-1])
gapAB = S[i, j, k-1] + 3*gap
gapBC = S[i-1, j, k] + 3*gap
gapAC = S[i, j-1, k] + 3*gap
# Use maximum of the 7 options for this location
S[i, j, k] = max([matchABC, matchAB, matchBC, matchAC, gapAB, gapBC, gapAC])
# Remember which one was max., for the traceback
if S[i, j, k] == matchABC:
traceback[i, j, k] = 0 #"matchABC"
elif S[i, j, k] == matchBC:
traceback[i, j, k] = 1 #"matchBC"
elif S[i, j, k] == matchAC:
traceback[i, j, k] = 2 #"matchAC"
elif S[i, j, k] == matchAB:
traceback[i, j, k] = 3 #"matchAB"
elif S[i, j, k] == gapAB:
traceback[i, j, k] = 4 #"gapAB"
elif S[i, j, k] == gapBC:
traceback[i, j, k] = 5 #"gapBC"
elif S[i, j, k] == gapAC:
traceback[i, j, k] = 6 #"gapAC"
# Traceback the optimal alignment
alignA = ""
alignB = ""
alignC = ""
# Start at the end
i = lenA
j = lenB
k = lenC
# Stop when we hit the end of all but one sequence
while (i>0 and j>0) or (j>0 and k>0) or (i>0 and k>0):
if traceback[i, j, k] == 0: #"matchABC":
alignA = seqA[i-1] + alignA
alignB = seqB[j-1] + alignB
alignC = seqC[k-1] + alignC
i -= 1
j -= 1
k -= 1
elif traceback[i, j, k] == 3: #"matchAB":
alignA = seqA[i-1] + alignA
alignB = seqB[j-1] + alignB
alignC = "-" + alignC
i -= 1
j -= 1
elif traceback[i, j, k] == 2: #"matchAC":
alignA = seqA[i-1] + alignA
alignB = "-" + alignB
alignC = seqC[k-1] + alignC
i -= 1
k -= 1
elif traceback[i, j, k] == 1: #"matchBC":
alignA = "-" + alignA
alignB = seqB[j-1] + alignB
alignC = seqC[k-1] + alignC
j -= 1
k -= 1
elif traceback[i, j, k] == 4: #"gapAB":
alignA = "-" + alignA
alignB = "-" + alignB
alignC = seqC[k-1] + alignC
k -= 1
elif traceback[i, j, k] == 6: #"gapAC":
alignA = "-" + alignA
alignB = seqB[j-1] + alignB
alignC = "-" + alignC
j -= 1
elif traceback[i, j, k] == 5: #"gapBC":
alignA = seqA[i-1] + alignA
alignB = "-" + alignB
alignC = "-" + alignC
i -= 1
# Fill in the rest of the alignment if it begins with gaps
# (i.e., traceback all the way to S[0, 0, 0])
while i > 0:
alignA = seqA[i-1] + alignA
alignB = "-" + alignB
alignC = "-" + alignC
i -= 1
while j > 0:
alignA = "-" + alignA
alignB = seqB[j-1] + alignB
alignC = "-" + alignC
j -= 1
while k > 0:
alignA = "-" + alignA
alignB = "-" + alignB
alignC = seqC[k-1] + alignC
k -= 1
return Alignment([Sequence(alignA, seqA.alphabet, seqA.name, gappy = True),
Sequence(alignB, seqB.alphabet, seqB.name, gappy = True),
Sequence(alignC, seqC.alphabet, seqC.name, gappy = True)])
def readClustal(string, alphabet):
""" Read a ClustalW2 alignment in the given string and return as an
Alignment object. """
seqs = {} # sequence data
for line in string.splitlines():
if line.startswith('CLUSTAL') or line.startswith('STOCKHOLM') \
or line.startswith('#'):
continue
if len(line.strip()) == 0:
continue
if line[0] == ' ' or '*' in line or ':' in line:
continue
sections = line.split()
name, seqstr = sections[0:2]
index = name.find('/')
if index >= 0:
name = name[0:index]
if seqs.has_key(name):
seqs[name] += seqstr
else:
seqs[name] = seqstr
sequences = []
for name, seqstr in seqs.items():
sequences.append(Sequence(seqstr, alphabet, name, gappy = True))
return Alignment(sequences)
def readClustalFile(filename, alphabet):
""" Read a ClustalW2 alignment file and return an Alignment object
containing the alignment. """
fh = open(filename)
data = fh.read()
fh.close()
aln = readClustal(data, alphabet)
return aln
# Substitution Matrix ------------------
class SubstMatrix():
scoremat = None
alphabet = None
def __init__(self, alphabet):
self.alphabet = alphabet
self.scoremat = {}
def setScores(self, scoremat):
""" Set all scores in one go.
scoremat is a (sym1, sym2)-keyed dictionary of scores. """
self.scoremat = scoremat
def _getkey(self, sym1, sym2):
""" Construct canonical (unordered) key for two symbols """
if sym1 <= sym2:
return tuple([sym1, sym2])
else:
return tuple([sym2, sym1])
def set(self, sym1, sym2, score):
""" Add a score to the substitution matrix """
self.scoremat[self._getkey(sym1, sym2)] = score
def get(self, sym1, sym2):
return self.scoremat[self._getkey(sym1, sym2)]
def __str__(self):
symbols = self.alphabet.symbols # what symbols are in the alphabet
i = len(symbols)
string = ''
for a in symbols:
string += a + ' '
for b in symbols[:len(symbols)-i+1]:
score = self.scoremat[self._getkey(a, b)]
if score != None:
string += str(score).rjust(3) + ' '
else:
string += "?".rjust(3) + ' '
string += '\n'
i -= 1
string += ' ' + ' '.join(self.alphabet.symbols)
return string
def writeFile(self, filename):
""" Write this substitution matrix to the given file. """
fh = open(filename, 'w')
file = ''
for key in self.scoremat:
file += ''.join(key) + ': ' + str(self.scoremat[key]) + '\n'
fh.write(file)
fh.close()
def readSubstMatrix(filename, alphabet):
""" Read in the substitution matrix stored in the given file. """
mat = SubstMatrix(alphabet)
fh = open(filename, 'r')
data = fh.read()
fh.close()
lines = data.splitlines()
for line in lines:
if len(line.strip()) == 0:
continue
symbols, score = line.split(':')
score = int(score)
mat.set(symbols[0], symbols[1], score)
return mat
#import os
#os.chdir('/Users/mikael/workspace/binf/data/') # set to the directory where you keep your files
#BLOSUM62 = readSubstMatrix('blosum62.matrix', Protein_Alphabet)
# Motifs -------------------
class Regexp(object):
""" A class that defines a sequence pattern in terms of a
given regular expression, with . indicating any symbol and square brackets
indicating a selection. See standard regexp definitions for more. """
def __init__(self, pattern):
""" Create a new consensus sequence with the given pattern. """
try:
self.pattern = pattern
self.regex = re.compile(pattern)
except:
raise RuntimeError('invalid consensus sequence given: %s' % pattern)
def __str__(self):
return self.pattern
def search(self, sequence):
""" Find matches to the motif in the specified sequence. Returns a list
of triples, of the form (position, matched string, score). Note that
the score is always 1.0 because a consensus sequence either matches
or doesn't. """
if not type(sequence) is Sequence:
sequence = Sequence(sequence)
sequenceString = sequence[:]
results = []
for match in self.regex.finditer(sequenceString):
results.append((match.start(), match.group(), 1.0))
return results
class PWM(object):
""" A position weight matrix. """
def __init__(self, foreground, background = None, start = 0, end = None, pseudo = 0.0):
""" Create a new PWM from the given probability matrix/ces.
foreground: can be either an Alignment, a list of Distrib's or an instance of IndepJoint.
background: must be a Distrib instance or None (in which case a uniform background will be used)
Specify only a section of the matrix to use with start and end. """
if isinstance(foreground, Alignment):
foreground = foreground.getProfile(pseudo = pseudo)
if isinstance(foreground, IndepJoint):
foreground = foreground.store
self.start = start
self.end = end or len(foreground)
self.length = self.end - self.start
self.alphabet = foreground[self.start].alpha
if False in [ col.alpha == self.alphabet for col in foreground[self.start + 1 : self.end] ]:
raise RuntimeError("All positions need to be based on the same alphabet")
self.symbols = self.alphabet.symbols
# Set foreground probabilities from given alignment
self.m = numpy.zeros((len(self.symbols), self.length))
self.fg = foreground[self.start:self.end]
self.bg = background or Distrib(self.alphabet, 1.0) # specified background or uniform
if not self.alphabet == self.bg.alpha:
raise RuntimeError("Background needs to use the same alphabet as the foreground")
p = self.bg.prob()
for i in range(self.length):
q = self.fg[i].prob()
for j in range(len(self.alphabet)):
self.m[j][i] = self.logme(q[j], p[j])
def __len__(self):
return self.length
def getRC(self, swap = [('A', 'T'), ('C', 'G')] ):
""" Get the reverse complement of the current PWM.
Use for DNA sequences with default params.
"""
new_fg = self.fg[::-1] # backwards
for s in swap:
new_fg = [d.swapxcopy(s[0], s[1]) for d in new_fg]
return PWM(new_fg, self.bg)
MIN_VALUE = 0.00000000001
def logme(self, fg, bg):
if fg > self.MIN_VALUE and bg > self.MIN_VALUE:
ratio = fg / bg
return math.log(ratio)
# if not, one of fg and bg is practically zero
if fg > self.MIN_VALUE: # bg is zero
return math.log(fg / self.MIN_VALUE)
else: # fg is zero
return math.log(self.MIN_VALUE)
def getMatrix(self):
return self.m
def __str__(self):
str = ''
for j in range(len(self.alphabet)):
str += "%s\t%s\n" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
return str
def display(self, format = 'COLUMN'):
if format == 'COLUMN':
print " \t%s" % (' '.join(" %5d" % (i + 1) for i in range(self.length)))
for j in range(len(self.alphabet)):
print "%s\t%s" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
elif format == 'JASPAR':
for j in range(len(self.alphabet)):
print "%s\t[%s]" % (self.alphabet[j], ' '.join("%+6.2f" % (y) for y in self.m[j]))
def search(self, sequence, lowerBound=0):
""" Find matches to the motif in a specified sequence. Returns a list
of results as triples: (position, matched string, score).
The optional argument lowerBound specifies a lower bound on reported
scores. """
results = []
for i in range(len(sequence)-self.length+1):
subseq = sequence[i:i + self.length]
ndxseq = [ self.alphabet.index(sym) for sym in subseq ]
score = 0.0
for w in range(len(ndxseq)):
score += self.m[ ndxseq[w] ][ w ]
if score > lowerBound:
results.append((i, subseq, score))
return results
def maxscore(self, sequence):
""" Find matches to the motif in a specified sequence.
Returns the maximum score found in the sequence and its index as a tuple:
(maxscore, maxindex) """
maxscore = None
maxindex = None
for i in range(len(sequence)-self.length+1):
subseq = sequence[i:i + self.length]
ndxseq = [ self.alphabet.index(sym) for sym in subseq ]
score = 0.0
for w in range(len(ndxseq)):
score += self.m[ ndxseq[w] ][ w ]
if maxscore == None:
maxscore = score
maxindex = i
elif maxscore < score:
maxscore = score
maxindex = i
return (maxscore, maxindex)
# Web Service Functions -------------------
def getSequence(id, database = 'uniprotkb', start=None, end=None):
""" Get the sequence identified by the given ID from the given database
(e.g. 'uniprotkb', 'refseqn' or 'refseqp'), and return it as a Sequence
object. An error is caused if the sequence ID is not found. If start and
end are given, then only that section of the sequence is returned.
Note: more flexible search options are supported by using webservice.fetch
directly."""
MAX_TRY = 5
for i in range(MAX_TRY):
try:
fastaData = fetch(id, database)
seq = readFasta(fastaData)[0]
break
except:
from time import sleep
print 'Failed on {i}th try for id {id}'.format(i=i, id=id)
sleep(0.1)
try:
return Sequence(seq[start:end], seq.alphabet, seq.name, seq.info)
except:
raise RuntimeError('An error occurred while retrieving the specified sequence: %s (maybe the ID doesn\'t exist)' % id)
def searchSequences(query, database='uniprot'):
""" Search for sequences matching the given query in the given database
(must be 'uniprot'), and return a list of sequence IDs. """
ids = search(query, limit = None)
return ids
def runClustal(sequences, method='slow'):
""" Run a ClustalOmega alignment of the given list of Sequence objects.
Return an Alignment object. Method should be one of 'fast' or 'slow'. """
alpha = None
for seq in sequences:
if alpha == None:
alpha = seq.alphabet
elif alpha != seq.alphabet:
raise RuntimeError("Invalid alphabet: " + str(seq.alphabet) + ". Not compatible with " + str(alpha))
serviceName = 'clustalo'
resultType = 'aln-clustal'
fastaSeqs = ''.join([seq.writeFasta() for seq in sequences])
params = {'alignment': method.lower(), 'sequence': fastaSeqs}
service = EBI(serviceName)
result = service.submit(params, resultType)
alignment = readClustal(result, alpha)
return alignment
def createTree(alignment, type):
""" Run a ClustalW 2 phylogeny tree creation of either a 'Neighbour-joining'
or 'UPGMA' type tree from the given multiple sequence Alignment object. """
if not type in ['Neighbour-joining', 'UPGMA']:
raise RuntimeError('type must be either \'Neighbour-joining\' or \'UPGMA\'.')
serviceName = 'clustalw2_phylogeny'
resultType = 'tree'
output = 'dist'
clustalAln = alignment.writeClustal()
params = {'tree': output, 'sequence': clustalAln, 'clustering': type, 'tossgaps': 'true'}
service = EBI(serviceName)
tree = service.submit(params, resultType)
return tree
def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
""" Run a BLAST search of nucleotide mouse databases using the given
sequence as a query. Return a list of matched sequence IDs, in descending
order of similarity to query sequence.
program: either blastn (nucleotide) or blastp (protein)
database: many available, e.g. uniprotkb, pdb (protein); em_rel, nrnl1 (EMBL nucleotide, non-redundant resp)
(for protein see http://www.ebi.ac.uk/Tools/sss/ncbiblast/help/index-protein.html#database)
(for nucleotide see http://www.ebi.ac.uk/Tools/sss/ncbiblast/help/index-nucleotide.html#database)
exp: E-value threshold (select only hits that have a better E-value than this)
"""
if sequence.alphabet == predefAlphabets['DNA']:
stype = 'dna'
elif sequence.alphabet == predefAlphabets['RNA']:
stype = 'rna'
else:
stype = 'protein'
serviceName = 'ncbiblast'
resultTypes = ['ids', 'out'] # request
fastaSeq = sequence.writeFasta()
databases = [database]
params = {'program': program, 'database': databases, 'sequence': fastaSeq,
'stype': stype, 'exp': exp}
service = EBI(serviceName)
idsData, output = service.submit(params, resultTypes)
ids=[]
for id in idsData.splitlines():
if len(id) > 0:
ids.append(id.split(':')[1])
return ids
if __name__ == '__main__':
seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True)
print 'Read', len(seqs), 'sequences'
'''
A module to enable experimentation with various methods for predicting properties
assigned to sequence elements, e.g. secondary structure of proteins.
A neural net wrapper class is provided.
A couple of example applications are found at the end of this module.
'''
import numpy
import sym
import prob
import sequence
import ml
def slidewin(seq, winsize):
""" Produce a list of sub-sequences of a given length from a complete sequence """
subseqs = []
for i in range(len(seq) - winsize + 1):
subseqs.append(seq[i : i + winsize])
return subseqs
def _onehotIndex(alpha, sym):
""" Create array with "one-hot" bit codes (only adding "ones" to an all-"zero" array) """
symlen = len(sym)
alphalen = len(alpha)
indices = [ alpha.index(sym[i]) + (i * alphalen) for i in range(symlen) ]
return indices
class SeqNN():
""" A neural net wrapper for multinomial classification of sequence input """
def __init__(self, inp_len, inp_alpha, outp_alpha, nhidden, cascade = 0):
""" Construct a neural net with numeric inputs and outputs
depending on alphabets used for inputs and outputs.
inp_len: number of symbols to use as input
inp_alpha: input alphabet
outp_alpha: output alphabet (defines number of classes)
nhidden: number of "hidden" nodes in the net
cascade: if non-zero, number of positions to feed into a cascaded structure-to-structure NN (also the number of hidden nodes of this NN)
"""
self.nn1 = ml.NN(inp_len * len(inp_alpha), nhidden, len(outp_alpha)) # neural net
self.nn2 = None
self.cascade = cascade
if cascade > 0:
self.nn2 = ml.NN(cascade * len(outp_alpha), cascade, len(outp_alpha)) # cascaded neural net
self.inp_len = inp_len
self.inp_alpha = inp_alpha
self.outp_alpha = outp_alpha
def _encodeseq(self, seqs, targets = None):
""" Convert a list of sequences into numeric input suitable as input to NN. """
try:
len(seqs[0]) # if this does not throw error, it is a multi-input already
except TypeError:
seqs = [ seqs ]
targets = [ targets ]
totlen = 0
alpha = None
for seq in seqs:
if not alpha:
alpha = seq.alphabet
totlen += len(seq) - self.inp_len + 1
im = numpy.zeros((totlen, self.inp_len * len(alpha)))
if targets:
om = numpy.zeros((totlen, len(self.outp_alpha)))
row = 0
for i in range(len(seqs)):
subseqs = slidewin(seqs[i], self.inp_len)
if targets:
# Note how we remove the targets at the ends of the sequence
subtarg = targets[i][self.inp_len/2:-self.inp_len/2+1]
for k in range(len(subseqs)):
im[row, _onehotIndex(alpha, subseqs[k])] = 1
if targets: om[row, self.outp_alpha.index(subtarg[k])] = 1
row += 1
print "There are", row, "entries in data set"
if targets:
return im, om
else:
return im, None
def observeAll(self, seqs, targets, eta = 0.1, niter = 1):
""" Train a classifier to map from all possible windows to the target symbols.
Decompose each sequence to all full-width sub-sequences. Map each sub-sequence
to the target symbol for the symbol in the centre of the sub-sequence. """
assert len(seqs) == len(targets), "Number of input sequences need to match the number of target sequences"
im, om = self._encodeseq(seqs, targets)
for i in range(niter): # train first NN
rmse = self.nn1.train(im, om, eta = eta, niter = 1)
print i, ":", rmse
if not self.cascade: # if there's no cascaded NN, finish here
return rmse
nn1seqs = [] # a list of new SS sequences ...
for seq in seqs: # ... based on AA sequences
nn1seq = self.predict(seq, useCascade = False) # construct a new sequence which consists of SS predictions
nn1seqs.append(nn1seq)
im, om = self._encodeseq(nn1seqs, targets) # construct input/output patterns from SS sequences
for i in range(niter): # train cascaded NN
rmse = self.nn2.train(im, om, eta = eta, niter = 1)
print i, ":", rmse
return rmse
def testAll(self, seqs, targets):
""" Test the neural network on the specified sequences and target sequences.
Returns a confusion matrix with the predictions. """
assert len(seqs) == len(targets), "Number of input sequences needs to match the number of target sequences"
if not self.cascade:
im, om = self._encodeseq(seqs, targets)
cm = self.nn1.test(im, om)
return cm
else:
nn1seqs = []
for seq in seqs:
nn1seq = self.predict(seq, useCascade = False)
nn1seqs.append(nn1seq)
im, om = self._encodeseq(nn1seqs, targets)
cm = self.nn2.test(im, om)
return cm
def predict(self, inpseq, useCascade = True):
""" Classify each symbol in a sequence.
Return the predictions as a list of symbols. """
W = self.nn1.ninput / len(self.inp_alpha)
if useCascade and self.cascade:
nn1seq = self.predict(inpseq, useCascade = False)
subseqs = slidewin(nn1seq, self.cascade)
predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions
for i in range(len(subseqs)): # for each input sub-sequence of the primary NN
input = numpy.zeros(self.cascade * len(self.outp_alpha))
input[_onehotIndex(self.outp_alpha, subseqs[i])] = 1
outvec = self.nn2.feedforward(input)
d = prob.Distrib(self.outp_alpha)
for k in range(len(outvec)):
d.observe(self.outp_alpha[k], outvec[k])
predsyms[i + self.cascade / 2] = d.getmax() # use the symbol with the highest probability
return sequence.Sequence(predsyms, self.outp_alpha)
else: # only predict using the first NN
subseqs = slidewin(inpseq, W)
predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions
for i in range(len(subseqs)): # for each input sub-sequence of the primary NN
input = numpy.zeros(self.inp_len * len(self.inp_alpha))
input[_onehotIndex(self.inp_alpha, subseqs[i])] = 1
outvec = self.nn1.feedforward(input)
d = prob.Distrib(self.outp_alpha)
for k in range(len(outvec)):
d.observe(self.outp_alpha[k], outvec[k])
predsyms[i + W / 2] = d.getmax() # use the symbol with the highest probability
return sequence.Sequence(predsyms, self.outp_alpha)
'''
Module sstruct -- methods for protein secondary structure
'''
import sequence
import sym
cf_dict = { # Chou-Fasman table
# P(a), P(b), P(t), f(i), f(i+1), f(i+2), f(i+3)
'A': ( 142, 83, 66, 0.060, 0.076, 0.035, 0.058 ), # Alanine
'R': ( 98, 93, 95, 0.070, 0.106, 0.099, 0.085 ), # Arginine
'N': ( 101, 54, 146, 0.147, 0.110, 0.179, 0.081 ), # Aspartic Acid
'D': ( 67, 89, 156, 0.161, 0.083, 0.191, 0.091 ), # Asparagine
'C': ( 70, 119, 119, 0.149, 0.050, 0.117, 0.128 ), # Cysteine
'E': ( 151, 37, 74, 0.056, 0.060, 0.077, 0.064 ), # Glutamic Acid
'Q': ( 111, 110, 98, 0.074, 0.098, 0.037, 0.098 ), # Glutamine
'G': ( 57, 75, 156, 0.102, 0.085, 0.190, 0.152 ), # Glycine
'H': ( 100, 87, 95, 0.140, 0.047, 0.093, 0.054 ), # Histidine
'I': ( 108, 160, 47, 0.043, 0.034, 0.013, 0.056 ), # Isoleucine
'L': ( 121, 130, 59, 0.061, 0.025, 0.036, 0.070 ), # Leucine
'K': ( 114, 74, 101, 0.055, 0.115, 0.072, 0.095 ), # Lysine
'M': ( 145, 105, 60, 0.068, 0.082, 0.014, 0.055 ), # Methionine
'F': ( 113, 138, 60, 0.059, 0.041, 0.065, 0.065 ), # Phenylalanine
'P': ( 57, 55, 152, 0.102, 0.301, 0.034, 0.068 ), # Proline
'S': ( 77, 75, 143, 0.120, 0.139, 0.125, 0.106 ), # Serine
'T': ( 83, 119, 96, 0.086, 0.108, 0.065, 0.079 ), # Threonine
'W': ( 108, 137, 96, 0.077, 0.013, 0.064, 0.167 ), # Tryptophan
'Y': ( 69, 147, 114, 0.082, 0.065, 0.114, 0.125 ), # Tyrosine
'V': ( 106, 170, 50, 0.062, 0.048, 0.028, 0.053 ), # Valine
'Y': ( 69, 147, 114, 0.082, 0.065, 0.114, 0.125 ), # Tyrosine
'V': ( 106, 170, 50, 0.062, 0.048, 0.028, 0.053 ),} # Valine
prot_alpha = sym.Protein_Alphabet
sstr_alpha = sym.DSSP3_Alphabet
def makesstr(seq, sym = '*', gap = '-'):
""" Create a string from a list of booleans (seq) that indicate with sym what elements are true.
gap is used for elements that are false.
"""
sstr = ''
for yes in seq:
if yes:
sstr += sym
else:
sstr += gap
return sstr
def markCountAbove(scores, width = 6, call_cnt = 4):
""" Create a list of booleans that mark all positions within a window
of specified width that have scores above 100.
scores: a list of scores (one for each position in sequence)
width: width of window
call_cnt: required number of positions with score 100 or more
return: list of "calls" (positions in windows with at least call_cnt)
"""
above = [False for _ in range(len(scores))]
cnt = 0 # keep track of how many in the current window that are > 100
for i in range(len(scores)):
if scores[i] > 100: cnt += 1
if i >= width:
if scores[i - width] > 100: cnt -= 1
if cnt >= call_cnt:
for j in range(max(0, i - width + 1), i + 1):
above[j] = True
return above
def markAvgAbove(scores, width = 4, call_avg = 100.0):
""" Create a list of booleans that mark all positions within a window of specified width
that have an average score above specified call_avg.
"""
above = [False for _ in range(len(scores))]
sum = 0.0 #
for i in range(len(scores)):
sum += scores[i]
if i >= width: #
sum -= scores[i - width]
if sum >= call_avg * width:
for j in range(max(0, i - width + 1), i + 1):
above[j] = True
return above
def extendDownstream(scores, calls, width = 4):
""" Create a list of booleans that mark all positions that are contained
in supplied calls list AND extend this list downstream containing a
specified width average of 100.
"""
sum = 0.0
order = range(0, len(calls) - 1, +1) # we are extending calls downstream
cnt = 0
for i in order: # extend to the right
if calls[i]: # to extend a call is required in the first place
cnt += 1
sum += scores[i] # keep a sum to be able to average
if cnt >= width: # only average over a width
sum -= scores[i - width + 1]
if not calls[i + 1] and sum + scores[i + 1] > width * 100: # check
calls[i + 1] = True
else: # no call, reset sum
cnt = 0
sum = 0.0
return calls
def extendUpstream(scores, calls, width = 4):
""" Create a list of booleans that mark all positions that are contained in supplied calls list
AND extend this list upstream containing a specified width average of 100.
"""
sum = 0.0
order = range(len(calls) - 1, 0, -1) # we are extending calls upstream/to-the-left
cnt = 0
for i in order: # extend to the right
if calls[i]: # a requirement to extend is to have a call in the first place
cnt += 1
sum += scores[i] # keep a sum to be able to average
if cnt >= width: # only average over a width
sum -= scores[i + width - 1]
if not calls[i - 1] and sum + scores[i - 1] > width * 100: # check average
calls[i - 1] = True
else: # no call, reset sum
cnt = 0
sum = 0.0
return calls
def calcRegionAverage(scores, calls):
""" Determine for each position in a calls list the average score over the region
in which it is contained.
"""
region_avg = []
sum = 0.0
cnt = 0
# First determine the average for each region
for i in range(len(scores)): # go through each position
if calls[i]: # position is part of a "called" region
sum += scores[i] # add the score of that position to the average
cnt += 1 # keep track of the number of positions in the region
else: # we are outside a "called" region
if cnt > 0: # if it is the first AFTER a called region
region_avg.append(sum/cnt) # save the average
sum = 0.0 # reset average
cnt = 0
if cnt > 0: # if it is the first AFTER a called region
region_avg.append(sum/cnt) # save the average
# with all averages known, we'll populate the sequence of "averages"
region = 0
pos_avg = []
cnt = 0
for i in range(len(scores)):
if calls[i]:
pos_avg.append(region_avg[region])
cnt += 1
else:
pos_avg.append(0)
if cnt > 0:
region += 1
cnt = 0
return pos_avg
def checkSupport(calls, diff):
""" Create a list of booleans indicating if each true position is supported
by a positive score """
supported = []
for i in range(len(calls)): # go through each position
supported.append(calls[i] and diff[i] > 0)
return supported
def getScores(seq, index = 0):
""" Create a score list for a sequence by referencing the Chou-Fasman table.
"""
return [cf_dict[s.upper()][index] for s in seq]
import math
'''
Module with methods for doing some statistics.
'''
# Fisher's Exact Test
def getFETpval(a1, a2, b1, b2, left=True):
"""Computes Fisher's exact test based on a
null-hypothesis distribution specified by the totals, and
an observed distribution specified by b1 and b2, i.e.
determines the p-value of b's outcomes 1 and 2.
The default setting is to use the "left" side of the density
to determine the p-value.
Returns p-value."""
(prob, sless, sright, sleft, slarg)=getFETprob(a1, a2, b1, b2)
if left:
return sless # sless
else:
return slarg # slarg
def getFET2tail(a1, a2, b1, b2):
"""Computes Fisher's exact test based on a
null-hypothesis distribution specified by the totals, and
an observed distribution specified by b1 and b2, i.e.
determines the two-tailed p-value of b's outcomes 1 and 2.
Returns p-value."""
(prob, sless, sright, sleft, slarg)=getFETprob(a1, a2, b1, b2)
return min(1.0, sleft + sright)
def getFETprob(a1, a2, b1, b2):
"""Computes Fisher's exact test based on a
null-hypothesis distribution specified by the totals, and
an observed distribution specified by b1 and b2, i.e.
determines the probability of b's outcomes 1 and 2.
Returns an immutable list consisting of the exact
probability, and assorted p-values (sless, sright, sleft,
slarg) based on the density."""
sless = 0.0
sright = 0.0
sleft = 0.0
slarg = 0.0
n = a1 + a2 + b1 + b2
row1 = a1 + a2 # the row containing the null hypothesis
col1 = a1 + b1 # the column containing samples for outcome 1
max = row1
if col1 < max:
max = col1
min = row1 + col1 - n
if min < 0:
min = 0
if min == max:
rt = (prob, sless, sright, sleft, slarg) = (1.0,1.0,1.0,1.0,1.0)
return rt
prob = hyper0(a1, row1, col1, n)
sleft = 0.0
p = hyper(min)
i = min + 1
while p < (0.99999999 * prob):
sleft = sleft + p
p = hyper(i)
i = i + 1
i = i - 1
if p < (1.00000001 * prob):
sleft = sleft + p
else:
i = i - 1
sright = 0.0
p = hyper(max)
j = max - 1
while p < (0.99999999 * prob):
sright = sright + p
p = hyper(j)
j = j - 1
j = j + 1
if p < (1.00000001 * prob):
sright = sright + p
else:
j = j + 1
if abs(i - a1) < abs(j - a1):
sless = sleft
slarg = 1.0 - sleft + prob
else:
sless = 1.0 - sright + prob
slarg = sright
return (prob, sless, sright, sleft, slarg)
def lngamm(z):
# Reference: "Lanczos, C. 'A precision approximation
# of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964."
# Translation of Alan Miller's FORTRAN-implementation
# See http://lib.stat.cmu.edu/apstat/245
x = 0.0
x = x + 0.1659470187408462e-06/(z+7.0)
x = x + 0.9934937113930748e-05/(z+6.0)
x = x - 0.1385710331296526 /(z+5.0)
x = x + 12.50734324009056 /(z+4.0)
x = x - 176.6150291498386 /(z+3.0)
x = x + 771.3234287757674 /(z+2.0)
x = x - 1259.139216722289 /(z+1.0)
x = x + 676.5203681218835 /(z)
x = x + 0.9999999999995183
return math.log(x)-5.58106146679532777-z+(z-0.5)*math.log(z+6.5)
def lnfact(n):
if n<=1:
return 0.0
return lngamm(n+1.0)
def lnbico(n, k):
return lnfact(n)-lnfact(k)-lnfact(n-k)
def hyper_323(n11, n1_, n_1, n):
return math.exp(lnbico(n1_,n11)+lnbico(n-n1_,n_1-n11)-lnbico(n,n_1))
(_sn11, _sn1_, _sn_1, _sn, _sprob) = (0,0,0,0,0.0) # global variables used by hyper0
def hyper0(n11i, n1_i, n_1i, ni):
global _sn11, _sn1_, _sn_1, _sn, _sprob
if not ((n1_i|n_1i|ni)!=0):
if not (n11i % 10 == 0):
if n11i==_sn11+1:
_sprob = _sprob * ((_sn1_-_sn11)/float(n11i))*((_sn_1-_sn11)/float(n11i+_sn-_sn1_-_sn_1))
_sn11 = n11i
return _sprob
if n11i==_sn11-1:
_sprob = _sprob * ((_sn11)/float(_sn1_-n11i))*((_sn11+_sn-_sn1_-_sn_1)/float(_sn_1-n11i))
_sn11 = n11i
return _sprob
_sn11 = n11i
else:
_sn11 = n11i
_sn1_=n1_i
_sn_1=n_1i
_sn=ni
_sprob = hyper_323(_sn11,_sn1_,_sn_1,_sn)
return _sprob
def hyper(n11):
return hyper0(n11,0,0,0)
def mean(X):
sum = 0
for x in X:
sum += x
return sum/len(X)
def meanvar(X):
""" The mean and variance of the sample. """
mu = mean(X)
dev = 0
for x in X:
dev += (x - mu) * (x - mu)
return (mu, dev / len(X))
def getZScore(X, sample):
(mu, var) = meanvar(X)
return (sample - mu) / math.sqrt(var)
def getZScores(X):
(mu, var) = meanvar(X)
Y = [((x - mu) / math.sqrt(var)) for x in X]
return Y
def getPearson(X, Y):
""" Pearson correlation coefficient (r). Note that we are using the standard deviation of the sample, NOT the sample standard deviation (see http://en.wikipedia.org/wiki/Standard_deviation).
"""
(Xmu, Xvar) = meanvar(X)
(Ymu, Yvar) = meanvar(Y)
if len(X) != len(Y):
raise RuntimeError('Vectors are of uneven length')
n = len(X)
sum = 0
for i in range(n):
sum += (X[i] * Y[i])
if n == 0 or Xvar == 0 or Yvar == 0:
return 0
return (sum - n * (Xmu * Ymu)) / (n * math.sqrt(Xvar) * math.sqrt(Yvar))
# normal distribution
def error(x):
"""
Error function
Cephes Math Library Release 2.8: June, 2000
Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
"""
result = 0.0
xsq = 0.0
s = 0.0
p = 0.0
q = 0.0
s = +1
if x<0:
s = -1
x = abs(x)
if x<0.5:
xsq = x*x
p = 0.007547728033418631287834
p = 0.288805137207594084924010+xsq*p
p = 14.3383842191748205576712+xsq*p
p = 38.0140318123903008244444+xsq*p
p = 3017.82788536507577809226+xsq*p
p = 7404.07142710151470082064+xsq*p
p = 80437.3630960840172832162+xsq*p
q = 0.0
q = 1.00000000000000000000000+xsq*q
q = 38.0190713951939403753468+xsq*q
q = 658.070155459240506326937+xsq*q
q = 6379.60017324428279487120+xsq*q
q = 34216.5257924628539769006+xsq*q
q = 80437.3630960840172826266+xsq*q
result = s*1.1283791670955125738961589031*x*p/q
return result
elif x>=10:
result = s
return result
result = s*(1-errorComplement(x))
return result
def errorComplement(x):
"""
Complementary error function
Cephes Math Library Release 2.8: June, 2000
Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
"""
result = 0.0
p = 0.0
q = 0.0
if x<0.0:
result = 2.0-errorComplement(-x)
return result
elif x<0.5:
result = 1.0-errorComplement(x)
return result
elif x>=10:
result = 0
return result
p = 0.0
p = 0.5641877825507397413087057563+x*p
p = 9.675807882987265400604202961+x*p
p = 77.08161730368428609781633646+x*p
p = 368.5196154710010637133875746+x*p
p = 1143.262070703886173606073338+x*p
p = 2320.439590251635247384768711+x*p
p = 2898.0293292167655611275846+x*p
p = 1826.3348842295112592168999+x*p
q = 1.0
q = 17.14980943627607849376131193+x*q
q = 137.1255960500622202878443578+x*q
q = 661.7361207107653469211984771+x*q
q = 2094.384367789539593790281779+x*q
q = 4429.612803883682726711528526+x*q
q = 6089.5424232724435504633068+x*q
q = 4958.82756472114071495438422+x*q
q = 1826.3348842295112595576438+x*q
result = math.exp(-(x*x))*p/q
return result
def f(x):
"""
Normal distribution function
Returns the area under the Gaussian probability density
function, integrated from minus infinity to x
Cephes Math Library Release 2.8: June, 2000
Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
"""
result = 0.0
result = 0.5*(error(x/1.41421356237309504880)+1)
return result
def inverseError(e):
"""
Inverse of the error function
Cephes Math Library Release 2.8: June, 2000
Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
"""
result = 0.0
result = inverse(0.5*(e+1))/math.sqrt(2)
return result
def inverse(y0):
"""
Inverse of Normal distribution function
Returns the argument, x, for which the area under the
Gaussian probability density function (integrated from
minus infinity to x) is equal to y.
For small arguments 0 < y < exp(-2), the program computes
z = sqrt( -2.0 * log(y) ); then the approximation is
x = z - log(z)/z - (1/z) P(1/z) / Q(1/z).
There are two rational functions P/Q, one for 0 < y < exp(-32)
and the other for y up to exp(-2). For larger arguments,
w = y - 0.5, and x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
Cephes Math Library Release 2.8: June, 2000
Copyright 1984, 1987, 1988, 1992, 2000 by Stephen L. Moshier
"""
result = 0.0
expm2 = 0.0
s2pi = 0.0
x = 0.0
y = 0.0
z = 0.0
y2 = 0.0
x0 = 0.0
x1 = 0.0
code = 0 # int
p0 = 0.0
q0 = 0.0
p1 = 0.0
q1 = 0.0
p2 = 0.0
q2 = 0.0
MAX_VALUE = 1.e23
expm2 = 0.13533528323661269189
s2pi = 2.50662827463100050242
if y0<=0:
result = -MAX_VALUE
return result
elif y0>=1:
result = MAX_VALUE
return result
code = 1
y = y0
if y>1.0-expm2:
y = 1.0-y
code = 0
if y>expm2:
y = y-0.5
y2 = y*y
p0 = -59.9633501014107895267
p0 = 98.0010754185999661536+y2*p0
p0 = -56.6762857469070293439+y2*p0
p0 = 13.9312609387279679503+y2*p0
p0 = -1.23916583867381258016+y2*p0
q0 = 1.0
q0 = 1.95448858338141759834+y2*q0
q0 = 4.67627912898881538453+y2*q0
q0 = 86.3602421390890590575+y2*q0
q0 = -225.462687854119370527+y2*q0
q0 = 200.260212380060660359+y2*q0
q0 = -82.0372256168333339912+y2*q0
q0 = 15.9056225126211695515+y2*q0
q0 = -1.18331621121330003142+y2*q0
x = y+y*y2*p0/q0
x = x*s2pi
result = x
return result
x = math.sqrt(-(2.0*math.log(y)))
x0 = x-math.log(x)/x
z = 1.0/x
if x<8.0:
p1 = 4.05544892305962419923
p1 = 31.5251094599893866154+z*p1
p1 = 57.1628192246421288162+z*p1
p1 = 44.0805073893200834700+z*p1
p1 = 14.6849561928858024014+z*p1
p1 = 2.18663306850790267539+z*p1
p1 = -(1.40256079171354495875*0.1)+z*p1
p1 = -(3.50424626827848203418*0.01)+z*p1
p1 = -(8.57456785154685413611*0.0001)+z*p1
q1 = 1.0
q1 = 15.7799883256466749731+z*q1
q1 = 45.3907635128879210584+z*q1
q1 = 41.3172038254672030440+z*q1
q1 = 15.0425385692907503408+z*q1
q1 = 2.50464946208309415979+z*q1
q1 = -(1.42182922854787788574*0.1)+z*q1
q1 = -(3.80806407691578277194*0.01)+z*q1
q1 = -(9.33259480895457427372*0.0001)+z*q1
x1 = z*p1/q1
else:
p2 = 3.23774891776946035970
p2 = 6.91522889068984211695+z*p2
p2 = 3.93881025292474443415+z*p2
p2 = 1.33303460815807542389+z*p2
p2 = 2.01485389549179081538*0.1+z*p2
p2 = 1.23716634817820021358*0.01+z*p2
p2 = 3.01581553508235416007*0.0001+z*p2
p2 = 2.65806974686737550832*0.000001+z*p2
p2 = 6.23974539184983293730*0.000000001+z*p2
q2 = 1.0
q2 = 6.02427039364742014255+z*q2
q2 = 3.67983563856160859403+z*q2
q2 = 1.37702099489081330271+z*q2
q2 = 2.16236993594496635890*0.1+z*q2
q2 = 1.34204006088543189037*0.01+z*q2
q2 = 3.28014464682127739104*0.0001+z*q2
q2 = 2.89247864745380683936*0.000001+z*q2
q2 = 6.79019408009981274425*0.000000001+z*q2
x1 = z*p2/q2
x = x0-x1
if code!=0:
x = -x
result = x
return result
def getRSpval(a, b):
"""
Compute the Wilcoxon rank sum test (aka the Mann-Whitney U-test), return the p-value
The approximation is based on the normal distribution and is reliable
when sample sets are of size 5 or larger.
The default is based on the area of the left side of the Gaussian, relative the
estimated z-value.
NULL: a==b ONE-SIDED: a<b (default=left), for ONE-SIDED: b<a (right) use 1-returned value.
For a two-tailed, double the p-value.
Implemented by Mikael Boden
"""
# create a new list consisting of the two sample sets that can be sorted
lst=[]
for elem in a:
lst.append([elem, +1, 0])
for elem in b:
lst.append([elem, -1, 0])
# ok sort it
lst.sort(lambda p, q: cmp(p[0], q[0]))
# let's go through it and edit each rank
rank=0
na=0
nb=0 # the number of points in each set (A & B)
same=[] # a dynamic list to keep track of elements with same measurement
measurement=lst[0][0]
for row in lst:
if row[1]==+1: # belongs to class 'a'
na=na+1
else:
nb=nb+1
if (measurement!=row[0]): # here's an entry that differed from the previous...
# before moving on to handling the new element we need to sort out the "old" same list
firstInGroup=rank+1-len(same)
lastInGroup=rank
average=float(lastInGroup-firstInGroup)/2.0
for srow in same:
srow[2]=firstInGroup+average
same=[]
measurement=row[0]
same.append(row)
rank=rank+1
# the last batch of entries is handled outside the loop...
firstInGroup=rank+1-len(same)
lastInGroup=rank
average=float(lastInGroup-firstInGroup)/2.0
for srow in same:
srow[2]=firstInGroup+average
n=na+nb # the total number of measurements
ta_obs=0 # sum of na ranks in group A
tb_obs=0 # sum of nb ranks in group B
# sum the ranks (replace the measurements)
for entry in lst:
if entry[1]==+1: # class 'a'
ta_obs+=entry[2]
else:
tb_obs+=entry[2]
tab=ta_obs+tb_obs # sum of n ranks in groups A and B combined
sd=math.sqrt((na*nb*(n+1.0))/12.0) # the standard deviation is the same in both sets
ta_null=na*(n+1.0)/2.0 # the sum of the "null" case
tb_null=nb*(n+1.0)/2.0 # the sum of the "null" case
ta_max=na*nb+(na*(na+1.0))/2.0 # the max sum set A can take
tb_max=na*nb+(nb*(nb+1.0))/2.0 # the max sum set B can take
ua=ta_max-ta_obs # the "U" value for A which is the mirror of ...
ub=tb_max-tb_obs # the "U" value for B (we only need one)
ua_null=ta_max-ta_null # the U value for the null case
ub_null=tb_max-tb_null
if ta_obs>ta_null: # a "continuity correction" for A
da=-0.5
else:
da=+0.5
if tb_obs>tb_null: # a "continuity correction" for B
db=-0.5
else:
db=+0.5
za=((ta_obs-ta_null)+da)/sd # the z value for A which is the mirror of ...
zb=((tb_obs-tb_null)+db)/sd # the z value for B (we only need one)
p=f(za) # figure out the area of the normal distribution
u=ua; # remember one of the U values
return p # the p-value: null is that a==b, one-sided (a has lower values)
def getPointBiserialCorr(group1, group2):
"""
The point biserial correlation coefficient (rpb) is a correlation coefficient used when one variable (e.g. Y) is dichotomous,
with continuous data divided into two groups (group1 and group2 here).
group1 corresponds to "greater", group to "lesser", i.e. 1 and 0 respectively
See https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient
"""
n1 = len(group1)
n0 = len(group2)
if n1 < 1 or n0 < 1:
raise RuntimeError('At least one group is empty')
n = n1 + n0
M1 = sum(group1) / float(n1)
M0 = sum(group2) / float(n0)
M = (M1 * n1 + M0 * n0) / float(n)
all = []
all.extend(group1)
all.extend(group2)
sn = math.sqrt(sum([(x_i - M)**2 for x_i in all]) / float(n))
return (M1 - M0) / sn * math.sqrt((n1 * n0) / float(n**2))
\ No newline at end of file
"""
Module symbol is for defining alphabets (of symbols), and
for storing and operating on symbols and tuples (ordered or
unordered).
"""
import os
# ------------------ Alphabet ------------------
class Alphabet(object):
""" Defines an immutable biological alphabet (e.g. the alphabet for DNA is AGCT)
that can be used to create sequences (see sequence.py).
We use alphabets to define "tuple" tables, where entries are keyed by combinations
of symbols of an alphabet (see class TupleStore below).
Alphabets are used to define probability distributions for stochastic events
(see prob.py). """
def __init__(self, symbolString):
""" Construct an alphabet from a string of symbols. Lower case characters
will be converted to upper case, repeated characters are ignored.
Example of constructing the DNA alphabet:
>>> alpha = Alphabet('ACGTttga')
>>> alpha.symbols
('A', 'C', 'G', 'T') """
# Add each symbol to the symbols list, one at a time, and ignore doubles (could use "set" here...)
_symbols = [] # create a temporary list
for s in symbolString:
if not str(s).upper()[0] in _symbols:
_symbols.append(str(s).upper()[0])
_symbols.sort() # we put them in alphabetical (one canonical) order
# OK done extracting, put them in place
self.symbols = tuple(_symbols); # create the immutable tuple from the extracted list
self.length = len(self.symbols)
self.annotations = {}
def __str__(self):
return str(self.symbols)
def __len__(self):
return len(self.symbols)
def __iter__(self):
return self.symbols.__iter__()
def __getitem__(self, ndx):
""" Retrieve the symbol(s) at the specified index (or slice of indices) """
return self.symbols[ndx]
def __contains__(self, sym):
""" Check if the given symbol is a member of the alphabet. """
return sym in self.symbols
def index(self, sym):
""" Retrieve the index of the given symbol in the alphabet. """
# If the symbol is valid, use the tuple's index function
if sym in self.symbols:
syms = self.symbols
return syms.index(sym)
else:
raise RuntimeError('Symbol %s is not indexed by alphabet %s' % (sym, str(self.symbols)))
def __eq__(self, rhs):
""" Test if the rhs alphabet is equal to ours. """
if rhs == None:
return False
if len(rhs) != len(self):
return False
# OK we know they're same size...
for sym in self.symbols:
if not sym in rhs:
return False
return True
def isSubsetOf(self, alpha2):
""" Test if this alphabet is a subset of alpha2. """
for sym in self.symbols:
if not alpha2.isValidSymbol(sym):
return False
return True
def isSupersetOf(self, alpha2):
""" Test if this alphabet is a superset of alpha2. """
return alpha2.isSubsetOf(self)
def annotateSym(self, label, sym, value):
try:
lookup = self.annotations[label]
except KeyError:
lookup = self.annotations[label] = {}
lookup[sym] = value
def annotateAll(self, label, symdictOrFilename):
if isinstance(symdictOrFilename, str): # we assume it is a filename
fh = open(symdictOrFilename)
string = fh.read()
d = {}
for line in string.splitlines():
if len(line.strip()) == 0:
continue
sections = line.split()
symstr, value = sections[0:2]
for sym in symstr:
d[sym] = value
fh.close()
else: # we assume it is a dictionary
d = symdictOrFilename
for sym in d:
self.annotateSym(label, sym, d[sym])
def getAnnotation(self, label, sym):
try:
lookup = self.annotations[label]
return lookup[sym]
except KeyError:
return None
""" Below we declare alphabets that are going to be available when
this module is imported """
Bool_Alphabet = Alphabet('TF')
DNA_Alphabet = Alphabet('ACGT')
DNA_Alphabet_wN = Alphabet('ACGTN')
RNA_Alphabet = Alphabet('ACGU')
Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
Protein_Alphabet_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
DSSP_Alphabet = Alphabet('GHITEBSC')
DSSP3_Alphabet = Alphabet('HEC')
predefAlphabets = {'DNA': DNA_Alphabet,
'RNA': RNA_Alphabet,
'DNAwN': Alphabet('ACGTN'),
'RNAwN': Alphabet('ACGUN'),
'Protein': Protein_Alphabet,
'ProteinwX': Protein_wX}
# The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder = ['DNA', 'RNA', 'DNAwN', 'RNAwN', 'Protein', 'ProteinwX']
# Useful annotations
DNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','T':'#66bbff'})
RNA_Alphabet.annotateAll('html-color', {'A':'green','C':'orange','G':'red','U':'#66bbff'})
Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
# ------------------ Substitution Matrix ------------------
class TupleStore(dict):
""" Internal utility class that can be used for associating
a value with ordered n-tuples (n=1..N).
Read/write functions are defined for instances of this class.
"""
def __init__(self, alphas=None, entries=None, sparse=True):
"""
Manage entries keyed by symbol-tuples with values of arbitrary type.
If alphas is None, the alphabet(s) are inferred from the provided entries.
If entries is None, all entries are defined by possible combinations of symbols from specified alphabets,
and are assumed to be None until specified. Either alphas or entries must be supplied.
If sparse is True, a sparse memory-saving encoding is used, if false, a time-saving, more flexible encoding is used.
>>> matrix = TupleStore({'AA': 2, 'AW': -3, 'WW': 4, 'AR': -1})
>>> matrix[('A', 'W')]
-3
>>> matrix['AR']
-1
"""
assert sparse, "Currently only sparse encoding is implemented."
assert alphas or entries, "Either alphabets or entries (from which alphabets can be inferred) must be supplied."
self.sparse = sparse # sparse encoding if true
if alphas == None:
self.alphas = None # need to figure out alphabet from supplied entries
self.keylen = None # tuple length not known yet
elif type(alphas) is Alphabet:
self.alphas = tuple ([ alphas ]) # make it into a tuple
self.keylen = 1 # tuple length 1
else:
self.alphas = alphas # alphabets are supplied
self.keylen = len(alphas)# length of tuples is the same as the number alphabets
# Check if entries are supplied to the constructor
if entries == None:
self.entries = entries = {}
elif type(entries) is dict:
raise RuntimeError("When specified, entries must be a dictionary")
# Check length of tuples, must be the same for all
for entry in entries:
if self.keylen == None:
self.keylen = len(entry)
elif self.keylen != len(entry):
raise RuntimeError("All entries must have the same number of symbols")
# go through each position in tuples, to check what alphabet is right
myalphas = [] # my suggestions from entries (need to be subsets of specified)
for idx in range(self.keylen):
symset = set() # we collect all symbols in position idx here
for key in entries:
symset.add(key[idx])
myalpha = Alphabet(symset)
myalphas.append(myalpha)
if self.alphas != None: # if specified it needs to be a superset of that we constructed
if not self.alphas[idx].isSupersetOf(myalpha):
raise RuntimeError("Specified alphabet is not compatible with specified entries")
if self.alphas == None: # if not specified to constructor use those we found
self.alphas = tuple(myalphas)
for key in entries:
self[key] = entries[key]
def _isValid(self, symkey):
for idx in range(self.keylen):
if not symkey[idx] in self.alphas[idx]:
return False
return True
def __setitem__(self, symkey, value):
assert self.keylen == len(symkey), "All entries in dictionary must be equally long"
assert self._isValid(symkey), "Invalid symbol in entry"
self.entries[symkey] = value
def __getitem__(self, symkey):
""" Return the score matching the given symbols together."""
assert self.keylen == len(symkey), "Entries must be of the same length"
try:
return self.entries[symkey]
except KeyError:
return None
def __iadd__(self, symkey, ivalue):
assert self.keylen == len(symkey), "All entries in dictionary must be equally long"
assert self._isValid(symkey), "Invalid symbol in entry"
try:
self.entries[symkey] += ivalue
except KeyError:
self.entries[symkey] = ivalue
def __isub__(self, symkey, ivalue):
assert self.keylen == len(symkey), "All entries in dictionary must be equally long"
assert self._isValid(symkey), "Invalid symbol in entry"
try:
self.entries[symkey] -= ivalue
except KeyError:
self.entries[symkey] = -ivalue
def getAll(self, symkey=None):
""" Return the values matching the given symbols together.
symkey: tuple (or list) of symbols or None (symcount symbol); if tuple is None, all entries are iterated over.
"""
if symkey == None:
symkey = []
for idx in range(self.keylen):
symkey.append(None)
else:
assert self.keylen == len(symkey), "Entries must be of the same length"
for idx in range(self.keylen):
if symkey[idx] != None:
if not symkey[idx] in self.alphas[idx]:
raise RuntimeError("Invalid entry: must be symbols from specified alphabet or None")
return TupleEntries(self, symkey)
def __iter__(self):
return TupleEntries(self, tuple([None for _ in range(self.keylen)]))
def items(self, sort = False):
""" In a dictionary-like way return all entries as a list of 2-tuples (key, prob).
If sort is True, entries are sorted in descending order of value.
Note that this function should NOT be used for big (>5 variables) tables."""
ret = []
for s in self.entries:
if self[s] != None:
ret.append((s, self[s]))
if sort:
return sorted(ret, key=lambda v: v[1], reverse=True)
return ret
class TupleEntries(object):
""" Iterator class for multiple entries in a tuple store.
"""
def __init__(self, tuplestore, symkey):
self.tuplestore = tuplestore
self.symkey = symkey
self.symcount = []
self.indices = []
for ndx in range(tuplestore.keylen):
if symkey[ndx] == None:
self.indices.append(ndx)
self.symcount.append(0) # start at this index to alter symbol
else:
self.symcount.append(None) # do not alter this symbol
self.nextIsLast = False
def __iter__(self):
return self
def next(self):
""" Step through sequence of entries, either
(if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
(if sparse) with calls to tuple store based on all possible symbol combinations."""
if self.nextIsLast:
raise StopIteration
mykey = [] # construct current combination from known and unspecified symbols
for ndx in range(self.tuplestore.keylen):
if (self.symkey[ndx] == None):
sym = self.tuplestore.alphas[ndx][self.symcount[ndx]]
mykey.append(sym)
else:
mykey.append(self.symkey[ndx])
# decide which ndx that should be increased (only one)
self.nextIsLast = True # assume this is the last round (all counters are re-set)
for ndx in self.indices:
if self.symcount[ndx] == len(self.tuplestore.alphas[ndx]) - 1: # if we just entered the last symbol of this alphabet
self.symcount[ndx] = 0 # reset count here
else:
self.symcount[ndx] = self.symcount[ndx] + 1
self.nextIsLast = False
break
return tuple(mykey)
import urllib, urllib2
import os
from time import sleep
import stats
from StringIO import StringIO
import gzip
""" This module is collection of functions for accessing the EBI REST web services,
including sequence retrieval, searching, gene ontology, BLAST and ClustalW.
The class EBI takes precautions taken as to not send too many requests when
performing BLAST and ClustalW queries.
See
http://www.ebi.ac.uk/Tools/webservices/tutorials/01_intro and
http://www.ebi.ac.uk/Tools/webservices/tutorials/02_rest
http://www.ebi.ac.uk/Tools/webservices/tutorials/06_programming/python/rest/urllib
"""
__ebiUrl__ = 'http://www.ebi.ac.uk/Tools/' # Use UQ mirror when available
__ebiGOUrl__ = 'http://www.ebi.ac.uk/QuickGO/' # Use UQ mirror when available
__uniprotUrl__ = 'http://www.uniprot.org/' #
def fetch(entryId, dbName='uniprotkb', format='fasta'):
"""
Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE' (database dependent; examples for uniprotkb)
dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
format: file format specific to database e.g. 'fasta' or 'uniprot' for uniprotkb (see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases)
See http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp for more info re URL syntax
"""
# Construct URL
url = __ebiUrl__ + 'dbfetch/dbfetch?style=raw&db=' + dbName + '&format=' + format + '&id=' + entryId
# Get the entry
try:
data = urllib2.urlopen(url).read()
if data.startswith('ERROR'):
raise RuntimeError(data)
return data
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
def search(query, dbName='uniprot', format='list', limit=100):
"""
Retrieve multiple entries matching query from a database currently only via UniProtKB
query: search term(s) e.g. 'organism:9606+AND+antigen'
dbName: name of database e.g. 'uniprot', "refseq:protein", "refseq:pubmed"
format: file format e.g. 'list', 'fasta' or 'txt'
limit: max number of results (specify None for all results)
See http://www.uniprot.org/faq/28 for more info re UniprotKB's URL syntax
See http://www.ncbi.nlm.nih.gov/books/NBK25499/ for more on NCBI's E-utils
"""
if dbName.startswith('uniprot'):
# Construct URL
if limit == None: # no limit to number of results returned
url = __uniprotUrl__ + dbName + '/?format=' + format + '&query=' + query
else:
url = __uniprotUrl__ + dbName + '/?format=' + format + '&limit=' + str(limit) + '&query=' + query
# Get the entries
try:
data = urllib2.urlopen(url).read()
if format == 'list':
return data.splitlines()
else:
return data
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
elif dbName.startswith('refseq'):
dbs = dbName.split(":")
if len(dbs) > 1:
dbName = dbs[1]
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + "esearch.fcgi?db=" + dbName + "&term=" + query + "&retmax=" + str(limit)
# Get the entries
try:
data = urllib2.urlopen(url).read()
words = data.split("</Id>")
words = [w[w.find("<Id>")+4:] for w in words[:-1]]
if format == 'list':
return words
elif format == 'fasta' and len(words) > 0:
url = base + "efetch.fcgi?db=" + dbName + "&rettype=fasta&id="
for w in words:
url += w + ","
data = urllib2.urlopen(url).read()
return data
else:
return ''
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
return
authorised_database_tag = {9606: ['Homo sapiens', 'ACC', 'ID'],
3702: ['Arabidopsis thaliana', 'TAIR_ID'],
4932: ['Saccharomyces cerevisiae', 'SGD_ID', 'CYGD_ID'],
10090: ['Mus musculus', 'MGI_ID']}
def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False):
"""
Map identifiers between databases (based on UniProtKB; see http://www.uniprot.org/faq/28)
identifiers: a list of identifiers (list of strings)
frm: the tag/abbreviation for the identifier FROM which to idmap
to: the tag/abbreviation for the identifier TO which to idmap
format: the results format to use
reverse: reverse the returned mapping key (to) -> value (from)
Returns a dictionary with key (from) -> value (to)
Set reverse to True if dictionary should contain the reverse mapping, useful if the mapping is non-unique
"""
url = __uniprotUrl__ + 'mapping/'
# construct query by concatenating the list of identifiers
if isinstance(identifiers, str):
query = identifiers.strip()
else: # assume it is a list of strings
query = ''
for id in identifiers:
query = query + id.strip() + ' '
query = query.strip() # remove trailing spaces
params = {
'from' : frm,
'to' : to,
'format' : format,
'query' : query
}
if len(query) > 0:
request = urllib2.Request(url, urllib.urlencode(params))
response = urllib2.urlopen(request).read()
d = dict()
for row in response.splitlines()[1:]:
pair = row.split('\t')
if not reverse:
d[pair[0]] = pair[1]
else:
d[pair[1]] = pair[0]
return d
else:
return dict()
"""
Gene Ontology service (QuickGO)
http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries.
"""
def getGOReport(positives, background = None, database = 'UniProtKB'):
""" Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
(GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
E-value is a Bonferroni-corrected p-value.
"""
pos = set(positives)
fg_map = getGOTerms(pos, database)
fg_list = []
for id in fg_map:
for t in fg_map[id]:
fg_list.append(t)
bg_map = {}
bg_list = []
neg = set()
if background != None:
neg = set(background).difference(pos)
bg_map = getGOTerms(neg, database)
for id in bg_map:
for t in bg_map[id]:
bg_list.append(t)
term_set = set(fg_list)
term_cnt = {}
nPos = len(pos)
nNeg = len(neg)
if background == None:
for t in term_set:
term_cnt[t] = fg_list.count(t)
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
else: # a background is provided
for t in term_set:
fg_hit = fg_list.count(t)
bg_hit = bg_list.count(t)
fg_nohit = nPos - fg_hit
bg_nohit = nNeg - bg_hit
term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
ret = []
for t in sorted_cnt:
defin = getGODef(t[0])
if background != None:
ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin['name']))
else:
ret.append((t[0], t[1], defin['name']))
return ret
def getGODef(goterm):
"""
Retrieve information about a GO term
goterm: the identifier, e.g. 'GO:0002080'
"""
# Construct URL
url = __ebiGOUrl__ + 'GTerm?format=obo&id=' + goterm
# Get the entry: fill in the fields specified below
try:
entry={'id': None, 'name': None, 'def': None}
data = urllib2.urlopen(url).read()
for row in data.splitlines():
index = row.find(':')
if index > 0 and len(row[index:]) > 1:
field = row[0:index].strip()
value = row[index+1:].strip(' "') # remove spaces and quotation marks
if field in entry.keys(): # check if we need this field
if entry[field] == None: # check if not yet assigned
entry[field] = value
return entry
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
"""
Retrieve all GO terms for a given set of genes (or single gene).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms.
If completeAnnot is True (default is False) then the above "terms" is the first element
in a tuple with (gene-terms-map, gene-taxon-id).
"""
if type(genes) != list and type(genes) != set and type(genes) != tuple:
genes = [genes]
termsmap = dict()
taxonmap = dict()
uri_string = 'GAnnotation?format=tsv&gz&db=' + database + '&protein='
# build queries (batches of genes)
queryLength = 2000
queries = []
query = None
for gene in genes:
if query == None:
query = gene
elif len(query) < queryLength:
query += ','+gene
else:
queries.append(query)
query = gene
if query != None:
queries.append(query)
# execute queries, each involving a number of genes
for query in queries:
# Construct URL
url = __ebiGOUrl__ + uri_string + query
# Get the entry: fill in the fields specified below
try:
urlreq = urllib2.Request(url)
urlreq.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
else:
data = response.read()
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
key = values[1]
if termsmap.has_key(key):
termsmap[key].add(values[6])
else:
termsmap[key] = set([values[6]])
taxonmap[key] = int(values[4])
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
if completeAnnot:
if len(genes) == 1:
if len(termsmap) == 1:
return (termsmap[genes[0]], taxonmap[genes[0]])
else:
return (set(), None)
else:
return (termsmap, taxonmap)
else:
if len(genes) == 1:
if len(termsmap) == 1:
return termsmap[genes[0]]
else:
return set()
else:
return termsmap
def getGenes(goterms, database='UniProtKB', taxo=None):
"""
Retrieve all genes/proteins for a given set of GO terms (or single GO term).
database: use specified database, e.g. 'UniProtKB', 'UniGene', or 'Ensembl'
taxo: use specific taxonomic identifier, e.g. 9606 (human)
The result is given as a map (key=gene name, value=list of unique terms) OR
in the case of a single gene as a list of unique terms.
"""
if type(goterms) != list and type(goterms) != set and type(goterms) != tuple:
goterms = [goterms]
map = dict()
if taxo == None:
uri_string = 'GAnnotation?format=tsv&db=' + database + '&term='
else:
uri_string = 'GAnnotation?format=tsv&db=' + database + '&tax=' + str(taxo) + '&term='
for goterm in goterms:
genes = set()
# Construct URL
url = __ebiGOUrl__ + uri_string + goterm.strip()
# Get the entry: fill in the fields specified below
try:
data = urllib2.urlopen(url).read()
for row in data.splitlines()[1:]: # we ignore first (header) row
values = row.split('\t')
if len(values) >= 7:
genes.add(values[1])
map[goterm] = list(genes)
except urllib2.HTTPError, ex:
raise RuntimeError(ex.read())
if len(goterms) == 1:
return map[goterms[0]]
else:
return map
class EBI(object):
__email__ = 'anon@uq.edu.au' # to whom emails about jobs should go
__ebiServiceUrl__ = 'http://www.ebi.ac.uk/Tools/services/rest/' # Use UQ mirror when available
__checkInterval__ = 2 # how long to wait between checking job status
def __init__(self, service=None):
""" Initialise service session.
service: presently, ncbiblast and clustalw2 are supported. Use None (default) for fetch/idmap jobs.
"""
self.service = service
self.lockFile = '%s.lock' % service
def createLock(self):
""" Create a lock file to prevent submission of more than 1 job
at a time by a single user. """
fh = open(self.lockFile, 'w')
fh.write(self.jobId)
fh.close()
def removeLock(self):
""" Remove the lock file. """
os.remove(self.lockFile)
def isLocked(self):
""" Check if there is a lock on this service. If there is, check if
the job is complete, and if so remove the lock. Return True if still
locked and False if not. """
if os.path.exists(self.lockFile):
fh = open(self.lockFile, 'r')
jobId = fh.read()
fh.close()
status = self.status(jobId)
if status == 'RUNNING':
self.jobId = jobId
return True
else:
self.removeLock()
return False
else:
return False
"""
BLAST and CLUSTALW services
"""
def run(self, params):
""" Submit a job to the given service with the given parameters, given
as a dictionary. Return the jobId. """
if self.service == None:
raise RuntimeError('No service specified')
if self.isLocked():
raise RuntimeError("""You currently have a %s job running. You must
wait until it is complete before submitting another job. Go to
%sstatus/%s to check the status of the job.""" % (self.service, self.__ebiServiceUrl__, self.jobId))
url = self.__ebiServiceUrl__ + self.service + '/run/'
# ncbiblast database parameter needs special handling
if self.service == 'ncbiblast':
databaseList = params['database']
del params['database']
databaseData = ''
for db in databaseList:
databaseData += '&database=' + db
encodedParams = urllib.urlencode(params)
encodedParams += databaseData
else:
encodedParams = urllib.urlencode(params)
print url
self.jobId = urllib2.urlopen(url, encodedParams).read()
self.createLock()
return self.jobId
def status(self, jobId=None):
""" Check the status of the given job (or the current job if none is
specified), and return the result. """
if jobId is None:
jobId = self.jobId
url = self.__ebiServiceUrl__ + self.service + '/status/%s' % jobId
status = urllib2.urlopen(url).read()
return status
def resultTypes(self):
""" Get the available result types. Will only work on a finished job. """
url = self.__ebiServiceUrl__ + self.service + '/resulttypes/%s' % self.jobId
resultTypes = urllib2.urlopen(url).read()
return resultTypes
def result(self, resultType):
""" Get the result of the given job of the specified type. """
url = self.__ebiServiceUrl__ + self.service + '/result/%s/%s' % (self.jobId, resultType)
try:
result = urllib2.urlopen(url).read()
if resultType == 'error':
raise RuntimeError('An error occurred: %s' % result)
except urllib2.HTTPError:
if resultType == 'error':
raise RuntimeError('An unknown error occurred while processing the job (check your input)')
else:
self.result('error')
return result
def submit(self, params, resultTypes):
""" Submit a new job to the service with the given parameters.
Return the output in the specified format. """
params['email'] = self.__email__
self.run(params)
print 'Submitted new', self.service, 'job, jobId:', self.jobId
print 'Please be patient while the job is completed'
status = 'RUNNING'
observe = 0
while status == 'RUNNING':
observe = observe + 1
status = self.status()
sleep(self.__checkInterval__)
if status != 'FINISHED':
raise RuntimeError('An error occurred and the job could not be completed')
print 'Job complete.'
self.removeLock()
if type(resultTypes) != list:
resultTypes = [resultTypes]
results = []
for resultType in resultTypes:
results.append(self.result(resultType))
if len(results) == 1:
return results[0]
else:
return results
#!/usr/bin/python
import sys, math, random, getopt
import numpy as np
import matplotlib.pyplot as plt
import prob as prb
import sequence
import stats
from rcdict import *
import operator # for use with key= in max() function
import binomial
def slidewin(seq, winsize):
""" Produce a list of sub-sequences of a given length from a complete sequence """
subseqs = []
for i in range(len(seq) - winsize + 1):
subseqs.append(seq[i : i + winsize])
return subseqs
def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
""" Produce a report of enriched words of specified length.
seqs: DNA sequence data
WordWidth: length of sought words
PeakWidth: width of window around centre of sequence
PeakMargin: the width of the margin on each side of the centre window
(which delineates the positives around peak from negatives away from peak). """
pos = RCDict() # reverse complement-aware dictionary for DNA
neg = RCDict() # reverse complement-aware dictionary for DNA
for seq in seqs:
centre = len(seq)/2 # find peak
""" Construct all words around peak (positives) and count their presence """
words = set(slidewin(seq[centre-PeakWidth/2:centre+PeakWidth/2], WordWidth))
for word in words:
try:
pos[word] += 1
except KeyError:
pos[word] = 1
""" Construct all words away from peak (negatives) and count """
words = set(slidewin(seq[:centre-PeakWidth/2-PeakMargin], WordWidth))
words.union(slidewin(seq[centre+PeakWidth/2+PeakMargin:], WordWidth))
for word in words:
try:
neg[word] += 1
except KeyError:
neg[word] = 1
logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg
for (word, cnt_pos) in pos.items():
cnt_neg = 0.0001
try:
cnt_neg = neg[word]
except KeyError:
pass
logratio[word] = math.log(float(cnt_pos) / float(cnt_neg))
allpos = logratio.items() # extract all pairs of words:log-ratio
sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them
print "Enriched words (sorted by ln pos/neg)"
print "Word \tln pos/neg\tE-value"
for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values
cnt_pos = int(pos[word])
try: cnt_neg = int(neg[word])
except KeyError: cnt_neg = 0
# Compute p-value using Fisher's Exact test
pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False)
# Correct for multiple testing (very conservatively)
eval = pval * len(allpos)
print "%s\t%6.3f \t%e" % (word, lgr, eval)
def getReverse(distribs):
""" Construct a new list of probability distributions of DNA, by
1. swapping their order, and
2. swapping A's and T's, and C's and G's """
return [d.swapxcopy('A','T').swapxcopy('C','G') for d in distribs[::-1]] # backwards
def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
""" Produce a plot for a scan of the specified motif.
The plot has as its x-axis position of sequence, and
the y-axis the cumulative, non-negative PWM score over all sequences. """
# check that all sequences are the same length and set sequence length
seq_len = len(seqs[0])
for seq in seqs:
if len(seq) != seq_len:
usage(sys.argv[0], "All sequences must have same length")
return
# create the motif and its reverse complemennt
bg = prb.Distrib(sym.DNA_Alphabet, sequence.getCount(seqs))
d = prb.readMultiCounts(jaspar)
try:
fg1 = d[motif]
fg2 = getReverse(d[motif])
except KeyError:
usage(sys.argv[0], "Unknown motif %s" % motif)
return
print "Motif %s:" % motif
pwm1 = sequence.PWM(fg1, bg)
pwm1.display(format='JASPAR')
print "Motif %s (reverse complement):" % motif
pwm2 = sequence.PWM(fg2, bg)
pwm2.display(format='JASPAR')
# initialize things to zero
avg_motif_score = np.zeros(seq_len)
# compute average score at each position (on both strands) in sequences
i_seq = 0
motif_width = pwm1.length
for seq in seqs:
i_seq += 1
# print >> sys.stderr, "Scoring seq: %4d\r" % (i_seq),
# positive strand
hits = pwm1.search(seq, threshold)
pos_scores = seq_len * [0]
for hit in hits:
# mark hit at *center* of site (hence motif_width/2)
pos_scores[hit[0]+(motif_width/2)] = hit[2]
# negative strand
hits = pwm2.search(seq, threshold)
neg_scores = seq_len * [0]
for hit in hits:
neg_scores[hit[0]+(motif_width/2)] = hit[2]
# use maximum score on two strands
for i in range(seq_len):
score = max(pos_scores[i], neg_scores[i])
if (score > threshold):
avg_motif_score[i] += score
# compute average score
for i in range(seq_len):
avg_motif_score[i] /= len(seqs)
# hw = 5 # window width is 2*hw + 1
# smoothed_avg_motif_score = np.zeros(seq_len)
# for i in range(hw, seq_len-motif_width+1-hw):
# smoothed_avg_motif_score[i]=sum(avg_motif_score[i-hw:i+hw+1])/(2*hw+1)
# plot the average score curve
# print >> sys.stderr, ""
x = range(-(seq_len/2), (seq_len/2)) # call center of sequence X=0
lbl = "%s" % (motif)
plt.plot(x, avg_motif_score, label=lbl)
#plt.plot(x, smoothed_avg_motif_score, label=lbl)
plt.axhline(color='black', linestyle='dotted')
plt.legend(loc='lower center')
plt.xlabel('position')
plt.ylabel('average motif score')
plt.title(motif)
plt.show()
def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices.txt', seed=0):
""" Produce a plot for a scan of the specified motif.
The plot has as its x-axis position of sequence, and
the y-axis the number of sequences with a best hit at position x.
Sequences with no hit above 'threshold' are ignored.
Ties for best hit are broken randomly.
The p-value of the central region that is most "centrally enriched"
and the width of the best central region is printed in the label
of the plot.
"""
# set the random seed for repeatability
random.seed(seed)
# Copy the code from your "improved" version of scanMotifReport()
# to here, and follow the instructions in the Prac to develop this
# new function.
# check that all sequences are the same length and set sequence length
seq_len = len(seqs[0])
for seq in seqs:
if len(seq) != seq_len:
usage(sys.argv[0], "All sequences must have same length")
return
# create the motif and its reverse complemennt
bg = prb.Distrib(sym.DNA_Alphabet, sequence.getCount(seqs))
d = prb.readMultiCounts(jaspar)
try:
fg1 = d[motif]
fg2 = getReverse(d[motif])
except KeyError:
usage(sys.argv[0], "Unknown motif %s" % motif)
return
print "Motif %s:" % motif
pwm1 = sequence.PWM(fg1, bg)
pwm1.display(format='JASPAR')
print "Motif %s (reverse complement):" % motif
pwm2 = sequence.PWM(fg2, bg)
pwm2.display(format='JASPAR')
# initialize things to zero
hit_count = np.zeros(seq_len)
n_seqs_with_hits = 0.0
# Scan each sequence for all hits on both strands and record
# the number of "best hits" at each sequence position.
#
motif_width = pwm1.length
i_seq = 0
for seq in seqs:
i_seq += 1
# print >> sys.stderr, "Scoring seq: %4d\r" % (i_seq),
# scan with both motifs
hits = pwm1.search(seq, threshold) + pwm2.search(seq, threshold)
# Record position of best hit
if (hits):
n_seqs_with_hits += 1
# find best hit score
best_score = max(hits, key=operator.itemgetter(1))[2]
# find ties
best_hits = [ hit for hit in hits if hit[2] == best_score ]
# break ties at random
best_hit = random.choice(best_hits)
# mark hit at *center* of site (hence pwm1.length/2)
hit_count[best_hit[0] + pwm1.length/2] += 1
# divide number of sequences with hit by total number of hits
site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ]
print >> sys.stderr, "Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits)
# STATISTICS
# Get the cumulative hit counts in concentric windows
# and perform the Binomial Test. Report best region and its p-value.
#
best_r = 0
best_log_pvalue = 1
center = seq_len/2 # center of sequence
cum_hit_count = np.zeros(seq_len) # total hits in window of width i
for i in range(1, (seq_len - pwm1.length/2 + 1)/2):
cum_hit_count[i] = cum_hit_count[i-1] + hit_count[center-i] + hit_count[center+i]
# Compute probability of observed or more best hits in central window
# assuming uniform probability distribution in each sequence.
# successes = cum_hit_count[i]
# trials = n_seqs_with_hits
# p_success = ?
# log_pvalue = ?
# if (log_pvalue < best_log_pvalue):
# best_log_pvalue = log_pvalue
# best_r = 2*i
# End STATISTICS
hw = 5
smoothed_site_probability = np.zeros(seq_len)
for i in range(hw, seq_len-motif_width+1-hw):
smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1)
x = range(-(seq_len/2), (seq_len/2)) # call center of sequence X=0
lbl = "%s, t=%.2f" % (motif, threshold)
#lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
plt.plot(x, smoothed_site_probability, label=lbl)
plt.axhline(color='black', linestyle='dotted')
plt.legend(loc='lower center')
plt.xlabel('Position of best site')
plt.ylabel('Smoothed probability')
plt.title(motif)
plt.show()
def usage(name, errmsg = None):
if errmsg != None:
print "Error: %s" % errmsg
print """Usage: %s [options]
-f <fasta-filename> (required)
-d discover enriched words
-w <word width, default 8>
-p <peak width, default 100>
-m <peak margin, default 100>
-s <JASPAR-ID> scan for JASPAR motif
-h print this help""" % name
if __name__ == '__main__':
try:
optlst, args = getopt.getopt(sys.argv[1:], 'f:hds:j:w:p:m:')
except getopt.GetoptError, err:
usage(sys.argv[0], str(err))
sys.exit(2)
FILENAME = None
DISCOVER_MODE = False
SCAN_MODE = False
WORD_WIDTH = 8
PEAK_WIDTH = 100
PEAK_MARGIN = 100
MOTIF_ID = 'MA0112.2'
JASPAR_FILE = 'JASPAR_matrices.txt'
for o, a in optlst:
if o == '-h': usage(sys.argv[0])
elif o == '-f': FILENAME = a
elif o == '-d': DISCOVER_MODE = True
elif o == '-w': WORD_WIDTH = int(a)
elif o == '-p': PEAK_WIDTH = int(a)
elif o == '-m': PEAK_MARGIN = int(a)
elif o == '-s': SCAN_MODE = True; MOTIF_ID = a
elif o == '-j': JASPAR_FILE = a
if FILENAME == None:
usage(sys.argv[0], "Filename not specified")
sys.exit(3)
seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN)
if DISCOVER_MODE:
print "Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
elif SCAN_MODE:
scanMotifReport(seqs, MOTIF_ID)
else:
usage(sys.argv[0], "No run mode selected")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment