Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
ac6c5d6b
Commit
ac6c5d6b
authored
Feb 14, 2017
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
python3_5
parent
934c2bff
Changes
16
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
860 additions
and
1533 deletions
+860
-1533
binomial.py
binomial.py
+3
-3
genome.py
genome.py
+32
-43
gibbs.py
gibbs.py
+2
-2
godata.py
godata.py
+41
-43
guide.py
guide.py
+172
-844
ml.py
ml.py
+2
-2
phylo.py
phylo.py
+19
-5
prob.py
prob.py
+6
-8
sam.py
sam.py
+44
-44
seqdata.py
seqdata.py
+38
-35
sequence.py
sequence.py
+31
-32
spred.py
spred.py
+3
-3
sstruct.py
sstruct.py
+2
-2
sym.py
sym.py
+1
-1
webservice.py
webservice.py
+447
-449
wordcount.py
wordcount.py
+17
-17
No files found.
binomial.py
View file @
ac6c5d6b
...
@@ -95,8 +95,8 @@ def betacf(a, b, x):
...
@@ -95,8 +95,8 @@ def betacf(a, b, x):
h
*=
delta
h
*=
delta
if
(
abs
(
delta
-
1.0
)
<
EPS
):
break
if
(
abs
(
delta
-
1.0
)
<
EPS
):
break
if
(
m
>
MAXIT
):
print
>>
sys
.
stderr
,
(
"a or b too big or MAXIT too small "
if
(
m
>
MAXIT
):
print
(
(
"a or b too big or MAXIT too small "
"in betacf"
)
"in betacf"
)
,
file
=
sys
.
stderr
)
return
h
return
h
...
@@ -118,5 +118,5 @@ def gammaln(x):
...
@@ -118,5 +118,5 @@ def gammaln(x):
def
die
(
string
):
def
die
(
string
):
print
>>
sys
.
stderr
,
string
print
(
string
,
file
=
sys
.
stderr
)
genome.py
View file @
ac6c5d6b
...
@@ -105,7 +105,7 @@ class GeneExpression:
...
@@ -105,7 +105,7 @@ class GeneExpression:
{'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
{'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
"""
"""
if
names
==
None
:
if
names
==
None
:
return
self
.
genes
.
keys
(
)
return
list
(
self
.
genes
.
keys
()
)
elif
isinstance
(
names
,
str
):
elif
isinstance
(
names
,
str
):
return
self
.
matrix
[
self
.
genes
[
names
],:]
return
self
.
matrix
[
self
.
genes
[
names
],:]
else
:
else
:
...
@@ -148,7 +148,7 @@ class GeneExpression:
...
@@ -148,7 +148,7 @@ class GeneExpression:
except
:
except
:
index
=
samples
index
=
samples
mygenes
=
{}
mygenes
=
{}
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
mygenes
[
name
]
=
self
.
matrix
[
ndx
,
index
]
mygenes
[
name
]
=
self
.
matrix
[
ndx
,
index
]
return
mygenes
return
mygenes
...
@@ -165,7 +165,7 @@ class GeneExpression:
...
@@ -165,7 +165,7 @@ class GeneExpression:
sort_ndx
=
np
.
nan_to_num
(
self
.
matrix
[:,
index
])
.
argsort
()
sort_ndx
=
np
.
nan_to_num
(
self
.
matrix
[:,
index
])
.
argsort
()
except
:
except
:
sort_ndx
=
np
.
nan_to_num
(
self
.
matrix
[:,
sample
])
.
argsort
()
sort_ndx
=
np
.
nan_to_num
(
self
.
matrix
[:,
sample
])
.
argsort
()
name_tuples
=
sorted
(
self
.
genes
.
items
(
),
key
=
lambda
v
:
v
[
1
])
# put all gene names in order of the matrix of profiles
name_tuples
=
sorted
(
list
(
self
.
genes
.
items
()
),
key
=
lambda
v
:
v
[
1
])
# put all gene names in order of the matrix of profiles
names
=
[]
names
=
[]
if
descending
:
if
descending
:
for
(
name
,
index
)
in
[
name_tuples
[
index
]
for
index
in
sort_ndx
[::
-
1
]]:
# reverse the order
for
(
name
,
index
)
in
[
name_tuples
[
index
]
for
index
in
sort_ndx
[::
-
1
]]:
# reverse the order
...
@@ -199,7 +199,7 @@ class GeneExpression:
...
@@ -199,7 +199,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding ratios. """
Creates and returns a gene dictionary with the corresponding ratios. """
mygenes
=
{}
mygenes
=
{}
mdiv
=
self
.
matrix
[:,
index1
]
/
self
.
matrix
[:,
index2
]
mdiv
=
self
.
matrix
[:,
index1
]
/
self
.
matrix
[:,
index2
]
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
mygenes
[
name
]
=
mdiv
[
ndx
]
mygenes
[
name
]
=
mdiv
[
ndx
]
return
mygenes
return
mygenes
...
@@ -208,7 +208,7 @@ class GeneExpression:
...
@@ -208,7 +208,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding log-ratios. """
Creates and returns a gene dictionary with the corresponding log-ratios. """
mygenes
=
{}
mygenes
=
{}
mlr
=
np
.
log2
(
self
.
matrix
[:,
index1
]
/
self
.
matrix
[:,
index2
])
mlr
=
np
.
log2
(
self
.
matrix
[:,
index1
]
/
self
.
matrix
[:,
index2
])
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
mygenes
[
name
]
=
mlr
[
ndx
]
mygenes
[
name
]
=
mlr
[
ndx
]
return
mygenes
return
mygenes
...
@@ -218,7 +218,7 @@ class GeneExpression:
...
@@ -218,7 +218,7 @@ class GeneExpression:
index
=
self
.
genes
[
probeID
]
index
=
self
.
genes
[
probeID
]
profile
=
self
.
matrix
[
index
,
:]
profile
=
self
.
matrix
[
index
,
:]
mygenes
=
{}
mygenes
=
{}
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
other
=
self
.
matrix
[
ndx
,
:]
other
=
self
.
matrix
[
ndx
,
:]
mygenes
[
name
]
=
pearson
(
profile
,
other
)
mygenes
[
name
]
=
pearson
(
profile
,
other
)
return
mygenes
return
mygenes
...
@@ -252,7 +252,7 @@ class GeneExpression:
...
@@ -252,7 +252,7 @@ class GeneExpression:
# Calculate Z-score for the given column for each gene
# Calculate Z-score for the given column for each gene
zscore
=
(
self
.
matrix
[:,
index
]
-
mu
)
/
sd
zscore
=
(
self
.
matrix
[:,
index
]
-
mu
)
/
sd
mygenes
=
{}
mygenes
=
{}
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
try
:
try
:
mygenes
[
name
]
=
zscore
[
ndx
,
:]
mygenes
[
name
]
=
zscore
[
ndx
,
:]
except
IndexError
:
except
IndexError
:
...
@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0):
...
@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0):
genes
[
name
]
=
values
genes
[
name
]
=
values
if
len
(
genes
)
==
0
:
if
len
(
genes
)
==
0
:
raise
RuntimeError
(
'No data in file'
)
raise
RuntimeError
(
'No data in file'
)
print
'Data set
%
s contains
%
d entries'
%
(
dataset
,
len
(
genes
))
print
(
'Data set
%
s contains
%
d genes'
%
(
dataset
,
len
(
genes
)
))
if
cnt_null
>
0
:
if
cnt_null
>
0
:
print
'Data set has
%
d null-values'
%
(
cnt_null
)
print
(
'Data set has
%
d null-values'
%
(
cnt_null
)
)
return
GeneExpression
(
dataset
,
headers
[
2
:],
genes
)
return
GeneExpression
(
dataset
,
headers
[
2
:],
genes
)
...
@@ -357,40 +357,29 @@ def pearson(X, Y):
...
@@ -357,40 +357,29 @@ def pearson(X, Y):
return
0
return
0
return
(
sum
-
n
*
(
Xmu
*
Ymu
))
/
(
n
*
math
.
sqrt
(
Xvar
)
*
math
.
sqrt
(
Yvar
))
return
(
sum
-
n
*
(
Xmu
*
Ymu
))
/
(
n
*
math
.
sqrt
(
Xvar
)
*
math
.
sqrt
(
Yvar
))
# ------------------- Example ---------------------
# ------------------- Example
(basically exercise 7 in prac 9)
---------------------
ge3716
=
readGEOFile
(
'/Users/mikael/workspace/COSC2000/GDS3716.soft'
)
if
__name__
==
'__main__'
:
ratio
=
GeneExpression
(
'GDS3716_ratio'
)
g
=
readGEOFile
(
'GDS3198.soft'
,
id_column
=
1
)
ratio
.
addSamples
(
'S1_ER+/Healthy'
,
ge3716
.
getRatio
(
33
,
0
))
meanfold
=
{}
ratio
.
addSamples
(
'S2_ER+/Healthy'
,
ge3716
.
getRatio
(
34
,
1
))
for
gene
in
g
.
genes
:
ratio
.
addSamples
(
'S3_ER+/Healthy'
,
ge3716
.
getRatio
(
35
,
2
))
profile
=
g
.
getGenes
(
gene
)
ratio
.
addSamples
(
'S4_ER+/Healthy'
,
ge3716
.
getRatio
(
36
,
3
))
meanfold
[
gene
]
=
(
np
.
log2
(
profile
[
0
]
/
profile
[
3
])
+
np
.
log2
(
profile
[
1
]
/
profile
[
4
])
+
np
.
log2
(
profile
[
2
]
/
profile
[
5
]))
/
3
ratio
.
addSamples
(
'S5_ER+/Healthy'
,
ge3716
.
getRatio
(
37
,
4
))
ratio
.
addSamples
(
'S6_ER+/Healthy'
,
ge3716
.
getRatio
(
38
,
5
))
ratio
.
addSamples
(
'S7_ER+/Healthy'
,
ge3716
.
getRatio
(
39
,
6
))
ratio
.
addSamples
(
'S8_ER+/Healthy'
,
ge3716
.
getRatio
(
40
,
7
))
ratio
.
addSamples
(
'S9_ER+/Healthy'
,
ge3716
.
getRatio
(
41
,
8
))
ratio
.
addSamples
(
'S1_ER-/Healthy'
,
ge3716
.
getRatio
(
24
,
9
))
ratio
.
addSamples
(
'S2_ER-/Healthy'
,
ge3716
.
getRatio
(
25
,
10
))
ratio
.
addSamples
(
'S3_ER-/Healthy'
,
ge3716
.
getRatio
(
26
,
11
))
ratio
.
addSamples
(
'S4_ER-/Healthy'
,
ge3716
.
getRatio
(
27
,
12
))
ratio
.
addSamples
(
'S5_ER-/Healthy'
,
ge3716
.
getRatio
(
28
,
13
))
ratio
.
addSamples
(
'S6_ER-/Healthy'
,
ge3716
.
getRatio
(
29
,
14
))
ratio
.
addSamples
(
'S7_ER-/Healthy'
,
ge3716
.
getRatio
(
30
,
15
))
ratio
.
addSamples
(
'S8_ER-/Healthy'
,
ge3716
.
getRatio
(
31
,
16
))
ratio
.
addSamples
(
'S9_ER-/Healthy'
,
ge3716
.
getRatio
(
32
,
17
))
ratio
.
writeGEOFile
(
'/Users/mikael/workspace/COSC2000/GDS3716_ratios.soft'
)
print
ge3716
.
getHeaders
()
import
matplotlib.pyplot
as
plt
scores
=
[
y
for
y
in
list
(
meanfold
.
values
())
if
not
np
.
isnan
(
y
)]
hist
,
bins
=
np
.
histogram
(
scores
,
bins
=
50
)
width
=
0.7
*
(
bins
[
1
]
-
bins
[
0
])
center
=
(
bins
[:
-
1
]
+
bins
[
1
:])
/
2
plt
.
bar
(
center
,
hist
,
align
=
'center'
,
width
=
width
)
plt
.
show
()
z
=
ratio
.
getZScore
(
0
)
# NOT recommended! Ratios are NOT normally distributed! Use log-ratios instead.
result
=
sorted
(
list
(
meanfold
.
items
()),
key
=
lambda
v
:
v
[
1
])
print
(
'========== Wildtype may down-regulate =========='
)
ge38
=
readGEOFile
(
'/Users/mikael/workspace/COSC2000/GDS38.soft'
,
id_column
=
1
)
for
r
in
result
[
0
:
100
]:
cln2_profile
=
ge38
.
getGenes
(
'CLN2'
)
print
(
r
[
0
],
r
[
1
])
pcorr
=
ge38
.
getPearson
(
'CLN2'
)
print
(
'========== Wildtype may up-regulate =========='
)
gp
=
GeneExpression
(
'Ex3'
,
'PC_CLN2'
,
pcorr
)
for
r
in
result
[
-
1
:
-
100
:
-
1
]:
sorted
=
gp
.
sort
(
'PC_CLN2'
,
True
)
print
(
r
[
0
],
r
[
1
])
print
sorted
[
0
],
ge38
.
getGenes
(
sorted
[
0
])
print
sorted
[
1
],
ge38
.
getGenes
(
sorted
[
1
])
gibbs.py
View file @
ac6c5d6b
...
@@ -138,7 +138,7 @@ class GibbsMotif():
...
@@ -138,7 +138,7 @@ class GibbsMotif():
LL
+=
math
.
log
(
Qk
/
Pk
)
LL
+=
math
.
log
(
Qk
/
Pk
)
except
ZeroDivisionError
:
except
ZeroDivisionError
:
pass
pass
print
"LL @
%5
d=
\t
%5.2
f"
%
(
round
,
LL
)
print
(
"LL @
%5
d=
\t
%5.2
f"
%
(
round
,
LL
)
)
# end main for-loop
# end main for-loop
self
.
q
=
q
self
.
q
=
q
...
@@ -312,7 +312,7 @@ class GibbsAlign():
...
@@ -312,7 +312,7 @@ class GibbsAlign():
LL
+=
math
.
log
(
Qk
/
Pk
)
LL
+=
math
.
log
(
Qk
/
Pk
)
except
ZeroDivisionError
:
except
ZeroDivisionError
:
pass
pass
print
"LL @
%5
d=
\t
%5.2
f"
%
(
round
,
LL
)
print
(
"LL @
%5
d=
\t
%5.2
f"
%
(
round
,
LL
)
)
# end main for-loop
# end main for-loop
self
.
q
=
q
self
.
q
=
q
...
...
godata.py
View file @
ac6c5d6b
This diff is collapsed.
Click to expand it.
guide.py
View file @
ac6c5d6b
This diff is collapsed.
Click to expand it.
ml.py
View file @
ac6c5d6b
...
@@ -21,7 +21,7 @@ class NN():
...
@@ -21,7 +21,7 @@ class NN():
self
.
b_hid
=
numpy
.
random
.
randn
(
nHidden
)
# biases hidden layer
self
.
b_hid
=
numpy
.
random
.
randn
(
nHidden
)
# biases hidden layer
self
.
w_out
=
numpy
.
random
.
randn
(
nOutput
,
nHidden
)
# weights hid -> out
self
.
w_out
=
numpy
.
random
.
randn
(
nOutput
,
nHidden
)
# weights hid -> out
self
.
b_out
=
numpy
.
random
.
randn
(
nOutput
)
# biases output layer
self
.
b_out
=
numpy
.
random
.
randn
(
nOutput
)
# biases output layer
print
"Constructed NN with
%
d inputs,
%
d hidden and
%
d output nodes."
%
(
self
.
ninput
,
len
(
self
.
hidden
),
len
(
self
.
output
))
print
(
"Constructed NN with
%
d inputs,
%
d hidden and
%
d output nodes."
%
(
self
.
ninput
,
len
(
self
.
hidden
),
len
(
self
.
output
)
))
def
writeFile
(
self
,
filename
):
def
writeFile
(
self
,
filename
):
""" Save NN to a file. """
""" Save NN to a file. """
...
@@ -110,7 +110,7 @@ class NN():
...
@@ -110,7 +110,7 @@ class NN():
multi_targ
=
[
target
]
multi_targ
=
[
target
]
for
i
in
range
(
niter
):
for
i
in
range
(
niter
):
mse
=
0.0
mse
=
0.0
entries
=
range
(
len
(
multi_input
))
entries
=
list
(
range
(
len
(
multi_input
)
))
if
shuffle
:
if
shuffle
:
random
.
shuffle
(
entries
)
random
.
shuffle
(
entries
)
for
p
in
entries
:
for
p
in
entries
:
...
...
phylo.py
View file @
ac6c5d6b
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
Module with methods and classes for phylogeny.
Module with methods and classes for phylogeny.
@author: mikael
@author: mikael
'''
'''
##
import sequence
import
sequence
class
PhyloTree
:
class
PhyloTree
:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
...
@@ -141,6 +141,18 @@ class PhyloNode:
...
@@ -141,6 +141,18 @@ class PhyloNode:
elif
self
.
left
and
self
.
right
:
elif
self
.
left
and
self
.
right
:
return
'('
+
left
+
','
+
right
+
')'
+
dist
return
'('
+
left
+
','
+
right
+
')'
+
dist
def
__le__
(
self
,
other
):
""" Returns indication of less than other node. """
return
other
and
self
.
__hash__
()
<=
other
.
__hash__
()
def
__eq__
(
self
,
other
):
""" Returns indication of equivalence to other node. """
return
other
and
self
.
__hash__
()
==
other
.
__hash__
()
def
__hash__
(
self
):
""" Returns hash of object. """
return
hash
((
self
.
label
,
self
.
dist
,
self
.
sequence
))
def
_printSequences
(
self
,
start
,
end
):
def
_printSequences
(
self
,
start
,
end
):
""" Returns string with node (incl descendants) in a Newick style. """
""" Returns string with node (incl descendants) in a Newick style. """
left
=
right
=
label
=
dist
=
''
left
=
right
=
label
=
dist
=
''
...
@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False):
...
@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False):
find the *closest* pair of clusters, and
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
In each case, the new cluster is represented by the (phylo)node that is formed. """
while
len
(
N
)
>
1
:
# N will contain all "live" clusters, to be reduced to a si
gn
le below
while
len
(
N
)
>
1
:
# N will contain all "live" clusters, to be reduced to a si
ng
le below
closest_pair
=
(
None
,
None
)
# The two nodes that are closest to one another according to supplied metric
closest_pair
=
(
None
,
None
)
# The two nodes that are closest to one another according to supplied metric
closest_dist
=
None
# The distance between them
closest_dist
=
None
# The distance between them
for
pair
in
D
:
# check all pairs which should be merged
for
pair
in
D
:
# check all pairs which should be merged
dist
=
D
[
pair
]
dist
=
D
[
pair
]
if
dist
<
closest_dist
or
closest_dist
==
None
:
if
closest_dist
==
None
or
dist
<
closest_dist
:
closest_dist
=
dist
closest_dist
=
dist
closest_pair
=
pair
closest_pair
=
pair
# So we know the closest, now we need to merge...
# So we know the closest, now we need to merge...
...
@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False):
...
@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False):
y
=
closest_pair
[
1
]
y
=
closest_pair
[
1
]
z
=
PhyloNode
()
# create a new node for the cluster z
z
=
PhyloNode
()
# create a new node for the cluster z
z
.
dist
=
D
.
pop
(
_getkey
(
x
,
y
))
/
2.0
# assign the absolute distance, travelled so far, note: this will change to relative distance later
z
.
dist
=
D
.
pop
(
_getkey
(
x
,
y
))
/
2.0
# assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx
=
N
.
pop
(
x
)
# find number of sequences in x, remove the cluster from list N
Nx
=
N
.
pop
(
x
,
None
)
# find number of sequences in x, remove the cluster from list N
Ny
=
N
.
pop
(
y
)
# find number of sequences in y, remove the cluster from list N
Ny
=
N
.
pop
(
y
,
None
)
# find number of sequences in y, remove the cluster from list N
if
Nx
==
None
or
Ny
==
None
:
continue
dz
=
{}
# new distances to cluster z
dz
=
{}
# new distances to cluster z
for
w
in
N
:
# for each node w ...
for
w
in
N
:
# for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
...
...
prob.py
View file @
ac6c5d6b
...
@@ -277,7 +277,7 @@ def _readDistrib(linelist):
...
@@ -277,7 +277,7 @@ def _readDistrib(linelist):
if
len
(
d
)
==
0
:
if
len
(
d
)
==
0
:
return
None
return
None
alpha
=
Alphabet
(
symstr
)
alpha
=
Alphabet
(
symstr
)
if
'*'
in
d
.
keys
(
):
# tot provided
if
'*'
in
list
(
d
.
keys
()
):
# tot provided
for
sym
in
d
:
for
sym
in
d
:
if
sym
!=
'*'
:
if
sym
!=
'*'
:
d
[
sym
]
=
d
[
sym
]
*
d
[
'*'
]
d
[
sym
]
=
d
[
sym
]
*
d
[
'*'
]
...
@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'):
...
@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'):
ncol
=
len
(
counts
)
ncol
=
len
(
counts
)
if
len
(
name
)
==
1
:
# proper symbol
if
len
(
name
)
==
1
:
# proper symbol
symcount
[
name
]
=
counts
symcount
[
name
]
=
counts
alpha
=
Alphabet
(
''
.
join
(
symcount
.
keys
(
)))
alpha
=
Alphabet
(
''
.
join
(
list
(
symcount
.
keys
()
)))
distribs
=
[]
distribs
=
[]
for
col
in
range
(
ncol
):
for
col
in
range
(
ncol
):
d
=
dict
([(
sym
,
symcount
[
sym
][
col
])
for
sym
in
symcount
])
d
=
dict
([(
sym
,
symcount
[
sym
][
col
])
for
sym
in
symcount
])
...
@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'):
...
@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'):
"""
"""
d
=
readMultiCounts
(
filename
,
format
=
format
)
d
=
readMultiCounts
(
filename
,
format
=
format
)
if
len
(
d
)
>
0
:
if
len
(
d
)
>
0
:
return
d
.
values
(
)[
0
]
return
list
(
d
.
values
()
)[
0
]
#################################################################################################
#################################################################################################
# Joint class
# Joint class
...
@@ -628,12 +628,12 @@ class IndepJoint(Joint):
...
@@ -628,12 +628,12 @@ class IndepJoint(Joint):
def
displayMatrix
(
self
,
count
=
False
):
def
displayMatrix
(
self
,
count
=
False
):
""" Pretty-print matrix """
""" Pretty-print matrix """
print
"
\t
%
s"
%
(
''
.
join
(
"
\t
%5
d"
%
(
i
+
1
)
for
i
in
range
(
len
(
self
.
alphas
))))
print
((
"
\t
%
s"
%
(
''
.
join
(
"
\t
%5
d"
%
(
i
+
1
)
for
i
in
range
(
len
(
self
.
alphas
))
))))
for
a
in
self
.
alphas
[
0
]:
for
a
in
self
.
alphas
[
0
]:
if
count
:
if
count
:
print
"
%
s
\t
%
s"
%
(
a
,
''
.
join
(
"
\t
%5
d"
%
(
y
)
for
y
in
self
.
getRow
(
a
,
True
)))
print
((
"
%
s
\t
%
s"
%
(
a
,
''
.
join
(
"
\t
%5
d"
%
(
y
)
for
y
in
self
.
getRow
(
a
,
True
))
)))
else
:
else
:
print
"
%
s
\t
%
s"
%
(
a
,
''
.
join
(
"
\t
%5.3
f"
%
(
y
)
for
y
in
self
.
getRow
(
a
)))
print
((
"
%
s
\t
%
s"
%
(
a
,
''
.
join
(
"
\t
%5.3
f"
%
(
y
)
for
y
in
self
.
getRow
(
a
))
)))
def
__str__
(
self
):
def
__str__
(
self
):
""" Text representation of the table. Note that size is an issue so big tables
""" Text representation of the table. Note that size is an issue so big tables
...
@@ -718,5 +718,3 @@ class NaiveBayes():
...
@@ -718,5 +718,3 @@ class NaiveBayes():
prob
*=
condprob
[
i
][
key
[
i
]]
or
0.0
prob
*=
condprob
[
i
][
key
[
i
]]
or
0.0
out
.
observe
(
outsym
,
prob
)
out
.
observe
(
outsym
,
prob
)
return
out
return
out
sam.py
View file @
ac6c5d6b
This diff is collapsed.
Click to expand it.
seqdata.py
View file @
ac6c5d6b
...
@@ -381,11 +381,11 @@ class BedFile():
...
@@ -381,11 +381,11 @@ class BedFile():
index_name
=
{}
index_name
=
{}
for
i
in
range
(
len
(
self
.
rows
)):
for
i
in
range
(
len
(
self
.
rows
)):
row
=
self
.
rows
[
i
]
row
=
self
.
rows
[
i
]
if
not
index_start
.
has_key
(
row
.
chrom
)
:
# seeing chromosome entry first time
if
not
row
.
chrom
in
index_start
:
# seeing chromosome entry first time
index_start
[
row
.
chrom
]
=
[]
index_start
[
row
.
chrom
]
=
[]
if
not
index_centre
.
has_key
(
row
.
chrom
)
:
# seeing chromosome entry first time
if
not
row
.
chrom
in
index_centre
:
# seeing chromosome entry first time
index_centre
[
row
.
chrom
]
=
[]
index_centre
[
row
.
chrom
]
=
[]
if
not
index_end
.
has_key
(
row
.
chrom
)
:
# seeing chromosome entry first time
if
not
row
.
chrom
in
index_end
:
# seeing chromosome entry first time
index_end
[
row
.
chrom
]
=
[]
index_end
[
row
.
chrom
]
=
[]
index_start
[
row
.
chrom
]
.
append
((
row
.
chromStart
,
row
.
chromEnd
-
row
.
chromStart
,
i
))
index_start
[
row
.
chrom
]
.
append
((
row
.
chromStart
,
row
.
chromEnd
-
row
.
chromStart
,
i
))
index_centre
[
row
.
chrom
]
.
append
((
row
.
chromStart
+
(
row
.
chromEnd
-
row
.
chromStart
)
/
2
,
(
row
.
chromEnd
-
row
.
chromStart
)
/
2
,
i
))
index_centre
[
row
.
chrom
]
.
append
((
row
.
chromStart
+
(
row
.
chromEnd
-
row
.
chromStart
)
/
2
,
(
row
.
chromEnd
-
row
.
chromStart
)
/
2
,
i
))
...
@@ -725,11 +725,11 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
...
@@ -725,11 +725,11 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
for
row
in
entries
:
for
row
in
entries
:
if
format
==
'Peaks'
:
if
format
==
'Peaks'
:
#f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
#f.write("%s %d %d %s %d %s %f %f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue, row.pValue)) # seems to cause issues in UCSD Genome Browser
f
.
write
(
"
%
s
%
d
%
d
%
s
%
d
%
s
%
f"
%
(
row
.
chrom
,
row
.
chromStart
,
row
.
chromEnd
,
row
.
name
,
row
.
score
,
row
.
strand
,
row
.
signalValue
))
f
.
write
(
"
%
s
\t
%
d
\t
%
d
\t
%
s
\t
%
d
\t
%
s
\t
%
f"
%
(
row
.
chrom
,
row
.
chromStart
,
row
.
chromEnd
,
row
.
name
,
row
.
score
,
row
.
strand
,
row
.
signalValue
))
elif
format
==
'Limited'
:
elif
format
==
'Limited'
:
f
.
write
(
"
%
s
%
d
%
d"
%
(
row
.
chrom
,
row
.
chromStart
,
row
.
chromEnd
))
f
.
write
(
"
%
s
\t
%
d
\t
%
d"
%
(
row
.
chrom
,
row
.
chromStart
,
row
.
chromEnd
))
else
:
else
:
f
.
write
(
"
%
s
%
d
%
d
%
s
%
d
%
s"
%
(
row
.
chrom
,
row
.
chromStart
,
row
.
chromEnd
,
row
.
name
,
row
.
score
,
row
.
strand
))
f
.
write
(
"
%
s
\t
%
d
\t
%
d
\t
%
s
\t
%
d
\t
%
s"
%
(
row
.
chrom
,
row
.
chromStart
,
row
.
chromEnd
,
row
.
name
,
row
.
score
,
row
.
strand
))
f
.
write
(
"
\n
"
)
f
.
write
(
"
\n
"
)
f
.
close
()
f
.
close
()
...
@@ -760,7 +760,7 @@ try:
...
@@ -760,7 +760,7 @@ try:
except
ImportError
:
except
ImportError
:
strerror
=
lambda
x
:
'strerror not supported'
strerror
=
lambda
x
:
'strerror not supported'
from
os.path
import
exists
from
os.path
import
exists
from
itertools
import
izip
from
itertools
import
chain
def
true_long_type
():
def
true_long_type
():
"""
"""
...
@@ -805,7 +805,7 @@ def base_to_bin(x):
...
@@ -805,7 +805,7 @@ def base_to_bin(x):
def
create_byte_table
():
def
create_byte_table
():
"""create BYTE_TABLE"""
"""create BYTE_TABLE"""
d
=
{}
d
=
{}
for
x
in
x
range
(
2
**
8
):
for
x
in
range
(
2
**
8
):
d
[
x
]
=
byte_to_bases
(
x
)
d
[
x
]
=
byte_to_bases
(
x
)
return
d
return
d
...
@@ -821,9 +821,9 @@ def split16(x):
...
@@ -821,9 +821,9 @@ def split16(x):
def
create_twobyte_table
():
def
create_twobyte_table
():
"""create TWOBYTE_TABLE"""
"""create TWOBYTE_TABLE"""
d
=
{}
d
=
{}
for
x
in
x
range
(
2
**
16
):
for
x
in
range
(
2
**
16
):
c
,
f
=
split16
(
x
)
c
,
f
=
split16
(
x
)
d
[
x
]
=
byte_to_bases
(
c
)
+
byte_to_bases
(
f
)
d
[
x
]
=
chain
(
byte_to_bases
(
c
),
byte_to_bases
(
f
)
)
return
d
return
d
BYTE_TABLE
=
create_byte_table
()
BYTE_TABLE
=
create_byte_table
()
...
@@ -836,7 +836,7 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
...
@@ -836,7 +836,7 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
"""
"""
longs_len
=
len
(
longs
)
longs_len
=
len
(
longs
)
# dna = ctypes.create_string_buffer(array_size)
# dna = ctypes.create_string_buffer(array_size)
dna
=
array
(
'
c
'
,
'N'
*
longs_len
)
dna
=
array
(
'
b
'
,
'N'
*
longs_len
)
# translate from 32-bit blocks to bytes
# translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed)
# this method ensures correct endianess (byteswap as neeed)
bytes
=
array
(
'B'
)
bytes
=
array
(
'B'
)
...
@@ -845,14 +845,14 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
...
@@ -845,14 +845,14 @@ def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
first_block
=
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes
[
x
]])
for
x
in
range
(
4
)])
first_block
=
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes
[
x
]])
for
x
in
range
(
4
)])
i
=
16
-
first_base_offset
i
=
16
-
first_base_offset
if
array_size
<
i
:
i
=
array_size
if
array_size
<
i
:
i
=
array_size
dna
[
0
:
i
]
=
array
(
'
c
'
,
first_block
[
first_base_offset
:
first_base_offset
+
i
])
dna
[
0
:
i
]
=
array
(
'
b
'
,
first_block
[
first_base_offset
:
first_base_offset
+
i
])
if
longs_len
==
1
:
return
dna
if
longs_len
==
1
:
return
dna
# middle blocks (implicitly skipped if they don't exist)
# middle blocks (implicitly skipped if they don't exist)
for
byte
in
bytes
[
4
:
-
4
]:
for
byte
in
bytes
[
4
:
-
4
]:
dna
[
i
:
i
+
4
]
=
array
(
'
c
'
,
BYTE_TABLE
[
byte
])
dna
[
i
:
i
+
4
]
=
array
(
'
b
'
,
BYTE_TABLE
[
byte
])
i
+=
4
i
+=
4
# last block
# last block
last_block
=
array
(
'
c
'
,
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes
[
x
]])
for
x
in
range
(
-
4
,
0
)]))
last_block
=
array
(
'
b
'
,
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes
[
x
]])
for
x
in
range
(
-
4
,
0
)]))
dna
[
i
:
i
+
last_base_offset
]
=
last_block
[
0
:
last_base_offset
]
dna
[
i
:
i
+
last_base_offset
]
=
last_block
[
0
:
last_base_offset
]
return
dna
return
dna
...
@@ -889,7 +889,7 @@ class TwoBitFile(dict):
...
@@ -889,7 +889,7 @@ class TwoBitFile(dict):
self
.
_file_handle
=
open
(
foo
,
'rb'
)
self
.
_file_handle
=
open
(
foo
,
'rb'
)
self
.
_load_header
()
self
.
_load_header
()
self
.
_load_index
()
self
.
_load_index
()
for
name
,
offset
in
self
.
_offset_dict
.
ite
rite
ms
():
for
name
,
offset
in
self
.
_offset_dict
.
items
():
self
[
name
]
=
TwoBitSequence
(
self
.
_file_handle
,
offset
,
self
[
name
]
=
TwoBitSequence
(
self
.
_file_handle
,
offset
,
self
.
_byteswapped
)
self
.
_byteswapped
)
return
return
...
@@ -926,13 +926,16 @@ class TwoBitFile(dict):
...
@@ -926,13 +926,16 @@ class TwoBitFile(dict):
if
remaining
==
0
:
break
if
remaining
==
0
:
break
name_size
=
array
(
'B'
)
name_size
=
array
(
'B'
)
name_size
.
fromfile
(
file_handle
,
1
)
name_size
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
name_size
.
byteswap
()
if
byteswapped
:
name
=
array
(
'c'
)
name_size
.
byteswap
()
if
byteswapped
:
name
.
byteswap
()
name
=
array
(
'b'
)
if
byteswapped
:
name
.
byteswap
()
name
.
fromfile
(
file_handle
,
name_size
[
0
])
name
.
fromfile
(
file_handle
,
name_size
[
0
])
offset
=
array
(
LONG
)
offset
=
array
(
LONG
)
offset
.
fromfile
(
file_handle
,
1
)
offset
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
offset
.
byteswap
()
if
byteswapped
:
offset
.
byteswap
()
sequence_offsets
.
append
((
name
.
tostring
(),
offset
[
0
]))
sequence_offsets
.
append
((
name
.
tostring
(),
offset
[
0
]))
remaining
-=
1
remaining
-=
1
self
.
_sequence_offsets
=
sequence_offsets
self
.
_sequence_offsets
=
sequence_offsets
...
@@ -943,7 +946,7 @@ class TwoBitFile(dict):
...
@@ -943,7 +946,7 @@ class TwoBitFile(dict):
d
=
{}
d
=
{}
file_handle
=
self
.
_file_handle
file_handle
=
self
.
_file_handle
byteswapped
=
self
.
_byteswapped
byteswapped
=
self
.
_byteswapped
for
name
,
offset
in
self
.
_offset_dict
.
ite
rite
ms
():
for
name
,
offset
in
self
.
_offset_dict
.
items
():
file_handle
.
seek
(
offset
)
file_handle
.
seek
(
offset
)
dna_size
=
array
(
LONG
)
dna_size
=
array
(
LONG
)
dna_size
.
fromfile
(
file_handle
,
1
)
dna_size
.
fromfile
(
file_handle
,
1
)
...
@@ -1078,7 +1081,7 @@ class TwoBitSequence(object):
...
@@ -1078,7 +1081,7 @@ class TwoBitSequence(object):
if
byteswapped
:
fourbyte_dna
.
byteswap
()
if
byteswapped
:
fourbyte_dna
.
byteswap
()
string_as_array
=
longs_to_char_array
(
fourbyte_dna
,
first_base_offset
,
string_as_array
=
longs_to_char_array
(
fourbyte_dna
,
first_base_offset
,
last_base_offset
,
region_size
)
last_base_offset
,
region_size
)
for
start
,
size
in
i
zip
(
n_block_starts
,
n_block_sizes
):
for
start
,
size
in
zip
(
n_block_starts
,
n_block_sizes
):
end
=
start
+
size
end
=
start
+
size
if
end
<=
min_
:
continue
if
end
<=
min_
:
continue
if
start
>
max_
:
break
if
start
>
max_
:
break
...
@@ -1086,14 +1089,14 @@ class TwoBitSequence(object):
...
@@ -1086,14 +1089,14 @@ class TwoBitSequence(object):
if
end
>
max_
:
end
=
max_
if
end
>
max_
:
end
=
max_
start
-=
min_
start
-=
min_
end
-=
min_
end
-=
min_
string_as_array
[
start
:
end
]
=
array
(
'
c
'
,
'N'
*
(
end
-
start
))
string_as_array
[
start
:
end
]
=
array
(
'
b
'
,
'N'
*
(
end
-
start
))
lower
=
str
.
lower
lower
=
str
.
lower
first_masked_region
=
max
(
0
,
first_masked_region
=
max
(
0
,
bisect_right
(
mask_block_starts
,
min_
)
-
1
)
bisect_right
(
mask_block_starts
,
min_
)
-
1
)
last_masked_region
=
min
(
len
(
mask_block_starts
),
last_masked_region
=
min
(
len
(
mask_block_starts
),
1
+
bisect_right
(
mask_block_starts
,
max_
,
1
+
bisect_right
(
mask_block_starts
,
max_
,
lo
=
first_masked_region
))
lo
=
first_masked_region
))
for
start
,
size
in
i
zip
(
mask_block_starts
[
first_masked_region
:
last_masked_region
],
for
start
,
size
in
zip
(
mask_block_starts
[
first_masked_region
:
last_masked_region
],
mask_block_sizes
[
first_masked_region
:
last_masked_region
]):
mask_block_sizes
[
first_masked_region
:
last_masked_region
]):
end
=
start
+
size
end
=
start
+
size
if
end
<=
min_
:
continue
if
end
<=
min_
:
continue
...
@@ -1102,9 +1105,9 @@ class TwoBitSequence(object):
...
@@ -1102,9 +1105,9 @@ class TwoBitSequence(object):
if
end
>
max_
:
end
=
max_
if
end
>
max_
:
end
=
max_
start
-=
min_
start
-=
min_
end
-=
min_
end
-=
min_
string_as_array
[
start
:
end
]
=
array
(
'
c
'
,
lower
(
string_as_array
[
start
:
end
]
.
tostring
()))
string_as_array
[
start
:
end
]
=
array
(
'
b
'
,
lower
(
string_as_array
[
start
:
end
]
.
tostring
()))
if
not
len
(
string_as_array
)
==
max_
-
min_
:
if
not
len
(
string_as_array
)
==
max_
-
min_
:
raise
RuntimeError
,
"Sequence was longer than it should be"
raise
RuntimeError
(
"Sequence was longer than it should be"
)
if
reverse
:
if
reverse
:
return
self
.
reverseComplement
(
string_as_array
.
tostring
())
return
self
.
reverseComplement
(
string_as_array
.
tostring
())
return
string_as_array
.
tostring
()
return
string_as_array
.
tostring
()
...
@@ -1124,7 +1127,7 @@ class TwoBitSequence(object):
...
@@ -1124,7 +1127,7 @@ class TwoBitSequence(object):
"""
"""
return
self
.
__getslice__
(
0
,
None
)
return
self
.
__getslice__
(
0
,
None
)
class
TwoBitFileError
(
StandardError
):
class
TwoBitFileError
(
Exception
):
"""
"""
Base exception for TwoBit module
Base exception for TwoBit module
"""
"""
...
...
sequence.py
View file @
ac6c5d6b
...
@@ -55,10 +55,11 @@ class Sequence(object):
...
@@ -55,10 +55,11 @@ class Sequence(object):
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'Y'] """
'R', 'S', 'T', 'V', 'W', 'Y'] """
try
:
# convert sequence data into a compact array representation
#try: # convert sequence data into a compact array representation
self
.
sequence
=
array
.
array
(
'c'
,
''
.
join
([
s
.
upper
()
for
s
in
sequence
]))
# self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
except
TypeError
:
#except TypeError:
raise
RuntimeError
(
'Sequence data is not specified correctly: must be iterable'
)
# raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
self
.
sequence
=
sequence
# Assign an alphabet
# Assign an alphabet
self
.
alphabet
=
None
self
.
alphabet
=
None
...
@@ -133,15 +134,15 @@ class Sequence(object):
...
@@ -133,15 +134,15 @@ class Sequence(object):
Calling self.__getitem__(3) is equivalent to self[3]
Calling self.__getitem__(3) is equivalent to self[3]
"""
"""
if
type
(
ndx
)
is
slice
:
if
type
(
ndx
)
is
slice
:
return
self
.
sequence
[
ndx
]
.
tostring
(
)
return
''
.
join
(
self
.
sequence
[
ndx
]
)
else
:
else
:
return
self
.
sequence
[
ndx
]
return
self
.
sequence
[
ndx
]
def
writeFasta
(
self
):
def
writeFasta
(
self
):
""" Write one sequence in FASTA format to a string and return it. """
""" Write one sequence in FASTA format to a string and return it. """
fasta
=
'>'
+
self
.
name
+
' '
+
self
.
info
+
'
\n
'
fasta
=
'>'
+
self
.
name
+
' '
+
self
.
info
+
'
\n
'
data
=
self
.
sequence
.
tostring
(
)
data
=
''
.
join
(
self
.
sequence
)
nlines
=
(
len
(
self
.
sequence
)
-
1
)
/
60
+
1
nlines
=
int
(
math
.
ceil
((
len
(
self
.
sequence
)
-
1
)
/
60
+
1
))
for
i
in
range
(
nlines
):
for
i
in
range
(
nlines
):
lineofseq
=
''
.
join
(
data
[
i
*
60
:
(
i
+
1
)
*
60
])
+
'
\n
'
lineofseq
=
''
.
join
(
data
[
i
*
60
:
(
i
+
1
)
*
60
])
+
'
\n
'
fasta
+=
lineofseq
fasta
+=
lineofseq
...
@@ -164,7 +165,7 @@ class Sequence(object):
...
@@ -164,7 +165,7 @@ class Sequence(object):
def
find
(
self
,
findme
):
def
find
(
self
,
findme
):
""" Find the position of the specified symbol or sub-sequence """
""" Find the position of the specified symbol or sub-sequence """
return
self
.
sequence
.
tostring
(
)
.
find
(
findme
)
return
''
.
join
(
self
.
sequence
)
.
find
(
findme
)
"""
"""
Below are some useful methods for loading data from strings and files.
Below are some useful methods for loading data from strings and files.
...
@@ -438,8 +439,8 @@ class Alignment():
...
@@ -438,8 +439,8 @@ class Alignment():
column index, entropy, number of gaps, and symbols in order of decreasing probability.
column index, entropy, number of gaps, and symbols in order of decreasing probability.
theta1 is the threshold for displaying symbols in upper case,
theta1 is the threshold for displaying symbols in upper case,
theta2 is the threshold for showing symbols at all, and in lower case. """
theta2 is the threshold for showing symbols at all, and in lower case. """
print
"Alignment of
%
d sequences, with
%
d columns"
%
(
len
(
self
.
seqs
),
self
.
alignlen
)
print
((
"Alignment of
%
d sequences, with
%
d columns"
%
(
len
(
self
.
seqs
),
self
.
alignlen
))
)
print
"Column
\t
Entropy
\t
Gaps
\t
Prob
\t
Conserv
\t
Symbols (Up>=
%.2
f;Low>=
%.2
f)
\n
"
%
(
theta1
,
theta2
)
print
((
"Column
\t
Entropy
\t
Gaps
\t
Prob
\t
Conserv
\t
Symbols (Up>=
%.2
f;Low>=
%.2
f)
\n
"
%
(
theta1
,
theta2
))
)
for
col
in
range
(
self
.
alignlen
):
for
col
in
range
(
self
.
alignlen
):
d
=
Distrib
(
self
.
alphabet
)
d
=
Distrib
(
self
.
alphabet
)
gaps
=
0
gaps
=
0
...
@@ -448,21 +449,21 @@ class Alignment():
...
@@ -448,21 +449,21 @@ class Alignment():
d
.
observe
(
seq
[
col
])
d
.
observe
(
seq
[
col
])
else
:
else
:
gaps
+=
1
gaps
+=
1
print
(
col
+
1
),
"
\t
%5.3
f"
%
d
.
entropy
(),
"
\t
%4
d
\t
"
%
gaps
,
print
(((
col
+
1
),
"
\t
%5.3
f"
%
d
.
entropy
(),
"
\t
%4
d
\t
"
%
gaps
,))
symprobs
=
d
.
getProbsort
()
symprobs
=
d
.
getProbsort
()
(
_
,
maxprob
)
=
symprobs
[
0
]
(
_
,
maxprob
)
=
symprobs
[
0
]
if
maxprob
>=
theta1
:
if
maxprob
>=
theta1
:
print
"
%
d
\t
TRUE
\t
"
%
int
(
maxprob
*
100
),
print
((
"
%
d
\t
TRUE
\t
"
%
int
(
maxprob
*
100
),))
else
:
else
:
print
"
%
d
\t\t
"
%
int
(
maxprob
*
100
),
print
((
"
%
d
\t\t
"
%
int
(
maxprob
*
100
),))
for
(
sym
,
prob
)
in
symprobs
:
for
(
sym
,
prob
)
in
symprobs
:
if
prob
>=
theta1
:
if
prob
>=
theta1
:
print
sym
,
"
%
d
%%
"
%
int
(
prob
*
100
),
print
((
sym
,
"
%
d
%%
"
%
int
(
prob
*
100
),))
elif
prob
>=
theta2
and
lowercase
:
elif
prob
>=
theta2
and
lowercase
:
print
sym
.
lower
(),
"
%
d
%%
"
%
int
(
prob
*
100
),
print
((
sym
.
lower
(),
"
%
d
%%
"
%
int
(
prob
*
100
),))
elif
prob
>=
theta2
:
elif
prob
>=
theta2
:
print
sym
,
"
%
d
%%
"
%
int
(
prob
*
100
),
print
((
sym
,
"
%
d
%%
"
%
int
(
prob
*
100
),))
print
print
()
def
saveConsensus
(
self
,
myseq
,
filename
,
theta1
=
0.2
,
theta2
=
0.05
,
lowercase
=
True
,
compact
=
False
):
def
saveConsensus
(
self
,
myseq
,
filename
,
theta1
=
0.2
,
theta2
=
0.05
,
lowercase
=
True
,
compact
=
False
):
""" Display a table with rows for each alignment column, showing
""" Display a table with rows for each alignment column, showing
...
@@ -681,10 +682,9 @@ class Alignment():
...
@@ -681,10 +682,9 @@ class Alignment():
htmlstr
+=
html
htmlstr
+=
html
htmlstr
+=
'<pre>'
htmlstr
+=
'<pre>'
if
filename
:
if
filename
:
fh
=
open
(
filename
,
'w'
)
with
open
(
filename
,
'w+'
)
as
fh
:
fh
.
write
(
htmlstr
)
fh
.
write
(
htmlstr
)
fh
.
write
(
'</body></html>
\n
'
)
fh
.
write
(
'</body></html>
\n
'
)
fh
.
close
()
else
:
else
:
return
htmlstr
return
htmlstr
...
@@ -985,12 +985,12 @@ def readClustal(string, alphabet):
...
@@ -985,12 +985,12 @@ def readClustal(string, alphabet):
index
=
name
.
find
(
'/'
)
index
=
name
.
find
(
'/'
)
if
index
>=
0
:
if
index
>=
0
:
name
=
name
[
0
:
index
]
name
=
name
[
0
:
index
]
if
seqs
.
has_key
(
name
)
:
if
name
in
seqs
:
seqs
[
name
]
+=
seqstr
seqs
[
name
]
+=
seqstr
else
:
else
:
seqs
[
name
]
=
seqstr
seqs
[
name
]
=
seqstr
sequences
=
[]
sequences
=
[]
for
name
,
seqstr
in
seqs
.
items
(
):
for
name
,
seqstr
in
list
(
seqs
.
items
()
):
sequences
.
append
(
Sequence
(
seqstr
,
alphabet
,
name
,
gappy
=
True
))
sequences
.
append
(
Sequence
(
seqstr
,
alphabet
,
name
,
gappy
=
True
))
return
Alignment
(
sequences
)
return
Alignment
(
sequences
)
...
@@ -1180,12 +1180,12 @@ class PWM(object):
...
@@ -1180,12 +1180,12 @@ class PWM(object):
def
display
(
self
,
format
=
'COLUMN'
):
def
display
(
self
,
format
=
'COLUMN'
):
if
format
==
'COLUMN'
:
if
format
==
'COLUMN'
:
print
"
\t
%
s"
%
(
' '
.
join
(
"
%5
d"
%
(
i
+
1
)
for
i
in
range
(
self
.
length
)))
print
((
"
\t
%
s"
%
(
' '
.
join
(
"
%5
d"
%
(
i
+
1
)
for
i
in
range
(
self
.
length
))
)))
for
j
in
range
(
len
(
self
.
alphabet
)):
for
j
in
range
(
len
(
self
.
alphabet
)):
print
"
%
s
\t
%
s"
%
(
self
.
alphabet
[
j
],
' '
.
join
(
"
%+6.2
f"
%
(
y
)
for
y
in
self
.
m
[
j
]
))
print
((
"
%
s
\t
%
s"
%
(
self
.
alphabet
[
j
],
' '
.
join
(
"
%+6.2
f"
%
(
y
)
for
y
in
self
.
m
[
j
]))
))
elif
format
==
'JASPAR'
:
elif
format
==
'JASPAR'
:
for
j
in
range
(
len
(
self
.
alphabet
)):
for
j
in
range
(
len
(
self
.
alphabet
)):
print
"
%
s
\t
[
%
s]"
%
(
self
.
alphabet
[
j
],
' '
.
join
(
"
%+6.2
f"
%
(
y
)
for
y
in
self
.
m
[
j
]
))
print
((
"
%
s
\t
[
%
s]"
%
(
self
.
alphabet
[
j
],
' '
.
join
(
"
%+6.2
f"
%
(
y
)
for
y
in
self
.
m
[
j
]))
))
def
search
(
self
,
sequence
,
lowerBound
=
0
):
def
search
(
self
,
sequence
,
lowerBound
=
0
):
""" Find matches to the motif in a specified sequence. Returns a list
""" Find matches to the motif in a specified sequence. Returns a list
...
@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
...
@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
for
i
in
range
(
MAX_TRY
):
for
i
in
range
(
MAX_TRY
):
try
:
try
:
fastaData
=
fetch
(
id
,
database
)
fastaData
=
fetch
(
id
,
database
)
.
decode
(
"utf-8"
)
seq
=
readFasta
(
fastaData
)[
0
]
seq
=
readFasta
(
fastaData
)[
0
]
break
break
except
:
except
:
from
time
import
sleep
from
time
import
sleep
print
'Failed on {i}th try for id {id}'
.
format
(
i
=
i
,
id
=
id
)
print
((
'Failed on {i}th try for id {id}'
.
format
(
i
=
i
,
id
=
id
))
)
sleep
(
0.1
)
sleep
(
0.1
)
try
:
try
:
return
Sequence
(
seq
[
start
:
end
],
seq
.
alphabet
,
seq
.
name
,
seq
.
info
)
return
Sequence
(
seq
[
start
:
end
],
seq
.
alphabet
,
seq
.
name
,
seq
.
info
)
...
@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
...
@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
seqs
=
readFastaFile
(
'/Users/mikael/ASR/CYP11/CYP11_aln_full.fa'
,
Protein_wX
,
gappy
=
True
)
seqs
=
readFastaFile
(
'/Users/mikael/ASR/CYP11/CYP11_aln_full.fa'
,
Protein_wX
,
gappy
=
True
)
print
'Read'
,
len
(
seqs
),
'sequences'
print
((
'Read'
,
len
(
seqs
),
'sequences'
))
spred.py
View file @
ac6c5d6b
...
@@ -71,7 +71,7 @@ class SeqNN():
...
@@ -71,7 +71,7 @@ class SeqNN():
im
[
row
,
_onehotIndex
(
alpha
,
subseqs
[
k
])]
=
1
im
[
row
,
_onehotIndex
(
alpha
,
subseqs
[
k
])]
=
1
if
targets
:
om
[
row
,
self
.
outp_alpha
.
index
(
subtarg
[
k
])]
=
1
if
targets
:
om
[
row
,
self
.
outp_alpha
.
index
(
subtarg
[
k
])]
=
1
row
+=
1
row
+=
1
print
"There are"
,
row
,
"entries in data set"
print
(
"There are"
,
row
,
"entries in data set"
)
if
targets
:
if
targets
:
return
im
,
om
return
im
,
om
else
:
else
:
...
@@ -85,7 +85,7 @@ class SeqNN():
...
@@ -85,7 +85,7 @@ class SeqNN():
im
,
om
=
self
.
_encodeseq
(
seqs
,
targets
)
im
,
om
=
self
.
_encodeseq
(
seqs
,
targets
)
for
i
in
range
(
niter
):
# train first NN
for
i
in
range
(
niter
):
# train first NN
rmse
=
self
.
nn1
.
train
(
im
,
om
,
eta
=
eta
,
niter
=
1
)
rmse
=
self
.
nn1
.
train
(
im
,
om
,
eta
=
eta
,
niter
=
1
)
print
i
,
":"
,
rmse
print
(
i
,
":"
,
rmse
)
if
not
self
.
cascade
:
# if there's no cascaded NN, finish here
if
not
self
.
cascade
:
# if there's no cascaded NN, finish here
return
rmse
return
rmse
nn1seqs
=
[]
# a list of new SS sequences ...
nn1seqs
=
[]
# a list of new SS sequences ...
...
@@ -95,7 +95,7 @@ class SeqNN():
...
@@ -95,7 +95,7 @@ class SeqNN():
im
,
om
=
self
.
_encodeseq
(
nn1seqs
,
targets
)
# construct input/output patterns from SS sequences
im
,
om
=
self
.
_encodeseq
(
nn1seqs
,
targets
)
# construct input/output patterns from SS sequences
for
i
in
range
(
niter
):
# train cascaded NN
for
i
in
range
(
niter
):
# train cascaded NN
rmse
=
self
.
nn2
.
train
(
im
,
om
,
eta
=
eta
,
niter
=
1
)
rmse
=
self
.
nn2
.
train
(
im
,
om
,
eta
=
eta
,
niter
=
1
)
print
i
,
":"
,
rmse
print
(
i
,
":"
,
rmse
)
return
rmse
return
rmse
def
testAll
(
self
,
seqs
,
targets
):
def
testAll
(
self
,
seqs
,
targets
):
...
...
sstruct.py
View file @
ac6c5d6b
...
@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4):
...
@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4):
specified width average of 100.
specified width average of 100.
"""
"""
sum
=
0.0
sum
=
0.0
order
=
range
(
0
,
len
(
calls
)
-
1
,
+
1
)
# we are extending calls downstream
order
=
list
(
range
(
0
,
len
(
calls
)
-
1
,
+
1
)
)
# we are extending calls downstream
cnt
=
0
cnt
=
0
for
i
in
order
:
# extend to the right
for
i
in
order
:
# extend to the right
if
calls
[
i
]:
# to extend a call is required in the first place
if
calls
[
i
]:
# to extend a call is required in the first place
...
@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4):
...
@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4):
AND extend this list upstream containing a specified width average of 100.
AND extend this list upstream containing a specified width average of 100.
"""
"""
sum
=
0.0
sum
=
0.0
order
=
range
(
len
(
calls
)
-
1
,
0
,
-
1
)
# we are extending calls upstream/to-the-left
order
=
list
(
range
(
len
(
calls
)
-
1
,
0
,
-
1
)
)
# we are extending calls upstream/to-the-left
cnt
=
0
cnt
=
0
for
i
in
order
:
# extend to the right
for
i
in
order
:
# extend to the right
if
calls
[
i
]:
# a requirement to extend is to have a call in the first place
if
calls
[
i
]:
# a requirement to extend is to have a call in the first place
...
...
sym.py
View file @
ac6c5d6b
...
@@ -291,7 +291,7 @@ class TupleEntries(object):
...
@@ -291,7 +291,7 @@ class TupleEntries(object):
def
__iter__
(
self
):
def
__iter__
(
self
):
return
self
return
self
def
next
(
self
):
def
__next__
(
self
):
""" Step through sequence of entries, either
""" Step through sequence of entries, either
(if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
(if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
(if sparse) with calls to tuple store based on all possible symbol combinations."""
(if sparse) with calls to tuple store based on all possible symbol combinations."""
...
...
webservice.py
View file @
ac6c5d6b
import
urllib
,
urllib2
import
urllib
.request
import
os
import
os
from
time
import
sleep
from
time
import
sleep
import
stats
import
stats
from
StringIO
import
StringIO
from
io
import
StringIO
import
gzip
import
gzip
""" This module is collection of functions for accessing the EBI REST web services,
""" This module is collection of functions for accessing the EBI REST web services,
...
@@ -32,11 +32,11 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
...
@@ -32,11 +32,11 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
url
=
__ebiUrl__
+
'dbfetch/dbfetch?style=raw&db='
+
dbName
+
'&format='
+
format
+
'&id='
+
entryId
url
=
__ebiUrl__
+
'dbfetch/dbfetch?style=raw&db='
+
dbName
+
'&format='
+
format
+
'&id='
+
entryId
# Get the entry
# Get the entry
try
:
try
:
data
=
urllib
2
.
urlopen
(
url
)
.
read
()
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
if
data
.
startswith
(
'ERROR'
):
if
data
.
startswith
(
b
'ERROR'
):
raise
RuntimeError
(
data
)
raise
RuntimeError
(
data
)
return
data
return
data
except
urllib2
.
HTTPError
,
ex
:
except
(
urllib
.
error
.
HTTPError
,
ex
):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
def
search
(
query
,
dbName
=
'uniprot'
,
format
=
'list'
,
limit
=
100
):
def
search
(
query
,
dbName
=
'uniprot'
,
format
=
'list'
,
limit
=
100
):
...
@@ -57,12 +57,12 @@ def search(query, dbName='uniprot', format='list', limit=100):
...
@@ -57,12 +57,12 @@ def search(query, dbName='uniprot', format='list', limit=100):
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&limit='
+
str
(
limit
)
+
'&query='
+
query
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&limit='
+
str
(
limit
)
+
'&query='
+
query
# Get the entries
# Get the entries
try
:
try
:
data
=
urllib
2
.
urlopen
(
url
)
.
read
()
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
if
format
==
'list'
:
if
format
==
'list'
:
return
data
.
splitlines
()
return
data
.
splitlines
()
else
:
else
:
return
data
return
data
except
urllib2
.
HTTPError
,
ex
:
except
(
urllib
.
error
.
HTTPError
,
ex
):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
elif
dbName
.
startswith
(
'refseq'
):
elif
dbName
.
startswith
(
'refseq'
):
dbs
=
dbName
.
split
(
":"
)
dbs
=
dbName
.
split
(
":"
)
...
@@ -72,7 +72,7 @@ def search(query, dbName='uniprot', format='list', limit=100):
...
@@ -72,7 +72,7 @@ def search(query, dbName='uniprot', format='list', limit=100):
url
=
base
+
"esearch.fcgi?db="
+
dbName
+
"&term="
+
query
+
"&retmax="
+
str
(
limit
)
url
=
base
+
"esearch.fcgi?db="
+
dbName
+
"&term="
+
query
+
"&retmax="
+
str
(
limit
)
# Get the entries
# Get the entries
try
:
try
:
data
=
urllib
2
.
urlopen
(
url
)
.
read
()
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
words
=
data
.
split
(
"</Id>"
)
words
=
data
.
split
(
"</Id>"
)
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
if
format
==
'list'
:
if
format
==
'list'
:
...
@@ -81,11 +81,11 @@ def search(query, dbName='uniprot', format='list', limit=100):
...
@@ -81,11 +81,11 @@ def search(query, dbName='uniprot', format='list', limit=100):
url
=
base
+
"efetch.fcgi?db="
+
dbName
+
"&rettype=fasta&id="
url
=
base
+
"efetch.fcgi?db="
+
dbName
+
"&rettype=fasta&id="
for
w
in
words
:
for
w
in
words
:
url
+=
w
+
","
url
+=
w
+
","
data
=
urllib
2
.
urlopen
(
url
)
.
read
()
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
return
data
return
data
else
:
else
:
return
''
return
''
except
urllib2
.
HTTPError
,
ex
:
except
(
urllib
.
error
.
HTTPError
,
ex
):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
return
return
...
@@ -121,8 +121,8 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False)
...
@@ -121,8 +121,8 @@ def idmap(identifiers, frm='ACC', to='P_REFSEQ_AC', format='tab', reverse=False)
'query'
:
query
'query'
:
query
}
}
if
len
(
query
)
>
0
:
if
len
(
query
)
>
0
:
request
=
urllib
2
.
Request
(
url
,
urllib
.
urlencode
(
params
))
request
=
urllib
.
request
.
Request
(
url
,
urllib
.
parse
.
urlencode
(
params
))
response
=
urllib
2
.
urlopen
(
request
)
.
read
()
response
=
urllib
.
request
.
urlopen
(
request
)
.
read
()
d
=
dict
()
d
=
dict
()
for
row
in
response
.
splitlines
()[
1
:]:
for
row
in
response
.
splitlines
()[
1
:]:
pair
=
row
.
split
(
'
\t
'
)
pair
=
row
.
split
(
'
\t
'
)
...
@@ -170,7 +170,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
...
@@ -170,7 +170,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
if
background
==
None
:
if
background
==
None
:
for
t
in
term_set
:
for
t
in
term_set
:
term_cnt
[
t
]
=
fg_list
.
count
(
t
)
term_cnt
[
t
]
=
fg_list
.
count
(
t
)
sorted_cnt
=
sorted
(
term_cnt
.
items
(),
key
=
lambda
v
:
v
[
1
],
reverse
=
True
)
sorted_cnt
=
sorted
(
list
(
term_cnt
.
items
()),
key
=
lambda
v
:
v
[
1
],
reverse
=
True
)
else
:
# a background is provided
else
:
# a background is provided
for
t
in
term_set
:
for
t
in
term_set
:
fg_hit
=
fg_list
.
count
(
t
)
fg_hit
=
fg_list
.
count
(
t
)
...
@@ -178,7 +178,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
...
@@ -178,7 +178,7 @@ def getGOReport(positives, background = None, database = 'UniProtKB'):
fg_nohit
=
nPos
-
fg_hit
fg_nohit
=
nPos
-
fg_hit
bg_nohit
=
nNeg
-
bg_hit
bg_nohit
=
nNeg
-
bg_hit
term_cnt
[
t
]
=
(
fg_hit
,
fg_hit
+
bg_hit
,
stats
.
getFETpval
(
fg_hit
,
bg_hit
,
fg_nohit
,
bg_nohit
,
False
))
term_cnt
[
t
]
=
(
fg_hit
,
fg_hit
+
bg_hit
,
stats
.
getFETpval
(
fg_hit
,
bg_hit
,
fg_nohit
,
bg_nohit
,
False
))
sorted_cnt
=
sorted
(
term_cnt
.
items
(),
key
=
lambda
v
:
v
[
1
][
2
],
reverse
=
False
)
sorted_cnt
=
sorted
(
list
(
term_cnt
.
items
()),
key
=
lambda
v
:
v
[
1
][
2
],
reverse
=
False
)
ret
=
[]
ret
=
[]
for
t
in
sorted_cnt
:
for
t
in
sorted_cnt
:
...
@@ -199,17 +199,17 @@ def getGODef(goterm):
...
@@ -199,17 +199,17 @@ def getGODef(goterm):
# Get the entry: fill in the fields specified below
# Get the entry: fill in the fields specified below
try
:
try
:
entry
=
{
'id'
:
None
,
'name'
:
None
,
'def'
:
None
}
entry
=
{
'id'
:
None
,
'name'
:
None
,
'def'
:
None
}
data
=
urllib
2
.
urlopen
(
url
)
.
read
()
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
for
row
in
data
.
splitlines
():
for
row
in
data
.
splitlines
():
index
=
row
.
find
(
':'
)
index
=
row
.
find
(
':'
)
if
index
>
0
and
len
(
row
[
index
:])
>
1
:
if
index
>
0
and
len
(
row
[
index
:])
>
1
:
field
=
row
[
0
:
index
]
.
strip
()
field
=
row
[
0
:
index
]
.
strip
()
value
=
row
[
index
+
1
:]
.
strip
(
' "'
)
# remove spaces and quotation marks
value
=
row
[
index
+
1
:]
.
strip
(
' "'
)
# remove spaces and quotation marks
if
field
in
entry
.
keys
():
# check if we need this field
if
field
in
list
(
entry
.
keys
()):
# check if we need this field
if
entry
[
field
]
==
None
:
# check if not yet assigned
if
entry
[
field
]
==
None
:
# check if not yet assigned
entry
[
field
]
=
value
entry
[
field
]
=
value
return
entry
return
entry
except
urllib2
.
HTTPError
,
ex
:
except
(
urllib
.
error
.
HTTPError
,
ex
):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
def
getGOTerms
(
genes
,
database
=
'UniProtKB'
,
completeAnnot
=
False
):
def
getGOTerms
(
genes
,
database
=
'UniProtKB'
,
completeAnnot
=
False
):
...
@@ -246,9 +246,9 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
...
@@ -246,9 +246,9 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
url
=
__ebiGOUrl__
+
uri_string
+
query
url
=
__ebiGOUrl__
+
uri_string
+
query
# Get the entry: fill in the fields specified below
# Get the entry: fill in the fields specified below
try
:
try
:
urlreq
=
urllib
2
.
Request
(
url
)
urlreq
=
urllib
.
request
.
Request
(
url
)
urlreq
.
add_header
(
'Accept-encoding'
,
'gzip'
)
urlreq
.
add_header
(
'Accept-encoding'
,
'gzip'
)
response
=
urllib
2
.
urlopen
(
urlreq
)
response
=
urllib
.
request
.
urlopen
(
urlreq
)
if
response
.
info
()
.
get
(
'Content-Encoding'
)
==
'gzip'
:
if
response
.
info
()
.
get
(
'Content-Encoding'
)
==
'gzip'
:
buf
=
StringIO
(
response
.
read
())
buf
=
StringIO
(
response
.
read
())
f
=
gzip
.
GzipFile
(
fileobj
=
buf
)
f
=
gzip
.
GzipFile
(
fileobj
=
buf
)
...
@@ -259,12 +259,12 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
...
@@ -259,12 +259,12 @@ def getGOTerms(genes, database='UniProtKB', completeAnnot = False):
values
=
row
.
split
(
'
\t
'
)
values
=
row
.
split
(
'
\t
'
)
if
len
(
values
)
>=
7
:
if
len
(
values
)
>=
7
:
key
=
values
[
1
]
key
=
values
[
1
]
if
termsmap
.
has_key
(
key
):
if
key
in
termsmap
:
termsmap
[
key
]
.
add
(
values
[
6
])
termsmap
[
key
]
.
add
(
values
[
6
])
else
:
else
:
termsmap
[
key
]
=
set
([
values
[
6
]])
termsmap
[
key
]
=
set
([
values
[
6
]])
taxonmap
[
key
]
=
int
(
values
[
4
])
taxonmap
[
key
]
=
int
(
values
[
4
])
except
urllib2
.
HTTPError
,
ex
:
except
(
urllib
.
error
.
HTTPError
,
ex
):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
if
completeAnnot
:
if
completeAnnot
:
if
len
(
genes
)
==
1
:
if
len
(
genes
)
==
1
:
...
@@ -304,13 +304,13 @@ def getGenes(goterms, database='UniProtKB', taxo=None):
...
@@ -304,13 +304,13 @@ def getGenes(goterms, database='UniProtKB', taxo=None):
url
=
__ebiGOUrl__
+
uri_string
+
goterm
.
strip
()
url
=
__ebiGOUrl__
+
uri_string
+
goterm
.
strip
()
# Get the entry: fill in the fields specified below
# Get the entry: fill in the fields specified below
try
:
try
:
data
=
urllib
2
.
urlopen
(
url
)
.
read
()
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
for
row
in
data
.
splitlines
()[
1
:]:
# we ignore first (header) row
for
row
in
data
.
splitlines
()[
1
:]:
# we ignore first (header) row
values
=
row
.
split
(
'
\t
'
)
values
=
row
.
split
(
'
\t
'
)
if
len
(
values
)
>=
7
:
if
len
(
values
)
>=
7
:
genes
.
add
(
values
[
1
])
genes
.
add
(
values
[
1
])
map
[
goterm
]
=
list
(
genes
)
map
[
goterm
]
=
list
(
genes
)
except
urllib2
.
HTTPError
,
ex
:
except
(
urllib
.
error
.
HTTPError
,
ex
):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
if
len
(
goterms
)
==
1
:
if
len
(
goterms
)
==
1
:
return
map
[
goterms
[
0
]]
return
map
[
goterms
[
0
]]
...
@@ -381,12 +381,12 @@ class EBI(object):
...
@@ -381,12 +381,12 @@ class EBI(object):
databaseData
=
''
databaseData
=
''
for
db
in
databaseList
:
for
db
in
databaseList
:
databaseData
+=
'&database='
+
db
databaseData
+=
'&database='
+
db
encodedParams
=
urllib
.
urlencode
(
params
)
encodedParams
=
urllib
.
parse
.
urlencode
(
params
)
encodedParams
+=
databaseData
encodedParams
+=
databaseData
else
:
else
:
encodedParams
=
urllib
.
urlencode
(
params
)
encodedParams
=
urllib
.
parse
.
urlencode
(
params
)
print
url
print
(
url
)
self
.
jobId
=
urllib
2
.
urlopen
(
url
,
encodedParams
)
.
read
()
self
.
jobId
=
urllib
.
request
.
urlopen
(
url
,
encodedParams
)
.
read
()
self
.
createLock
()
self
.
createLock
()
return
self
.
jobId
return
self
.
jobId
...
@@ -396,23 +396,23 @@ class EBI(object):
...
@@ -396,23 +396,23 @@ class EBI(object):
if
jobId
is
None
:
if
jobId
is
None
:
jobId
=
self
.
jobId
jobId
=
self
.
jobId
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/status/
%
s'
%
jobId
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/status/
%
s'
%
jobId
status
=
urllib
2
.
urlopen
(
url
)
.
read
()
status
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
return
status
return
status
def
resultTypes
(
self
):
def
resultTypes
(
self
):
""" Get the available result types. Will only work on a finished job. """
""" Get the available result types. Will only work on a finished job. """
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/resulttypes/
%
s'
%
self
.
jobId
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/resulttypes/
%
s'
%
self
.
jobId
resultTypes
=
urllib
2
.
urlopen
(
url
)
.
read
()
resultTypes
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
return
resultTypes
return
resultTypes
def
result
(
self
,
resultType
):
def
result
(
self
,
resultType
):
""" Get the result of the given job of the specified type. """
""" Get the result of the given job of the specified type. """
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/result/
%
s/
%
s'
%
(
self
.
jobId
,
resultType
)
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/result/
%
s/
%
s'
%
(
self
.
jobId
,
resultType
)
try
:
try
:
result
=
urllib
2
.
urlopen
(
url
)
.
read
()
result
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
if
resultType
==
'error'
:
if
resultType
==
'error'
:
raise
RuntimeError
(
'An error occurred:
%
s'
%
result
)
raise
RuntimeError
(
'An error occurred:
%
s'
%
result
)
except
urllib2
.
HTTPError
:
except
(
urllib
.
error
.
HTTPError
):
if
resultType
==
'error'
:
if
resultType
==
'error'
:
raise
RuntimeError
(
'An unknown error occurred while processing the job (check your input)'
)
raise
RuntimeError
(
'An unknown error occurred while processing the job (check your input)'
)
else
:
else
:
...
@@ -424,8 +424,8 @@ class EBI(object):
...
@@ -424,8 +424,8 @@ class EBI(object):
Return the output in the specified format. """
Return the output in the specified format. """
params
[
'email'
]
=
self
.
__email__
params
[
'email'
]
=
self
.
__email__
self
.
run
(
params
)
self
.
run
(
params
)
print
'Submitted new'
,
self
.
service
,
'job, jobId:'
,
self
.
jobId
print
((
'Submitted new'
,
self
.
service
,
'job, jobId:'
,
self
.
jobId
))
print
'Please be patient while the job is completed'
print
(
'Please be patient while the job is completed'
)
status
=
'RUNNING'
status
=
'RUNNING'
observe
=
0
observe
=
0
while
status
==
'RUNNING'
:
while
status
==
'RUNNING'
:
...
@@ -434,7 +434,7 @@ class EBI(object):
...
@@ -434,7 +434,7 @@ class EBI(object):
sleep
(
self
.
__checkInterval__
)
sleep
(
self
.
__checkInterval__
)
if
status
!=
'FINISHED'
:
if
status
!=
'FINISHED'
:
raise
RuntimeError
(
'An error occurred and the job could not be completed'
)
raise
RuntimeError
(
'An error occurred and the job could not be completed'
)
print
'Job complete.'
print
(
'Job complete.'
)
self
.
removeLock
()
self
.
removeLock
()
if
type
(
resultTypes
)
!=
list
:
if
type
(
resultTypes
)
!=
list
:
resultTypes
=
[
resultTypes
]
resultTypes
=
[
resultTypes
]
...
@@ -445,5 +445,3 @@ class EBI(object):
...
@@ -445,5 +445,3 @@ class EBI(object):
return
results
[
0
]
return
results
[
0
]
else
:
else
:
return
results
return
results
wordcount.py
View file @
ac6c5d6b
...
@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
...
@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
neg
[
word
]
=
1
neg
[
word
]
=
1
logratio
=
RCDict
()
# DNA dictionary for storing the log-ration between pos and neg
logratio
=
RCDict
()
# DNA dictionary for storing the log-ration between pos and neg
for
(
word
,
cnt_pos
)
in
pos
.
items
(
):
for
(
word
,
cnt_pos
)
in
list
(
pos
.
items
()
):
cnt_neg
=
0.0001
cnt_neg
=
0.0001
try
:
try
:
cnt_neg
=
neg
[
word
]
cnt_neg
=
neg
[
word
]
...
@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
...
@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pass
pass
logratio
[
word
]
=
math
.
log
(
float
(
cnt_pos
)
/
float
(
cnt_neg
))
logratio
[
word
]
=
math
.
log
(
float
(
cnt_pos
)
/
float
(
cnt_neg
))
allpos
=
l
ogratio
.
items
(
)
# extract all pairs of words:log-ratio
allpos
=
l
ist
(
logratio
.
items
()
)
# extract all pairs of words:log-ratio
sortpos
=
sorted
(
allpos
,
key
=
lambda
v
:
v
[
1
],
reverse
=
True
)
# sort them
sortpos
=
sorted
(
allpos
,
key
=
lambda
v
:
v
[
1
],
reverse
=
True
)
# sort them
print
"Enriched words (sorted by ln pos/neg)"
print
(
"Enriched words (sorted by ln pos/neg)"
)
print
"Word
\t
ln pos/neg
\t
E-value"
print
(
"Word
\t
ln pos/neg
\t
E-value"
)
for
(
word
,
lgr
)
in
sortpos
[
0
:
100
]:
# Look at the top-entries according to log-ratio, compute e-values
for
(
word
,
lgr
)
in
sortpos
[
0
:
100
]:
# Look at the top-entries according to log-ratio, compute e-values
cnt_pos
=
int
(
pos
[
word
])
cnt_pos
=
int
(
pos
[
word
])
try
:
cnt_neg
=
int
(
neg
[
word
])
try
:
cnt_neg
=
int
(
neg
[
word
])
...
@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
...
@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pval
=
stats
.
getFETpval
(
cnt_pos
,
cnt_neg
,
len
(
seqs
)
*
(
PeakWidth
-
WordWidth
+
1
)
-
cnt_pos
,
len
(
seqs
)
*
(
len
(
seq
)
-
(
PeakMargin
*
2
+
PeakWidth
)
-
(
WordWidth
-
1
)
*
2
)
-
cnt_neg
,
False
)
pval
=
stats
.
getFETpval
(
cnt_pos
,
cnt_neg
,
len
(
seqs
)
*
(
PeakWidth
-
WordWidth
+
1
)
-
cnt_pos
,
len
(
seqs
)
*
(
len
(
seq
)
-
(
PeakMargin
*
2
+
PeakWidth
)
-
(
WordWidth
-
1
)
*
2
)
-
cnt_neg
,
False
)
# Correct for multiple testing (very conservatively)
# Correct for multiple testing (very conservatively)
eval
=
pval
*
len
(
allpos
)
eval
=
pval
*
len
(
allpos
)
print
"
%
s
\t
%6.3
f
\t
%
e"
%
(
word
,
lgr
,
eval
)
print
(
"
%
s
\t
%6.3
f
\t
%
e"
%
(
word
,
lgr
,
eval
)
)
def
getReverse
(
distribs
):
def
getReverse
(
distribs
):
""" Construct a new list of probability distributions of DNA, by
""" Construct a new list of probability distributions of DNA, by
...
@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
...
@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
except
KeyError
:
except
KeyError
:
usage
(
sys
.
argv
[
0
],
"Unknown motif
%
s"
%
motif
)
usage
(
sys
.
argv
[
0
],
"Unknown motif
%
s"
%
motif
)
return
return
print
"Motif
%
s:"
%
motif
print
(
"Motif
%
s:"
%
motif
)
pwm1
=
sequence
.
PWM
(
fg1
,
bg
)
pwm1
=
sequence
.
PWM
(
fg1
,
bg
)
pwm1
.
display
(
format
=
'JASPAR'
)
pwm1
.
display
(
format
=
'JASPAR'
)
print
"Motif
%
s (reverse complement):"
%
motif
print
(
"Motif
%
s (reverse complement):"
%
motif
)
pwm2
=
sequence
.
PWM
(
fg2
,
bg
)
pwm2
=
sequence
.
PWM
(
fg2
,
bg
)
pwm2
.
display
(
format
=
'JASPAR'
)
pwm2
.
display
(
format
=
'JASPAR'
)
...
@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
...
@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
# plot the average score curve
# plot the average score curve
# print >> sys.stderr, ""
# print >> sys.stderr, ""
x
=
range
(
-
(
seq_len
/
2
),
(
seq_len
/
2
))
# call center of sequence X=0
x
=
list
(
range
(
-
(
seq_len
/
2
),
(
seq_len
/
2
)
))
# call center of sequence X=0
lbl
=
"
%
s"
%
(
motif
)
lbl
=
"
%
s"
%
(
motif
)
plt
.
plot
(
x
,
avg_motif_score
,
label
=
lbl
)
plt
.
plot
(
x
,
avg_motif_score
,
label
=
lbl
)
#plt.plot(x, smoothed_avg_motif_score, label=lbl)
#plt.plot(x, smoothed_avg_motif_score, label=lbl)
...
@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
...
@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
except
KeyError
:
except
KeyError
:
usage
(
sys
.
argv
[
0
],
"Unknown motif
%
s"
%
motif
)
usage
(
sys
.
argv
[
0
],
"Unknown motif
%
s"
%
motif
)
return
return
print
"Motif
%
s:"
%
motif
print
(
"Motif
%
s:"
%
motif
)
pwm1
=
sequence
.
PWM
(
fg1
,
bg
)
pwm1
=
sequence
.
PWM
(
fg1
,
bg
)
pwm1
.
display
(
format
=
'JASPAR'
)
pwm1
.
display
(
format
=
'JASPAR'
)
print
"Motif
%
s (reverse complement):"
%
motif
print
(
"Motif
%
s (reverse complement):"
%
motif
)
pwm2
=
sequence
.
PWM
(
fg2
,
bg
)
pwm2
=
sequence
.
PWM
(
fg2
,
bg
)
pwm2
.
display
(
format
=
'JASPAR'
)
pwm2
.
display
(
format
=
'JASPAR'
)
...
@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
...
@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
# divide number of sequences with hit by total number of hits
# divide number of sequences with hit by total number of hits
site_probability
=
[
(
cnt
/
n_seqs_with_hits
)
for
cnt
in
hit_count
]
site_probability
=
[
(
cnt
/
n_seqs_with_hits
)
for
cnt
in
hit_count
]
print
>>
sys
.
stderr
,
"Number of sequences with hit (score >=
%
f):
%
d"
%
(
threshold
,
n_seqs_with_hits
)
print
(
"Number of sequences with hit (score >=
%
f):
%
d"
%
(
threshold
,
n_seqs_with_hits
),
file
=
sys
.
stderr
)
# STATISTICS
# STATISTICS
# Get the cumulative hit counts in concentric windows
# Get the cumulative hit counts in concentric windows
...
@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
...
@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
for
i
in
range
(
hw
,
seq_len
-
motif_width
+
1
-
hw
):
for
i
in
range
(
hw
,
seq_len
-
motif_width
+
1
-
hw
):
smoothed_site_probability
[
i
]
=
sum
(
site_probability
[
i
-
hw
:
i
+
hw
+
1
])
/
(
2
*
hw
+
1
)
smoothed_site_probability
[
i
]
=
sum
(
site_probability
[
i
-
hw
:
i
+
hw
+
1
])
/
(
2
*
hw
+
1
)
x
=
range
(
-
(
seq_len
/
2
),
(
seq_len
/
2
))
# call center of sequence X=0
x
=
list
(
range
(
-
(
seq_len
/
2
),
(
seq_len
/
2
)
))
# call center of sequence X=0
lbl
=
"
%
s, t=
%.2
f"
%
(
motif
,
threshold
)
lbl
=
"
%
s, t=
%.2
f"
%
(
motif
,
threshold
)
#lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
#lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
plt
.
plot
(
x
,
smoothed_site_probability
,
label
=
lbl
)
plt
.
plot
(
x
,
smoothed_site_probability
,
label
=
lbl
)
...
@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
...
@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
def
usage
(
name
,
errmsg
=
None
):
def
usage
(
name
,
errmsg
=
None
):
if
errmsg
!=
None
:
if
errmsg
!=
None
:
print
"Error:
%
s"
%
errmsg
print
(
"Error:
%
s"
%
errmsg
)
print
"""Usage:
%
s [options]
print
(
"""Usage:
%
s [options]
-f <fasta-filename> (required)
-f <fasta-filename> (required)
-d discover enriched words
-d discover enriched words
-w <word width, default 8>
-w <word width, default 8>
-p <peak width, default 100>
-p <peak width, default 100>
-m <peak margin, default 100>
-m <peak margin, default 100>
-s <JASPAR-ID> scan for JASPAR motif
-s <JASPAR-ID> scan for JASPAR motif
-h print this help"""
%
name
-h print this help"""
%
name
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
try
:
try
:
optlst
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'f:hds:j:w:p:m:'
)
optlst
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'f:hds:j:w:p:m:'
)
except
getopt
.
GetoptError
,
err
:
except
getopt
.
GetoptError
as
err
:
usage
(
sys
.
argv
[
0
],
str
(
err
))
usage
(
sys
.
argv
[
0
],
str
(
err
))
sys
.
exit
(
2
)
sys
.
exit
(
2
)
FILENAME
=
None
FILENAME
=
None
...
@@ -301,7 +301,7 @@ if __name__ == '__main__':
...
@@ -301,7 +301,7 @@ if __name__ == '__main__':
sys
.
exit
(
3
)
sys
.
exit
(
3
)
seqs
=
sequence
.
readFastaFile
(
FILENAME
,
sym
.
DNA_Alphabet_wN
)
seqs
=
sequence
.
readFastaFile
(
FILENAME
,
sym
.
DNA_Alphabet_wN
)
if
DISCOVER_MODE
:
if
DISCOVER_MODE
:
print
"Discover (f=
%
s; w=
%
d; p=
%
d; m=
%
d)"
%
(
FILENAME
,
WORD_WIDTH
,
PEAK_WIDTH
,
PEAK_MARGIN
)
print
(
"Discover (f=
%
s; w=
%
d; p=
%
d; m=
%
d)"
%
(
FILENAME
,
WORD_WIDTH
,
PEAK_WIDTH
,
PEAK_MARGIN
)
)
countWordsReport
(
seqs
,
WORD_WIDTH
,
PEAK_WIDTH
,
PEAK_MARGIN
)
countWordsReport
(
seqs
,
WORD_WIDTH
,
PEAK_WIDTH
,
PEAK_MARGIN
)
elif
SCAN_MODE
:
elif
SCAN_MODE
:
scanMotifReport
(
seqs
,
MOTIF_ID
)
scanMotifReport
(
seqs
,
MOTIF_ID
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment