Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
ac6c5d6b
Commit
ac6c5d6b
authored
Feb 14, 2017
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
python3_5
parent
934c2bff
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
860 additions
and
1533 deletions
+860
-1533
binomial.py
binomial.py
+3
-3
genome.py
genome.py
+32
-43
gibbs.py
gibbs.py
+2
-2
godata.py
godata.py
+41
-43
guide.py
guide.py
+172
-844
ml.py
ml.py
+2
-2
phylo.py
phylo.py
+19
-5
prob.py
prob.py
+6
-8
sam.py
sam.py
+44
-44
seqdata.py
seqdata.py
+38
-35
sequence.py
sequence.py
+31
-32
spred.py
spred.py
+3
-3
sstruct.py
sstruct.py
+2
-2
sym.py
sym.py
+1
-1
webservice.py
webservice.py
+447
-449
wordcount.py
wordcount.py
+17
-17
No files found.
binomial.py
View file @
ac6c5d6b
...
...
@@ -95,8 +95,8 @@ def betacf(a, b, x):
h
*=
delta
if
(
abs
(
delta
-
1.0
)
<
EPS
):
break
if
(
m
>
MAXIT
):
print
>>
sys
.
stderr
,
(
"a or b too big or MAXIT too small "
"in betacf"
)
if
(
m
>
MAXIT
):
print
(
(
"a or b too big or MAXIT too small "
"in betacf"
)
,
file
=
sys
.
stderr
)
return
h
...
...
@@ -118,5 +118,5 @@ def gammaln(x):
def
die
(
string
):
print
>>
sys
.
stderr
,
string
print
(
string
,
file
=
sys
.
stderr
)
genome.py
View file @
ac6c5d6b
...
...
@@ -105,7 +105,7 @@ class GeneExpression:
{'G2': array([ 4.1, -0.9]), 'G3': array([ 2.1, -2.1])}
"""
if
names
==
None
:
return
self
.
genes
.
keys
(
)
return
list
(
self
.
genes
.
keys
()
)
elif
isinstance
(
names
,
str
):
return
self
.
matrix
[
self
.
genes
[
names
],:]
else
:
...
...
@@ -148,7 +148,7 @@ class GeneExpression:
except
:
index
=
samples
mygenes
=
{}
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
mygenes
[
name
]
=
self
.
matrix
[
ndx
,
index
]
return
mygenes
...
...
@@ -165,7 +165,7 @@ class GeneExpression:
sort_ndx
=
np
.
nan_to_num
(
self
.
matrix
[:,
index
])
.
argsort
()
except
:
sort_ndx
=
np
.
nan_to_num
(
self
.
matrix
[:,
sample
])
.
argsort
()
name_tuples
=
sorted
(
self
.
genes
.
items
(
),
key
=
lambda
v
:
v
[
1
])
# put all gene names in order of the matrix of profiles
name_tuples
=
sorted
(
list
(
self
.
genes
.
items
()
),
key
=
lambda
v
:
v
[
1
])
# put all gene names in order of the matrix of profiles
names
=
[]
if
descending
:
for
(
name
,
index
)
in
[
name_tuples
[
index
]
for
index
in
sort_ndx
[::
-
1
]]:
# reverse the order
...
...
@@ -199,7 +199,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding ratios. """
mygenes
=
{}
mdiv
=
self
.
matrix
[:,
index1
]
/
self
.
matrix
[:,
index2
]
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
mygenes
[
name
]
=
mdiv
[
ndx
]
return
mygenes
...
...
@@ -208,7 +208,7 @@ class GeneExpression:
Creates and returns a gene dictionary with the corresponding log-ratios. """
mygenes
=
{}
mlr
=
np
.
log2
(
self
.
matrix
[:,
index1
]
/
self
.
matrix
[:,
index2
])
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
mygenes
[
name
]
=
mlr
[
ndx
]
return
mygenes
...
...
@@ -218,7 +218,7 @@ class GeneExpression:
index
=
self
.
genes
[
probeID
]
profile
=
self
.
matrix
[
index
,
:]
mygenes
=
{}
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
other
=
self
.
matrix
[
ndx
,
:]
mygenes
[
name
]
=
pearson
(
profile
,
other
)
return
mygenes
...
...
@@ -252,7 +252,7 @@ class GeneExpression:
# Calculate Z-score for the given column for each gene
zscore
=
(
self
.
matrix
[:,
index
]
-
mu
)
/
sd
mygenes
=
{}
for
(
name
,
ndx
)
in
self
.
genes
.
items
(
):
for
(
name
,
ndx
)
in
list
(
self
.
genes
.
items
()
):
try
:
mygenes
[
name
]
=
zscore
[
ndx
,
:]
except
IndexError
:
...
...
@@ -331,9 +331,9 @@ def readGEOFile(filename, id_column=0):
genes
[
name
]
=
values
if
len
(
genes
)
==
0
:
raise
RuntimeError
(
'No data in file'
)
print
'Data set
%
s contains
%
d entries'
%
(
dataset
,
len
(
genes
))
print
(
'Data set
%
s contains
%
d genes'
%
(
dataset
,
len
(
genes
)
))
if
cnt_null
>
0
:
print
'Data set has
%
d null-values'
%
(
cnt_null
)
print
(
'Data set has
%
d null-values'
%
(
cnt_null
)
)
return
GeneExpression
(
dataset
,
headers
[
2
:],
genes
)
...
...
@@ -357,40 +357,29 @@ def pearson(X, Y):
return
0
return
(
sum
-
n
*
(
Xmu
*
Ymu
))
/
(
n
*
math
.
sqrt
(
Xvar
)
*
math
.
sqrt
(
Yvar
))
# ------------------- Example ---------------------
# ------------------- Example
(basically exercise 7 in prac 9)
---------------------
ge3716
=
readGEOFile
(
'/Users/mikael/workspace/COSC2000/GDS3716.soft'
)
if
__name__
==
'__main__'
:
ratio
=
GeneExpression
(
'GDS3716_ratio'
)
ratio
.
addSamples
(
'S1_ER+/Healthy'
,
ge3716
.
getRatio
(
33
,
0
))
ratio
.
addSamples
(
'S2_ER+/Healthy'
,
ge3716
.
getRatio
(
34
,
1
))
ratio
.
addSamples
(
'S3_ER+/Healthy'
,
ge3716
.
getRatio
(
35
,
2
))
ratio
.
addSamples
(
'S4_ER+/Healthy'
,
ge3716
.
getRatio
(
36
,
3
))
ratio
.
addSamples
(
'S5_ER+/Healthy'
,
ge3716
.
getRatio
(
37
,
4
))
ratio
.
addSamples
(
'S6_ER+/Healthy'
,
ge3716
.
getRatio
(
38
,
5
))
ratio
.
addSamples
(
'S7_ER+/Healthy'
,
ge3716
.
getRatio
(
39
,
6
))
ratio
.
addSamples
(
'S8_ER+/Healthy'
,
ge3716
.
getRatio
(
40
,
7
))
ratio
.
addSamples
(
'S9_ER+/Healthy'
,
ge3716
.
getRatio
(
41
,
8
))
ratio
.
addSamples
(
'S1_ER-/Healthy'
,
ge3716
.
getRatio
(
24
,
9
))
ratio
.
addSamples
(
'S2_ER-/Healthy'
,
ge3716
.
getRatio
(
25
,
10
))
ratio
.
addSamples
(
'S3_ER-/Healthy'
,
ge3716
.
getRatio
(
26
,
11
))
ratio
.
addSamples
(
'S4_ER-/Healthy'
,
ge3716
.
getRatio
(
27
,
12
))
ratio
.
addSamples
(
'S5_ER-/Healthy'
,
ge3716
.
getRatio
(
28
,
13
))
ratio
.
addSamples
(
'S6_ER-/Healthy'
,
ge3716
.
getRatio
(
29
,
14
))
ratio
.
addSamples
(
'S7_ER-/Healthy'
,
ge3716
.
getRatio
(
30
,
15
))
ratio
.
addSamples
(
'S8_ER-/Healthy'
,
ge3716
.
getRatio
(
31
,
16
))
ratio
.
addSamples
(
'S9_ER-/Healthy'
,
ge3716
.
getRatio
(
32
,
17
))
ratio
.
writeGEOFile
(
'/Users/mikael/workspace/COSC2000/GDS3716_ratios.soft'
)
print
ge3716
.
getHeaders
()
z
=
ratio
.
getZScore
(
0
)
# NOT recommended! Ratios are NOT normally distributed! Use log-ratios instead.
ge38
=
readGEOFile
(
'/Users/mikael/workspace/COSC2000/GDS38.soft'
,
id_column
=
1
)
cln2_profile
=
ge38
.
getGenes
(
'CLN2'
)
pcorr
=
ge38
.
getPearson
(
'CLN2'
)
gp
=
GeneExpression
(
'Ex3'
,
'PC_CLN2'
,
pcorr
)
sorted
=
gp
.
sort
(
'PC_CLN2'
,
True
)
print
sorted
[
0
],
ge38
.
getGenes
(
sorted
[
0
])
print
sorted
[
1
],
ge38
.
getGenes
(
sorted
[
1
])
g
=
readGEOFile
(
'GDS3198.soft'
,
id_column
=
1
)
meanfold
=
{}
for
gene
in
g
.
genes
:
profile
=
g
.
getGenes
(
gene
)
meanfold
[
gene
]
=
(
np
.
log2
(
profile
[
0
]
/
profile
[
3
])
+
np
.
log2
(
profile
[
1
]
/
profile
[
4
])
+
np
.
log2
(
profile
[
2
]
/
profile
[
5
]))
/
3
import
matplotlib.pyplot
as
plt
scores
=
[
y
for
y
in
list
(
meanfold
.
values
())
if
not
np
.
isnan
(
y
)]
hist
,
bins
=
np
.
histogram
(
scores
,
bins
=
50
)
width
=
0.7
*
(
bins
[
1
]
-
bins
[
0
])
center
=
(
bins
[:
-
1
]
+
bins
[
1
:])
/
2
plt
.
bar
(
center
,
hist
,
align
=
'center'
,
width
=
width
)
plt
.
show
()
result
=
sorted
(
list
(
meanfold
.
items
()),
key
=
lambda
v
:
v
[
1
])
print
(
'========== Wildtype may down-regulate =========='
)
for
r
in
result
[
0
:
100
]:
print
(
r
[
0
],
r
[
1
])
print
(
'========== Wildtype may up-regulate =========='
)
for
r
in
result
[
-
1
:
-
100
:
-
1
]:
print
(
r
[
0
],
r
[
1
])
gibbs.py
View file @
ac6c5d6b
...
...
@@ -138,7 +138,7 @@ class GibbsMotif():
LL
+=
math
.
log
(
Qk
/
Pk
)
except
ZeroDivisionError
:
pass
print
"LL @
%5
d=
\t
%5.2
f"
%
(
round
,
LL
)
print
(
"LL @
%5
d=
\t
%5.2
f"
%
(
round
,
LL
)
)
# end main for-loop
self
.
q
=
q
...
...
@@ -312,7 +312,7 @@ class GibbsAlign():
LL
+=
math
.
log
(
Qk
/
Pk
)
except
ZeroDivisionError
:
pass
print
"LL @
%5
d=
\t
%5.2
f"
%
(
round
,
LL
)
print
(
"LL @
%5
d=
\t
%5.2
f"
%
(
round
,
LL
)
)
# end main for-loop
self
.
q
=
q
...
...
godata.py
View file @
ac6c5d6b
This diff is collapsed.
Click to expand it.
guide.py
View file @
ac6c5d6b
This diff is collapsed.
Click to expand it.
ml.py
View file @
ac6c5d6b
...
...
@@ -21,7 +21,7 @@ class NN():
self
.
b_hid
=
numpy
.
random
.
randn
(
nHidden
)
# biases hidden layer
self
.
w_out
=
numpy
.
random
.
randn
(
nOutput
,
nHidden
)
# weights hid -> out
self
.
b_out
=
numpy
.
random
.
randn
(
nOutput
)
# biases output layer
print
"Constructed NN with
%
d inputs,
%
d hidden and
%
d output nodes."
%
(
self
.
ninput
,
len
(
self
.
hidden
),
len
(
self
.
output
))
print
(
"Constructed NN with
%
d inputs,
%
d hidden and
%
d output nodes."
%
(
self
.
ninput
,
len
(
self
.
hidden
),
len
(
self
.
output
)
))
def
writeFile
(
self
,
filename
):
""" Save NN to a file. """
...
...
@@ -110,7 +110,7 @@ class NN():
multi_targ
=
[
target
]
for
i
in
range
(
niter
):
mse
=
0.0
entries
=
range
(
len
(
multi_input
))
entries
=
list
(
range
(
len
(
multi_input
)
))
if
shuffle
:
random
.
shuffle
(
entries
)
for
p
in
entries
:
...
...
phylo.py
View file @
ac6c5d6b
...
...
@@ -2,7 +2,7 @@
Module with methods and classes for phylogeny.
@author: mikael
'''
##
import sequence
import
sequence
class
PhyloTree
:
""" Rooted, binary (bifurcating) tree for representing phylogenetic relationships.
...
...
@@ -140,7 +140,19 @@ class PhyloNode:
return
left
+
','
elif
self
.
left
and
self
.
right
:
return
'('
+
left
+
','
+
right
+
')'
+
dist
def
__le__
(
self
,
other
):
""" Returns indication of less than other node. """
return
other
and
self
.
__hash__
()
<=
other
.
__hash__
()
def
__eq__
(
self
,
other
):
""" Returns indication of equivalence to other node. """
return
other
and
self
.
__hash__
()
==
other
.
__hash__
()
def
__hash__
(
self
):
""" Returns hash of object. """
return
hash
((
self
.
label
,
self
.
dist
,
self
.
sequence
))
def
_printSequences
(
self
,
start
,
end
):
""" Returns string with node (incl descendants) in a Newick style. """
left
=
right
=
label
=
dist
=
''
...
...
@@ -352,12 +364,12 @@ def runUPGMA(aln, measure, absoluteDistances = False):
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while
len
(
N
)
>
1
:
# N will contain all "live" clusters, to be reduced to a si
gn
le below
while
len
(
N
)
>
1
:
# N will contain all "live" clusters, to be reduced to a si
ng
le below
closest_pair
=
(
None
,
None
)
# The two nodes that are closest to one another according to supplied metric
closest_dist
=
None
# The distance between them
for
pair
in
D
:
# check all pairs which should be merged
dist
=
D
[
pair
]
if
dist
<
closest_dist
or
closest_dist
==
None
:
if
closest_dist
==
None
or
dist
<
closest_dist
:
closest_dist
=
dist
closest_pair
=
pair
# So we know the closest, now we need to merge...
...
...
@@ -365,8 +377,10 @@ def runUPGMA(aln, measure, absoluteDistances = False):
y
=
closest_pair
[
1
]
z
=
PhyloNode
()
# create a new node for the cluster z
z
.
dist
=
D
.
pop
(
_getkey
(
x
,
y
))
/
2.0
# assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx
=
N
.
pop
(
x
)
# find number of sequences in x, remove the cluster from list N
Ny
=
N
.
pop
(
y
)
# find number of sequences in y, remove the cluster from list N
Nx
=
N
.
pop
(
x
,
None
)
# find number of sequences in x, remove the cluster from list N
Ny
=
N
.
pop
(
y
,
None
)
# find number of sequences in y, remove the cluster from list N
if
Nx
==
None
or
Ny
==
None
:
continue
dz
=
{}
# new distances to cluster z
for
w
in
N
:
# for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
...
...
prob.py
View file @
ac6c5d6b
...
...
@@ -277,7 +277,7 @@ def _readDistrib(linelist):
if
len
(
d
)
==
0
:
return
None
alpha
=
Alphabet
(
symstr
)
if
'*'
in
d
.
keys
(
):
# tot provided
if
'*'
in
list
(
d
.
keys
()
):
# tot provided
for
sym
in
d
:
if
sym
!=
'*'
:
d
[
sym
]
=
d
[
sym
]
*
d
[
'*'
]
...
...
@@ -338,7 +338,7 @@ def _readMultiCount(linelist, format = 'JASPAR'):
ncol
=
len
(
counts
)
if
len
(
name
)
==
1
:
# proper symbol
symcount
[
name
]
=
counts
alpha
=
Alphabet
(
''
.
join
(
symcount
.
keys
(
)))
alpha
=
Alphabet
(
''
.
join
(
list
(
symcount
.
keys
()
)))
distribs
=
[]
for
col
in
range
(
ncol
):
d
=
dict
([(
sym
,
symcount
[
sym
][
col
])
for
sym
in
symcount
])
...
...
@@ -412,7 +412,7 @@ def readMultiCount(filename, format = 'JASPAR'):
"""
d
=
readMultiCounts
(
filename
,
format
=
format
)
if
len
(
d
)
>
0
:
return
d
.
values
(
)[
0
]
return
list
(
d
.
values
()
)[
0
]
#################################################################################################
# Joint class
...
...
@@ -628,12 +628,12 @@ class IndepJoint(Joint):
def
displayMatrix
(
self
,
count
=
False
):
""" Pretty-print matrix """
print
"
\t
%
s"
%
(
''
.
join
(
"
\t
%5
d"
%
(
i
+
1
)
for
i
in
range
(
len
(
self
.
alphas
))))
print
((
"
\t
%
s"
%
(
''
.
join
(
"
\t
%5
d"
%
(
i
+
1
)
for
i
in
range
(
len
(
self
.
alphas
))
))))
for
a
in
self
.
alphas
[
0
]:
if
count
:
print
"
%
s
\t
%
s"
%
(
a
,
''
.
join
(
"
\t
%5
d"
%
(
y
)
for
y
in
self
.
getRow
(
a
,
True
)))
print
((
"
%
s
\t
%
s"
%
(
a
,
''
.
join
(
"
\t
%5
d"
%
(
y
)
for
y
in
self
.
getRow
(
a
,
True
))
)))
else
:
print
"
%
s
\t
%
s"
%
(
a
,
''
.
join
(
"
\t
%5.3
f"
%
(
y
)
for
y
in
self
.
getRow
(
a
)))
print
((
"
%
s
\t
%
s"
%
(
a
,
''
.
join
(
"
\t
%5.3
f"
%
(
y
)
for
y
in
self
.
getRow
(
a
))
)))
def
__str__
(
self
):
""" Text representation of the table. Note that size is an issue so big tables
...
...
@@ -718,5 +718,3 @@ class NaiveBayes():
prob
*=
condprob
[
i
][
key
[
i
]]
or
0.0
out
.
observe
(
outsym
,
prob
)
return
out
sam.py
View file @
ac6c5d6b
This diff is collapsed.
Click to expand it.
seqdata.py
View file @
ac6c5d6b
This diff is collapsed.
Click to expand it.
sequence.py
View file @
ac6c5d6b
...
...
@@ -55,10 +55,11 @@ class Sequence(object):
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'Y'] """
try
:
# convert sequence data into a compact array representation
self
.
sequence
=
array
.
array
(
'c'
,
''
.
join
([
s
.
upper
()
for
s
in
sequence
]))
except
TypeError
:
raise
RuntimeError
(
'Sequence data is not specified correctly: must be iterable'
)
#try: # convert sequence data into a compact array representation
# self.sequence = sequence.encode("utf-8") #array.array('b', ''.join([s.upper() for s in sequence]))
#except TypeError:
# raise RuntimeError('S"""""""""""""""""""""""""""""""equence data is not specified correctly: must be iterable')
self
.
sequence
=
sequence
# Assign an alphabet
self
.
alphabet
=
None
...
...
@@ -133,15 +134,15 @@ class Sequence(object):
Calling self.__getitem__(3) is equivalent to self[3]
"""
if
type
(
ndx
)
is
slice
:
return
self
.
sequence
[
ndx
]
.
tostring
(
)
return
''
.
join
(
self
.
sequence
[
ndx
]
)
else
:
return
self
.
sequence
[
ndx
]
def
writeFasta
(
self
):
""" Write one sequence in FASTA format to a string and return it. """
fasta
=
'>'
+
self
.
name
+
' '
+
self
.
info
+
'
\n
'
data
=
self
.
sequence
.
tostring
(
)
nlines
=
(
len
(
self
.
sequence
)
-
1
)
/
60
+
1
data
=
''
.
join
(
self
.
sequence
)
nlines
=
int
(
math
.
ceil
((
len
(
self
.
sequence
)
-
1
)
/
60
+
1
))
for
i
in
range
(
nlines
):
lineofseq
=
''
.
join
(
data
[
i
*
60
:
(
i
+
1
)
*
60
])
+
'
\n
'
fasta
+=
lineofseq
...
...
@@ -164,7 +165,7 @@ class Sequence(object):
def
find
(
self
,
findme
):
""" Find the position of the specified symbol or sub-sequence """
return
self
.
sequence
.
tostring
(
)
.
find
(
findme
)
return
''
.
join
(
self
.
sequence
)
.
find
(
findme
)
"""
Below are some useful methods for loading data from strings and files.
...
...
@@ -438,8 +439,8 @@ class Alignment():
column index, entropy, number of gaps, and symbols in order of decreasing probability.
theta1 is the threshold for displaying symbols in upper case,
theta2 is the threshold for showing symbols at all, and in lower case. """
print
"Alignment of
%
d sequences, with
%
d columns"
%
(
len
(
self
.
seqs
),
self
.
alignlen
)
print
"Column
\t
Entropy
\t
Gaps
\t
Prob
\t
Conserv
\t
Symbols (Up>=
%.2
f;Low>=
%.2
f)
\n
"
%
(
theta1
,
theta2
)
print
((
"Alignment of
%
d sequences, with
%
d columns"
%
(
len
(
self
.
seqs
),
self
.
alignlen
))
)
print
((
"Column
\t
Entropy
\t
Gaps
\t
Prob
\t
Conserv
\t
Symbols (Up>=
%.2
f;Low>=
%.2
f)
\n
"
%
(
theta1
,
theta2
))
)
for
col
in
range
(
self
.
alignlen
):
d
=
Distrib
(
self
.
alphabet
)
gaps
=
0
...
...
@@ -448,21 +449,21 @@ class Alignment():
d
.
observe
(
seq
[
col
])
else
:
gaps
+=
1
print
(
col
+
1
),
"
\t
%5.3
f"
%
d
.
entropy
(),
"
\t
%4
d
\t
"
%
gaps
,
print
(((
col
+
1
),
"
\t
%5.3
f"
%
d
.
entropy
(),
"
\t
%4
d
\t
"
%
gaps
,))
symprobs
=
d
.
getProbsort
()
(
_
,
maxprob
)
=
symprobs
[
0
]
if
maxprob
>=
theta1
:
print
"
%
d
\t
TRUE
\t
"
%
int
(
maxprob
*
100
),
print
((
"
%
d
\t
TRUE
\t
"
%
int
(
maxprob
*
100
),))
else
:
print
"
%
d
\t\t
"
%
int
(
maxprob
*
100
),
print
((
"
%
d
\t\t
"
%
int
(
maxprob
*
100
),))
for
(
sym
,
prob
)
in
symprobs
:
if
prob
>=
theta1
:
print
sym
,
"
%
d
%%
"
%
int
(
prob
*
100
),
print
((
sym
,
"
%
d
%%
"
%
int
(
prob
*
100
),))
elif
prob
>=
theta2
and
lowercase
:
print
sym
.
lower
(),
"
%
d
%%
"
%
int
(
prob
*
100
),
print
((
sym
.
lower
(),
"
%
d
%%
"
%
int
(
prob
*
100
),))
elif
prob
>=
theta2
:
print
sym
,
"
%
d
%%
"
%
int
(
prob
*
100
),
print
print
((
sym
,
"
%
d
%%
"
%
int
(
prob
*
100
),))
print
()
def
saveConsensus
(
self
,
myseq
,
filename
,
theta1
=
0.2
,
theta2
=
0.05
,
lowercase
=
True
,
compact
=
False
):
""" Display a table with rows for each alignment column, showing
...
...
@@ -644,7 +645,7 @@ class Alignment():
return
distmat
def
writeHTML
(
self
,
filename
=
None
):
""" Generate HTML that displays the alignment in color.
""" Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue'
"""
...
...
@@ -681,10 +682,9 @@ class Alignment():
htmlstr
+=
html
htmlstr
+=
'<pre>'
if
filename
:
fh
=
open
(
filename
,
'w'
)
fh
.
write
(
htmlstr
)
fh
.
write
(
'</body></html>
\n
'
)
fh
.
close
()
with
open
(
filename
,
'w+'
)
as
fh
:
fh
.
write
(
htmlstr
)
fh
.
write
(
'</body></html>
\n
'
)
else
:
return
htmlstr
...
...
@@ -985,12 +985,12 @@ def readClustal(string, alphabet):
index
=
name
.
find
(
'/'
)
if
index
>=
0
:
name
=
name
[
0
:
index
]
if
seqs
.
has_key
(
name
)
:
if
name
in
seqs
:
seqs
[
name
]
+=
seqstr
else
:
seqs
[
name
]
=
seqstr
sequences
=
[]
for
name
,
seqstr
in
seqs
.
items
(
):
for
name
,
seqstr
in
list
(
seqs
.
items
()
):
sequences
.
append
(
Sequence
(
seqstr
,
alphabet
,
name
,
gappy
=
True
))
return
Alignment
(
sequences
)
...
...
@@ -1180,12 +1180,12 @@ class PWM(object):
def
display
(
self
,
format
=
'COLUMN'
):
if
format
==
'COLUMN'
:
print
"
\t
%
s"
%
(
' '
.
join
(
"
%5
d"
%
(
i
+
1
)
for
i
in
range
(
self
.
length
)))
print
((
"
\t
%
s"
%
(
' '
.
join
(
"
%5
d"
%
(
i
+
1
)
for
i
in
range
(
self
.
length
))
)))
for
j
in
range
(
len
(
self
.
alphabet
)):
print
"
%
s
\t
%
s"
%
(
self
.
alphabet
[
j
],
' '
.
join
(
"
%+6.2
f"
%
(
y
)
for
y
in
self
.
m
[
j
]
))
print
((
"
%
s
\t
%
s"
%
(
self
.
alphabet
[
j
],
' '
.
join
(
"
%+6.2
f"
%
(
y
)
for
y
in
self
.
m
[
j
]))
))
elif
format
==
'JASPAR'
:
for
j
in
range
(
len
(
self
.
alphabet
)):
print
"
%
s
\t
[
%
s]"
%
(
self
.
alphabet
[
j
],
' '
.
join
(
"
%+6.2
f"
%
(
y
)
for
y
in
self
.
m
[
j
]
))
print
((
"
%
s
\t
[
%
s]"
%
(
self
.
alphabet
[
j
],
' '
.
join
(
"
%+6.2
f"
%
(
y
)
for
y
in
self
.
m
[
j
]))
))
def
search
(
self
,
sequence
,
lowerBound
=
0
):
""" Find matches to the motif in a specified sequence. Returns a list
...
...
@@ -1229,7 +1229,7 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
""" Get the sequence identified by the given ID from the given database
(e.g. 'uniprotkb', 'refseqn' or 'refseqp'), and return it as a Sequence
object. An error is caused if the sequence ID is not found. If start and
end are given, then only that section of the sequence is returned.
end are given, then only that section of the sequence is returned.
Note: more flexible search options are supported by using webservice.fetch
directly."""
...
...
@@ -1237,12 +1237,12 @@ def getSequence(id, database = 'uniprotkb', start=None, end=None):
for
i
in
range
(
MAX_TRY
):
try
:
fastaData
=
fetch
(
id
,
database
)
fastaData
=
fetch
(
id
,
database
)
.
decode
(
"utf-8"
)
seq
=
readFasta
(
fastaData
)[
0
]
break
except
:
from
time
import
sleep
print
'Failed on {i}th try for id {id}'
.
format
(
i
=
i
,
id
=
id
)
print
((
'Failed on {i}th try for id {id}'
.
format
(
i
=
i
,
id
=
id
))
)
sleep
(
0.1
)
try
:
return
Sequence
(
seq
[
start
:
end
],
seq
.
alphabet
,
seq
.
name
,
seq
.
info
)
...
...
@@ -1319,5 +1319,4 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'):
if
__name__
==
'__main__'
:
seqs
=
readFastaFile
(
'/Users/mikael/ASR/CYP11/CYP11_aln_full.fa'
,
Protein_wX
,
gappy
=
True
)
print
'Read'
,
len
(
seqs
),
'sequences'
print
((
'Read'
,
len
(
seqs
),
'sequences'
))
spred.py
View file @
ac6c5d6b
...
...
@@ -71,7 +71,7 @@ class SeqNN():
im
[
row
,
_onehotIndex
(
alpha
,
subseqs
[
k
])]
=
1
if
targets
:
om
[
row
,
self
.
outp_alpha
.
index
(
subtarg
[
k
])]
=
1
row
+=
1
print
"There are"
,
row
,
"entries in data set"
print
(
"There are"
,
row
,
"entries in data set"
)
if
targets
:
return
im
,
om
else
:
...
...
@@ -85,7 +85,7 @@ class SeqNN():
im
,
om
=
self
.
_encodeseq
(
seqs
,
targets
)
for
i
in
range
(
niter
):
# train first NN
rmse
=
self
.
nn1
.
train
(
im
,
om
,
eta
=
eta
,
niter
=
1
)
print
i
,
":"
,
rmse
print
(
i
,
":"
,
rmse
)
if
not
self
.
cascade
:
# if there's no cascaded NN, finish here
return
rmse
nn1seqs
=
[]
# a list of new SS sequences ...
...
...
@@ -95,7 +95,7 @@ class SeqNN():
im
,
om
=
self
.
_encodeseq
(
nn1seqs
,
targets
)
# construct input/output patterns from SS sequences
for
i
in
range
(
niter
):
# train cascaded NN
rmse
=
self
.
nn2
.
train
(
im
,
om
,
eta
=
eta
,
niter
=
1
)
print
i
,
":"
,
rmse
print
(
i
,
":"
,
rmse
)
return
rmse
def
testAll
(
self
,
seqs
,
targets
):
...
...
sstruct.py
View file @
ac6c5d6b
...
...
@@ -85,7 +85,7 @@ def extendDownstream(scores, calls, width = 4):
specified width average of 100.
"""
sum
=
0.0
order
=
range
(
0
,
len
(
calls
)
-
1
,
+
1
)
# we are extending calls downstream
order
=
list
(
range
(
0
,
len
(
calls
)
-
1
,
+
1
)
)
# we are extending calls downstream
cnt
=
0
for
i
in
order
:
# extend to the right
if
calls
[
i
]:
# to extend a call is required in the first place
...
...
@@ -105,7 +105,7 @@ def extendUpstream(scores, calls, width = 4):
AND extend this list upstream containing a specified width average of 100.
"""
sum
=
0.0
order
=
range
(
len
(
calls
)
-
1
,
0
,
-
1
)
# we are extending calls upstream/to-the-left
order
=
list
(
range
(
len
(
calls
)
-
1
,
0
,
-
1
)
)
# we are extending calls upstream/to-the-left
cnt
=
0
for
i
in
order
:
# extend to the right
if
calls
[
i
]:
# a requirement to extend is to have a call in the first place
...
...
sym.py
View file @
ac6c5d6b
...
...
@@ -291,7 +291,7 @@ class TupleEntries(object):
def
__iter__
(
self
):
return
self
def
next
(
self
):
def
__next__
(
self
):
""" Step through sequence of entries, either
(if not sparse) with a step-size based on alphabet-sizes and what symbols are specified or
(if sparse) with calls to tuple store based on all possible symbol combinations."""
...
...
webservice.py
View file @
ac6c5d6b
This diff is collapsed.
Click to expand it.
wordcount.py
View file @
ac6c5d6b
...
...
@@ -45,7 +45,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
neg
[
word
]
=
1
logratio
=
RCDict
()
# DNA dictionary for storing the log-ration between pos and neg
for
(
word
,
cnt_pos
)
in
pos
.
items
(
):
for
(
word
,
cnt_pos
)
in
list
(
pos
.
items
()
):
cnt_neg
=
0.0001
try
:
cnt_neg
=
neg
[
word
]
...
...
@@ -53,10 +53,10 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pass
logratio
[
word
]
=
math
.
log
(
float
(
cnt_pos
)
/
float
(
cnt_neg
))
allpos
=
l
ogratio
.
items
(
)
# extract all pairs of words:log-ratio
allpos
=
l
ist
(
logratio
.
items
()
)
# extract all pairs of words:log-ratio
sortpos
=
sorted
(
allpos
,
key
=
lambda
v
:
v
[
1
],
reverse
=
True
)
# sort them
print
"Enriched words (sorted by ln pos/neg)"
print
"Word
\t
ln pos/neg
\t
E-value"
print
(
"Enriched words (sorted by ln pos/neg)"
)
print
(
"Word
\t
ln pos/neg
\t
E-value"
)
for
(
word
,
lgr
)
in
sortpos
[
0
:
100
]:
# Look at the top-entries according to log-ratio, compute e-values
cnt_pos
=
int
(
pos
[
word
])
try
:
cnt_neg
=
int
(
neg
[
word
])
...
...
@@ -65,7 +65,7 @@ def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
pval
=
stats
.
getFETpval
(
cnt_pos
,
cnt_neg
,
len
(
seqs
)
*
(
PeakWidth
-
WordWidth
+
1
)
-
cnt_pos
,
len
(
seqs
)
*
(
len
(
seq
)
-
(
PeakMargin
*
2
+
PeakWidth
)
-
(
WordWidth
-
1
)
*
2
)
-
cnt_neg
,
False
)
# Correct for multiple testing (very conservatively)
eval
=
pval
*
len
(
allpos
)
print
"
%
s
\t
%6.3
f
\t
%
e"
%
(
word
,
lgr
,
eval
)
print
(
"
%
s
\t
%6.3
f
\t
%
e"
%
(
word
,
lgr
,
eval
)
)
def
getReverse
(
distribs
):
""" Construct a new list of probability distributions of DNA, by
...
...
@@ -94,10 +94,10 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
except
KeyError
:
usage
(
sys
.
argv
[
0
],
"Unknown motif
%
s"
%
motif
)
return
print
"Motif
%
s:"
%
motif
print
(
"Motif
%
s:"
%
motif
)
pwm1
=
sequence
.
PWM
(
fg1
,
bg
)
pwm1
.
display
(
format
=
'JASPAR'
)
print
"Motif
%
s (reverse complement):"
%
motif
print
(
"Motif
%
s (reverse complement):"
%
motif
)
pwm2
=
sequence
.
PWM
(
fg2
,
bg
)
pwm2
.
display
(
format
=
'JASPAR'
)
...
...
@@ -141,7 +141,7 @@ def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
# plot the average score curve
# print >> sys.stderr, ""
x
=
range
(
-
(
seq_len
/
2
),
(
seq_len
/
2
))
# call center of sequence X=0
x
=
list
(
range
(
-
(
seq_len
/
2
),
(
seq_len
/
2
)
))
# call center of sequence X=0
lbl
=
"
%
s"
%
(
motif
)
plt
.
plot
(
x
,
avg_motif_score
,
label
=
lbl
)
#plt.plot(x, smoothed_avg_motif_score, label=lbl)
...
...
@@ -187,10 +187,10 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
except
KeyError
:
usage
(
sys
.
argv
[
0
],
"Unknown motif
%
s"
%
motif
)
return
print
"Motif
%
s:"
%
motif
print
(
"Motif
%
s:"
%
motif
)
pwm1
=
sequence
.
PWM
(
fg1
,
bg
)
pwm1
.
display
(
format
=
'JASPAR'
)
print
"Motif
%
s (reverse complement):"
%
motif
print
(
"Motif
%
s (reverse complement):"
%
motif
)
pwm2
=
sequence
.
PWM
(
fg2
,
bg
)
pwm2
.
display
(
format
=
'JASPAR'
)
...
...
@@ -222,7 +222,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
# divide number of sequences with hit by total number of hits
site_probability
=
[
(
cnt
/
n_seqs_with_hits
)
for
cnt
in
hit_count
]
print
>>
sys
.
stderr
,
"Number of sequences with hit (score >=
%
f):
%
d"
%
(
threshold
,
n_seqs_with_hits
)
print
(
"Number of sequences with hit (score >=
%
f):
%
d"
%
(
threshold
,
n_seqs_with_hits
),
file
=
sys
.
stderr
)
# STATISTICS
# Get the cumulative hit counts in concentric windows
...
...
@@ -250,7 +250,7 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
for
i
in
range
(
hw
,
seq_len
-
motif_width
+
1
-
hw
):
smoothed_site_probability
[
i
]
=
sum
(
site_probability
[
i
-
hw
:
i
+
hw
+
1
])
/
(
2
*
hw
+
1
)
x
=
range
(
-
(
seq_len
/
2
),
(
seq_len
/
2
))
# call center of sequence X=0
x
=
list
(
range
(
-
(
seq_len
/
2
),
(
seq_len
/
2
)
))
# call center of sequence X=0
lbl
=
"
%
s, t=
%.2
f"
%
(
motif
,
threshold
)
#lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
plt
.
plot
(
x
,
smoothed_site_probability
,
label
=
lbl
)
...
...
@@ -263,20 +263,20 @@ def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices
def
usage
(
name
,
errmsg
=
None
):
if
errmsg
!=
None
:
print
"Error:
%
s"
%
errmsg
print
"""Usage:
%
s [options]
print
(
"Error:
%
s"
%
errmsg
)
print
(
"""Usage:
%
s [options]
-f <fasta-filename> (required)
-d discover enriched words
-w <word width, default 8>
-p <peak width, default 100>
-m <peak margin, default 100>
-s <JASPAR-ID> scan for JASPAR motif
-h print this help"""
%
name
-h print this help"""
%
name
)
if
__name__
==
'__main__'
:
try
:
optlst
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'f:hds:j:w:p:m:'
)
except
getopt
.
GetoptError
,
err
:
except
getopt
.
GetoptError
as
err
:
usage
(
sys
.
argv
[
0
],
str
(
err
))
sys
.
exit
(
2
)
FILENAME
=
None
...
...
@@ -301,7 +301,7 @@ if __name__ == '__main__':
sys
.
exit
(
3
)
seqs
=
sequence
.
readFastaFile
(
FILENAME
,
sym
.
DNA_Alphabet_wN
)
if
DISCOVER_MODE
:
print
"Discover (f=
%
s; w=
%
d; p=
%
d; m=
%
d)"
%
(
FILENAME
,
WORD_WIDTH
,
PEAK_WIDTH
,
PEAK_MARGIN
)
print
(
"Discover (f=
%
s; w=
%
d; p=
%
d; m=
%
d)"
%
(
FILENAME
,
WORD_WIDTH
,
PEAK_WIDTH
,
PEAK_MARGIN
)
)
countWordsReport
(
seqs
,
WORD_WIDTH
,
PEAK_WIDTH
,
PEAK_MARGIN
)
elif
SCAN_MODE
:
scanMotifReport
(
seqs
,
MOTIF_ID
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment