Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
ffe94c34
Commit
ffe94c34
authored
Mar 09, 2017
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
phylo_bugfixes
parent
85897a23
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
49 additions
and
64 deletions
+49
-64
guide.py
guide.py
+10
-18
phylo.py
phylo.py
+29
-38
sequence.py
sequence.py
+10
-8
No files found.
guide.py
View file @
ffe94c34
...
...
@@ -670,7 +670,7 @@ def readGeoFile(filename, id_column = 0):
# Our implementations are mainly serviced by EBI.
###############################################################################
def
getSequence
(
entryId
,
dbName
=
'uniprotkb'
,
alphabet
=
Protein_Alphabet
,
format
=
'fasta'
):
def
getSequence
(
entryId
,
dbName
=
'uniprotkb'
,
alphabet
=
Protein_Alphabet
,
format
=
'fasta'
,
debug
:
bool
=
True
):
""" Retrieve a single entry from a database
entryId: ID for entry e.g. 'P63166' or 'SUMO1_MOUSE'
dbName: name of database e.g. 'uniprotkb' or 'pdb' or 'refseqn'; see http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/dbfetch.databases for available databases
...
...
@@ -681,6 +681,8 @@ def getSequence(entryId, dbName = 'uniprotkb', alphabet = Protein_Alphabet, form
entryId
=
entryId
.
decode
(
"utf-8"
)
url
=
'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?style=raw&db='
+
dbName
+
'&format='
+
format
+
'&id='
+
entryId
try
:
if
debug
:
print
(
'DEBUG: Querying URL: {0}'
.
format
(
url
))
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
if
format
==
'fasta'
:
return
readFastaString
(
data
.
decode
(
"utf-8"
),
alphabet
)[
0
]
...
...
@@ -1200,7 +1202,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
nodes
[
i
]
.
dist
=
0.0
N
[
nodes
[
i
]]
=
1
# each cluster contains a single sequence
for
j
in
range
(
0
,
i
):
D
[
_getkey
(
nodes
[
i
],
nodes
[
j
])]
=
M
[
i
,
j
]
D
[
frozenset
([
nodes
[
i
],
nodes
[
j
]
])]
=
M
[
i
,
j
]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
...
...
@@ -1211,26 +1213,25 @@ def runUPGMA(aln, measure, absoluteDistances=False):
closest_dist
=
None
# The distance between them
for
pair
in
D
:
# check all pairs which should be merged
dist
=
D
[
pair
]
if
dist
<
closest_dist
or
closest_dist
==
None
:
if
closest_dist
==
None
or
dist
<
closest_dist
:
closest_dist
=
dist
closest_pair
=
pair
closest_pair
=
list
(
pair
)
# So we know the closest, now we need to merge...
x
=
closest_pair
[
0
]
# See Zvelebil and Baum p. 278 for notation
y
=
closest_pair
[
1
]
z
=
PhyloNode
()
# create a new node for the cluster z
z
.
dist
=
D
.
pop
(
_getkey
(
x
,
y
))
/
2.0
# assign the absolute distance, travelled so far, note: this will change to relative distance later
z
.
dist
=
D
.
pop
(
frozenset
([
x
,
y
]))
/
2.0
# assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx
=
N
.
pop
(
x
)
# find number of sequences in x, remove the cluster from list N
Ny
=
N
.
pop
(
y
)
# find number of sequences in y, remove the cluster from list N
dz
=
{}
# new distances to cluster z
for
w
in
N
:
# for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw
=
D
.
pop
(
_getkey
(
x
,
w
))
# retrieve and remove distance from D: x to w
dyw
=
D
.
pop
(
_getkey
(
y
,
w
))
# retrieve and remove distance from D: y to w
dxw
=
D
.
pop
(
frozenset
([
x
,
w
]
))
# retrieve and remove distance from D: x to w
dyw
=
D
.
pop
(
frozenset
([
y
,
w
]
))
# retrieve and remove distance from D: y to w
dz
[
w
]
=
(
Nx
*
dxw
+
Ny
*
dyw
)
/
(
Nx
+
Ny
)
# distance: z to w
N
[
z
]
=
Nx
+
Ny
# total number of sequences in new cluster, insert new cluster in list N
for
w
in
dz
:
# we have to run through the nodes again, now not including the removed x and y
D
[
_getkey
(
z
,
w
)]
=
dz
[
w
]
# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
D
[
frozenset
([
z
,
w
]
)]
=
dz
[
w
]
# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z
.
left
=
x
# link the phylogenetic tree
z
.
right
=
y
nodes
.
append
(
z
)
...
...
@@ -1240,15 +1241,6 @@ def runUPGMA(aln, measure, absoluteDistances=False):
z
.
dist
=
0.0
# root z is at distance 0 from merged x and y
return
PhyloTree
(
z
)
# make it to tree, return
def
_getkey
(
node1
,
node2
):
""" Construct canonical (unordered) key for two symbols """
if
node1
<=
node2
:
return
tuple
([
node1
,
node2
])
else
:
return
tuple
([
node2
,
node1
])
def
_findComma
(
string
,
level
=
0
):
""" Find first comma at specified level of embedding """
mylevel
=
0
...
...
phylo.py
View file @
ffe94c34
...
...
@@ -343,68 +343,59 @@ class PhyloNode:
Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
----------------------------------------------------------------------------------------"""
def
runUPGMA
(
aln
,
measure
,
absoluteDistances
=
False
):
def
runUPGMA
(
aln
,
measure
,
absoluteDistances
=
False
):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances).
If absoluteDistances is True, the tree will be assigned the total distance from provided species.
Otherwise, the relative addition at each path will be assigned."""
D
=
{}
N
=
{}
# The number of sequences in each node
M
=
aln
.
calcDistances
(
measure
)
# determine all pairwise distances
nodes
=
[
PhyloNode
(
seq
.
name
)
for
seq
in
aln
.
seqs
]
# construct all leaf nodes
N
=
{}
# The number of sequences in each node
M
=
aln
.
calcDistances
(
measure
)
# determine all pairwise distances
nodes
=
[
PhyloNode
(
seq
.
name
)
for
seq
in
aln
.
seqs
]
# construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for
i
in
range
(
len
(
nodes
)):
nodes
[
i
]
.
sequence
=
aln
.
seqs
[
i
]
nodes
[
i
]
.
dist
=
0.0
N
[
nodes
[
i
]]
=
1
# each cluster contains a single sequence
N
[
nodes
[
i
]]
=
1
# each cluster contains a single sequence
for
j
in
range
(
0
,
i
):
D
[
_getkey
(
nodes
[
i
],
nodes
[
j
])]
=
M
[
i
,
j
]
D
[
frozenset
([
nodes
[
i
],
nodes
[
j
]
])]
=
M
[
i
,
j
]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while
len
(
N
)
>
1
:
# N will contain all "live" clusters, to be reduced to a sing
le below
closest_pair
=
(
None
,
None
)
# The two nodes that are closest to one another according to supplied metric
closest_dist
=
None
# The distance between them
for
pair
in
D
:
# check all pairs which should be merged
while
len
(
N
)
>
1
:
# N will contain all "live" clusters, to be reduced to a sign
le below
closest_pair
=
(
None
,
None
)
# The two nodes that are closest to one another according to supplied metric
closest_dist
=
None
# The distance between them
for
pair
in
D
:
# check all pairs which should be merged
dist
=
D
[
pair
]
if
closest_dist
==
None
or
dist
<
closest_dist
:
closest_dist
=
dist
closest_pair
=
pair
closest_pair
=
list
(
pair
)
# So we know the closest, now we need to merge...
x
=
closest_pair
[
0
]
# See Zvelebil and Baum p. 278 for notation
x
=
closest_pair
[
0
]
# See Zvelebil and Baum p. 278 for notation
y
=
closest_pair
[
1
]
z
=
PhyloNode
()
# create a new node for the cluster z
z
.
dist
=
D
.
pop
(
_getkey
(
x
,
y
))
/
2.0
# assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx
=
N
.
pop
(
x
,
None
)
# find number of sequences in x, remove the cluster from list N
Ny
=
N
.
pop
(
y
,
None
)
# find number of sequences in y, remove the cluster from list N
if
Nx
==
None
or
Ny
==
None
:
continue
dz
=
{}
# new distances to cluster z
for
w
in
N
:
# for each node w ...
z
=
PhyloNode
()
# create a new node for the cluster z
z
.
dist
=
D
.
pop
(
frozenset
([
x
,
y
]))
/
2.0
# assign the absolute distance, travelled so far, note: this will change to relative distance later
Nx
=
N
.
pop
(
x
)
# find number of sequences in x, remove the cluster from list N
Ny
=
N
.
pop
(
y
)
# find number of sequences in y, remove the cluster from list N
dz
=
{}
# new distances to cluster z
for
w
in
N
:
# for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw
=
D
.
pop
(
_getkey
(
x
,
w
))
# retrieve and remove distance from D: x to w
dyw
=
D
.
pop
(
_getkey
(
y
,
w
))
# retrieve and remove distance from D: y to w
dz
[
w
]
=
(
Nx
*
dxw
+
Ny
*
dyw
)
/
(
Nx
+
Ny
)
# distance: z to w
N
[
z
]
=
Nx
+
Ny
# total number of sequences in new cluster, insert new cluster in list N
for
w
in
dz
:
# we have to run through the nodes again, now not including the removed x and y
D
[
_getkey
(
z
,
w
)]
=
dz
[
w
]
# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z
.
left
=
x
# link the phylogenetic tree
dxw
=
D
.
pop
(
frozenset
([
x
,
w
]))
# retrieve and remove distance from D: x to w
dyw
=
D
.
pop
(
frozenset
([
y
,
w
]))
# retrieve and remove distance from D: y to w
dz
[
w
]
=
(
Nx
*
dxw
+
Ny
*
dyw
)
/
(
Nx
+
Ny
)
# distance: z to w
N
[
z
]
=
Nx
+
Ny
# total number of sequences in new cluster, insert new cluster in list N
for
w
in
dz
:
# we have to run through the nodes again, now not including the removed x and y
D
[
frozenset
([
z
,
w
])]
=
dz
[
w
]
# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z
.
left
=
x
# link the phylogenetic tree
z
.
right
=
y
nodes
.
append
(
z
)
if
not
absoluteDistances
:
x
.
_propagateDistance
(
z
.
dist
)
# convert absolute distances to relative by recursing down left path
y
.
_propagateDistance
(
z
.
dist
)
# convert absolute distances to relative by recursing down right path
z
.
dist
=
0.0
# root z is at distance 0 from merged x and y
return
PhyloTree
(
z
)
# make it to tree, return
def
_getkey
(
node1
,
node2
):
""" Construct canonical (unordered) key for two symbols """
if
node1
<=
node2
:
return
tuple
([
node1
,
node2
])
else
:
return
tuple
([
node2
,
node1
])
x
.
_propagateDistance
(
z
.
dist
)
# convert absolute distances to relative by recursing down left path
y
.
_propagateDistance
(
z
.
dist
)
# convert absolute distances to relative by recursing down right path
z
.
dist
=
0.0
# root z is at distance 0 from merged x and y
return
PhyloTree
(
z
)
# make it to tree, return
""" ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format
...
...
sequence.py
View file @
ffe94c34
...
...
@@ -17,7 +17,9 @@ PWM -- defines a weight matrix that can score any site in actual sequences
Incorporates methods for loading and saving files relevant to the above (e.g. FASTA, ALN, substitution matrices)
and methods for retrieving relevant data from web services
This code has gone through many updates and has benefitted from kind contributions of course participants.
This code has been adapted to Python 3.5 in 2017
This code has gone through many updates and has benefited from kind contributions of course participants.
Please keep suggestions coming!
Email: m.boden@uq.edu.au
"""
...
...
@@ -91,7 +93,7 @@ class Sequence(object):
def
__len__
(
self
):
""" Defines what the "len" operator returns for an instance of Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print
len(seq
)
>>> print
(len(seq)
)
9
"""
return
len
(
self
.
sequence
)
...
...
@@ -107,7 +109,7 @@ class Sequence(object):
""" Defines how a Sequence should be "iterated", i.e. what its elements are, e.g.
>>> seq = Sequence('AGGAT', DNA_Alphabet)
>>> for sym in seq:
print
sym
print
(sym)
will print A, G, G, A, T (each on a separate row)
"""
tsyms
=
tuple
(
self
.
sequence
)
...
...
@@ -116,12 +118,12 @@ class Sequence(object):
def
__contains__
(
self
,
item
):
""" Defines what is returned when the "in" operator is used on a Sequence, e.g.
>>> seq = Sequence('ACGGTAGGA', DNA_Alphabet)
>>> print
'T' in seq
>>> print
('T' in seq)
True
which is equivalent to
>>> print
seq.__contains__('T'
)
>>> print
(seq.__contains__('T')
)
True
>>> print
'X' in seq
>>> print
('X' in seq)
False
"""
for
sym
in
self
.
sequence
:
...
...
@@ -319,7 +321,7 @@ class Alignment():
one symbol is gap '-'
Example usage:
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> print
Alignment(seqs
)
>>> print
(Alignment(seqs)
)
THIS-LI-NE-
--ISALIGNED
"""
...
...
@@ -351,7 +353,7 @@ class Alignment():
""" Defines what the "len" operator returns for an instance of Alignment, e.g.
>>> seqs = [Sequence('THIS-LI-NE', Protein_Alphabet, gappy = True), Sequence('--ISALIGNED', Protein_Alphabet, gappy = True)]
>>> aln = Alignment(seqs)
>>> print
len(aln
)
>>> print
(len(aln)
)
2
"""
return
len
(
self
.
seqs
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment