Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
8fa94535
Commit
8fa94535
authored
Jun 03, 2019
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added_regex_search_in_gappy_sequences
parent
9889428a
Pipeline
#46
failed with stages
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
300 additions
and
291 deletions
+300
-291
annotation_test.py
annotation_test.py
+13
-11
annotations.py
annotations.py
+3
-0
gtf.py
gtf.py
+1
-1
phylo.py
phylo.py
+149
-211
sequence.py
sequence.py
+38
-23
sym.py
sym.py
+33
-2
webservice.py
webservice.py
+63
-43
No files found.
annotation_test.py
View file @
8fa94535
import
annotations
import
annotations
import
phylo
import
phylo
tree
=
phylo
.
parseNewick
(
"(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);"
)
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
...
@@ -10,17 +12,17 @@ import phylo
...
@@ -10,17 +12,17 @@ import phylo
working_dir
=
"/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
tree
=
phylo
.
read_nexus
(
working_dir
+
"annotation_simple.nexus"
)
#
tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
#
print
(
tree
)
#
print (tree)
print
(
tree
.
nexus_annotations
.
annotations
)
#
print (tree.nexus_annotations.annotations)
#
tree
.
swap_annotations
(
"PDB"
)
#
tree.swap_annotations("PDB")
#
print
(
tree
)
#
print (tree)
print
(
tree
.
nexus_annotations
.
annotations
)
#
print (tree.nexus_annotations.annotations)
#
#
# tree.write_to_nexus(working_dir + "output.nexus")
# tree.write_to_nexus(working_dir + "output.nexus")
...
...
annotations.py
View file @
8fa94535
from
collections
import
defaultdict
from
collections
import
defaultdict
from
phylo
import
*
from
phylo
import
*
import
phylo
import
matplotlib
import
matplotlib
import
random
import
random
...
@@ -146,3 +147,5 @@ class NexusAnnotations():
...
@@ -146,3 +147,5 @@ class NexusAnnotations():
def
generate_colour_list
(
self
,
num
):
def
generate_colour_list
(
self
,
num
):
return
num
return
num
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
gtf.py
View file @
8fa94535
...
@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
...
@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
f
.
close
()
f
.
close
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
bf
=
GtfFile
(
'/Users/mikael/simhome/NFIX/WT16
77
.gtf'
)
bf
=
GtfFile
(
'/Users/mikael/simhome/NFIX/WT16
89
.gtf'
)
print
(
bf
.
chroms
.
keys
())
print
(
bf
.
chroms
.
keys
())
g
=
bf
.
generate
(
'chr12'
)
g
=
bf
.
generate
(
'chr12'
)
print
(
next
(
g
))
print
(
next
(
g
))
...
...
phylo.py
View file @
8fa94535
'''
'''
Module with methods and classes for phylogeny.
Module with methods and classes for phylogeny.
Extended to handle n-ary trees (Jan 2019).
@author: mikael
@author: mikael
'''
'''
import
sequence
import
sequence
from
collections
import
defaultdict
from
collections
import
defaultdict
import
annotations
import
annotations
class
PhyloTree
:
class
PhyloTree
:
""" Rooted,
binary (bifurcating)
tree for representing phylogenetic relationships.
""" Rooted,
n-ary
tree for representing phylogenetic relationships.
Functionality includes labelling and traversing nodes; reading and writing to Newick format;
Functionality includes labelling and traversing nodes; reading and writing to Newick format;
association with sequence alignment; maximum parsimony inference of ancestral sequence;
association with sequence alignment; maximum parsimony inference of ancestral sequence;
generation of
single, bifurcating
rooted tree by UPGMA.
generation of rooted tree by UPGMA.
Known issues:
Binary only;
Parsimony does not handle gaps in alignment.
Known issues: Parsimony does not handle gaps in alignment.
Programmers should note that almost all functionality is implemented through recursion. """
Programmers should note that almost all functionality is implemented through recursion. """
def
__init__
(
self
,
root
):
def
__init__
(
self
,
root
):
...
@@ -27,7 +27,6 @@ class PhyloTree:
...
@@ -27,7 +27,6 @@ class PhyloTree:
def
putAnnotations
(
self
,
nexus_annotations
):
def
putAnnotations
(
self
,
nexus_annotations
):
self
.
nexus_annotations
=
nexus_annotations
self
.
nexus_annotations
=
nexus_annotations
# Update the annotations dictionary so that it contains PhyloNode objects as keys, not text labels
# Update the annotations dictionary so that it contains PhyloNode objects as keys, not text labels
for
node
in
self
.
getNodes
():
for
node
in
self
.
getNodes
():
if
node
.
label
in
self
.
nexus_annotations
.
leaf_annotations
:
if
node
.
label
in
self
.
nexus_annotations
.
leaf_annotations
:
...
@@ -60,10 +59,18 @@ class PhyloTree:
...
@@ -60,10 +59,18 @@ class PhyloTree:
node
=
queue
.
pop
()
node
=
queue
.
pop
()
nodes
.
append
(
node
)
nodes
.
append
(
node
)
# if strategy.upper().startswith('DEPTH'):
# if strategy.upper().startswith('DEPTH'):
if
no
de
.
left
:
queue
.
append
(
node
.
left
)
if
no
t
node
.
isLeaf
():
if
node
.
right
:
queue
.
append
(
node
.
right
)
queue
.
extend
(
node
.
children
)
return
nodes
return
nodes
def
getLeaves
(
self
):
all
=
self
.
getNodes
()
leaves
=
[]
for
n
in
all
:
if
n
.
isLeaf
():
leaves
.
append
(
n
)
return
leaves
def
getDescendantsOf
(
self
,
node
,
transitive
=
False
):
def
getDescendantsOf
(
self
,
node
,
transitive
=
False
):
""" Retrieve and return the (list of) descendants (children) of a specified node.
""" Retrieve and return the (list of) descendants (children) of a specified node.
Node can be the label or the instance.
Node can be the label or the instance.
...
@@ -86,28 +93,7 @@ class PhyloTree:
...
@@ -86,28 +93,7 @@ class PhyloTree:
if
not
isinstance
(
node
,
PhyloNode
):
if
not
isinstance
(
node
,
PhyloNode
):
node
=
self
.
findLabel
(
node
)
node
=
self
.
findLabel
(
node
)
if
node
:
if
node
:
myroot
=
self
.
root
return
node
.
getAncestors
(
transitive
)
found
=
False
branching
=
[]
while
not
found
and
myroot
!=
None
:
branching
.
append
(
myroot
)
# check if "myroot" is a leaf node, i.e. does not have children
if
myroot
.
left
==
node
or
myroot
.
right
==
node
:
found
=
True
break
if
myroot
.
left
!=
None
:
# myroot has a "left" child
# check if the "left" child of "myroot" is the ancestor of "node"
if
myroot
.
left
.
isAncestorOf
(
node
,
transitive
=
True
):
# if yes,
myroot
=
myroot
.
left
# move to the "left" child
else
:
# if not,
myroot
=
myroot
.
right
# move to the "right" child
else
:
# myroot does NOT have a "left" child, so let's move "right"
myroot
=
myroot
.
right
if
found
and
transitive
:
return
branching
elif
found
and
len
(
branching
)
>
0
:
return
branching
[
len
(
branching
)
-
1
]
return
None
def
parsimony
(
self
):
def
parsimony
(
self
):
""" Solve the "small parsimony problem",
""" Solve the "small parsimony problem",
...
@@ -117,12 +103,8 @@ class PhyloTree:
...
@@ -117,12 +103,8 @@ class PhyloTree:
self
.
root
.
_backwardParsimony
(
self
.
aln
)
# use scores to determine sequences
self
.
root
.
_backwardParsimony
(
self
.
aln
)
# use scores to determine sequences
return
self
.
root
.
getSequence
()
# return the sequence found at the root
return
self
.
root
.
getSequence
()
# return the sequence found at the root
def
canonise
(
self
):
self
.
root
.
_canonise
()
def
swap_annotations
(
self
,
annotation_key
):
def
swap_annotations
(
self
,
annotation_key
):
try
:
try
:
for
node
in
self
.
getNodes
():
for
node
in
self
.
getNodes
():
if
node
.
isLeaf
():
if
node
.
isLeaf
():
node
.
label
=
self
.
nexus_annotations
.
leaf_annotations
[
node
][
annotation_key
]
node
.
label
=
self
.
nexus_annotations
.
leaf_annotations
[
node
][
annotation_key
]
...
@@ -135,103 +117,91 @@ class PhyloTree:
...
@@ -135,103 +117,91 @@ class PhyloTree:
:param out_path: The path to write the NEXUS file to
:param out_path: The path to write the NEXUS file to
:param nexus_annotations: The NexusAnnotations containing the annotations
:param nexus_annotations: The NexusAnnotations containing the annotations
"""
"""
if
write_annotations
and
not
nexus_annotations
:
if
write_annotations
and
not
nexus_annotations
:
if
not
self
.
nexus_annotations
:
if
not
self
.
nexus_annotations
:
raise
RuntimeError
(
"This tree file has no associated annotation file. Either associate or supply one as a parameter."
)
raise
RuntimeError
(
"This tree file has no associated annotation file. Either associate or supply one as a parameter."
)
nexus_annotations
=
self
.
nexus_annotations
nexus_annotations
=
self
.
nexus_annotations
if
nexus_annotations
:
if
nexus_annotations
:
for
node
in
self
.
getNodes
():
for
node
in
self
.
getNodes
():
if
node
in
self
.
nexus_annotations
.
node_annotations
:
if
node
in
self
.
nexus_annotations
.
node_annotations
:
node
.
annotate_node
(
self
.
nexus_annotations
.
node_annotations
,
self
.
nexus_annotations
.
annotation_symbols
,
exclude_annotations
,
use_symbols
)
node
.
annotate_node
(
self
.
nexus_annotations
.
node_annotations
,
self
.
nexus_annotations
.
annotation_symbols
,
exclude_annotations
,
use_symbols
)
tree_annotation
=
str
(
self
)
+
";"
tree_annotation
=
str
(
self
)
+
";"
self
.
swap_annotations
(
"Original"
)
self
.
swap_annotations
(
"Original"
)
for
node
in
self
.
getNodes
():
for
node
in
self
.
getNodes
():
if
node
in
self
.
nexus_annotations
.
leaf_annotations
:
if
node
in
self
.
nexus_annotations
.
leaf_annotations
:
node
.
annotate_node
(
self
.
nexus_annotations
.
leaf_annotations
,
exclude_annotations
)
node
.
annotate_node
(
self
.
nexus_annotations
.
leaf_annotations
,
exclude_annotations
)
leaves
=
[]
leaves
=
[]
for
node
in
self
.
getNodes
():
for
node
in
self
.
getNodes
():
if
node
.
isLeaf
():
if
node
.
isLeaf
():
leaves
.
append
(
node
.
label
)
leaves
.
append
(
node
.
label
)
leaf_annotation
=
""
leaf_annotation
=
""
for
leaf
in
leaves
:
for
leaf
in
leaves
:
leaf_annotation
+=
"
\n\t
%
s"
%
(
leaf
)
leaf_annotation
+=
"
\n\t
%
s"
%
(
leaf
)
with
open
(
out_path
,
"w+"
)
as
file
:
with
open
(
out_path
,
"w+"
)
as
file
:
file
.
write
(
file
.
write
(
"#NEXUS
\n
begin taxa;
\n\t
dimensions ntax=
%
d;
\n\t
taxlabels
%
s
\n
;
\n
end;
\n\n
begin trees;
\n\t
tree tree_1 = "
"#NEXUS
\n
begin taxa;
\n\t
dimensions ntax=
%
d;
\n\t
taxlabels
%
s
\n
;
\n
end;
\n\n
begin trees;
\n\t
tree tree_1 = "
"[&R]
%
s
\n
end;"
%
(
len
(
leaves
),
leaf_annotation
,
tree_annotation
))
"[&R]
%
s
\n
end;"
%
(
len
(
leaves
),
leaf_annotation
,
tree_annotation
))
class
PhyloNode
:
class
PhyloNode
:
""" A class for a node in a rooted,
binary (bifurcating)
tree.
""" A class for a node in a rooted,
n-ary
tree.
Contains pointers to
descendants/daughters (left and right)
,
Contains pointers to
multiple descendants/daughters
,
optional fields include data, label, sequence and dist.
optional fields include data, label, sequence and dist.
If parsimony is used scores and traceback pointers are available.
If parsimony is used scores and traceback pointers are available.
A number of methods are named with a _ prefix. These can be, but
A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """
are not intended to be used from outside the class. """
def
__init__
(
self
,
label
=
''
):
def
__init__
(
self
,
parent
=
None
,
label
=
''
):
""" Initialise a
n initially unlinked
node.
""" Initialise a node.
Populate fields left and right to link it with other nodes
.
Set its parent (another PhyloNode), parent can be None
.
Set label to name it.
Set label to name it.
Use field data for any type of information associated with node.
Use field data for any type of information associated with node.
Use dist to indicate the distance to its parent (if any).
Use dist to indicate the distance to its parent (if any).
Other fields are used internally, including sequence for associated alignment,
Other fields are used internally, including sequence for associated alignment,
seqscores, back
left and backright
for maximum parsimony. """
seqscores, back for maximum parsimony. """
self
.
left
=
None
self
.
parent
=
parent
self
.
right
=
None
self
.
children
=
None
self
.
data
=
None
self
.
data
=
None
self
.
label
=
label
self
.
label
=
label
self
.
dist
=
None
self
.
dist
=
None
self
.
sequence
=
None
# The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
self
.
sequence
=
None
# The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
self
.
seqscores
=
None
# The scores propagated from leaves via children
self
.
seqscores
=
None
# The scores propagated from leaves via children
self
.
backleft
=
None
# Pointers back to left child: what symbol rendered current/parent symbols
self
.
backptr
=
None
# Pointers back to children: what symbol rendered current/parent symbols
self
.
backright
=
None
# Pointers back to right child: what symbol rendered current/parent symbols
def
isLeaf
(
self
):
def
isLeaf
(
self
):
return
self
.
left
==
self
.
right
==
None
return
self
.
nChildren
()
==
0
def
nChildren
(
self
):
if
self
.
children
==
None
:
return
0
else
:
return
len
(
self
.
children
)
def
__str__
(
self
):
def
__str__
(
self
):
""" Returns string with node (incl descendants) in a Newick style. """
""" Returns string with node (incl descendants) in a Newick style. """
left
=
right
=
label
=
dist
=
''
stubs
=
[
''
for
_
in
range
(
self
.
nChildren
())]
if
self
.
left
:
label
=
dist
=
''
left
=
str
(
self
.
left
)
for
i
in
range
(
self
.
nChildren
()):
if
self
.
right
:
stubs
[
i
]
=
str
(
self
.
children
[
i
])
right
=
str
(
self
.
right
)
if
self
.
dist
or
self
.
dist
==
0.0
:
if
self
.
dist
or
self
.
dist
==
0.0
:
dist
=
':'
+
str
(
self
.
dist
)
dist
=
':'
+
str
(
self
.
dist
)
if
self
.
label
!=
None
:
if
self
.
label
!=
None
:
label
=
str
(
self
.
label
)
label
=
str
(
self
.
label
)
if
not
self
.
left
and
not
self
.
right
:
if
self
.
nChildren
()
==
0
:
return
label
+
dist
return
label
+
dist
else
:
else
:
return
'('
+
left
+
','
+
right
+
')'
+
label
+
dist
stubstr
=
'('
else
:
# there is no label
for
i
in
range
(
len
(
stubs
)
-
1
):
stubstr
+=
stubs
[
i
]
+
','
return
stubstr
+
stubs
[
-
1
]
+
')'
+
label
+
dist
# there is no label
'''
if not self.left and self.right:
if not self.left and self.right:
return ',' + right
return ',' + right
elif self.left and not self.right:
elif self.left and not self.right:
return left + ','
return left + ','
elif self.left and self.right:
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
return '(' + left + ',' + right + ')' + dist
'''
# def __le__(self, other):
# def __le__(self, other):
# """ Returns indication of less than other node. """
# """ Returns indication of less than other node. """
...
@@ -247,38 +217,31 @@ class PhyloNode:
...
@@ -247,38 +217,31 @@ class PhyloNode:
def
_printSequences
(
self
,
start
,
end
):
def
_printSequences
(
self
,
start
,
end
):
""" Returns string with node (incl descendants) in a Newick style. """
""" Returns string with node (incl descendants) in a Newick style. """
left
=
right
=
label
=
dist
=
''
stubs
=
[
''
for
_
in
range
(
self
.
nChildren
())]
if
self
.
left
:
label
=
dist
=
''
left
=
self
.
left
.
_printSequences
(
start
,
end
)
for
i
in
range
(
self
.
nChildren
()):
if
self
.
right
:
stubs
[
i
]
=
self
.
_printSequences
(
self
.
children
[
i
],
start
,
end
)
right
=
self
.
right
.
_printSequences
(
start
,
end
)
if
self
.
dist
or
self
.
dist
==
0.0
:
if
self
.
dist
:
dist
=
':'
+
str
(
self
.
dist
)
dist
=
':'
+
str
(
self
.
dist
)
if
self
.
sequence
!=
None
:
if
self
.
label
!=
None
:
label
=
""
.
join
(
self
.
sequence
[
start
:
end
])
+
""
label
=
str
(
self
.
label
)
if
not
self
.
left
and
not
self
.
right
:
if
self
.
nChildren
()
==
0
:
return
label
+
dist
return
label
+
dist
else
:
else
:
return
'('
+
left
+
','
+
right
+
')'
+
label
+
dist
stubstr
=
'('
else
:
# there is no label
for
i
in
range
(
len
(
stubs
)
-
1
):
if
not
self
.
left
and
self
.
right
:
stubstr
+=
stubs
[
i
]
+
','
return
','
+
right
return
stubstr
+
stubs
[
-
1
]
+
')'
+
label
+
dist
elif
self
.
left
and
not
self
.
right
:
return
left
+
','
elif
self
.
left
and
self
.
right
:
return
'('
+
left
+
','
+
right
+
')'
+
dist
def
_findLabel
(
self
,
label
):
def
_findLabel
(
self
,
label
):
""" Find a node by label at this node or in any descendants (recursively). """
""" Find a node by label at this node or in any descendants (recursively). """
if
self
.
label
==
label
:
if
self
.
label
==
label
:
return
self
return
self
else
:
else
:
if
self
.
left
:
for
i
in
range
(
self
.
nChildren
()):
foundLeft
=
self
.
left
.
_findLabel
(
label
)
found
=
self
.
children
[
i
]
.
_findLabel
(
label
)
if
foundLeft
:
if
found
:
return
foundLeft
return
found
if
self
.
right
:
return
self
.
right
.
_findLabel
(
label
)
return
None
return
None
def
_propagateDistance
(
self
,
parent_dist
):
def
_propagateDistance
(
self
,
parent_dist
):
...
@@ -286,24 +249,21 @@ class PhyloNode:
...
@@ -286,24 +249,21 @@ class PhyloNode:
The only parameter is the absolute distance to the parent of this node. """
The only parameter is the absolute distance to the parent of this node. """
travelled
=
self
.
dist
# absolute distance to this node
travelled
=
self
.
dist
# absolute distance to this node
self
.
dist
=
parent_dist
-
self
.
dist
# relative distance to this node
self
.
dist
=
parent_dist
-
self
.
dist
# relative distance to this node
if
self
.
left
!=
None
:
# if there is a child node...
for
i
in
range
(
self
.
nChildren
()):
self
.
left
.
_propagateDistance
(
travelled
)
# pass absolute distance to this node
self
.
children
[
i
]
.
_propagateDistance
(
travelled
)
# pass absolute distance to this node
if
self
.
right
!=
None
:
self
.
right
.
_propagateDistance
(
travelled
)
def
_assignAlignment
(
self
,
aln
):
def
_assignAlignment
(
self
,
aln
):
""" Assign an alignment to the node, which implies assigning a sequence to it if one is
""" Assign an alignment to the node, which implies assigning a sequence to it if one is
available in the alignment. """
available in the alignment. """
self
.
sequence
=
None
self
.
sequence
=
None
if
self
.
left
!=
None
:
for
i
in
range
(
self
.
nChildren
()):
self
.
left
.
_assignAlignment
(
aln
)
self
.
children
[
i
]
.
_assignAlignment
(
aln
)
if
self
.
right
!=
None
:
self
.
right
.
_assignAlignment
(
aln
)
for
seq
in
aln
.
seqs
:
for
seq
in
aln
.
seqs
:
if
seq
.
name
==
self
.
label
:
if
seq
.
name
==
self
.
label
:
self
.
sequence
=
seq
self
.
sequence
=
seq
break
break
""" # Not sure if this is required (putting nodes into a canonical ordering)
def _canonise(self):
def _canonise(self):
if self.left == None and self.right == None: # at leaf
if self.left == None and self.right == None: # at leaf
return self.label
return self.label
...
@@ -315,52 +275,38 @@ class PhyloNode:
...
@@ -315,52 +275,38 @@ class PhyloNode:
self.right = tmpnode
self.right = tmpnode
return myright
return myright
return myleft
return myleft
"""
def
_forwardParsimony
(
self
,
aln
):
def
_forwardParsimony
(
self
,
aln
):
""" Internal function that operates recursively to first initialise each node (forward),
""" Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node,
stopping only once a sequence has been assigned to the node,
then to propagate scores from sequence assigned nodes to root (backward). """
then to propagate scores from sequence assigned nodes to root (backward). """
if
self
.
sequence
==
None
:
# no sequence has been assigned
if
self
.
sequence
==
None
:
# no sequence has been assigned
if
self
.
left
==
None
and
self
.
right
==
None
:
# no children, so terminal, cannot propagate scores
if
self
.
nChildren
()
==
0
:
# no children, so terminal, cannot propagate scores
raise
RuntimeError
(
"No sequence assigned to leaf node:"
,
self
.
label
)
raise
RuntimeError
(
"No sequence assigned to leaf node:"
,
self
.
label
)
scoresleft
=
scoresright
=
None
scores
=
[
None
for
_
in
range
(
self
.
nChildren
())]
if
self
.
left
!=
None
:
for
i
in
range
(
self
.
nChildren
()):
scoresleft
=
self
.
left
.
_forwardParsimony
(
aln
)
scores
[
i
]
=
self
.
children
[
i
]
.
_forwardParsimony
(
aln
)
if
self
.
right
!=
None
:
scoresright
=
self
.
right
.
_forwardParsimony
(
aln
)
# for each position in the alignment,
# for each position in the alignment,
# introduce (initially zero) score for each symbol in alphabet
# introduce (initially zero) score for each symbol in alphabet
self
.
seqscores
=
[[
0
for
_
in
aln
.
alphabet
]
for
col
in
range
(
aln
.
alignlen
)]
self
.
seqscores
=
[[
0
for
_
in
aln
.
alphabet
]
for
col
in
range
(
aln
.
alignlen
)]
# for each position in the alignment,
# for each position in the alignment,
# allocate a position to put the left child symbol from which each current node symbol score was determined
# allocate a position to put the each child symbol from which each current node symbol score was determined
self
.
backleft
=
[[
None
for
_
in
aln
.
alphabet
]
for
_
in
range
(
aln
.
alignlen
)]
self
.
backptr
=
[[[
None
for
_
in
aln
.
alphabet
]
for
_
in
range
(
aln
.
alignlen
)]
for
_
in
range
(
self
.
nChildren
())]
# allocate a position to put the right child symbol from which each current node symbol score was determined
self
.
backright
=
[[
None
for
_
in
aln
.
alphabet
]
for
_
in
range
(
aln
.
alignlen
)]
for
col
in
range
(
aln
.
alignlen
):
for
col
in
range
(
aln
.
alignlen
):
# left child will contribute first
for
i
in
range
(
self
.
nChildren
()):
for
a_parent
in
range
(
len
(
aln
.
alphabet
)):
# left child will contribute first
best_score_left
=
+
9999999
for
a_parent
in
range
(
len
(
aln
.
alphabet
)):
best_symb_left
=
0
best_score
=
+
9999999
for
a
in
range
(
len
(
aln
.
alphabet
)):
best_symb
=
0
score
=
(
scoresleft
[
col
][
a
]
+
(
for
a
in
range
(
len
(
aln
.
alphabet
)):
1
if
a
!=
a_parent
else
0
))
# if we want to weight scores, this would need to change
score
=
(
scores
[
i
][
col
][
a
]
+
(
if
score
<
best_score_left
:
1
if
a
!=
a_parent
else
0
))
# if we want to weight scores, this would need to change
best_symb_left
=
a
if
score
<
best_score
:
best_score_left
=
score
best_symb
=
a
self
.
seqscores
[
col
][
a_parent
]
=
best_score_left
best_score
=
score
self
.
backleft
[
col
][
a_parent
]
=
best_symb_left
self
.
seqscores
[
col
][
a_parent
]
+=
best_score
# right child will contribute next
self
.
backptr
[
i
][
col
][
a_parent
]
=
best_symb
for
a_parent
in
range
(
len
(
aln
.
alphabet
)):
best_score_right
=
+
9999999
best_symb_right
=
0
for
a
in
range
(
len
(
aln
.
alphabet
)):
score
=
(
scoresright
[
col
][
a
]
+
(
1
if
a
!=
a_parent
else
0
))
# if we want to weight scores, this would need to change
if
score
<
best_score_right
:
best_symb_right
=
a
best_score_right
=
score
self
.
seqscores
[
col
][
a_parent
]
+=
best_score_right
self
.
backright
[
col
][
a_parent
]
=
best_symb_right
else
:
else
:
self
.
seqscores
=
[[
0
if
a
==
sym
else
999999
for
a
in
aln
.
alphabet
]
for
sym
in
self
.
seqscores
=
[[
0
if
a
==
sym
else
999999
for
a
in
aln
.
alphabet
]
for
sym
in
self
.
sequence
]
# if we want to weight scores, this would need to change
self
.
sequence
]
# if we want to weight scores, this would need to change
...
@@ -370,39 +316,37 @@ class PhyloNode:
...
@@ -370,39 +316,37 @@ class PhyloNode:
""" Internal function that operates recursively to inspect scores to determine
""" Internal function that operates recursively to inspect scores to determine
most parsimonious sequence, from root to leaves. """
most parsimonious sequence, from root to leaves. """
if
self
.
sequence
==
None
:
# no sequence has been assigned
if
self
.
sequence
==
None
:
# no sequence has been assigned
leftbuf
=
[]
childbuf
=
[[]
for
_
in
range
(
self
.
nChildren
())]
rightbuf
=
[]
if
self
.
nChildren
()
==
0
:
# no children, so terminal, cannot propagate scores
if
self
.
left
==
None
and
self
.
right
==
None
:
# no children, so terminal, cannot propagate scores
raise
RuntimeError
(
"No sequence assigned to leaf node:"
,
self
.
label
)
raise
RuntimeError
(
"No sequence assigned to leaf node:"
,
self
.
label
)
if
seq
==
None
:
# Only root can do this, no parents to consider, so we pick the lowest scoring symbol
if
seq
==
None
:
# Only root can do this, no parents to consider, so we pick the lowest scoring symbol
currbuf
=
[]
currbuf
=
[]
for
col
in
range
(
aln
.
alignlen
):
for
col
in
range
(
aln
.
alignlen
):
min_score
=
999999
min_score
=
999999
min_symb
=
None
min_symb
=
None
left_symb
=
None
child_symb
=
[
None
for
_
in
range
(
self
.
nChildren
())]
right_symb
=
None
for
a_parent
in
range
(
len
(
aln
.
alphabet
)):
for
a_parent
in
range
(
len
(
aln
.
alphabet
)):
if
self
.
seqscores
[
col
][
a_parent
]
<
min_score
:
if
self
.
seqscores
[
col
][
a_parent
]
<
min_score
:
min_score
=
self
.
seqscores
[
col
][
a_parent
]
min_score
=
self
.
seqscores
[
col
][
a_parent
]
min_symb
=
a_parent
min_symb
=
a_parent
left_symb
=
self
.
backleft
[
col
][
a_parent
]
for
i
in
range
(
self
.
nChildren
()):
right_symb
=
self
.
backright
[
col
][
a_parent
]
child_symb
[
i
]
=
self
.
backptr
[
i
]
[
col
][
a_parent
]
currbuf
.
append
(
aln
.
alphabet
[
min_symb
])
currbuf
.
append
(
aln
.
alphabet
[
min_symb
])
leftbuf
.
append
(
aln
.
alphabet
[
left_symb
])
for
i
in
range
(
self
.
nChildren
()):
rightbuf
.
append
(
aln
.
alphabet
[
right_symb
])
childbuf
[
i
]
.
append
(
aln
.
alphabet
[
child_symb
[
i
]
])
self
.
sequence
=
sequence
.
Sequence
(
currbuf
,
aln
.
alphabet
,
self
.
label
,
gappy
=
True
)
self
.
sequence
=
sequence
.
Sequence
(
currbuf
,
aln
.
alphabet
,
self
.
label
,
gappy
=
True
)
else
:
# Non-root, but not leaf
else
:
# Non-root, but not leaf
self
.
sequence
=
seq
self
.
sequence
=
seq
col
=
0
col
=
0
for
sym_parent
in
self
.
sequence
:
for
sym_parent
in
self
.
sequence
:
a_parent
=
aln
.
alphabet
.
index
(
sym_parent
)
a_parent
=
aln
.
alphabet
.
index
(
sym_parent
)
left_symb
=
self
.
backleft
[
col
][
a_parent
]
child_symb
=
[
None
for
_
in
range
(
self
.
nChildren
())
]
right_symb
=
self
.
backright
[
col
][
a_parent
]
for
i
in
range
(
self
.
nChildren
()):
leftbuf
.
append
(
aln
.
alphabet
[
left_symb
])
child_symb
[
i
]
=
self
.
backptr
[
i
][
col
][
a_parent
]
rightbuf
.
append
(
aln
.
alphabet
[
right_symb
])
childbuf
.
append
(
aln
.
alphabet
[
child_symb
[
i
]
])
col
+=
1
col
+=
1
self
.
left
.
_backwardParsimony
(
aln
,
sequence
.
Sequence
(
leftbuf
,
aln
.
alphabet
,
self
.
label
,
gappy
=
True
))
for
i
in
range
(
self
.
nChildren
()):
self
.
right
.
_backwardParsimony
(
aln
,
sequence
.
Sequence
(
rightbuf
,
aln
.
alphabet
,
self
.
label
,
gappy
=
True
))
self
.
children
[
i
]
.
_backwardParsimony
(
aln
,
sequence
.
Sequence
(
childbuf
[
i
]
,
aln
.
alphabet
,
self
.
label
,
gappy
=
True
))
return
self
.
sequence
return
self
.
sequence
def
getSequence
(
self
):
def
getSequence
(
self
):
...
@@ -418,26 +362,35 @@ class PhyloNode:
...
@@ -418,26 +362,35 @@ class PhyloNode:
""" Decide if this node is the ancestor of specified node.
""" Decide if this node is the ancestor of specified node.
If transitive is True (default), all descendants are included.
If transitive is True (default), all descendants are included.
If transitive is False, only direct descendants are included. """
If transitive is False, only direct descendants are included. """
if
node
==
self
.
left
or
node
==
self
.
right
:
for
i
in
range
(
self
.
nChildren
()):
return
True
if
node
==
self
.
children
[
i
]:
elif
transitive
:
return
True
if
self
.
left
:
elif
transitive
:
statusLeft
=
self
.
left
.
isAncestorOf
(
node
,
transitive
)
status
=
self
.
children
[
i
]
.
isAncestorOf
(
node
,
transitive
)
if
statusLeft
:
return
True
if
status
:
return
True
if
self
.
right
:
return
self
.
right
.
isAncestorOf
(
node
,
transitive
)
else
:
else
:
return
False
return
False
def
getAncestors
(
self
,
transitive
=
False
):
""" Retrieve and return (list of) parent nodes.
If transitive is False (default), only the direct parent is included.
If transitive is True, all parents (parents of parents etc) are included. """
if
self
.
parent
==
None
:
return
[]
if
not
transitive
:
return
[
self
.
parent
]
else
:
parents
=
self
.
parent
.
getAncestors
(
transitive
)
parents
.
append
(
self
.
parent
)
return
parents
def
getDescendants
(
self
,
transitive
=
False
):
def
getDescendants
(
self
,
transitive
=
False
):
""" Retrieve and return (list of) nodes descendant of this.
""" Retrieve and return (list of) nodes descendant of this.
If transitive is False (default), only direct descendants are included.
If transitive is False (default), only direct descendants are included.
If transitive is True, all descendants are (recursively) included. """
If transitive is True, all descendants are (recursively) included. """
children
=
[]
children
=
[]
if
self
.
left
:
for
i
in
range
(
self
.
nChildren
()):
children
.
append
(
self
.
left
)
children
.
append
(
self
.
children
[
i
])
if
self
.
right
:
children
.
append
(
self
.
right
)
if
not
transitive
:
if
not
transitive
:
return
children
return
children
else
:
else
:
...
@@ -450,13 +403,11 @@ class PhyloNode:
...
@@ -450,13 +403,11 @@ class PhyloNode:
return
children
return
children
def
annotate_node
(
self
,
annotations
,
annotation_symbols
=
None
,
exclude_annotations
=
[],
use_symbols
=
False
):
def
annotate_node
(
self
,
annotations
,
annotation_symbols
=
None
,
exclude_annotations
=
[],
use_symbols
=
False
):
annotation_string
=
"[&"
annotation_string
=
"[&"
for
key
,
val_list
in
annotations
[
self
]
.
items
():
for
key
,
val_list
in
annotations
[
self
]
.
items
():
if
type
(
val_list
)
!=
list
:
if
type
(
val_list
)
!=
list
:
val_list
=
[
val_list
]
val_list
=
[
val_list
]
if
key
not
in
exclude_annotations
:
if
key
not
in
exclude_annotations
:
# If we are using annotation symbols and the annotation has an associated symbol
# If we are using annotation symbols and the annotation has an associated symbol
for
val
in
val_list
:
for
val
in
val_list
:
if
use_symbols
and
val
in
annotation_symbols
:
if
use_symbols
and
val
in
annotation_symbols
:
...
@@ -464,11 +415,8 @@ class PhyloNode:
...
@@ -464,11 +415,8 @@ class PhyloNode:
annotation_string
+=
'
%
s="
%
s",'
%
(
key
,
' '
.
join
([
'
%
s'
%
(
val
,)
for
val
in
sorted_symbols
]))
annotation_string
+=
'
%
s="
%
s",'
%
(
key
,
' '
.
join
([
'
%
s'
%
(
val
,)
for
val
in
sorted_symbols
]))
else
:
else
:
annotation_string
+=
'
%
s="
%
s",'
%
(
key
,
' '
.
join
([
'
%
s'
%
(
val
,)
for
val
in
val_list
]))
annotation_string
+=
'
%
s="
%
s",'
%
(
key
,
' '
.
join
([
'
%
s'
%
(
val
,)
for
val
in
val_list
]))
# Remove the final comma and add in a closing bracket
# Remove the final comma and add in a closing bracket
annotation_string
=
annotation_string
[
0
:
len
(
annotation_string
)
-
1
]
+
"]"
annotation_string
=
annotation_string
[
0
:
len
(
annotation_string
)
-
1
]
+
"]"
if
len
(
annotation_string
)
>
2
:
if
len
(
annotation_string
)
>
2
:
if
":"
in
self
.
label
:
if
":"
in
self
.
label
:
self
.
label
=
self
.
label
.
split
(
":"
)[
0
]
+
annotation_string
+
self
.
label
.
split
(
":"
)[
1
]
self
.
label
=
self
.
label
.
split
(
":"
)[
0
]
+
annotation_string
+
self
.
label
.
split
(
":"
)[
1
]
...
@@ -488,7 +436,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
...
@@ -488,7 +436,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
D
=
{}
D
=
{}
N
=
{}
# The number of sequences in each node
N
=
{}
# The number of sequences in each node
M
=
aln
.
calcDistances
(
measure
)
# determine all pairwise distances
M
=
aln
.
calcDistances
(
measure
)
# determine all pairwise distances
nodes
=
[
PhyloNode
(
seq
.
name
)
for
seq
in
aln
.
seqs
]
# construct all leaf nodes
nodes
=
[
PhyloNode
(
label
=
seq
.
name
)
for
seq
in
aln
.
seqs
]
# construct all leaf nodes
""" For each node-pair, assign the distance between them. """
""" For each node-pair, assign the distance between them. """
for
i
in
range
(
len
(
nodes
)):
for
i
in
range
(
len
(
nodes
)):
nodes
[
i
]
.
sequence
=
aln
.
seqs
[
i
]
nodes
[
i
]
.
sequence
=
aln
.
seqs
[
i
]
...
@@ -525,8 +473,9 @@ def runUPGMA(aln, measure, absoluteDistances=False):
...
@@ -525,8 +473,9 @@ def runUPGMA(aln, measure, absoluteDistances=False):
N
[
z
]
=
Nx
+
Ny
# total number of sequences in new cluster, insert new cluster in list N
N
[
z
]
=
Nx
+
Ny
# total number of sequences in new cluster, insert new cluster in list N
for
w
in
dz
:
# we have to run through the nodes again, now not including the removed x and y
for
w
in
dz
:
# we have to run through the nodes again, now not including the removed x and y
D
[
frozenset
([
z
,
w
])]
=
dz
[
w
]
# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
D
[
frozenset
([
z
,
w
])]
=
dz
[
w
]
# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z
.
left
=
x
# link the phylogenetic tree
x
.
parent
=
z
z
.
right
=
y
y
.
parent
=
z
z
.
children
=
[
x
,
y
]
nodes
.
append
(
z
)
nodes
.
append
(
z
)
if
not
absoluteDistances
:
if
not
absoluteDistances
:
x
.
_propagateDistance
(
z
.
dist
)
# convert absolute distances to relative by recursing down left path
x
.
_propagateDistance
(
z
.
dist
)
# convert absolute distances to relative by recursing down left path
...
@@ -534,24 +483,22 @@ def runUPGMA(aln, measure, absoluteDistances=False):
...
@@ -534,24 +483,22 @@ def runUPGMA(aln, measure, absoluteDistances=False):
z
.
dist
=
0.0
# root z is at distance 0 from merged x and y
z
.
dist
=
0.0
# root z is at distance 0 from merged x and y
return
PhyloTree
(
z
)
# make it to tree, return
return
PhyloTree
(
z
)
# make it to tree, return
""" ----------------------------------------------------------------------------------------
""" ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format
Methods for processing files of trees on the Newick format
----------------------------------------------------------------------------------------"""
----------------------------------------------------------------------------------------"""
def
_findComma
(
string
,
level
=
0
):
def
_findComma
(
string
,
level
=
0
):
""" Find
first comma
at specified level of embedding """
""" Find
all commas
at specified level of embedding """
mylevel
=
0
mylevel
=
0
commas
=
[]
for
i
in
range
(
len
(
string
)):
for
i
in
range
(
len
(
string
)):
if
string
[
i
]
==
'('
:
if
string
[
i
]
==
'('
:
mylevel
+=
1
mylevel
+=
1
elif
string
[
i
]
==
')'
:
elif
string
[
i
]
==
')'
:
mylevel
-=
1
mylevel
-=
1
elif
string
[
i
]
==
','
and
mylevel
==
level
:
elif
string
[
i
]
==
','
and
mylevel
==
level
:
return
i
commas
.
append
(
i
)
return
-
1
return
commas
def
parseNewickNode
(
string
):
def
parseNewickNode
(
string
):
""" Utility function that recursively parses embedded string using Newick format. """
""" Utility function that recursively parses embedded string using Newick format. """
...
@@ -559,7 +506,7 @@ def parseNewickNode(string):
...
@@ -559,7 +506,7 @@ def parseNewickNode(string):
last
=
string
[::
-
1
]
.
find
(
')'
)
# look from the back
last
=
string
[::
-
1
]
.
find
(
')'
)
# look from the back
if
first
==
-
1
and
last
==
-
1
:
# we are at leaf
if
first
==
-
1
and
last
==
-
1
:
# we are at leaf
y
=
string
.
split
(
':'
)
y
=
string
.
split
(
':'
)
node
=
PhyloNode
(
y
[
0
])
node
=
PhyloNode
(
label
=
y
[
0
])
if
len
(
y
)
>=
2
:
if
len
(
y
)
>=
2
:
node
.
dist
=
float
(
y
[
1
])
node
.
dist
=
float
(
y
[
1
])
return
node
return
node
...
@@ -569,17 +516,24 @@ def parseNewickNode(string):
...
@@ -569,17 +516,24 @@ def parseNewickNode(string):
embed
=
string
[
first
+
1
:
last
]
embed
=
string
[
first
+
1
:
last
]
tail
=
string
[
last
+
1
:]
tail
=
string
[
last
+
1
:]
# find where corresp comma is
# find where corresp comma is
comma
=
_findComma
(
embed
)
comma
s
=
_findComma
(
embed
)
if
comma
==
-
1
:
if
len
(
commas
)
<
1
:
raise
RuntimeError
(
'Invalid format: invalid placement of "," in sub-string "'
+
embed
+
'"'
)
raise
RuntimeError
(
'Invalid format: invalid placement of "," in sub-string "'
+
embed
+
'"'
)
left
=
embed
[
0
:
comma
]
.
strip
()
prev_comma
=
0
right
=
embed
[
comma
+
1
:]
.
strip
()
child_tokens
=
[]
for
comma
in
commas
:
child_tokens
.
append
(
embed
[
prev_comma
:
comma
]
.
strip
())
prev_comma
=
comma
+
1
child_tokens
.
append
(
embed
[
prev_comma
:]
.
strip
())
y
=
tail
.
split
(
':'
)
y
=
tail
.
split
(
':'
)
node
=
PhyloNode
(
y
[
0
])
# node is an instance of the PhyloNode() class
node
=
PhyloNode
(
label
=
y
[
0
])
# node is an instance of the PhyloNode() class
if
len
(
y
)
>=
2
:
if
len
(
y
)
>=
2
:
node
.
dist
=
float
(
y
[
1
])
node
.
dist
=
float
(
y
[
1
])
node
.
left
=
parseNewickNode
(
left
)
node
.
children
=
[]
node
.
right
=
parseNewickNode
(
right
)
for
tok
in
child_tokens
:
child
=
parseNewickNode
(
tok
)
child
.
parent
=
node
node
.
children
.
append
(
child
)
return
node
return
node
else
:
else
:
raise
RuntimeError
(
'Invalid format: unbalanced parentheses in sub-string "'
+
string
+
'"'
)
raise
RuntimeError
(
'Invalid format: unbalanced parentheses in sub-string "'
+
string
+
'"'
)
...
@@ -628,8 +582,6 @@ def parse_nexus(string):
...
@@ -628,8 +582,6 @@ def parse_nexus(string):
taxon_num
=
num
+
1
taxon_num
=
num
+
1
while
not
lines
[
taxon_num
]
.
strip
()
.
startswith
(
";"
):
while
not
lines
[
taxon_num
]
.
strip
()
.
startswith
(
";"
):
taxon_name
=
lines
[
taxon_num
]
.
split
(
"["
)[
0
]
.
strip
()
taxon_name
=
lines
[
taxon_num
]
.
split
(
"["
)[
0
]
.
strip
()
for
annot_line
in
lines
[
taxon_num
]
.
split
(
"[&"
)[
1
]
.
split
(
","
):
for
annot_line
in
lines
[
taxon_num
]
.
split
(
"[&"
)[
1
]
.
split
(
","
):
#TODO: Make these regex calls
#TODO: Make these regex calls
# print ("Annotation Key is ", annot_line.split("=")[0])
# print ("Annotation Key is ", annot_line.split("=")[0])
...
@@ -641,34 +593,18 @@ def parse_nexus(string):
...
@@ -641,34 +593,18 @@ def parse_nexus(string):
annot_val
=
annot_line
.
split
(
"="
)[
1
]
.
split
(
"]"
)[
0
]
annot_val
=
annot_line
.
split
(
"="
)[
1
]
.
split
(
"]"
)[
0
]
annotation_dict
[
taxon_name
][
annot_key
.
strip
()]
=
annot_val
annotation_dict
[
taxon_name
][
annot_key
.
strip
()]
=
annot_val
taxon_num
+=
1
taxon_num
+=
1
if
line
.
strip
()
.
startswith
(
"begin trees"
):
if
line
.
strip
()
.
startswith
(
"begin trees"
):
tree_num
=
num
+
1
tree_num
=
num
+
1
tree
=
(
lines
[
tree_num
]
.
split
(
"[&R]"
)[
1
])
tree
=
(
lines
[
tree_num
]
.
split
(
"[&R]"
)[
1
])
phylo_tree
=
parseNewick
(
tree
)
phylo_tree
=
parseNewick
(
tree
)
nexus_annotations
=
annotations
.
NexusAnnotations
(
tree
=
phylo_tree
)
nexus_annotations
=
annotations
.
NexusAnnotations
(
tree
=
phylo_tree
)
nexus_annotations
.
add_annotations
(
annotation_dict
)
nexus_annotations
.
add_annotations
(
annotation_dict
)
# print (nexus_annotations.annotations)
phylo_tree
.
putAnnotations
(
nexus_annotations
)
phylo_tree
.
putAnnotations
(
nexus_annotations
)
## Extract all of the annotations from the tree and add them to the NexusAnnotations object
## Extract all of the annotations from the tree and add them to the NexusAnnotations object
print
(
"Number of taxons is
%
s "
%
(
taxon_number
))
print
(
"Number of taxons is
%
s "
%
(
taxon_number
))
return
phylo_tree
return
phylo_tree
""" ----------------------------------------------------------------------------------------
""" ----------------------------------------------------------------------------------------
Method for generating a PhyloTree with unique tip names
Method for generating a PhyloTree with unique tip names
----------------------------------------------------------------------------------------"""
----------------------------------------------------------------------------------------"""
...
@@ -676,7 +612,6 @@ def parse_nexus(string):
...
@@ -676,7 +612,6 @@ def parse_nexus(string):
def
get_unique_tree
(
tree
):
def
get_unique_tree
(
tree
):
unique_tree
=
tree
unique_tree
=
tree
unique_labels
=
{}
unique_labels
=
{}
for
node
in
unique_tree
.
getNodes
():
for
node
in
unique_tree
.
getNodes
():
if
node
.
isLeaf
()
and
node
.
label
in
unique_labels
:
if
node
.
isLeaf
()
and
node
.
label
in
unique_labels
:
unique_labels
[
node
.
label
]
=
unique_labels
[
node
.
label
]
+
1
unique_labels
[
node
.
label
]
=
unique_labels
[
node
.
label
]
+
1
...
@@ -688,3 +623,6 @@ def get_unique_tree(tree):
...
@@ -688,3 +623,6 @@ def get_unique_tree(tree):
def
unpack_list
(
list
):
def
unpack_list
(
list
):
return
(
" "
.
join
([
"
%
s"
]
*
len
(
list
))
+
"!"
)
%
(
x
for
x
in
list
)
return
(
" "
.
join
([
"
%
s"
]
*
len
(
list
))
+
"!"
)
%
(
x
for
x
in
list
)
if
__name__
==
'__main__'
:
tree
=
readNewick
(
'/Users/mikael/simhome/ASR/edge1.nwk'
)
print
(
tree
)
\ No newline at end of file
sequence.py
View file @
8fa94535
...
@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
...
@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
if
parse_defline
:
if
parse_defline
:
parsed
=
parseDefline
(
seqinfo
[
0
])
parsed
=
parseDefline
(
seqinfo
[
0
])
seqname
=
parsed
[
0
]
seqname
=
parsed
[
0
]
else
:
seqinfo
=
line
[
1
:]
else
:
# we are not parsing the sequence name so no need to duplicate it in the info
seqname
=
seqinfo
[
0
]
seqname
=
seqinfo
[
0
]
seqinfo
=
line
[
1
:]
if
len
(
seqinfo
)
>
0
:
# more than a name
edited_info
=
''
for
infopart
in
seqinfo
[
1
:]:
edited_info
+=
infopart
+
' '
seqinfo
=
edited_info
else
:
seqinfo
=
''
except
IndexError
as
errmsg
:
except
IndexError
as
errmsg
:
if
not
ignore
:
if
not
ignore
:
raise
RuntimeError
(
errmsg
)
raise
RuntimeError
(
errmsg
)
...
@@ -717,60 +724,62 @@ class Alignment():
...
@@ -717,60 +724,62 @@ class Alignment():
distmat
[
i
,
j
]
=
distmat
[
j
,
i
]
=
dist
distmat
[
i
,
j
]
=
distmat
[
j
,
i
]
=
dist
return
distmat
return
distmat
def
writeHTML
(
self
,
filename
=
None
):
def
writeHTML
(
self
,
filename
=
None
,
col_start
=
None
,
col_end
=
None
):
""" Generate HTML that displays the alignment in color.
""" Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue'
and that each symbol maps to a text string naming the color, e.g. 'blue'
"""
"""
col_start
=
col_start
or
0
col_end
=
col_end
or
self
.
alignlen
html
=
'''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">
\n
<title>Sequence Alignment</title>
\n
</head><body><pre>
\n
'''
html
=
'''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">
\n
<title>Sequence Alignment</title>
\n
</head><body><pre>
\n
'''
html
+=
'''<p style="font-size:12px">
\n
'''
maxNameLength
=
self
.
getnamelen
()
maxNameLength
=
self
.
getnamelen
()
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
str
(
i
/
10
+
1
)[
0
]
html
+=
str
(
i
/
10
+
1
)[
0
]
el
se
:
el
if
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
' '
html
+=
' '
html
+=
'
%
s
\n
'
%
(
self
.
alignlen
)
# html += '%s\n' % (col_end)
html
+=
'
\n
'
if
self
.
alignlen
>
10
:
if
self
.
alignlen
>
10
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][(
index
*
-
1
)
+
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
1
)
else
'0'
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][(
index
*
-
1
)
+
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
1
)
else
'0'
el
se
:
el
if
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
' '
html
+=
' '
html
+=
'
\n
'
html
+=
'
\n
'
if
self
.
alignlen
>
100
:
if
self
.
alignlen
>
100
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
and
i
>=
99
:
if
(
i
+
1
)
%
10
==
0
and
i
>=
99
and
(
i
>=
col_start
and
i
<
col_end
)
:
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][
-
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
'0'
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][
-
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
'0'
elif
(
i
>=
col_start
and
i
<
col_end
):
else
:
html
+=
' '
html
+=
' '
html
+=
'
\n
'
html
+=
'
\n
'
if
self
.
alignlen
>
1000
:
if
self
.
alignlen
>
1000
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
'0'
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
' '
html
+=
'0'
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
' '
elif
(
i
>=
col_start
and
i
<
col_end
):
else
:
html
+=
' '
html
+=
' '
html
+=
'
\n
'
html
+=
'
\n
'
for
seq
in
self
.
seqs
:
for
seq
in
self
.
seqs
:
html
+=
seq
.
name
.
ljust
(
maxNameLength
)
+
' '
html
+=
seq
.
name
.
ljust
(
maxNameLength
)
+
' '
for
sym
in
seq
:
for
sym
in
seq
[
col_start
:
col_end
]
:
color
=
self
.
alphabet
.
getAnnotation
(
'html-color'
,
sym
)
color
=
self
.
alphabet
.
getAnnotation
(
'html-color'
,
sym
)
if
not
color
:
if
not
color
:
color
=
'white'
color
=
'white'
html
+=
'<font style="BACKGROUND-COLOR:
%
s">
%
s</font>'
%
(
color
,
sym
)
html
+=
'<font style="BACKGROUND-COLOR:
%
s">
%
s</font>'
%
(
color
,
sym
)
html
+=
'
\n
'
html
+=
'
\n
'
html
+=
'</pre></body></html>'
html
+=
'</p
></p
re></body></html>'
if
filename
:
if
filename
:
fh
=
open
(
filename
,
'w'
)
fh
=
open
(
filename
,
'w'
)
fh
.
write
(
html
)
fh
.
write
(
html
)
...
@@ -1187,19 +1196,25 @@ class Regexp(object):
...
@@ -1187,19 +1196,25 @@ class Regexp(object):
def
__str__
(
self
):
def
__str__
(
self
):
return
self
.
pattern
return
self
.
pattern
def
search
(
self
,
sequence
):
def
search
(
self
,
sequence
,
gappy
=
False
):
""" Find matches to the motif in the specified sequence. Returns a list
""" Find matches to the motif in the specified sequence. Returns a list
of triples, of the form (position, matched string, score). Note that
of triples, of the form (position, matched string, score). Note that
the score is always 1.0 because a regexp either matches
the score is always 1.0 because a regexp either matches
or doesn't. """
or doesn't. """
if
not
type
(
sequence
)
is
Sequence
:
if
not
type
(
sequence
)
is
Sequence
:
sequence
=
Sequence
(
sequence
)
sequence
=
Sequence
(
sequence
)
sequenceString
=
sequence
[:]
if
gappy
==
False
or
sequence
.
gappy
==
False
:
results
=
[]
sequenceString
=
sequence
[:]
for
match
in
self
.
regex
.
finditer
(
sequenceString
):
results
=
[]
results
.
append
((
match
.
start
(),
match
.
group
(),
1.0
))
for
match
in
self
.
regex
.
finditer
(
sequenceString
):
return
results
results
.
append
((
match
.
start
(),
match
.
group
(),
1.0
))
return
results
else
:
# if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
degapped
,
idxs
=
sequence
.
getDegapped
()
results
=
[]
for
match
in
self
.
regex
.
finditer
(
''
.
join
(
degapped
)):
results
.
append
((
idxs
[
match
.
start
()],
match
.
group
(),
1.0
))
return
results
class
PWM
(
object
):
class
PWM
(
object
):
...
...
sym.py
View file @
8fa94535
...
@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
...
@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
'Protein'
:
Protein_Alphabet
,
'Protein'
:
Protein_Alphabet
,
'ProteinwX'
:
Protein_wX
,
'ProteinwX'
:
Protein_wX
,
'ProteinwSTOP'
:
Protein_wSTOP
,
'ProteinwSTOP'
:
Protein_wSTOP
,
'ProteinwGAP'
:
Protein_wGAP
,
'DSSP_Alphabet'
:
DSSP_Alphabet
,
'DSSP_Alphabet'
:
DSSP_Alphabet
,
'DSSP3_Alphabet'
:
DSSP3_Alphabet
}
'DSSP3_Alphabet'
:
DSSP3_Alphabet
}
# The preferred order in which a predefined alphabet is assigned to a sequence
# The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder
=
[
'Bool_Alphabet'
,
'DNA'
,
'RNA'
,
'DNAwN'
,
'RNAwN'
,
'Protein'
,
'ProteinwX'
,
'ProteinwSTOP'
,
'DSSP_Alphabet'
,
'DSSP3_Alphabet'
]
preferredOrder
=
[
'Bool_Alphabet'
,
'DNA'
,
'RNA'
,
'DNAwN'
,
'RNAwN'
,
'Protein'
,
'ProteinwX'
,
'ProteinwSTOP'
,
'ProteinwGAP'
,
'DSSP_Alphabet'
,
'DSSP3_Alphabet'
]
# Useful annotations
# Useful annotations
DNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'T'
:
'#66bbff'
})
DNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'T'
:
'#66bbff'
})
RNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'U'
:
'#66bbff'
})
RNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'U'
:
'#66bbff'
})
Protein_Alphabet
.
annotateAll
(
'html-color'
,
{
'G'
:
'orange'
,
'P'
:
'orange'
,
'S'
:
'orange'
,
'T'
:
'orange'
,
'H'
:
'red'
,
'K'
:
'red'
,
'R'
:
'red'
,
'F'
:
'#66bbff'
,
'Y'
:
'#66bbff'
,
'W'
:
'#66bbff'
,
'I'
:
'green'
,
'L'
:
'green'
,
'M'
:
'green'
,
'V'
:
'green'
})
#Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
Protein_Alphabet
.
annotateAll
(
'html-color'
,
{
#orange*/
'G'
:
"#F5A259"
,
#green*/
'N'
:
"#00f900"
,
'Q'
:
"#00f900"
,
'S'
:
"#00f900"
,
'T'
:
"#00f900"
,
#red*/
'K'
:
"#f62f00"
,
'R'
:
"#f62f00"
,
#blue/purple*/
'A'
:
"#92b2f3"
,
'I'
:
"#92b2f3"
,
'L'
:
"#92b2f3"
,
'M'
:
"#92b2f3"
,
'V'
:
"#92b2f3"
,
'W'
:
"#92b2f3"
,
'F'
:
"#92b2f3"
,
#yellow*/
'P'
:
"#FFFB00"
,
#pink*/
'C'
:
"#F59692"
,
#aqua*/
'H'
:
"#04B2B3"
,
'Y'
:
"#04B2B3"
,
#purple*/
'D'
:
"#CE64CB"
,
'E'
:
"#CE64CB"
})
# ------------------ Substitution Matrix ------------------
# ------------------ Substitution Matrix ------------------
...
...
webservice.py
View file @
8fa94535
import
urllib.request
import
urllib.request
import
urllib.parse
import
os
import
os
from
time
import
sleep
from
time
import
sleep
import
stats
import
stats
...
@@ -16,10 +17,11 @@ import json
...
@@ -16,10 +17,11 @@ import json
http://www.ebi.ac.uk/Tools/webservices/tutorials
http://www.ebi.ac.uk/Tools/webservices/tutorials
"""
"""
__ebiUrl__
=
'http://www.ebi.ac.uk/Tools/'
__ebiUrl__
=
'http://www.ebi.ac.uk/Tools/'
__ebiGOUrl__
=
'https://www.ebi.ac.uk/QuickGO/services/'
__ebiGOUrl__
=
'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__
=
'http://www.uniprot.org/'
__uniprotUrl__
=
'http://www.uniprot.org/'
__ebiSearchUrl__
=
'http://www.ebi.ac.uk/ebisearch/'
__ebiSearchUrl__
=
'http://www.ebi.ac.uk/ebisearch/'
def
fetch
(
entryId
,
dbName
=
'uniprotkb'
,
format
=
'fasta'
):
def
fetch
(
entryId
,
dbName
=
'uniprotkb'
,
format
=
'fasta'
):
"""
"""
...
@@ -31,7 +33,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
...
@@ -31,7 +33,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb&id=P63166&format=fasta&style=raw&Retrieve=Retrieve
http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb&id=P63166&format=fasta&style=raw&Retrieve=Retrieve
"""
"""
# Construct URL
# Construct URL
url
=
__ebiUrl__
+
'dbfetch/dbfetch?style=raw&Retrieve=Retrieve&db='
+
dbName
+
'&format='
+
format
+
'&id='
+
entryId
url
=
__ebiUrl__
+
'dbfetch/dbfetch?style=raw&Retrieve=Retrieve&db='
+
dbName
+
'&format='
+
format
+
'&id='
+
entryId
# Get the entry
# Get the entry
try
:
try
:
...
@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
...
@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
except
urllib
.
error
.
HTTPError
as
ex
:
except
urllib
.
error
.
HTTPError
as
ex
:
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
def
search
(
query
,
dbName
=
'uniprot'
,
format
=
'list'
,
limit
=
100
,
columns
=
""
):
def
search
(
query
,
dbName
=
'uniprot'
,
format
=
'list'
,
limit
=
100
,
columns
=
""
):
"""
"""
Retrieve multiple entries matching query from a database currently only via UniProtKB
Retrieve multiple entries matching query from a database currently only via UniProtKB
...
@@ -54,10 +57,14 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
...
@@ -54,10 +57,14 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
"""
"""
if
dbName
.
startswith
(
'uniprot'
):
if
dbName
.
startswith
(
'uniprot'
):
# Construct URL
# Construct URL
if
limit
==
None
:
# no limit to number of results returned
if
limit
==
None
:
# no limit to number of results returned
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&query='
+
query
+
'&columns='
+
columns
url
=
"{}{}/?format={}&query={}&columns={}"
.
format
(
__uniprotUrl__
,
dbName
,
format
,
urllib
.
parse
.
quote
(
query
),
columns
)
else
:
else
:
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&limit='
+
str
(
limit
)
+
'&query='
+
query
+
'&columns='
+
columns
url
=
"{}{}/?format={}&limit={}&query={}&columns={}"
.
format
(
__uniprotUrl__
,
dbName
,
format
,
str
(
limit
),
urllib
.
parse
.
quote
(
query
),
columns
)
# Get the entries
# Get the entries
try
:
try
:
...
@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
...
@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
dbs
=
dbName
.
split
(
":"
)
dbs
=
dbName
.
split
(
":"
)
if
len
(
dbs
)
>
1
:
if
len
(
dbs
)
>
1
:
dbName
=
dbs
[
1
]
dbName
=
dbs
[
1
]
base
=
'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
base
=
'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url
=
base
+
"esearch.fcgi?db="
+
dbName
+
"&term="
+
query
+
"&retmax="
+
str
(
limit
)
url
=
base
+
"esearch.fcgi?db={}&term={}+AND+srcdb_refseq["
\
"prop]&retmax={}"
.
format
(
dbName
,
urllib
.
parse
.
quote
(
query
),
str
(
limit
))
print
(
url
)
# Get the entries
# Get the entries
try
:
try
:
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
words
=
data
.
split
(
"</Id>"
)
words
=
data
.
split
(
"</Id>"
)
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
if
format
==
'list'
:
if
format
==
'list'
:
return
words
return
words
elif
format
==
'fasta'
and
len
(
words
)
>
0
:
elif
format
==
'fasta'
and
len
(
words
)
>
0
:
...
@@ -93,9 +107,10 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
...
@@ -93,9 +107,10 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
return
return
authorised_database_tag
=
{
9606
:
[
'Homo sapiens'
,
'ACC'
,
'ID'
],
3702
:
[
'Arabidopsis thaliana'
,
'TAIR_ID'
],
authorised_database_tag
=
{
9606
:
[
'Homo sapiens'
,
'ACC'
,
'ID'
],
4932
:
[
'Saccharomyces cerevisiae'
,
'SGD_ID'
,
'CYGD_ID'
],
3702
:
[
'Arabidopsis thaliana'
,
'TAIR_ID'
],
4932
:
[
'Saccharomyces cerevisiae'
,
'SGD_ID'
,
'CYGD_ID'
],
10090
:
[
'Mus musculus'
,
'MGI_ID'
]}
10090
:
[
'Mus musculus'
,
'MGI_ID'
]}
"""
"""
...
@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
...
@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries.
Note that this service can be slow for queries involving a large number of entries.
"""
"""
def
getGOReport
(
positives
,
background
=
None
):
def
getGOReport
(
positives
,
background
=
None
):
""" Generate a complete GO term report for a set of genes (positives).
""" Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
...
@@ -135,7 +151,7 @@ def getGOReport(positives, background = None):
...
@@ -135,7 +151,7 @@ def getGOReport(positives, background = None):
for
t
in
term_set
:
for
t
in
term_set
:
term_cnt
[
t
]
=
fg_list
.
count
(
t
)
term_cnt
[
t
]
=
fg_list
.
count
(
t
)
sorted_cnt
=
sorted
(
list
(
term_cnt
.
items
()),
key
=
lambda
v
:
v
[
1
],
reverse
=
True
)
sorted_cnt
=
sorted
(
list
(
term_cnt
.
items
()),
key
=
lambda
v
:
v
[
1
],
reverse
=
True
)
else
:
# a background is provided
else
:
# a background is provided
for
t
in
term_set
:
for
t
in
term_set
:
fg_hit
=
fg_list
.
count
(
t
)
fg_hit
=
fg_list
.
count
(
t
)
bg_hit
=
bg_list
.
count
(
t
)
bg_hit
=
bg_list
.
count
(
t
)
...
@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
...
@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
for
t
in
sorted_cnt
:
for
t
in
sorted_cnt
:
defin
=
getGODef
(
t
[
0
])
defin
=
getGODef
(
t
[
0
])
if
background
!=
None
:
if
background
!=
None
:
ret
.
append
((
t
[
0
],
t
[
1
][
2
]
*
len
(
term_set
),
t
[
1
][
0
],
t
[
1
][
0
]
+
t
[
1
][
1
],
defin
[
'name'
]))
ret
.
append
((
t
[
0
],
t
[
1
][
2
]
*
len
(
term_set
),
t
[
1
][
0
],
t
[
1
][
0
]
+
t
[
1
][
1
],
defin
[
'name'
]))
else
:
else
:
ret
.
append
((
t
[
0
],
t
[
1
],
defin
[
'name'
]))
ret
.
append
((
t
[
0
],
t
[
1
],
defin
[
'name'
]))
return
ret
return
ret
def
getGODef
(
goterm
):
def
getGODef
(
goterm
):
"""
"""
Retrieve information about a GO term
Retrieve information about a GO term
...
@@ -165,7 +182,7 @@ def getGODef(goterm):
...
@@ -165,7 +182,7 @@ def getGODef(goterm):
url
=
__ebiGOUrl__
+
'ontology/go/search?query='
+
goterm
url
=
__ebiGOUrl__
+
'ontology/go/search?query='
+
goterm
# Get the entry: fill in the fields specified below
# Get the entry: fill in the fields specified below
try
:
try
:
entry
=
{
'id'
:
None
,
'name'
:
None
,
'aspect'
:
None
}
entry
=
{
'id'
:
None
,
'name'
:
None
,
'aspect'
:
None
}
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
ret
=
json
.
loads
(
data
)
ret
=
json
.
loads
(
data
)
for
row
in
ret
[
'results'
]:
for
row
in
ret
[
'results'
]:
...
@@ -179,6 +196,7 @@ def getGODef(goterm):
...
@@ -179,6 +196,7 @@ def getGODef(goterm):
except
urllib
.
error
.
HTTPError
as
ex
:
except
urllib
.
error
.
HTTPError
as
ex
:
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
def
getGOTerms
(
genes
):
def
getGOTerms
(
genes
):
"""
"""
Retrieve all GO terms for a given set of genes (or single gene).
Retrieve all GO terms for a given set of genes (or single gene).
...
@@ -187,9 +205,9 @@ def getGOTerms(genes):
...
@@ -187,9 +205,9 @@ def getGOTerms(genes):
if
type
(
genes
)
!=
list
and
type
(
genes
)
!=
set
and
type
(
genes
)
!=
tuple
:
if
type
(
genes
)
!=
list
and
type
(
genes
)
!=
set
and
type
(
genes
)
!=
tuple
:
genes
=
[
genes
]
genes
=
[
genes
]
map
=
dict
()
map
=
dict
()
batchsize
=
100
# size of query batch
batchsize
=
100
# size of query batch
genecnt
=
0
genecnt
=
0
limitpage
=
100
# number of record on each returned page
limitpage
=
100
# number of record on each returned page
while
genecnt
<
len
(
genes
):
while
genecnt
<
len
(
genes
):
genebatch
=
[]
genebatch
=
[]
for
index
in
range
(
batchsize
):
for
index
in
range
(
batchsize
):
...
@@ -237,6 +255,7 @@ def getGOTerms(genes):
...
@@ -237,6 +255,7 @@ def getGOTerms(genes):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
return
map
return
map
def
getGenes
(
goterms
,
taxo
=
None
):
def
getGenes
(
goterms
,
taxo
=
None
):
"""
"""
Retrieve all genes/proteins for a given set of GO terms (or single GO term).
Retrieve all genes/proteins for a given set of GO terms (or single GO term).
...
@@ -247,9 +266,9 @@ def getGenes(goterms, taxo=None):
...
@@ -247,9 +266,9 @@ def getGenes(goterms, taxo=None):
if
type
(
goterms
)
!=
list
and
type
(
goterms
)
!=
set
and
type
(
goterms
)
!=
tuple
:
if
type
(
goterms
)
!=
list
and
type
(
goterms
)
!=
set
and
type
(
goterms
)
!=
tuple
:
goterms
=
[
goterms
]
goterms
=
[
goterms
]
map
=
dict
()
map
=
dict
()
batchsize
=
10
# size of query batch
batchsize
=
10
# size of query batch
termcnt
=
0
termcnt
=
0
limitpage
=
100
# number of record on each returned page
limitpage
=
100
# number of record on each returned page
while
termcnt
<
len
(
goterms
):
while
termcnt
<
len
(
goterms
):
termbatch
=
[]
termbatch
=
[]
for
index
in
range
(
batchsize
):
for
index
in
range
(
batchsize
):
...
@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
...
@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
else
:
else
:
break
break
termcnt
+=
1
termcnt
+=
1
uri_string
=
'annotation/search?limit='
+
str
(
limitpage
)
+
'&taxonId='
+
taxo
+
"&goId="
if
taxo
else
'annotation/search?goId='
uri_string
=
'annotation/search?limit='
+
str
(
limitpage
)
+
'&taxonId='
+
taxo
+
"&goId="
if
taxo
else
'annotation/search?goId='
for
i
in
range
(
len
(
termbatch
)):
for
i
in
range
(
len
(
termbatch
)):
term
=
termbatch
[
i
]
term
=
termbatch
[
i
]
uri_string
+=
term
+
","
if
i
<
len
(
termbatch
)
-
1
else
term
uri_string
+=
term
+
","
if
i
<
len
(
termbatch
)
-
1
else
term
...
@@ -295,11 +315,11 @@ def getGenes(goterms, taxo=None):
...
@@ -295,11 +315,11 @@ def getGenes(goterms, taxo=None):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
return
map
return
map
class
EBI
(
object
):
__email__
=
'anon@uq.edu.au'
# to whom emails about jobs should go
class
EBI
(
object
):
__ebiServiceUrl__
=
'http://www.ebi.ac.uk/Tools/services/rest/'
# Use UQ mirror when available
__email__
=
'anon@uq.edu.au'
# to whom emails about jobs should go
__checkInterval__
=
2
# how long to wait between checking job status
__ebiServiceUrl__
=
'http://www.ebi.ac.uk/Tools/services/rest/'
# Use UQ mirror when available
__checkInterval__
=
2
# how long to wait between checking job status
def
__init__
(
self
,
service
=
None
):
def
__init__
(
self
,
service
=
None
):
""" Initialise service session.
""" Initialise service session.
...
@@ -349,7 +369,8 @@ class EBI(object):
...
@@ -349,7 +369,8 @@ class EBI(object):
if
self
.
isLocked
():
if
self
.
isLocked
():
raise
RuntimeError
(
"""You currently have a
%
s job running. You must
raise
RuntimeError
(
"""You currently have a
%
s job running. You must
wait until it is complete before submitting another job. Go to
wait until it is complete before submitting another job. Go to
%
sstatus/
%
s to check the status of the job."""
%
(
self
.
service
,
self
.
__ebiServiceUrl__
,
self
.
jobId
))
%
sstatus/
%
s to check the status of the job."""
%
(
self
.
service
,
self
.
__ebiServiceUrl__
,
self
.
jobId
))
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/run/'
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/run/'
# ncbiblast database parameter needs special handling
# ncbiblast database parameter needs special handling
if
self
.
service
==
'ncbiblast'
:
if
self
.
service
==
'ncbiblast'
:
...
@@ -423,8 +444,8 @@ class EBI(object):
...
@@ -423,8 +444,8 @@ class EBI(object):
else
:
else
:
return
results
return
results
def
getUniProtDict
(
ids
,
cols
=
""
,
db
=
'uniprot'
,
identities
=
None
):
def
getUniProtDict
(
ids
,
cols
=
""
,
db
=
'uniprot'
,
identities
=
None
):
"""
"""
:param ids: The list of UniProt IDs
:param ids: The list of UniProt IDs
...
@@ -439,11 +460,11 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
...
@@ -439,11 +460,11 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
*** EXAMPLE USAGE ***
*** EXAMPLE USAGE ***
Get a list of UniProt IDs and a list of UniProt columns you're interested in.
Get a list of UniProt IDs and a list of UniProt columns you're interested in.
Full list of UniProt column names - https://www.uniprot.org/help/uniprotkb_column_names
Full list of UniProt column names - https://www.uniprot.org/help/uniprotkb_column_names
uniprot_names = ['Q9LIR4', 'Q1JUQ1', 'P05791', 'P0ADF6']
uniprot_names = ['Q9LIR4', 'Q1JUQ1', 'P05791', 'P0ADF6']
cols = ["lineage(SUPERKINGDOM)", "genes", "lineage(KINGDOM)"]
cols = ["lineage(SUPERKINGDOM)", "genes", "lineage(KINGDOM)"]
up_dict = getUniProtDict(uniprot_names, cols)
up_dict = getUniProtDict(uniprot_names, cols)
for record in up_dict:
for record in up_dict:
print (record, up_dict[record].get("lineage(SUPERKINGDOM)"))
print (record, up_dict[record].get("lineage(SUPERKINGDOM)"))
...
@@ -452,22 +473,21 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
...
@@ -452,22 +473,21 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
for record in up_dict:
for record in up_dict:
print (record, up_dict[record].get("genes"))
print (record, up_dict[record].get("genes"))
If a record doesn't have an entry in UniProt for that column it'll just return None
If a record doesn't have an entry in UniProt for that column it'll just return None
print (up_dict['Q1JUQ1'])
print (up_dict['Q1JUQ1'])
print (up_dict['Q1JUQ1']['lineage(KINGDOM)'])
print (up_dict['Q1JUQ1']['lineage(KINGDOM)'])
*** EXAMPLE USAGE FOR UNIREF SEARCHING ***
*** EXAMPLE USAGE FOR UNIREF SEARCHING ***
up_dict = getUniProtDict(["Q9LIR4", "P99999"], cols=["members"], db="uniref", identities = 1.0)
up_dict = getUniProtDict(["Q9LIR4", "P99999"], cols=["members"], db="uniref", identities = 1.0)
You can either pass a list of identities for each UniProt identifier (in which case the list of identities must be
You can either pass a list of identities for each UniProt identifier (in which case the list of identities must be
the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
"""
"""
# Format the lists of IDs and columns correctly
# Format the lists of IDs and columns correctly
cols
=
","
.
join
(
cols
)
cols
=
","
.
join
(
cols
)
...
@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
...
@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
if
type
(
identities
)
!=
list
:
if
type
(
identities
)
!=
list
:
identities
=
[
identities
]
*
len
(
ids
)
identities
=
[
identities
]
*
len
(
ids
)
elif
len
(
identities
)
!=
len
(
ids
):
elif
len
(
identities
)
!=
len
(
ids
):
raise
RuntimeError
(
'Either supply a single identity threshold or supply one for each identifier in the list'
)
raise
RuntimeError
(
'Either supply a single identity threshold or supply one for each identifier in the list'
)
# Check that the identity thresholds are valid values
# Check that the identity thresholds are valid values
for
x
in
identities
:
for
x
in
identities
:
if
x
not
in
[
1.0
,
0.9
,
0.5
]:
if
x
not
in
[
1.0
,
0.9
,
0.5
]:
raise
RuntimeError
(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - "
+
str
(
x
))
raise
RuntimeError
(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - "
+
str
(
x
))
# Add the query syntax around the identifiers
# Add the query syntax around the identifiers
updated_ids
=
""
updated_ids
=
""
...
@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
...
@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
url
=
'https://www.uniprot.org/'
+
db
+
'/'
url
=
'https://www.uniprot.org/'
+
db
+
'/'
params
=
{
params
=
{
'format'
:
'tab'
,
'format'
:
'tab'
,
'query'
:
updated_ids
,
'query'
:
updated_ids
,
...
@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
...
@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
# For each record we retrieve, split the line by tabs and build up the UniProt dict
# For each record we retrieve, split the line by tabs and build up the UniProt dict
for
line
in
page
.
split
(
"
\n
"
)[
1
:]:
for
line
in
page
.
split
(
"
\n
"
)[
1
:]:
if
line
:
if
line
:
splitlines
=
line
.
split
(
"
\t
"
)
splitlines
=
line
.
split
(
"
\t
"
)
id_dict
=
{}
id_dict
=
{}
pos
=
1
pos
=
1
for
col
in
cols
.
split
(
","
):
for
col
in
cols
.
split
(
","
):
id_dict
[
col
]
=
None
if
splitlines
[
pos
]
==
""
else
splitlines
[
pos
]
id_dict
[
col
]
=
None
if
splitlines
[
pos
]
==
""
else
splitlines
[
pos
]
pos
+=
1
pos
+=
1
up_dict
[
splitlines
[
0
]]
=
id_dict
up_dict
[
splitlines
[
0
]]
=
id_dict
return
up_dict
return
up_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment