Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
8fa94535
Commit
8fa94535
authored
Jun 03, 2019
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added_regex_search_in_gappy_sequences
parent
9889428a
Pipeline
#46
failed with stages
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
300 additions
and
291 deletions
+300
-291
annotation_test.py
annotation_test.py
+13
-11
annotations.py
annotations.py
+3
-0
gtf.py
gtf.py
+1
-1
phylo.py
phylo.py
+149
-211
sequence.py
sequence.py
+38
-23
sym.py
sym.py
+33
-2
webservice.py
webservice.py
+63
-43
No files found.
annotation_test.py
View file @
8fa94535
import
annotations
import
phylo
tree
=
phylo
.
parseNewick
(
"(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);"
)
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
...
...
@@ -10,17 +12,17 @@ import phylo
working_dir
=
"/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
tree
=
phylo
.
read_nexus
(
working_dir
+
"annotation_simple.nexus"
)
print
(
tree
)
print
(
tree
.
nexus_annotations
.
annotations
)
tree
.
swap_annotations
(
"PDB"
)
print
(
tree
)
print
(
tree
.
nexus_annotations
.
annotations
)
#
working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
#
tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
#
#
print (tree)
#
print (tree.nexus_annotations.annotations)
#
#
tree.swap_annotations("PDB")
#
#
print (tree)
#
print (tree.nexus_annotations.annotations)
#
# tree.write_to_nexus(working_dir + "output.nexus")
...
...
annotations.py
View file @
8fa94535
from
collections
import
defaultdict
from
phylo
import
*
import
phylo
import
matplotlib
import
random
...
...
@@ -146,3 +147,5 @@ class NexusAnnotations():
def
generate_colour_list
(
self
,
num
):
return
num
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
gtf.py
View file @
8fa94535
...
...
@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
f
.
close
()
if
__name__
==
'__main__'
:
bf
=
GtfFile
(
'/Users/mikael/simhome/NFIX/WT16
77
.gtf'
)
bf
=
GtfFile
(
'/Users/mikael/simhome/NFIX/WT16
89
.gtf'
)
print
(
bf
.
chroms
.
keys
())
g
=
bf
.
generate
(
'chr12'
)
print
(
next
(
g
))
...
...
phylo.py
View file @
8fa94535
'''
Module with methods and classes for phylogeny.
Extended to handle n-ary trees (Jan 2019).
@author: mikael
'''
import
sequence
from
collections
import
defaultdict
import
annotations
class
PhyloTree
:
""" Rooted,
binary (bifurcating)
tree for representing phylogenetic relationships.
""" Rooted,
n-ary
tree for representing phylogenetic relationships.
Functionality includes labelling and traversing nodes; reading and writing to Newick format;
association with sequence alignment; maximum parsimony inference of ancestral sequence;
generation of
single, bifurcating
rooted tree by UPGMA.
Known issues:
Binary only;
Parsimony does not handle gaps in alignment.
generation of rooted tree by UPGMA.
Known issues: Parsimony does not handle gaps in alignment.
Programmers should note that almost all functionality is implemented through recursion. """
def
__init__
(
self
,
root
):
...
...
@@ -27,7 +27,6 @@ class PhyloTree:
def
putAnnotations
(
self
,
nexus_annotations
):
self
.
nexus_annotations
=
nexus_annotations
# Update the annotations dictionary so that it contains PhyloNode objects as keys, not text labels
for
node
in
self
.
getNodes
():
if
node
.
label
in
self
.
nexus_annotations
.
leaf_annotations
:
...
...
@@ -60,10 +59,18 @@ class PhyloTree:
node
=
queue
.
pop
()
nodes
.
append
(
node
)
# if strategy.upper().startswith('DEPTH'):
if
no
de
.
left
:
queue
.
append
(
node
.
left
)
if
node
.
right
:
queue
.
append
(
node
.
right
)
if
no
t
node
.
isLeaf
():
queue
.
extend
(
node
.
children
)
return
nodes
def
getLeaves
(
self
):
all
=
self
.
getNodes
()
leaves
=
[]
for
n
in
all
:
if
n
.
isLeaf
():
leaves
.
append
(
n
)
return
leaves
def
getDescendantsOf
(
self
,
node
,
transitive
=
False
):
""" Retrieve and return the (list of) descendants (children) of a specified node.
Node can be the label or the instance.
...
...
@@ -86,28 +93,7 @@ class PhyloTree:
if
not
isinstance
(
node
,
PhyloNode
):
node
=
self
.
findLabel
(
node
)
if
node
:
myroot
=
self
.
root
found
=
False
branching
=
[]
while
not
found
and
myroot
!=
None
:
branching
.
append
(
myroot
)
# check if "myroot" is a leaf node, i.e. does not have children
if
myroot
.
left
==
node
or
myroot
.
right
==
node
:
found
=
True
break
if
myroot
.
left
!=
None
:
# myroot has a "left" child
# check if the "left" child of "myroot" is the ancestor of "node"
if
myroot
.
left
.
isAncestorOf
(
node
,
transitive
=
True
):
# if yes,
myroot
=
myroot
.
left
# move to the "left" child
else
:
# if not,
myroot
=
myroot
.
right
# move to the "right" child
else
:
# myroot does NOT have a "left" child, so let's move "right"
myroot
=
myroot
.
right
if
found
and
transitive
:
return
branching
elif
found
and
len
(
branching
)
>
0
:
return
branching
[
len
(
branching
)
-
1
]
return
None
return
node
.
getAncestors
(
transitive
)
def
parsimony
(
self
):
""" Solve the "small parsimony problem",
...
...
@@ -117,12 +103,8 @@ class PhyloTree:
self
.
root
.
_backwardParsimony
(
self
.
aln
)
# use scores to determine sequences
return
self
.
root
.
getSequence
()
# return the sequence found at the root
def
canonise
(
self
):
self
.
root
.
_canonise
()
def
swap_annotations
(
self
,
annotation_key
):
try
:
for
node
in
self
.
getNodes
():
if
node
.
isLeaf
():
node
.
label
=
self
.
nexus_annotations
.
leaf_annotations
[
node
][
annotation_key
]
...
...
@@ -135,103 +117,91 @@ class PhyloTree:
:param out_path: The path to write the NEXUS file to
:param nexus_annotations: The NexusAnnotations containing the annotations
"""
if
write_annotations
and
not
nexus_annotations
:
if
not
self
.
nexus_annotations
:
raise
RuntimeError
(
"This tree file has no associated annotation file. Either associate or supply one as a parameter."
)
nexus_annotations
=
self
.
nexus_annotations
if
nexus_annotations
:
for
node
in
self
.
getNodes
():
if
node
in
self
.
nexus_annotations
.
node_annotations
:
node
.
annotate_node
(
self
.
nexus_annotations
.
node_annotations
,
self
.
nexus_annotations
.
annotation_symbols
,
exclude_annotations
,
use_symbols
)
tree_annotation
=
str
(
self
)
+
";"
self
.
swap_annotations
(
"Original"
)
for
node
in
self
.
getNodes
():
if
node
in
self
.
nexus_annotations
.
leaf_annotations
:
node
.
annotate_node
(
self
.
nexus_annotations
.
leaf_annotations
,
exclude_annotations
)
leaves
=
[]
for
node
in
self
.
getNodes
():
if
node
.
isLeaf
():
leaves
.
append
(
node
.
label
)
leaf_annotation
=
""
for
leaf
in
leaves
:
leaf_annotation
+=
"
\n\t
%
s"
%
(
leaf
)
with
open
(
out_path
,
"w+"
)
as
file
:
file
.
write
(
"#NEXUS
\n
begin taxa;
\n\t
dimensions ntax=
%
d;
\n\t
taxlabels
%
s
\n
;
\n
end;
\n\n
begin trees;
\n\t
tree tree_1 = "
"[&R]
%
s
\n
end;"
%
(
len
(
leaves
),
leaf_annotation
,
tree_annotation
))
class
PhyloNode
:
""" A class for a node in a rooted,
binary (bifurcating)
tree.
Contains pointers to
descendants/daughters (left and right)
,
""" A class for a node in a rooted,
n-ary
tree.
Contains pointers to
multiple descendants/daughters
,
optional fields include data, label, sequence and dist.
If parsimony is used scores and traceback pointers are available.
A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """
def
__init__
(
self
,
label
=
''
):
""" Initialise a
n initially unlinked
node.
Populate fields left and right to link it with other nodes
.
def
__init__
(
self
,
parent
=
None
,
label
=
''
):
""" Initialise a node.
Set its parent (another PhyloNode), parent can be None
.
Set label to name it.
Use field data for any type of information associated with node.
Use dist to indicate the distance to its parent (if any).
Other fields are used internally, including sequence for associated alignment,
seqscores, back
left and backright
for maximum parsimony. """
self
.
left
=
None
self
.
right
=
None
seqscores, back for maximum parsimony. """
self
.
parent
=
parent
self
.
children
=
None
self
.
data
=
None
self
.
label
=
label
self
.
dist
=
None
self
.
sequence
=
None
# The sequence after an alignment have been mapped (leaf) or the most parsimonous sequence (ancestral)
self
.
seqscores
=
None
# The scores propagated from leaves via children
self
.
backleft
=
None
# Pointers back to left child: what symbol rendered current/parent symbols
self
.
backright
=
None
# Pointers back to right child: what symbol rendered current/parent symbols
self
.
seqscores
=
None
# The scores propagated from leaves via children
self
.
backptr
=
None
# Pointers back to children: what symbol rendered current/parent symbols
def
isLeaf
(
self
):
return
self
.
left
==
self
.
right
==
None
return
self
.
nChildren
()
==
0
def
nChildren
(
self
):
if
self
.
children
==
None
:
return
0
else
:
return
len
(
self
.
children
)
def
__str__
(
self
):
""" Returns string with node (incl descendants) in a Newick style. """
left
=
right
=
label
=
dist
=
''
if
self
.
left
:
left
=
str
(
self
.
left
)
if
self
.
right
:
right
=
str
(
self
.
right
)
stubs
=
[
''
for
_
in
range
(
self
.
nChildren
())]
label
=
dist
=
''
for
i
in
range
(
self
.
nChildren
()):
stubs
[
i
]
=
str
(
self
.
children
[
i
])
if
self
.
dist
or
self
.
dist
==
0.0
:
dist
=
':'
+
str
(
self
.
dist
)
if
self
.
label
!=
None
:
label
=
str
(
self
.
label
)
if
not
self
.
left
and
not
self
.
right
:
return
label
+
dist
else
:
return
'('
+
left
+
','
+
right
+
')'
+
label
+
dist
else
:
# there is no label
if
self
.
nChildren
()
==
0
:
return
label
+
dist
else
:
stubstr
=
'('
for
i
in
range
(
len
(
stubs
)
-
1
):
stubstr
+=
stubs
[
i
]
+
','
return
stubstr
+
stubs
[
-
1
]
+
')'
+
label
+
dist
# there is no label
'''
if not self.left and self.right:
return ',' + right
elif self.left and not self.right:
return left + ','
elif self.left and self.right:
return '(' + left + ',' + right + ')' + dist
'''
# def __le__(self, other):
# """ Returns indication of less than other node. """
...
...
@@ -247,38 +217,31 @@ class PhyloNode:
def
_printSequences
(
self
,
start
,
end
):
""" Returns string with node (incl descendants) in a Newick style. """
left
=
right
=
label
=
dist
=
''
if
self
.
left
:
left
=
self
.
left
.
_printSequences
(
start
,
end
)
if
self
.
right
:
right
=
self
.
right
.
_printSequences
(
start
,
end
)
if
self
.
dist
:
stubs
=
[
''
for
_
in
range
(
self
.
nChildren
())]
label
=
dist
=
''
for
i
in
range
(
self
.
nChildren
()):
stubs
[
i
]
=
self
.
_printSequences
(
self
.
children
[
i
],
start
,
end
)
if
self
.
dist
or
self
.
dist
==
0.0
:
dist
=
':'
+
str
(
self
.
dist
)
if
self
.
sequence
!=
None
:
label
=
""
.
join
(
self
.
sequence
[
start
:
end
])
+
""
if
not
self
.
left
and
not
self
.
right
:
return
label
+
dist
else
:
return
'('
+
left
+
','
+
right
+
')'
+
label
+
dist
else
:
# there is no label
if
not
self
.
left
and
self
.
right
:
return
','
+
right
elif
self
.
left
and
not
self
.
right
:
return
left
+
','
elif
self
.
left
and
self
.
right
:
return
'('
+
left
+
','
+
right
+
')'
+
dist
if
self
.
label
!=
None
:
label
=
str
(
self
.
label
)
if
self
.
nChildren
()
==
0
:
return
label
+
dist
else
:
stubstr
=
'('
for
i
in
range
(
len
(
stubs
)
-
1
):
stubstr
+=
stubs
[
i
]
+
','
return
stubstr
+
stubs
[
-
1
]
+
')'
+
label
+
dist
def
_findLabel
(
self
,
label
):
""" Find a node by label at this node or in any descendants (recursively). """
if
self
.
label
==
label
:
return
self
else
:
if
self
.
left
:
foundLeft
=
self
.
left
.
_findLabel
(
label
)
if
foundLeft
:
return
foundLeft
if
self
.
right
:
return
self
.
right
.
_findLabel
(
label
)
for
i
in
range
(
self
.
nChildren
()):
found
=
self
.
children
[
i
]
.
_findLabel
(
label
)
if
found
:
return
found
return
None
def
_propagateDistance
(
self
,
parent_dist
):
...
...
@@ -286,24 +249,21 @@ class PhyloNode:
The only parameter is the absolute distance to the parent of this node. """
travelled
=
self
.
dist
# absolute distance to this node
self
.
dist
=
parent_dist
-
self
.
dist
# relative distance to this node
if
self
.
left
!=
None
:
# if there is a child node...
self
.
left
.
_propagateDistance
(
travelled
)
# pass absolute distance to this node
if
self
.
right
!=
None
:
self
.
right
.
_propagateDistance
(
travelled
)
for
i
in
range
(
self
.
nChildren
()):
self
.
children
[
i
]
.
_propagateDistance
(
travelled
)
# pass absolute distance to this node
def
_assignAlignment
(
self
,
aln
):
""" Assign an alignment to the node, which implies assigning a sequence to it if one is
available in the alignment. """
self
.
sequence
=
None
if
self
.
left
!=
None
:
self
.
left
.
_assignAlignment
(
aln
)
if
self
.
right
!=
None
:
self
.
right
.
_assignAlignment
(
aln
)
for
i
in
range
(
self
.
nChildren
()):
self
.
children
[
i
]
.
_assignAlignment
(
aln
)
for
seq
in
aln
.
seqs
:
if
seq
.
name
==
self
.
label
:
self
.
sequence
=
seq
break
""" # Not sure if this is required (putting nodes into a canonical ordering)
def _canonise(self):
if self.left == None and self.right == None: # at leaf
return self.label
...
...
@@ -315,52 +275,38 @@ class PhyloNode:
self.right = tmpnode
return myright
return myleft
"""
def
_forwardParsimony
(
self
,
aln
):
""" Internal function that operates recursively to first initialise each node (forward),
stopping only once a sequence has been assigned to the node,
then to propagate scores from sequence assigned nodes to root (backward). """
if
self
.
sequence
==
None
:
# no sequence has been assigned
if
self
.
left
==
None
and
self
.
right
==
None
:
# no children, so terminal, cannot propagate scores
if
self
.
nChildren
()
==
0
:
# no children, so terminal, cannot propagate scores
raise
RuntimeError
(
"No sequence assigned to leaf node:"
,
self
.
label
)
scoresleft
=
scoresright
=
None
if
self
.
left
!=
None
:
scoresleft
=
self
.
left
.
_forwardParsimony
(
aln
)
if
self
.
right
!=
None
:
scoresright
=
self
.
right
.
_forwardParsimony
(
aln
)
scores
=
[
None
for
_
in
range
(
self
.
nChildren
())]
for
i
in
range
(
self
.
nChildren
()):
scores
[
i
]
=
self
.
children
[
i
]
.
_forwardParsimony
(
aln
)
# for each position in the alignment,
# introduce (initially zero) score for each symbol in alphabet
self
.
seqscores
=
[[
0
for
_
in
aln
.
alphabet
]
for
col
in
range
(
aln
.
alignlen
)]
# for each position in the alignment,
# allocate a position to put the left child symbol from which each current node symbol score was determined
self
.
backleft
=
[[
None
for
_
in
aln
.
alphabet
]
for
_
in
range
(
aln
.
alignlen
)]
# allocate a position to put the right child symbol from which each current node symbol score was determined
self
.
backright
=
[[
None
for
_
in
aln
.
alphabet
]
for
_
in
range
(
aln
.
alignlen
)]
# allocate a position to put the each child symbol from which each current node symbol score was determined
self
.
backptr
=
[[[
None
for
_
in
aln
.
alphabet
]
for
_
in
range
(
aln
.
alignlen
)]
for
_
in
range
(
self
.
nChildren
())]
for
col
in
range
(
aln
.
alignlen
):
# left child will contribute first
for
a_parent
in
range
(
len
(
aln
.
alphabet
)):
best_score_left
=
+
9999999
best_symb_left
=
0
for
a
in
range
(
len
(
aln
.
alphabet
)):
score
=
(
scoresleft
[
col
][
a
]
+
(
1
if
a
!=
a_parent
else
0
))
# if we want to weight scores, this would need to change
if
score
<
best_score_left
:
best_symb_left
=
a
best_score_left
=
score
self
.
seqscores
[
col
][
a_parent
]
=
best_score_left
self
.
backleft
[
col
][
a_parent
]
=
best_symb_left
# right child will contribute next
for
a_parent
in
range
(
len
(
aln
.
alphabet
)):
best_score_right
=
+
9999999
best_symb_right
=
0
for
a
in
range
(
len
(
aln
.
alphabet
)):
score
=
(
scoresright
[
col
][
a
]
+
(
1
if
a
!=
a_parent
else
0
))
# if we want to weight scores, this would need to change
if
score
<
best_score_right
:
best_symb_right
=
a
best_score_right
=
score
self
.
seqscores
[
col
][
a_parent
]
+=
best_score_right
self
.
backright
[
col
][
a_parent
]
=
best_symb_right
for
i
in
range
(
self
.
nChildren
()):
# left child will contribute first
for
a_parent
in
range
(
len
(
aln
.
alphabet
)):
best_score
=
+
9999999
best_symb
=
0
for
a
in
range
(
len
(
aln
.
alphabet
)):
score
=
(
scores
[
i
][
col
][
a
]
+
(
1
if
a
!=
a_parent
else
0
))
# if we want to weight scores, this would need to change
if
score
<
best_score
:
best_symb
=
a
best_score
=
score
self
.
seqscores
[
col
][
a_parent
]
+=
best_score
self
.
backptr
[
i
][
col
][
a_parent
]
=
best_symb
else
:
self
.
seqscores
=
[[
0
if
a
==
sym
else
999999
for
a
in
aln
.
alphabet
]
for
sym
in
self
.
sequence
]
# if we want to weight scores, this would need to change
...
...
@@ -370,39 +316,37 @@ class PhyloNode:
""" Internal function that operates recursively to inspect scores to determine
most parsimonious sequence, from root to leaves. """
if
self
.
sequence
==
None
:
# no sequence has been assigned
leftbuf
=
[]
rightbuf
=
[]
if
self
.
left
==
None
and
self
.
right
==
None
:
# no children, so terminal, cannot propagate scores
childbuf
=
[[]
for
_
in
range
(
self
.
nChildren
())]
if
self
.
nChildren
()
==
0
:
# no children, so terminal, cannot propagate scores
raise
RuntimeError
(
"No sequence assigned to leaf node:"
,
self
.
label
)
if
seq
==
None
:
# Only root can do this, no parents to consider, so we pick the lowest scoring symbol
currbuf
=
[]
for
col
in
range
(
aln
.
alignlen
):
min_score
=
999999
min_symb
=
None
left_symb
=
None
right_symb
=
None
child_symb
=
[
None
for
_
in
range
(
self
.
nChildren
())]
for
a_parent
in
range
(
len
(
aln
.
alphabet
)):
if
self
.
seqscores
[
col
][
a_parent
]
<
min_score
:
min_score
=
self
.
seqscores
[
col
][
a_parent
]
min_symb
=
a_parent
left_symb
=
self
.
backleft
[
col
][
a_parent
]
right_symb
=
self
.
backright
[
col
][
a_parent
]
for
i
in
range
(
self
.
nChildren
()):
child_symb
[
i
]
=
self
.
backptr
[
i
]
[
col
][
a_parent
]
currbuf
.
append
(
aln
.
alphabet
[
min_symb
])
leftbuf
.
append
(
aln
.
alphabet
[
left_symb
])
rightbuf
.
append
(
aln
.
alphabet
[
right_symb
])
for
i
in
range
(
self
.
nChildren
()):
childbuf
[
i
]
.
append
(
aln
.
alphabet
[
child_symb
[
i
]
])
self
.
sequence
=
sequence
.
Sequence
(
currbuf
,
aln
.
alphabet
,
self
.
label
,
gappy
=
True
)
else
:
# Non-root, but not leaf
self
.
sequence
=
seq
col
=
0
for
sym_parent
in
self
.
sequence
:
a_parent
=
aln
.
alphabet
.
index
(
sym_parent
)
left_symb
=
self
.
backleft
[
col
][
a_parent
]
right_symb
=
self
.
backright
[
col
][
a_parent
]
leftbuf
.
append
(
aln
.
alphabet
[
left_symb
])
rightbuf
.
append
(
aln
.
alphabet
[
right_symb
])
child_symb
=
[
None
for
_
in
range
(
self
.
nChildren
())
]
for
i
in
range
(
self
.
nChildren
()):
child_symb
[
i
]
=
self
.
backptr
[
i
][
col
][
a_parent
]
childbuf
.
append
(
aln
.
alphabet
[
child_symb
[
i
]
])
col
+=
1
self
.
left
.
_backwardParsimony
(
aln
,
sequence
.
Sequence
(
leftbuf
,
aln
.
alphabet
,
self
.
label
,
gappy
=
True
))
self
.
right
.
_backwardParsimony
(
aln
,
sequence
.
Sequence
(
rightbuf
,
aln
.
alphabet
,
self
.
label
,
gappy
=
True
))
for
i
in
range
(
self
.
nChildren
()):
self
.
children
[
i
]
.
_backwardParsimony
(
aln
,
sequence
.
Sequence
(
childbuf
[
i
]
,
aln
.
alphabet
,
self
.
label
,
gappy
=
True
))
return
self
.
sequence
def
getSequence
(
self
):
...
...
@@ -418,26 +362,35 @@ class PhyloNode:
""" Decide if this node is the ancestor of specified node.
If transitive is True (default), all descendants are included.
If transitive is False, only direct descendants are included. """
if
node
==
self
.
left
or
node
==
self
.
right
:
return
True
elif
transitive
:
if
self
.
left
:
statusLeft
=
self
.
left
.
isAncestorOf
(
node
,
transitive
)
if
statusLeft
:
return
True
if
self
.
right
:
return
self
.
right
.
isAncestorOf
(
node
,
transitive
)
for
i
in
range
(
self
.
nChildren
()):
if
node
==
self
.
children
[
i
]:
return
True
elif
transitive
:
status
=
self
.
children
[
i
]
.
isAncestorOf
(
node
,
transitive
)
if
status
:
return
True
else
:
return
False
def
getAncestors
(
self
,
transitive
=
False
):
""" Retrieve and return (list of) parent nodes.
If transitive is False (default), only the direct parent is included.
If transitive is True, all parents (parents of parents etc) are included. """
if
self
.
parent
==
None
:
return
[]
if
not
transitive
:
return
[
self
.
parent
]
else
:
parents
=
self
.
parent
.
getAncestors
(
transitive
)
parents
.
append
(
self
.
parent
)
return
parents
def
getDescendants
(
self
,
transitive
=
False
):
""" Retrieve and return (list of) nodes descendant of this.
If transitive is False (default), only direct descendants are included.
If transitive is True, all descendants are (recursively) included. """
children
=
[]
if
self
.
left
:
children
.
append
(
self
.
left
)
if
self
.
right
:
children
.
append
(
self
.
right
)
for
i
in
range
(
self
.
nChildren
()):
children
.
append
(
self
.
children
[
i
])
if
not
transitive
:
return
children
else
:
...
...
@@ -450,13 +403,11 @@ class PhyloNode:
return
children
def
annotate_node
(
self
,
annotations
,
annotation_symbols
=
None
,
exclude_annotations
=
[],
use_symbols
=
False
):
annotation_string
=
"[&"
for
key
,
val_list
in
annotations
[
self
]
.
items
():
if
type
(
val_list
)
!=
list
:
val_list
=
[
val_list
]
if
key
not
in
exclude_annotations
:
# If we are using annotation symbols and the annotation has an associated symbol
for
val
in
val_list
:
if
use_symbols
and
val
in
annotation_symbols
:
...
...
@@ -464,11 +415,8 @@ class PhyloNode:
annotation_string
+=
'
%
s="
%
s",'
%
(
key
,
' '
.
join
([
'
%
s'
%
(
val
,)
for
val
in
sorted_symbols
]))
else
:
annotation_string
+=
'
%
s="
%
s",'
%
(
key
,
' '
.
join
([
'
%
s'
%
(
val
,)
for
val
in
val_list
]))
# Remove the final comma and add in a closing bracket
annotation_string
=
annotation_string
[
0
:
len
(
annotation_string
)
-
1
]
+
"]"
if
len
(
annotation_string
)
>
2
:
if
":"
in
self
.
label
:
self
.
label
=
self
.
label
.
split
(
":"
)[
0
]
+
annotation_string
+
self
.
label
.
split
(
":"
)[
1
]
...
...
@@ -488,7 +436,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
D
=
{}
N
=
{}
# The number of sequences in each node
M
=
aln
.
calcDistances
(
measure
)
# determine all pairwise distances
nodes
=
[
PhyloNode
(
seq
.
name
)
for
seq
in
aln
.
seqs
]
# construct all leaf nodes
nodes
=
[
PhyloNode
(
label
=
seq
.
name
)
for
seq
in
aln
.
seqs
]
# construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for
i
in
range
(
len
(
nodes
)):
nodes
[
i
]
.
sequence
=
aln
.
seqs
[
i
]
...
...
@@ -525,8 +473,9 @@ def runUPGMA(aln, measure, absoluteDistances=False):
N
[
z
]
=
Nx
+
Ny
# total number of sequences in new cluster, insert new cluster in list N
for
w
in
dz
:
# we have to run through the nodes again, now not including the removed x and y
D
[
frozenset
([
z
,
w
])]
=
dz
[
w
]
# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
z
.
left
=
x
# link the phylogenetic tree
z
.
right
=
y
x
.
parent
=
z
y
.
parent
=
z
z
.
children
=
[
x
,
y
]
nodes
.
append
(
z
)
if
not
absoluteDistances
:
x
.
_propagateDistance
(
z
.
dist
)
# convert absolute distances to relative by recursing down left path
...
...
@@ -534,24 +483,22 @@ def runUPGMA(aln, measure, absoluteDistances=False):
z
.
dist
=
0.0
# root z is at distance 0 from merged x and y
return
PhyloTree
(
z
)
# make it to tree, return
""" ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format
----------------------------------------------------------------------------------------"""
def
_findComma
(
string
,
level
=
0
):
""" Find
first comma
at specified level of embedding """
""" Find
all commas
at specified level of embedding """
mylevel
=
0
commas
=
[]
for
i
in
range
(
len
(
string
)):
if
string
[
i
]
==
'('
:
mylevel
+=
1
elif
string
[
i
]
==
')'
:
mylevel
-=
1
elif
string
[
i
]
==
','
and
mylevel
==
level
:
return
i
return
-
1
commas
.
append
(
i
)
return
commas
def
parseNewickNode
(
string
):
""" Utility function that recursively parses embedded string using Newick format. """
...
...
@@ -559,7 +506,7 @@ def parseNewickNode(string):
last
=
string
[::
-
1
]
.
find
(
')'
)
# look from the back
if
first
==
-
1
and
last
==
-
1
:
# we are at leaf
y
=
string
.
split
(
':'
)
node
=
PhyloNode
(
y
[
0
])
node
=
PhyloNode
(
label
=
y
[
0
])
if
len
(
y
)
>=
2
:
node
.
dist
=
float
(
y
[
1
])
return
node
...
...
@@ -569,17 +516,24 @@ def parseNewickNode(string):
embed
=
string
[
first
+
1
:
last
]
tail
=
string
[
last
+
1
:]
# find where corresp comma is
comma
=
_findComma
(
embed
)
if
comma
==
-
1
:
comma
s
=
_findComma
(
embed
)
if
len
(
commas
)
<
1
:
raise
RuntimeError
(
'Invalid format: invalid placement of "," in sub-string "'
+
embed
+
'"'
)
left
=
embed
[
0
:
comma
]
.
strip
()
right
=
embed
[
comma
+
1
:]
.
strip
()
prev_comma
=
0
child_tokens
=
[]
for
comma
in
commas
:
child_tokens
.
append
(
embed
[
prev_comma
:
comma
]
.
strip
())
prev_comma
=
comma
+
1
child_tokens
.
append
(
embed
[
prev_comma
:]
.
strip
())
y
=
tail
.
split
(
':'
)
node
=
PhyloNode
(
y
[
0
])
# node is an instance of the PhyloNode() class
node
=
PhyloNode
(
label
=
y
[
0
])
# node is an instance of the PhyloNode() class
if
len
(
y
)
>=
2
:
node
.
dist
=
float
(
y
[
1
])
node
.
left
=
parseNewickNode
(
left
)
node
.
right
=
parseNewickNode
(
right
)
node
.
children
=
[]
for
tok
in
child_tokens
:
child
=
parseNewickNode
(
tok
)
child
.
parent
=
node
node
.
children
.
append
(
child
)
return
node
else
:
raise
RuntimeError
(
'Invalid format: unbalanced parentheses in sub-string "'
+
string
+
'"'
)
...
...
@@ -628,8 +582,6 @@ def parse_nexus(string):
taxon_num
=
num
+
1
while
not
lines
[
taxon_num
]
.
strip
()
.
startswith
(
";"
):
taxon_name
=
lines
[
taxon_num
]
.
split
(
"["
)[
0
]
.
strip
()
for
annot_line
in
lines
[
taxon_num
]
.
split
(
"[&"
)[
1
]
.
split
(
","
):
#TODO: Make these regex calls
# print ("Annotation Key is ", annot_line.split("=")[0])
...
...
@@ -641,34 +593,18 @@ def parse_nexus(string):
annot_val
=
annot_line
.
split
(
"="
)[
1
]
.
split
(
"]"
)[
0
]
annotation_dict
[
taxon_name
][
annot_key
.
strip
()]
=
annot_val
taxon_num
+=
1
if
line
.
strip
()
.
startswith
(
"begin trees"
):
tree_num
=
num
+
1
tree
=
(
lines
[
tree_num
]
.
split
(
"[&R]"
)[
1
])
phylo_tree
=
parseNewick
(
tree
)
nexus_annotations
=
annotations
.
NexusAnnotations
(
tree
=
phylo_tree
)
nexus_annotations
.
add_annotations
(
annotation_dict
)
# print (nexus_annotations.annotations)
phylo_tree
.
putAnnotations
(
nexus_annotations
)
## Extract all of the annotations from the tree and add them to the NexusAnnotations object
print
(
"Number of taxons is
%
s "
%
(
taxon_number
))
return
phylo_tree
""" ----------------------------------------------------------------------------------------
Method for generating a PhyloTree with unique tip names
----------------------------------------------------------------------------------------"""
...
...
@@ -676,7 +612,6 @@ def parse_nexus(string):
def
get_unique_tree
(
tree
):
unique_tree
=
tree
unique_labels
=
{}
for
node
in
unique_tree
.
getNodes
():
if
node
.
isLeaf
()
and
node
.
label
in
unique_labels
:
unique_labels
[
node
.
label
]
=
unique_labels
[
node
.
label
]
+
1
...
...
@@ -688,3 +623,6 @@ def get_unique_tree(tree):
def
unpack_list
(
list
):
return
(
" "
.
join
([
"
%
s"
]
*
len
(
list
))
+
"!"
)
%
(
x
for
x
in
list
)
if
__name__
==
'__main__'
:
tree
=
readNewick
(
'/Users/mikael/simhome/ASR/edge1.nwk'
)
print
(
tree
)
\ No newline at end of file
sequence.py
View file @
8fa94535
...
...
@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
if
parse_defline
:
parsed
=
parseDefline
(
seqinfo
[
0
])
seqname
=
parsed
[
0
]
else
:
seqinfo
=
line
[
1
:]
else
:
# we are not parsing the sequence name so no need to duplicate it in the info
seqname
=
seqinfo
[
0
]
seqinfo
=
line
[
1
:]
if
len
(
seqinfo
)
>
0
:
# more than a name
edited_info
=
''
for
infopart
in
seqinfo
[
1
:]:
edited_info
+=
infopart
+
' '
seqinfo
=
edited_info
else
:
seqinfo
=
''
except
IndexError
as
errmsg
:
if
not
ignore
:
raise
RuntimeError
(
errmsg
)
...
...
@@ -717,60 +724,62 @@ class Alignment():
distmat
[
i
,
j
]
=
distmat
[
j
,
i
]
=
dist
return
distmat
def
writeHTML
(
self
,
filename
=
None
):
def
writeHTML
(
self
,
filename
=
None
,
col_start
=
None
,
col_end
=
None
):
""" Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue'
"""
col_start
=
col_start
or
0
col_end
=
col_end
or
self
.
alignlen
html
=
'''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">
\n
<title>Sequence Alignment</title>
\n
</head><body><pre>
\n
'''
html
+=
'''<p style="font-size:12px">
\n
'''
maxNameLength
=
self
.
getnamelen
()
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
str
(
i
/
10
+
1
)[
0
]
el
se
:
el
if
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
' '
html
+=
'
%
s
\n
'
%
(
self
.
alignlen
)
# html += '%s\n' % (col_end)
html
+=
'
\n
'
if
self
.
alignlen
>
10
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][(
index
*
-
1
)
+
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
1
)
else
'0'
el
se
:
el
if
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
' '
html
+=
'
\n
'
if
self
.
alignlen
>
100
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
and
i
>=
99
:
if
(
i
+
1
)
%
10
==
0
and
i
>=
99
and
(
i
>=
col_start
and
i
<
col_end
)
:
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][
-
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
'0'
else
:
elif
(
i
>=
col_start
and
i
<
col_end
):
html
+=
' '
html
+=
'
\n
'
if
self
.
alignlen
>
1000
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
'0'
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
' '
else
:
elif
(
i
>=
col_start
and
i
<
col_end
):
html
+=
' '
html
+=
'
\n
'
for
seq
in
self
.
seqs
:
html
+=
seq
.
name
.
ljust
(
maxNameLength
)
+
' '
for
sym
in
seq
:
for
sym
in
seq
[
col_start
:
col_end
]
:
color
=
self
.
alphabet
.
getAnnotation
(
'html-color'
,
sym
)
if
not
color
:
color
=
'white'
html
+=
'<font style="BACKGROUND-COLOR:
%
s">
%
s</font>'
%
(
color
,
sym
)
html
+=
'
\n
'
html
+=
'</pre></body></html>'
html
+=
'</p
></p
re></body></html>'
if
filename
:
fh
=
open
(
filename
,
'w'
)
fh
.
write
(
html
)
...
...
@@ -1187,19 +1196,25 @@ class Regexp(object):
def
__str__
(
self
):
return
self
.
pattern
def
search
(
self
,
sequence
):
def
search
(
self
,
sequence
,
gappy
=
False
):
""" Find matches to the motif in the specified sequence. Returns a list
of triples, of the form (position, matched string, score). Note that
the score is always 1.0 because a regexp either matches
or doesn't. """
if
not
type
(
sequence
)
is
Sequence
:
sequence
=
Sequence
(
sequence
)
sequenceString
=
sequence
[:]
results
=
[]
for
match
in
self
.
regex
.
finditer
(
sequenceString
):
results
.
append
((
match
.
start
(),
match
.
group
(),
1.0
))
return
results
if
gappy
==
False
or
sequence
.
gappy
==
False
:
sequenceString
=
sequence
[:]
results
=
[]
for
match
in
self
.
regex
.
finditer
(
sequenceString
):
results
.
append
((
match
.
start
(),
match
.
group
(),
1.0
))
return
results
else
:
# if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
degapped
,
idxs
=
sequence
.
getDegapped
()
results
=
[]
for
match
in
self
.
regex
.
finditer
(
''
.
join
(
degapped
)):
results
.
append
((
idxs
[
match
.
start
()],
match
.
group
(),
1.0
))
return
results
class
PWM
(
object
):
...
...
sym.py
View file @
8fa94535
...
...
@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
'Protein'
:
Protein_Alphabet
,
'ProteinwX'
:
Protein_wX
,
'ProteinwSTOP'
:
Protein_wSTOP
,
'ProteinwGAP'
:
Protein_wGAP
,
'DSSP_Alphabet'
:
DSSP_Alphabet
,
'DSSP3_Alphabet'
:
DSSP3_Alphabet
}
# The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder
=
[
'Bool_Alphabet'
,
'DNA'
,
'RNA'
,
'DNAwN'
,
'RNAwN'
,
'Protein'
,
'ProteinwX'
,
'ProteinwSTOP'
,
'DSSP_Alphabet'
,
'DSSP3_Alphabet'
]
preferredOrder
=
[
'Bool_Alphabet'
,
'DNA'
,
'RNA'
,
'DNAwN'
,
'RNAwN'
,
'Protein'
,
'ProteinwX'
,
'ProteinwSTOP'
,
'ProteinwGAP'
,
'DSSP_Alphabet'
,
'DSSP3_Alphabet'
]
# Useful annotations
DNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'T'
:
'#66bbff'
})
RNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'U'
:
'#66bbff'
})
Protein_Alphabet
.
annotateAll
(
'html-color'
,
{
'G'
:
'orange'
,
'P'
:
'orange'
,
'S'
:
'orange'
,
'T'
:
'orange'
,
'H'
:
'red'
,
'K'
:
'red'
,
'R'
:
'red'
,
'F'
:
'#66bbff'
,
'Y'
:
'#66bbff'
,
'W'
:
'#66bbff'
,
'I'
:
'green'
,
'L'
:
'green'
,
'M'
:
'green'
,
'V'
:
'green'
})
#Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
Protein_Alphabet
.
annotateAll
(
'html-color'
,
{
#orange*/
'G'
:
"#F5A259"
,
#green*/
'N'
:
"#00f900"
,
'Q'
:
"#00f900"
,
'S'
:
"#00f900"
,
'T'
:
"#00f900"
,
#red*/
'K'
:
"#f62f00"
,
'R'
:
"#f62f00"
,
#blue/purple*/
'A'
:
"#92b2f3"
,
'I'
:
"#92b2f3"
,
'L'
:
"#92b2f3"
,
'M'
:
"#92b2f3"
,
'V'
:
"#92b2f3"
,
'W'
:
"#92b2f3"
,
'F'
:
"#92b2f3"
,
#yellow*/
'P'
:
"#FFFB00"
,
#pink*/
'C'
:
"#F59692"
,
#aqua*/
'H'
:
"#04B2B3"
,
'Y'
:
"#04B2B3"
,
#purple*/
'D'
:
"#CE64CB"
,
'E'
:
"#CE64CB"
})
# ------------------ Substitution Matrix ------------------
...
...
webservice.py
View file @
8fa94535
import
urllib.request
import
urllib.parse
import
os
from
time
import
sleep
import
stats
...
...
@@ -16,10 +17,11 @@ import json
http://www.ebi.ac.uk/Tools/webservices/tutorials
"""
__ebiUrl__
=
'http://www.ebi.ac.uk/Tools/'
__ebiGOUrl__
=
'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__
=
'http://www.uniprot.org/'
__ebiSearchUrl__
=
'http://www.ebi.ac.uk/ebisearch/'
__ebiUrl__
=
'http://www.ebi.ac.uk/Tools/'
__ebiGOUrl__
=
'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__
=
'http://www.uniprot.org/'
__ebiSearchUrl__
=
'http://www.ebi.ac.uk/ebisearch/'
def
fetch
(
entryId
,
dbName
=
'uniprotkb'
,
format
=
'fasta'
):
"""
...
...
@@ -31,7 +33,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb&id=P63166&format=fasta&style=raw&Retrieve=Retrieve
"""
# Construct URL
# Construct URL
url
=
__ebiUrl__
+
'dbfetch/dbfetch?style=raw&Retrieve=Retrieve&db='
+
dbName
+
'&format='
+
format
+
'&id='
+
entryId
# Get the entry
try
:
...
...
@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
except
urllib
.
error
.
HTTPError
as
ex
:
raise
RuntimeError
(
ex
.
read
())
def
search
(
query
,
dbName
=
'uniprot'
,
format
=
'list'
,
limit
=
100
,
columns
=
""
):
"""
Retrieve multiple entries matching query from a database currently only via UniProtKB
...
...
@@ -54,10 +57,14 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
"""
if
dbName
.
startswith
(
'uniprot'
):
# Construct URL
if
limit
==
None
:
# no limit to number of results returned
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&query='
+
query
+
'&columns='
+
columns
if
limit
==
None
:
# no limit to number of results returned
url
=
"{}{}/?format={}&query={}&columns={}"
.
format
(
__uniprotUrl__
,
dbName
,
format
,
urllib
.
parse
.
quote
(
query
),
columns
)
else
:
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&limit='
+
str
(
limit
)
+
'&query='
+
query
+
'&columns='
+
columns
url
=
"{}{}/?format={}&limit={}&query={}&columns={}"
.
format
(
__uniprotUrl__
,
dbName
,
format
,
str
(
limit
),
urllib
.
parse
.
quote
(
query
),
columns
)
# Get the entries
try
:
...
...
@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
dbs
=
dbName
.
split
(
":"
)
if
len
(
dbs
)
>
1
:
dbName
=
dbs
[
1
]
base
=
'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url
=
base
+
"esearch.fcgi?db="
+
dbName
+
"&term="
+
query
+
"&retmax="
+
str
(
limit
)
url
=
base
+
"esearch.fcgi?db={}&term={}+AND+srcdb_refseq["
\
"prop]&retmax={}"
.
format
(
dbName
,
urllib
.
parse
.
quote
(
query
),
str
(
limit
))
print
(
url
)
# Get the entries
try
:
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
words
=
data
.
split
(
"</Id>"
)
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
if
format
==
'list'
:
return
words
elif
format
==
'fasta'
and
len
(
words
)
>
0
:
...
...
@@ -93,9 +107,10 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
raise
RuntimeError
(
ex
.
read
())
return
authorised_database_tag
=
{
9606
:
[
'Homo sapiens'
,
'ACC'
,
'ID'
],
3702
:
[
'Arabidopsis thaliana'
,
'TAIR_ID'
],
4932
:
[
'Saccharomyces cerevisiae'
,
'SGD_ID'
,
'CYGD_ID'
],
authorised_database_tag
=
{
9606
:
[
'Homo sapiens'
,
'ACC'
,
'ID'
],
3702
:
[
'Arabidopsis thaliana'
,
'TAIR_ID'
],
4932
:
[
'Saccharomyces cerevisiae'
,
'SGD_ID'
,
'CYGD_ID'
],
10090
:
[
'Mus musculus'
,
'MGI_ID'
]}
"""
...
...
@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries.
"""
def
getGOReport
(
positives
,
background
=
None
):
def
getGOReport
(
positives
,
background
=
None
):
""" Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
...
...
@@ -135,7 +151,7 @@ def getGOReport(positives, background = None):
for
t
in
term_set
:
term_cnt
[
t
]
=
fg_list
.
count
(
t
)
sorted_cnt
=
sorted
(
list
(
term_cnt
.
items
()),
key
=
lambda
v
:
v
[
1
],
reverse
=
True
)
else
:
# a background is provided
else
:
# a background is provided
for
t
in
term_set
:
fg_hit
=
fg_list
.
count
(
t
)
bg_hit
=
bg_list
.
count
(
t
)
...
...
@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
for
t
in
sorted_cnt
:
defin
=
getGODef
(
t
[
0
])
if
background
!=
None
:
ret
.
append
((
t
[
0
],
t
[
1
][
2
]
*
len
(
term_set
),
t
[
1
][
0
],
t
[
1
][
0
]
+
t
[
1
][
1
],
defin
[
'name'
]))
ret
.
append
((
t
[
0
],
t
[
1
][
2
]
*
len
(
term_set
),
t
[
1
][
0
],
t
[
1
][
0
]
+
t
[
1
][
1
],
defin
[
'name'
]))
else
:
ret
.
append
((
t
[
0
],
t
[
1
],
defin
[
'name'
]))
return
ret
def
getGODef
(
goterm
):
"""
Retrieve information about a GO term
...
...
@@ -165,7 +182,7 @@ def getGODef(goterm):
url
=
__ebiGOUrl__
+
'ontology/go/search?query='
+
goterm
# Get the entry: fill in the fields specified below
try
:
entry
=
{
'id'
:
None
,
'name'
:
None
,
'aspect'
:
None
}
entry
=
{
'id'
:
None
,
'name'
:
None
,
'aspect'
:
None
}
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
ret
=
json
.
loads
(
data
)
for
row
in
ret
[
'results'
]:
...
...
@@ -179,6 +196,7 @@ def getGODef(goterm):
except
urllib
.
error
.
HTTPError
as
ex
:
raise
RuntimeError
(
ex
.
read
())
def
getGOTerms
(
genes
):
"""
Retrieve all GO terms for a given set of genes (or single gene).
...
...
@@ -187,9 +205,9 @@ def getGOTerms(genes):
if
type
(
genes
)
!=
list
and
type
(
genes
)
!=
set
and
type
(
genes
)
!=
tuple
:
genes
=
[
genes
]
map
=
dict
()
batchsize
=
100
# size of query batch
batchsize
=
100
# size of query batch
genecnt
=
0
limitpage
=
100
# number of record on each returned page
limitpage
=
100
# number of record on each returned page
while
genecnt
<
len
(
genes
):
genebatch
=
[]
for
index
in
range
(
batchsize
):
...
...
@@ -237,6 +255,7 @@ def getGOTerms(genes):
raise
RuntimeError
(
ex
.
read
())
return
map
def
getGenes
(
goterms
,
taxo
=
None
):
"""
Retrieve all genes/proteins for a given set of GO terms (or single GO term).
...
...
@@ -247,9 +266,9 @@ def getGenes(goterms, taxo=None):
if
type
(
goterms
)
!=
list
and
type
(
goterms
)
!=
set
and
type
(
goterms
)
!=
tuple
:
goterms
=
[
goterms
]
map
=
dict
()
batchsize
=
10
# size of query batch
batchsize
=
10
# size of query batch
termcnt
=
0
limitpage
=
100
# number of record on each returned page
limitpage
=
100
# number of record on each returned page
while
termcnt
<
len
(
goterms
):
termbatch
=
[]
for
index
in
range
(
batchsize
):
...
...
@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
else
:
break
termcnt
+=
1
uri_string
=
'annotation/search?limit='
+
str
(
limitpage
)
+
'&taxonId='
+
taxo
+
"&goId="
if
taxo
else
'annotation/search?goId='
uri_string
=
'annotation/search?limit='
+
str
(
limitpage
)
+
'&taxonId='
+
taxo
+
"&goId="
if
taxo
else
'annotation/search?goId='
for
i
in
range
(
len
(
termbatch
)):
term
=
termbatch
[
i
]
uri_string
+=
term
+
","
if
i
<
len
(
termbatch
)
-
1
else
term
...
...
@@ -295,11 +315,11 @@ def getGenes(goterms, taxo=None):
raise
RuntimeError
(
ex
.
read
())
return
map
class
EBI
(
object
):
__email__
=
'anon@uq.edu.au'
# to whom emails about jobs should go
__ebiServiceUrl__
=
'http://www.ebi.ac.uk/Tools/services/rest/'
# Use UQ mirror when available
__checkInterval__
=
2
# how long to wait between checking job status
class
EBI
(
object
):
__email__
=
'anon@uq.edu.au'
# to whom emails about jobs should go
__ebiServiceUrl__
=
'http://www.ebi.ac.uk/Tools/services/rest/'
# Use UQ mirror when available
__checkInterval__
=
2
# how long to wait between checking job status
def
__init__
(
self
,
service
=
None
):
""" Initialise service session.
...
...
@@ -349,7 +369,8 @@ class EBI(object):
if
self
.
isLocked
():
raise
RuntimeError
(
"""You currently have a
%
s job running. You must
wait until it is complete before submitting another job. Go to
%
sstatus/
%
s to check the status of the job."""
%
(
self
.
service
,
self
.
__ebiServiceUrl__
,
self
.
jobId
))
%
sstatus/
%
s to check the status of the job."""
%
(
self
.
service
,
self
.
__ebiServiceUrl__
,
self
.
jobId
))
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/run/'
# ncbiblast database parameter needs special handling
if
self
.
service
==
'ncbiblast'
:
...
...
@@ -423,8 +444,8 @@ class EBI(object):
else
:
return
results
def
getUniProtDict
(
ids
,
cols
=
""
,
db
=
'uniprot'
,
identities
=
None
):
def
getUniProtDict
(
ids
,
cols
=
""
,
db
=
'uniprot'
,
identities
=
None
):
"""
:param ids: The list of UniProt IDs
...
...
@@ -439,11 +460,11 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
*** EXAMPLE USAGE ***
Get a list of UniProt IDs and a list of UniProt columns you're interested in.
Full list of UniProt column names - https://www.uniprot.org/help/uniprotkb_column_names
uniprot_names = ['Q9LIR4', 'Q1JUQ1', 'P05791', 'P0ADF6']
cols = ["lineage(SUPERKINGDOM)", "genes", "lineage(KINGDOM)"]
up_dict = getUniProtDict(uniprot_names, cols)
for record in up_dict:
print (record, up_dict[record].get("lineage(SUPERKINGDOM)"))
...
...
@@ -452,22 +473,21 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
for record in up_dict:
print (record, up_dict[record].get("genes"))
If a record doesn't have an entry in UniProt for that column it'll just return None
print (up_dict['Q1JUQ1'])
print (up_dict['Q1JUQ1']['lineage(KINGDOM)'])
*** EXAMPLE USAGE FOR UNIREF SEARCHING ***
up_dict = getUniProtDict(["Q9LIR4", "P99999"], cols=["members"], db="uniref", identities = 1.0)
You can either pass a list of identities for each UniProt identifier (in which case the list of identities must be
the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
"""
# Format the lists of IDs and columns correctly
cols
=
","
.
join
(
cols
)
...
...
@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
if
type
(
identities
)
!=
list
:
identities
=
[
identities
]
*
len
(
ids
)
elif
len
(
identities
)
!=
len
(
ids
):
raise
RuntimeError
(
'Either supply a single identity threshold or supply one for each identifier in the list'
)
raise
RuntimeError
(
'Either supply a single identity threshold or supply one for each identifier in the list'
)
# Check that the identity thresholds are valid values
for
x
in
identities
:
if
x
not
in
[
1.0
,
0.9
,
0.5
]:
raise
RuntimeError
(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - "
+
str
(
x
))
raise
RuntimeError
(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - "
+
str
(
x
))
# Add the query syntax around the identifiers
updated_ids
=
""
...
...
@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
url
=
'https://www.uniprot.org/'
+
db
+
'/'
params
=
{
'format'
:
'tab'
,
'query'
:
updated_ids
,
...
...
@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
# For each record we retrieve, split the line by tabs and build up the UniProt dict
for
line
in
page
.
split
(
"
\n
"
)[
1
:]:
if
line
:
splitlines
=
line
.
split
(
"
\t
"
)
splitlines
=
line
.
split
(
"
\t
"
)
id_dict
=
{}
pos
=
1
for
col
in
cols
.
split
(
","
):
id_dict
[
col
]
=
None
if
splitlines
[
pos
]
==
""
else
splitlines
[
pos
]
pos
+=
1
pos
+=
1
up_dict
[
splitlines
[
0
]]
=
id_dict
return
up_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment