Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
8fa94535
Commit
8fa94535
authored
Jun 03, 2019
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added_regex_search_in_gappy_sequences
parent
9889428a
Pipeline
#46
failed with stages
Changes
7
Pipelines
1
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
300 additions
and
291 deletions
+300
-291
annotation_test.py
annotation_test.py
+13
-11
annotations.py
annotations.py
+3
-0
gtf.py
gtf.py
+1
-1
phylo.py
phylo.py
+149
-211
sequence.py
sequence.py
+38
-23
sym.py
sym.py
+33
-2
webservice.py
webservice.py
+63
-43
No files found.
annotation_test.py
View file @
8fa94535
import
annotations
import
annotations
import
phylo
import
phylo
tree
=
phylo
.
parseNewick
(
"(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);"
)
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
...
@@ -10,17 +12,17 @@ import phylo
...
@@ -10,17 +12,17 @@ import phylo
working_dir
=
"/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
tree
=
phylo
.
read_nexus
(
working_dir
+
"annotation_simple.nexus"
)
#
tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
#
print
(
tree
)
#
print (tree)
print
(
tree
.
nexus_annotations
.
annotations
)
#
print (tree.nexus_annotations.annotations)
#
tree
.
swap_annotations
(
"PDB"
)
#
tree.swap_annotations("PDB")
#
print
(
tree
)
#
print (tree)
print
(
tree
.
nexus_annotations
.
annotations
)
#
print (tree.nexus_annotations.annotations)
#
#
# tree.write_to_nexus(working_dir + "output.nexus")
# tree.write_to_nexus(working_dir + "output.nexus")
...
...
annotations.py
View file @
8fa94535
from
collections
import
defaultdict
from
collections
import
defaultdict
from
phylo
import
*
from
phylo
import
*
import
phylo
import
matplotlib
import
matplotlib
import
random
import
random
...
@@ -146,3 +147,5 @@ class NexusAnnotations():
...
@@ -146,3 +147,5 @@ class NexusAnnotations():
def
generate_colour_list
(
self
,
num
):
def
generate_colour_list
(
self
,
num
):
return
num
return
num
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
gtf.py
View file @
8fa94535
...
@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
...
@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
f
.
close
()
f
.
close
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
bf
=
GtfFile
(
'/Users/mikael/simhome/NFIX/WT16
77
.gtf'
)
bf
=
GtfFile
(
'/Users/mikael/simhome/NFIX/WT16
89
.gtf'
)
print
(
bf
.
chroms
.
keys
())
print
(
bf
.
chroms
.
keys
())
g
=
bf
.
generate
(
'chr12'
)
g
=
bf
.
generate
(
'chr12'
)
print
(
next
(
g
))
print
(
next
(
g
))
...
...
phylo.py
View file @
8fa94535
This diff is collapsed.
Click to expand it.
sequence.py
View file @
8fa94535
...
@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
...
@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
if
parse_defline
:
if
parse_defline
:
parsed
=
parseDefline
(
seqinfo
[
0
])
parsed
=
parseDefline
(
seqinfo
[
0
])
seqname
=
parsed
[
0
]
seqname
=
parsed
[
0
]
else
:
seqname
=
seqinfo
[
0
]
seqinfo
=
line
[
1
:]
seqinfo
=
line
[
1
:]
else
:
# we are not parsing the sequence name so no need to duplicate it in the info
seqname
=
seqinfo
[
0
]
if
len
(
seqinfo
)
>
0
:
# more than a name
edited_info
=
''
for
infopart
in
seqinfo
[
1
:]:
edited_info
+=
infopart
+
' '
seqinfo
=
edited_info
else
:
seqinfo
=
''
except
IndexError
as
errmsg
:
except
IndexError
as
errmsg
:
if
not
ignore
:
if
not
ignore
:
raise
RuntimeError
(
errmsg
)
raise
RuntimeError
(
errmsg
)
...
@@ -717,60 +724,62 @@ class Alignment():
...
@@ -717,60 +724,62 @@ class Alignment():
distmat
[
i
,
j
]
=
distmat
[
j
,
i
]
=
dist
distmat
[
i
,
j
]
=
distmat
[
j
,
i
]
=
dist
return
distmat
return
distmat
def
writeHTML
(
self
,
filename
=
None
):
def
writeHTML
(
self
,
filename
=
None
,
col_start
=
None
,
col_end
=
None
):
""" Generate HTML that displays the alignment in color.
""" Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue'
and that each symbol maps to a text string naming the color, e.g. 'blue'
"""
"""
col_start
=
col_start
or
0
col_end
=
col_end
or
self
.
alignlen
html
=
'''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">
\n
<title>Sequence Alignment</title>
\n
</head><body><pre>
\n
'''
html
=
'''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">
\n
<title>Sequence Alignment</title>
\n
</head><body><pre>
\n
'''
html
+=
'''<p style="font-size:12px">
\n
'''
maxNameLength
=
self
.
getnamelen
()
maxNameLength
=
self
.
getnamelen
()
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
str
(
i
/
10
+
1
)[
0
]
html
+=
str
(
i
/
10
+
1
)[
0
]
el
se
:
el
if
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
' '
html
+=
' '
html
+=
'
%
s
\n
'
%
(
self
.
alignlen
)
# html += '%s\n' % (col_end)
html
+=
'
\n
'
if
self
.
alignlen
>
10
:
if
self
.
alignlen
>
10
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][(
index
*
-
1
)
+
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
1
)
else
'0'
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][(
index
*
-
1
)
+
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
1
)
else
'0'
el
se
:
el
if
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
' '
html
+=
' '
html
+=
'
\n
'
html
+=
'
\n
'
if
self
.
alignlen
>
100
:
if
self
.
alignlen
>
100
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
and
i
>=
99
:
if
(
i
+
1
)
%
10
==
0
and
i
>=
99
and
(
i
>=
col_start
and
i
<
col_end
)
:
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][
-
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
'0'
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][
-
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
'0'
elif
(
i
>=
col_start
and
i
<
col_end
):
else
:
html
+=
' '
html
+=
' '
html
+=
'
\n
'
html
+=
'
\n
'
if
self
.
alignlen
>
1000
:
if
self
.
alignlen
>
1000
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
'0'
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
' '
html
+=
'0'
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
' '
elif
(
i
>=
col_start
and
i
<
col_end
):
else
:
html
+=
' '
html
+=
' '
html
+=
'
\n
'
html
+=
'
\n
'
for
seq
in
self
.
seqs
:
for
seq
in
self
.
seqs
:
html
+=
seq
.
name
.
ljust
(
maxNameLength
)
+
' '
html
+=
seq
.
name
.
ljust
(
maxNameLength
)
+
' '
for
sym
in
seq
:
for
sym
in
seq
[
col_start
:
col_end
]
:
color
=
self
.
alphabet
.
getAnnotation
(
'html-color'
,
sym
)
color
=
self
.
alphabet
.
getAnnotation
(
'html-color'
,
sym
)
if
not
color
:
if
not
color
:
color
=
'white'
color
=
'white'
html
+=
'<font style="BACKGROUND-COLOR:
%
s">
%
s</font>'
%
(
color
,
sym
)
html
+=
'<font style="BACKGROUND-COLOR:
%
s">
%
s</font>'
%
(
color
,
sym
)
html
+=
'
\n
'
html
+=
'
\n
'
html
+=
'</pre></body></html>'
html
+=
'</p
></p
re></body></html>'
if
filename
:
if
filename
:
fh
=
open
(
filename
,
'w'
)
fh
=
open
(
filename
,
'w'
)
fh
.
write
(
html
)
fh
.
write
(
html
)
...
@@ -1187,19 +1196,25 @@ class Regexp(object):
...
@@ -1187,19 +1196,25 @@ class Regexp(object):
def
__str__
(
self
):
def
__str__
(
self
):
return
self
.
pattern
return
self
.
pattern
def
search
(
self
,
sequence
):
def
search
(
self
,
sequence
,
gappy
=
False
):
""" Find matches to the motif in the specified sequence. Returns a list
""" Find matches to the motif in the specified sequence. Returns a list
of triples, of the form (position, matched string, score). Note that
of triples, of the form (position, matched string, score). Note that
the score is always 1.0 because a regexp either matches
the score is always 1.0 because a regexp either matches
or doesn't. """
or doesn't. """
if
not
type
(
sequence
)
is
Sequence
:
if
not
type
(
sequence
)
is
Sequence
:
sequence
=
Sequence
(
sequence
)
sequence
=
Sequence
(
sequence
)
if
gappy
==
False
or
sequence
.
gappy
==
False
:
sequenceString
=
sequence
[:]
sequenceString
=
sequence
[:]
results
=
[]
results
=
[]
for
match
in
self
.
regex
.
finditer
(
sequenceString
):
for
match
in
self
.
regex
.
finditer
(
sequenceString
):
results
.
append
((
match
.
start
(),
match
.
group
(),
1.0
))
results
.
append
((
match
.
start
(),
match
.
group
(),
1.0
))
return
results
return
results
else
:
# if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
degapped
,
idxs
=
sequence
.
getDegapped
()
results
=
[]
for
match
in
self
.
regex
.
finditer
(
''
.
join
(
degapped
)):
results
.
append
((
idxs
[
match
.
start
()],
match
.
group
(),
1.0
))
return
results
class
PWM
(
object
):
class
PWM
(
object
):
...
...
sym.py
View file @
8fa94535
...
@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
...
@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
'Protein'
:
Protein_Alphabet
,
'Protein'
:
Protein_Alphabet
,
'ProteinwX'
:
Protein_wX
,
'ProteinwX'
:
Protein_wX
,
'ProteinwSTOP'
:
Protein_wSTOP
,
'ProteinwSTOP'
:
Protein_wSTOP
,
'ProteinwGAP'
:
Protein_wGAP
,
'DSSP_Alphabet'
:
DSSP_Alphabet
,
'DSSP_Alphabet'
:
DSSP_Alphabet
,
'DSSP3_Alphabet'
:
DSSP3_Alphabet
}
'DSSP3_Alphabet'
:
DSSP3_Alphabet
}
# The preferred order in which a predefined alphabet is assigned to a sequence
# The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder
=
[
'Bool_Alphabet'
,
'DNA'
,
'RNA'
,
'DNAwN'
,
'RNAwN'
,
'Protein'
,
'ProteinwX'
,
'ProteinwSTOP'
,
'DSSP_Alphabet'
,
'DSSP3_Alphabet'
]
preferredOrder
=
[
'Bool_Alphabet'
,
'DNA'
,
'RNA'
,
'DNAwN'
,
'RNAwN'
,
'Protein'
,
'ProteinwX'
,
'ProteinwSTOP'
,
'ProteinwGAP'
,
'DSSP_Alphabet'
,
'DSSP3_Alphabet'
]
# Useful annotations
# Useful annotations
DNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'T'
:
'#66bbff'
})
DNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'T'
:
'#66bbff'
})
RNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'U'
:
'#66bbff'
})
RNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'U'
:
'#66bbff'
})
Protein_Alphabet
.
annotateAll
(
'html-color'
,
{
'G'
:
'orange'
,
'P'
:
'orange'
,
'S'
:
'orange'
,
'T'
:
'orange'
,
'H'
:
'red'
,
'K'
:
'red'
,
'R'
:
'red'
,
'F'
:
'#66bbff'
,
'Y'
:
'#66bbff'
,
'W'
:
'#66bbff'
,
'I'
:
'green'
,
'L'
:
'green'
,
'M'
:
'green'
,
'V'
:
'green'
})
#Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
Protein_Alphabet
.
annotateAll
(
'html-color'
,
{
#orange*/
'G'
:
"#F5A259"
,
#green*/
'N'
:
"#00f900"
,
'Q'
:
"#00f900"
,
'S'
:
"#00f900"
,
'T'
:
"#00f900"
,
#red*/
'K'
:
"#f62f00"
,
'R'
:
"#f62f00"
,
#blue/purple*/
'A'
:
"#92b2f3"
,
'I'
:
"#92b2f3"
,
'L'
:
"#92b2f3"
,
'M'
:
"#92b2f3"
,
'V'
:
"#92b2f3"
,
'W'
:
"#92b2f3"
,
'F'
:
"#92b2f3"
,
#yellow*/
'P'
:
"#FFFB00"
,
#pink*/
'C'
:
"#F59692"
,
#aqua*/
'H'
:
"#04B2B3"
,
'Y'
:
"#04B2B3"
,
#purple*/
'D'
:
"#CE64CB"
,
'E'
:
"#CE64CB"
})
# ------------------ Substitution Matrix ------------------
# ------------------ Substitution Matrix ------------------
...
...
webservice.py
View file @
8fa94535
import
urllib.request
import
urllib.request
import
urllib.parse
import
os
import
os
from
time
import
sleep
from
time
import
sleep
import
stats
import
stats
...
@@ -21,6 +22,7 @@ __ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/'
...
@@ -21,6 +22,7 @@ __ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__
=
'http://www.uniprot.org/'
__uniprotUrl__
=
'http://www.uniprot.org/'
__ebiSearchUrl__
=
'http://www.ebi.ac.uk/ebisearch/'
__ebiSearchUrl__
=
'http://www.ebi.ac.uk/ebisearch/'
def
fetch
(
entryId
,
dbName
=
'uniprotkb'
,
format
=
'fasta'
):
def
fetch
(
entryId
,
dbName
=
'uniprotkb'
,
format
=
'fasta'
):
"""
"""
Retrieve a single entry from a database
Retrieve a single entry from a database
...
@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
...
@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
except
urllib
.
error
.
HTTPError
as
ex
:
except
urllib
.
error
.
HTTPError
as
ex
:
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
def
search
(
query
,
dbName
=
'uniprot'
,
format
=
'list'
,
limit
=
100
,
columns
=
""
):
def
search
(
query
,
dbName
=
'uniprot'
,
format
=
'list'
,
limit
=
100
,
columns
=
""
):
"""
"""
Retrieve multiple entries matching query from a database currently only via UniProtKB
Retrieve multiple entries matching query from a database currently only via UniProtKB
...
@@ -55,9 +58,13 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
...
@@ -55,9 +58,13 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
if
dbName
.
startswith
(
'uniprot'
):
if
dbName
.
startswith
(
'uniprot'
):
# Construct URL
# Construct URL
if
limit
==
None
:
# no limit to number of results returned
if
limit
==
None
:
# no limit to number of results returned
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&query='
+
query
+
'&columns='
+
columns
url
=
"{}{}/?format={}&query={}&columns={}"
.
format
(
__uniprotUrl__
,
dbName
,
format
,
urllib
.
parse
.
quote
(
query
),
columns
)
else
:
else
:
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&limit='
+
str
(
limit
)
+
'&query='
+
query
+
'&columns='
+
columns
url
=
"{}{}/?format={}&limit={}&query={}&columns={}"
.
format
(
__uniprotUrl__
,
dbName
,
format
,
str
(
limit
),
urllib
.
parse
.
quote
(
query
),
columns
)
# Get the entries
# Get the entries
try
:
try
:
...
@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
...
@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
dbs
=
dbName
.
split
(
":"
)
dbs
=
dbName
.
split
(
":"
)
if
len
(
dbs
)
>
1
:
if
len
(
dbs
)
>
1
:
dbName
=
dbs
[
1
]
dbName
=
dbs
[
1
]
base
=
'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
base
=
'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url
=
base
+
"esearch.fcgi?db="
+
dbName
+
"&term="
+
query
+
"&retmax="
+
str
(
limit
)
url
=
base
+
"esearch.fcgi?db={}&term={}+AND+srcdb_refseq["
\
"prop]&retmax={}"
.
format
(
dbName
,
urllib
.
parse
.
quote
(
query
),
str
(
limit
))
print
(
url
)
# Get the entries
# Get the entries
try
:
try
:
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
words
=
data
.
split
(
"</Id>"
)
words
=
data
.
split
(
"</Id>"
)
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
if
format
==
'list'
:
if
format
==
'list'
:
return
words
return
words
elif
format
==
'fasta'
and
len
(
words
)
>
0
:
elif
format
==
'fasta'
and
len
(
words
)
>
0
:
...
@@ -93,6 +107,7 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
...
@@ -93,6 +107,7 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
return
return
authorised_database_tag
=
{
9606
:
[
'Homo sapiens'
,
'ACC'
,
'ID'
],
authorised_database_tag
=
{
9606
:
[
'Homo sapiens'
,
'ACC'
,
'ID'
],
3702
:
[
'Arabidopsis thaliana'
,
'TAIR_ID'
],
3702
:
[
'Arabidopsis thaliana'
,
'TAIR_ID'
],
4932
:
[
'Saccharomyces cerevisiae'
,
'SGD_ID'
,
'CYGD_ID'
],
4932
:
[
'Saccharomyces cerevisiae'
,
'SGD_ID'
,
'CYGD_ID'
],
...
@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
...
@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries.
Note that this service can be slow for queries involving a large number of entries.
"""
"""
def
getGOReport
(
positives
,
background
=
None
):
def
getGOReport
(
positives
,
background
=
None
):
""" Generate a complete GO term report for a set of genes (positives).
""" Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
...
@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
...
@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
for
t
in
sorted_cnt
:
for
t
in
sorted_cnt
:
defin
=
getGODef
(
t
[
0
])
defin
=
getGODef
(
t
[
0
])
if
background
!=
None
:
if
background
!=
None
:
ret
.
append
((
t
[
0
],
t
[
1
][
2
]
*
len
(
term_set
),
t
[
1
][
0
],
t
[
1
][
0
]
+
t
[
1
][
1
],
defin
[
'name'
]))
ret
.
append
((
t
[
0
],
t
[
1
][
2
]
*
len
(
term_set
),
t
[
1
][
0
],
t
[
1
][
0
]
+
t
[
1
][
1
],
defin
[
'name'
]))
else
:
else
:
ret
.
append
((
t
[
0
],
t
[
1
],
defin
[
'name'
]))
ret
.
append
((
t
[
0
],
t
[
1
],
defin
[
'name'
]))
return
ret
return
ret
def
getGODef
(
goterm
):
def
getGODef
(
goterm
):
"""
"""
Retrieve information about a GO term
Retrieve information about a GO term
...
@@ -165,7 +182,7 @@ def getGODef(goterm):
...
@@ -165,7 +182,7 @@ def getGODef(goterm):
url
=
__ebiGOUrl__
+
'ontology/go/search?query='
+
goterm
url
=
__ebiGOUrl__
+
'ontology/go/search?query='
+
goterm
# Get the entry: fill in the fields specified below
# Get the entry: fill in the fields specified below
try
:
try
:
entry
=
{
'id'
:
None
,
'name'
:
None
,
'aspect'
:
None
}
entry
=
{
'id'
:
None
,
'name'
:
None
,
'aspect'
:
None
}
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
ret
=
json
.
loads
(
data
)
ret
=
json
.
loads
(
data
)
for
row
in
ret
[
'results'
]:
for
row
in
ret
[
'results'
]:
...
@@ -179,6 +196,7 @@ def getGODef(goterm):
...
@@ -179,6 +196,7 @@ def getGODef(goterm):
except
urllib
.
error
.
HTTPError
as
ex
:
except
urllib
.
error
.
HTTPError
as
ex
:
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
def
getGOTerms
(
genes
):
def
getGOTerms
(
genes
):
"""
"""
Retrieve all GO terms for a given set of genes (or single gene).
Retrieve all GO terms for a given set of genes (or single gene).
...
@@ -237,6 +255,7 @@ def getGOTerms(genes):
...
@@ -237,6 +255,7 @@ def getGOTerms(genes):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
return
map
return
map
def
getGenes
(
goterms
,
taxo
=
None
):
def
getGenes
(
goterms
,
taxo
=
None
):
"""
"""
Retrieve all genes/proteins for a given set of GO terms (or single GO term).
Retrieve all genes/proteins for a given set of GO terms (or single GO term).
...
@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
...
@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
else
:
else
:
break
break
termcnt
+=
1
termcnt
+=
1
uri_string
=
'annotation/search?limit='
+
str
(
limitpage
)
+
'&taxonId='
+
taxo
+
"&goId="
if
taxo
else
'annotation/search?goId='
uri_string
=
'annotation/search?limit='
+
str
(
limitpage
)
+
'&taxonId='
+
taxo
+
"&goId="
if
taxo
else
'annotation/search?goId='
for
i
in
range
(
len
(
termbatch
)):
for
i
in
range
(
len
(
termbatch
)):
term
=
termbatch
[
i
]
term
=
termbatch
[
i
]
uri_string
+=
term
+
","
if
i
<
len
(
termbatch
)
-
1
else
term
uri_string
+=
term
+
","
if
i
<
len
(
termbatch
)
-
1
else
term
...
@@ -295,8 +315,8 @@ def getGenes(goterms, taxo=None):
...
@@ -295,8 +315,8 @@ def getGenes(goterms, taxo=None):
raise
RuntimeError
(
ex
.
read
())
raise
RuntimeError
(
ex
.
read
())
return
map
return
map
class
EBI
(
object
):
class
EBI
(
object
):
__email__
=
'anon@uq.edu.au'
# to whom emails about jobs should go
__email__
=
'anon@uq.edu.au'
# to whom emails about jobs should go
__ebiServiceUrl__
=
'http://www.ebi.ac.uk/Tools/services/rest/'
# Use UQ mirror when available
__ebiServiceUrl__
=
'http://www.ebi.ac.uk/Tools/services/rest/'
# Use UQ mirror when available
__checkInterval__
=
2
# how long to wait between checking job status
__checkInterval__
=
2
# how long to wait between checking job status
...
@@ -349,7 +369,8 @@ class EBI(object):
...
@@ -349,7 +369,8 @@ class EBI(object):
if
self
.
isLocked
():
if
self
.
isLocked
():
raise
RuntimeError
(
"""You currently have a
%
s job running. You must
raise
RuntimeError
(
"""You currently have a
%
s job running. You must
wait until it is complete before submitting another job. Go to
wait until it is complete before submitting another job. Go to
%
sstatus/
%
s to check the status of the job."""
%
(
self
.
service
,
self
.
__ebiServiceUrl__
,
self
.
jobId
))
%
sstatus/
%
s to check the status of the job."""
%
(
self
.
service
,
self
.
__ebiServiceUrl__
,
self
.
jobId
))
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/run/'
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/run/'
# ncbiblast database parameter needs special handling
# ncbiblast database parameter needs special handling
if
self
.
service
==
'ncbiblast'
:
if
self
.
service
==
'ncbiblast'
:
...
@@ -423,8 +444,8 @@ class EBI(object):
...
@@ -423,8 +444,8 @@ class EBI(object):
else
:
else
:
return
results
return
results
def
getUniProtDict
(
ids
,
cols
=
""
,
db
=
'uniprot'
,
identities
=
None
):
def
getUniProtDict
(
ids
,
cols
=
""
,
db
=
'uniprot'
,
identities
=
None
):
"""
"""
:param ids: The list of UniProt IDs
:param ids: The list of UniProt IDs
...
@@ -467,7 +488,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
...
@@ -467,7 +488,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
"""
"""
# Format the lists of IDs and columns correctly
# Format the lists of IDs and columns correctly
cols
=
","
.
join
(
cols
)
cols
=
","
.
join
(
cols
)
...
@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
...
@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
if
type
(
identities
)
!=
list
:
if
type
(
identities
)
!=
list
:
identities
=
[
identities
]
*
len
(
ids
)
identities
=
[
identities
]
*
len
(
ids
)
elif
len
(
identities
)
!=
len
(
ids
):
elif
len
(
identities
)
!=
len
(
ids
):
raise
RuntimeError
(
'Either supply a single identity threshold or supply one for each identifier in the list'
)
raise
RuntimeError
(
'Either supply a single identity threshold or supply one for each identifier in the list'
)
# Check that the identity thresholds are valid values
# Check that the identity thresholds are valid values
for
x
in
identities
:
for
x
in
identities
:
if
x
not
in
[
1.0
,
0.9
,
0.5
]:
if
x
not
in
[
1.0
,
0.9
,
0.5
]:
raise
RuntimeError
(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - "
+
str
(
x
))
raise
RuntimeError
(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - "
+
str
(
x
))
# Add the query syntax around the identifiers
# Add the query syntax around the identifiers
updated_ids
=
""
updated_ids
=
""
...
@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
...
@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
url
=
'https://www.uniprot.org/'
+
db
+
'/'
url
=
'https://www.uniprot.org/'
+
db
+
'/'
params
=
{
params
=
{
'format'
:
'tab'
,
'format'
:
'tab'
,
'query'
:
updated_ids
,
'query'
:
updated_ids
,
...
@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
...
@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
# For each record we retrieve, split the line by tabs and build up the UniProt dict
# For each record we retrieve, split the line by tabs and build up the UniProt dict
for
line
in
page
.
split
(
"
\n
"
)[
1
:]:
for
line
in
page
.
split
(
"
\n
"
)[
1
:]:
if
line
:
if
line
:
splitlines
=
line
.
split
(
"
\t
"
)
splitlines
=
line
.
split
(
"
\t
"
)
id_dict
=
{}
id_dict
=
{}
pos
=
1
pos
=
1
for
col
in
cols
.
split
(
","
):
for
col
in
cols
.
split
(
","
):
id_dict
[
col
]
=
None
if
splitlines
[
pos
]
==
""
else
splitlines
[
pos
]
id_dict
[
col
]
=
None
if
splitlines
[
pos
]
==
""
else
splitlines
[
pos
]
pos
+=
1
pos
+=
1
up_dict
[
splitlines
[
0
]]
=
id_dict
up_dict
[
splitlines
[
0
]]
=
id_dict
return
up_dict
return
up_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment