Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
8fa94535
Commit
8fa94535
authored
Jun 03, 2019
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added_regex_search_in_gappy_sequences
parent
9889428a
Pipeline
#46
failed with stages
Changes
7
Pipelines
1
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
300 additions
and
291 deletions
+300
-291
annotation_test.py
annotation_test.py
+13
-11
annotations.py
annotations.py
+3
-0
gtf.py
gtf.py
+1
-1
phylo.py
phylo.py
+149
-211
sequence.py
sequence.py
+38
-23
sym.py
sym.py
+33
-2
webservice.py
webservice.py
+63
-43
No files found.
annotation_test.py
View file @
8fa94535
import
annotations
import
phylo
tree
=
phylo
.
parseNewick
(
"(Paenibacillus_thiaminolyticus:4.0,(((bacterium_endosymbiont_of_Mortierella_elongata_FMR23_6:4.0,(Pandoraea_faecigallinarum:4.0,Pandoraea_vervacti:4.0,Pandoraea_oxalativorans:4.0):4.0,(Burkholderia_sp_b14:4.0,Burkholderia_sp_b13:4.0,(Burkholderia_pseudomallei_406e:4.0,Burkholderia_pseudomallei_1710a:4.0):4.0):4.0):4.0,(Chromobacterium_amazonense:4.0,(Microvirgula_sp_AG722:4.0,Microvirgula_aerodenitrificans:4.0):4.0):4.0):4.0,(Candidatus_Endobugula:4.0,Moritella_sp_PE36:4.0,(Enterovibrio_nigricans:4.0,Photobacterium_iliopiscarium:4.0,Vibrio_campbellii:4.0):4.0,(((Pantoea_sp_AMG_501:4.0,Pantoea_wallisii:4.0,Pantoea_rodasii:4.0):4.0,(Erwinia_sp_ErVv1:4.0,Erwinia_toletana:4.0,Erwinia_mallotivora:4.0):4.0):4.0,(Candidatus_Fukatsuia:4.0,Rahnella_aquatilis:4.0,(Yersinia_pekkanenii:4.0,Yersinia_entomophaga:4.0,Yersinia_mollaretii:4.0,(Yersinia_wautersii:4.0,Yersinia_similis:4.0,Yersinia_pseudotuberculosis:4.0,Yersinia_pestis:4.0):4.0,Yersinia_enterocolitica:4.0):4.0):4.0,(Cosenzaea_myxofaciens:4.0,(Photorhabdus_laumondii:4.0,Photorhabdus_bodei:4.0,Photorhabdus_sp_HUG-39:4.0,Photorhabdus_sp_CRCIA-P01:4.0,Photorhabdus_namnaonensis:4.0,Photorhabdus_khanii:4.0,Photorhabdus_heterorhabditis:4.0,Photorhabdus_temperata:4.0,Photorhabdus_asymbiotica:4.0,Photorhabdus_australis:4.0,Photorhabdus_thracensis:4.0,Photorhabdus_luminescens:4.0):4.0,(Xenorhabdus_ishibashii:4.0,Xenorhabdus_khoisanae:4.0,Xenorhabdus_mauleonii:4.0,Xenorhabdus_miraniensis:4.0,Xenorhabdus_vietnamensis:4.0,Xenorhabdus_stockiae:4.0,Xenorhabdus_szentirmaii:4.0,Xenorhabdus_budapestensis:4.0,Xenorhabdus_bovienii:4.0,Xenorhabdus_nematophila:4.0):4.0,(Proteus_sp_TJ1640:4.0,Proteus_sp_TJ1636:4.0,Proteus_sp_FJ2001126-3:4.0,Proteus_columbae:4.0,Proteus_alimentorum:4.0,Proteus_genomosp_6_str._ATCC_51471:4.0,Proteus_genomosp_4_str._ATCC_51469:4.0,Proteus_cibarius:4.0,Proteus_hauseri:4.0,Proteus_penneri:4.0,Proteus_vulgaris:4.0):4.0,(Morganella_sp_HMSC11D09:4.0,Morganella_sp_EGD-HP17:4.0,Morganella_morganii:4.0):4.0):4.0,(Escherichia_sp_ESNIH1:4.0,Mangrovibacter_phragmitis:4.0,(Enterobacter_sp_DC4:4.0,Enterobacter_sp_BIDMC_26:4.0):4.0,Kosakonia_sacchari:4.0,Pseudescherichia_vulneris:4.0):4.0):4.0,(Pseudomonas_kribbensis:4.0,Pseudomonas_lactis:4.0,Pseudomonas_paralactis:4.0,Pseudomonas_helleri:4.0,Pseudomonas_weihenstephanensis:4.0,Pseudomonas_coleopterorum:4.0,Pseudomonas_endophytica:4.0,Pseudomonas_granadensis:4.0,Pseudomonas_prosekii:4.0,Pseudomonas_brassicacearum:4.0,Pseudomonas_deceptionensis:4.0,Pseudomonas_baetica:4.0,Pseudomonas_simiae:4.0,Pseudomonas_moraviensis:4.0,Pseudomonas_batumici:4.0,Pseudomonas_antarctica:4.0,Pseudomonas_rhizosphaerae:4.0,Pseudomonas_lini:4.0,Pseudomonas_kilonensis:4.0,Pseudomonas_psychrophila:4.0,Pseudomonas_abietaniphila:4.0,Pseudomonas_thivervalensis:4.0,Pseudomonas_jessenii:4.0,Pseudomonas_plecoglossicida:4.0,Pseudomonas_agarici:4.0,(Pseudomonas_cichorii:4.0,Pseudomonas_syringae:4.0):4.0,Pseudomonas_sp:4.0,(Pseudomonas_lundensis:4.0,Pseudomonas_fragi:4.0):4.0,(Pseudomonas_poae:4.0,Pseudomonas_mediterranea:4.0,Pseudomonas_extremorientalis:4.0,Pseudomonas_orientalis:4.0,Pseudomonas_libanensis:4.0,Pseudomonas_synxantha:4.0,Pseudomonas_corrugata:4.0,Pseudomonas_fluorescens:4.0):4.0):4.0):4.0):4.0);"
)
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree.nwk")
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
...
...
@@ -10,17 +12,17 @@ import phylo
working_dir
=
"/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
tree
=
phylo
.
read_nexus
(
working_dir
+
"annotation_simple.nexus"
)
print
(
tree
)
print
(
tree
.
nexus_annotations
.
annotations
)
tree
.
swap_annotations
(
"PDB"
)
print
(
tree
)
print
(
tree
.
nexus_annotations
.
annotations
)
#
working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
#
#
tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
#
#
print (tree)
#
print (tree.nexus_annotations.annotations)
#
#
tree.swap_annotations("PDB")
#
#
print (tree)
#
print (tree.nexus_annotations.annotations)
#
# tree.write_to_nexus(working_dir + "output.nexus")
...
...
annotations.py
View file @
8fa94535
from
collections
import
defaultdict
from
phylo
import
*
import
phylo
import
matplotlib
import
random
...
...
@@ -146,3 +147,5 @@ class NexusAnnotations():
def
generate_colour_list
(
self
,
num
):
return
num
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Projects/Phylo Island/Species trees/species_tree_115.nwk")
gtf.py
View file @
8fa94535
...
...
@@ -242,7 +242,7 @@ def writeGtfFile(entries, filename, header = None):
f
.
close
()
if
__name__
==
'__main__'
:
bf
=
GtfFile
(
'/Users/mikael/simhome/NFIX/WT16
77
.gtf'
)
bf
=
GtfFile
(
'/Users/mikael/simhome/NFIX/WT16
89
.gtf'
)
print
(
bf
.
chroms
.
keys
())
g
=
bf
.
generate
(
'chr12'
)
print
(
next
(
g
))
...
...
phylo.py
View file @
8fa94535
This diff is collapsed.
Click to expand it.
sequence.py
View file @
8fa94535
...
...
@@ -226,9 +226,16 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defl
if
parse_defline
:
parsed
=
parseDefline
(
seqinfo
[
0
])
seqname
=
parsed
[
0
]
else
:
seqname
=
seqinfo
[
0
]
seqinfo
=
line
[
1
:]
else
:
# we are not parsing the sequence name so no need to duplicate it in the info
seqname
=
seqinfo
[
0
]
if
len
(
seqinfo
)
>
0
:
# more than a name
edited_info
=
''
for
infopart
in
seqinfo
[
1
:]:
edited_info
+=
infopart
+
' '
seqinfo
=
edited_info
else
:
seqinfo
=
''
except
IndexError
as
errmsg
:
if
not
ignore
:
raise
RuntimeError
(
errmsg
)
...
...
@@ -717,60 +724,62 @@ class Alignment():
distmat
[
i
,
j
]
=
distmat
[
j
,
i
]
=
dist
return
distmat
def
writeHTML
(
self
,
filename
=
None
):
def
writeHTML
(
self
,
filename
=
None
,
col_start
=
None
,
col_end
=
None
):
""" Generate HTML that displays the alignment in color.
Requires that the alphabet is annotated with the label 'html-color' (see Sequence.annotateSym)
and that each symbol maps to a text string naming the color, e.g. 'blue'
"""
col_start
=
col_start
or
0
col_end
=
col_end
or
self
.
alignlen
html
=
'''<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">
\n
<title>Sequence Alignment</title>
\n
</head><body><pre>
\n
'''
html
+=
'''<p style="font-size:12px">
\n
'''
maxNameLength
=
self
.
getnamelen
()
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
str
(
i
/
10
+
1
)[
0
]
el
se
:
el
if
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
' '
html
+=
'
%
s
\n
'
%
(
self
.
alignlen
)
# html += '%s\n' % (col_end)
html
+=
'
\n
'
if
self
.
alignlen
>
10
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][(
index
*
-
1
)
+
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
1
)
else
'0'
el
se
:
el
if
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
' '
html
+=
'
\n
'
if
self
.
alignlen
>
100
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
and
i
>=
99
:
if
(
i
+
1
)
%
10
==
0
and
i
>=
99
and
(
i
>=
col_start
and
i
<
col_end
)
:
index
=
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
html
+=
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
][
-
1
]
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
'0'
else
:
elif
(
i
>=
col_start
and
i
<
col_end
):
html
+=
' '
html
+=
'
\n
'
if
self
.
alignlen
>
1000
:
html
+=
''
.
ljust
(
maxNameLength
)
+
' '
for
i
in
range
(
self
.
alignlen
-
1
):
if
(
i
+
1
)
%
10
==
0
:
if
(
i
+
1
)
%
10
==
0
and
(
i
>=
col_start
and
i
<
col_end
)
:
html
+=
'0'
if
(
len
(
str
(
i
/
10
+
1
)
.
split
(
'.'
)[
0
])
>
2
)
else
' '
else
:
elif
(
i
>=
col_start
and
i
<
col_end
):
html
+=
' '
html
+=
'
\n
'
for
seq
in
self
.
seqs
:
html
+=
seq
.
name
.
ljust
(
maxNameLength
)
+
' '
for
sym
in
seq
:
for
sym
in
seq
[
col_start
:
col_end
]
:
color
=
self
.
alphabet
.
getAnnotation
(
'html-color'
,
sym
)
if
not
color
:
color
=
'white'
html
+=
'<font style="BACKGROUND-COLOR:
%
s">
%
s</font>'
%
(
color
,
sym
)
html
+=
'
\n
'
html
+=
'</pre></body></html>'
html
+=
'</p
></p
re></body></html>'
if
filename
:
fh
=
open
(
filename
,
'w'
)
fh
.
write
(
html
)
...
...
@@ -1187,19 +1196,25 @@ class Regexp(object):
def
__str__
(
self
):
return
self
.
pattern
def
search
(
self
,
sequence
):
def
search
(
self
,
sequence
,
gappy
=
False
):
""" Find matches to the motif in the specified sequence. Returns a list
of triples, of the form (position, matched string, score). Note that
the score is always 1.0 because a regexp either matches
or doesn't. """
if
not
type
(
sequence
)
is
Sequence
:
sequence
=
Sequence
(
sequence
)
if
gappy
==
False
or
sequence
.
gappy
==
False
:
sequenceString
=
sequence
[:]
results
=
[]
for
match
in
self
.
regex
.
finditer
(
sequenceString
):
results
.
append
((
match
.
start
(),
match
.
group
(),
1.0
))
return
results
else
:
# if the sequence is gappy AND the function is called with gappy = True THEN run the regex matching on the de-gapped sequence
degapped
,
idxs
=
sequence
.
getDegapped
()
results
=
[]
for
match
in
self
.
regex
.
finditer
(
''
.
join
(
degapped
)):
results
.
append
((
idxs
[
match
.
start
()],
match
.
group
(),
1.0
))
return
results
class
PWM
(
object
):
...
...
sym.py
View file @
8fa94535
...
...
@@ -138,15 +138,46 @@ predefAlphabets = {'Bool_Alphabet': Bool_Alphabet,
'Protein'
:
Protein_Alphabet
,
'ProteinwX'
:
Protein_wX
,
'ProteinwSTOP'
:
Protein_wSTOP
,
'ProteinwGAP'
:
Protein_wGAP
,
'DSSP_Alphabet'
:
DSSP_Alphabet
,
'DSSP3_Alphabet'
:
DSSP3_Alphabet
}
# The preferred order in which a predefined alphabet is assigned to a sequence
# (e.g., we'd want to assign DNA to 'AGCT', even though Protein is also valid)
preferredOrder
=
[
'Bool_Alphabet'
,
'DNA'
,
'RNA'
,
'DNAwN'
,
'RNAwN'
,
'Protein'
,
'ProteinwX'
,
'ProteinwSTOP'
,
'DSSP_Alphabet'
,
'DSSP3_Alphabet'
]
preferredOrder
=
[
'Bool_Alphabet'
,
'DNA'
,
'RNA'
,
'DNAwN'
,
'RNAwN'
,
'Protein'
,
'ProteinwX'
,
'ProteinwSTOP'
,
'ProteinwGAP'
,
'DSSP_Alphabet'
,
'DSSP3_Alphabet'
]
# Useful annotations
DNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'T'
:
'#66bbff'
})
RNA_Alphabet
.
annotateAll
(
'html-color'
,
{
'A'
:
'green'
,
'C'
:
'orange'
,
'G'
:
'red'
,
'U'
:
'#66bbff'
})
Protein_Alphabet
.
annotateAll
(
'html-color'
,
{
'G'
:
'orange'
,
'P'
:
'orange'
,
'S'
:
'orange'
,
'T'
:
'orange'
,
'H'
:
'red'
,
'K'
:
'red'
,
'R'
:
'red'
,
'F'
:
'#66bbff'
,
'Y'
:
'#66bbff'
,
'W'
:
'#66bbff'
,
'I'
:
'green'
,
'L'
:
'green'
,
'M'
:
'green'
,
'V'
:
'green'
})
#Protein_Alphabet.annotateAll('html-color', {'G':'orange','P':'orange','S':'orange','T':'orange','H':'red','K':'red','R':'red','F':'#66bbff','Y':'#66bbff','W':'#66bbff','I':'green','L':'green','M':'green','V':'green'})
Protein_Alphabet
.
annotateAll
(
'html-color'
,
{
#orange*/
'G'
:
"#F5A259"
,
#green*/
'N'
:
"#00f900"
,
'Q'
:
"#00f900"
,
'S'
:
"#00f900"
,
'T'
:
"#00f900"
,
#red*/
'K'
:
"#f62f00"
,
'R'
:
"#f62f00"
,
#blue/purple*/
'A'
:
"#92b2f3"
,
'I'
:
"#92b2f3"
,
'L'
:
"#92b2f3"
,
'M'
:
"#92b2f3"
,
'V'
:
"#92b2f3"
,
'W'
:
"#92b2f3"
,
'F'
:
"#92b2f3"
,
#yellow*/
'P'
:
"#FFFB00"
,
#pink*/
'C'
:
"#F59692"
,
#aqua*/
'H'
:
"#04B2B3"
,
'Y'
:
"#04B2B3"
,
#purple*/
'D'
:
"#CE64CB"
,
'E'
:
"#CE64CB"
})
# ------------------ Substitution Matrix ------------------
...
...
webservice.py
View file @
8fa94535
import
urllib.request
import
urllib.parse
import
os
from
time
import
sleep
import
stats
...
...
@@ -21,6 +22,7 @@ __ebiGOUrl__ = 'https://www.ebi.ac.uk/QuickGO/services/'
__uniprotUrl__
=
'http://www.uniprot.org/'
__ebiSearchUrl__
=
'http://www.ebi.ac.uk/ebisearch/'
def
fetch
(
entryId
,
dbName
=
'uniprotkb'
,
format
=
'fasta'
):
"""
Retrieve a single entry from a database
...
...
@@ -42,6 +44,7 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
except
urllib
.
error
.
HTTPError
as
ex
:
raise
RuntimeError
(
ex
.
read
())
def
search
(
query
,
dbName
=
'uniprot'
,
format
=
'list'
,
limit
=
100
,
columns
=
""
):
"""
Retrieve multiple entries matching query from a database currently only via UniProtKB
...
...
@@ -55,9 +58,13 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
if
dbName
.
startswith
(
'uniprot'
):
# Construct URL
if
limit
==
None
:
# no limit to number of results returned
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&query='
+
query
+
'&columns='
+
columns
url
=
"{}{}/?format={}&query={}&columns={}"
.
format
(
__uniprotUrl__
,
dbName
,
format
,
urllib
.
parse
.
quote
(
query
),
columns
)
else
:
url
=
__uniprotUrl__
+
dbName
+
'/?format='
+
format
+
'&limit='
+
str
(
limit
)
+
'&query='
+
query
+
'&columns='
+
columns
url
=
"{}{}/?format={}&limit={}&query={}&columns={}"
.
format
(
__uniprotUrl__
,
dbName
,
format
,
str
(
limit
),
urllib
.
parse
.
quote
(
query
),
columns
)
# Get the entries
try
:
...
...
@@ -72,13 +79,20 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
dbs
=
dbName
.
split
(
":"
)
if
len
(
dbs
)
>
1
:
dbName
=
dbs
[
1
]
base
=
'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url
=
base
+
"esearch.fcgi?db="
+
dbName
+
"&term="
+
query
+
"&retmax="
+
str
(
limit
)
url
=
base
+
"esearch.fcgi?db={}&term={}+AND+srcdb_refseq["
\
"prop]&retmax={}"
.
format
(
dbName
,
urllib
.
parse
.
quote
(
query
),
str
(
limit
))
print
(
url
)
# Get the entries
try
:
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
words
=
data
.
split
(
"</Id>"
)
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
words
=
[
w
[
w
.
find
(
"<Id>"
)
+
4
:]
for
w
in
words
[:
-
1
]]
if
format
==
'list'
:
return
words
elif
format
==
'fasta'
and
len
(
words
)
>
0
:
...
...
@@ -93,6 +107,7 @@ def search(query, dbName='uniprot', format='list', limit=100, columns=""):
raise
RuntimeError
(
ex
.
read
())
return
authorised_database_tag
=
{
9606
:
[
'Homo sapiens'
,
'ACC'
,
'ID'
],
3702
:
[
'Arabidopsis thaliana'
,
'TAIR_ID'
],
4932
:
[
'Saccharomyces cerevisiae'
,
'SGD_ID'
,
'CYGD_ID'
],
...
...
@@ -104,7 +119,8 @@ http://www.ebi.ac.uk/QuickGO/WebServices.html
Note that this service can be slow for queries involving a large number of entries.
"""
def
getGOReport
(
positives
,
background
=
None
):
def
getGOReport
(
positives
,
background
=
None
):
""" Generate a complete GO term report for a set of genes (positives).
Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
...
...
@@ -148,11 +164,12 @@ def getGOReport(positives, background = None):
for
t
in
sorted_cnt
:
defin
=
getGODef
(
t
[
0
])
if
background
!=
None
:
ret
.
append
((
t
[
0
],
t
[
1
][
2
]
*
len
(
term_set
),
t
[
1
][
0
],
t
[
1
][
0
]
+
t
[
1
][
1
],
defin
[
'name'
]))
ret
.
append
((
t
[
0
],
t
[
1
][
2
]
*
len
(
term_set
),
t
[
1
][
0
],
t
[
1
][
0
]
+
t
[
1
][
1
],
defin
[
'name'
]))
else
:
ret
.
append
((
t
[
0
],
t
[
1
],
defin
[
'name'
]))
return
ret
def
getGODef
(
goterm
):
"""
Retrieve information about a GO term
...
...
@@ -165,7 +182,7 @@ def getGODef(goterm):
url
=
__ebiGOUrl__
+
'ontology/go/search?query='
+
goterm
# Get the entry: fill in the fields specified below
try
:
entry
=
{
'id'
:
None
,
'name'
:
None
,
'aspect'
:
None
}
entry
=
{
'id'
:
None
,
'name'
:
None
,
'aspect'
:
None
}
data
=
urllib
.
request
.
urlopen
(
url
)
.
read
()
.
decode
(
"utf-8"
)
ret
=
json
.
loads
(
data
)
for
row
in
ret
[
'results'
]:
...
...
@@ -179,6 +196,7 @@ def getGODef(goterm):
except
urllib
.
error
.
HTTPError
as
ex
:
raise
RuntimeError
(
ex
.
read
())
def
getGOTerms
(
genes
):
"""
Retrieve all GO terms for a given set of genes (or single gene).
...
...
@@ -237,6 +255,7 @@ def getGOTerms(genes):
raise
RuntimeError
(
ex
.
read
())
return
map
def
getGenes
(
goterms
,
taxo
=
None
):
"""
Retrieve all genes/proteins for a given set of GO terms (or single GO term).
...
...
@@ -258,7 +277,8 @@ def getGenes(goterms, taxo=None):
else
:
break
termcnt
+=
1
uri_string
=
'annotation/search?limit='
+
str
(
limitpage
)
+
'&taxonId='
+
taxo
+
"&goId="
if
taxo
else
'annotation/search?goId='
uri_string
=
'annotation/search?limit='
+
str
(
limitpage
)
+
'&taxonId='
+
taxo
+
"&goId="
if
taxo
else
'annotation/search?goId='
for
i
in
range
(
len
(
termbatch
)):
term
=
termbatch
[
i
]
uri_string
+=
term
+
","
if
i
<
len
(
termbatch
)
-
1
else
term
...
...
@@ -295,8 +315,8 @@ def getGenes(goterms, taxo=None):
raise
RuntimeError
(
ex
.
read
())
return
map
class
EBI
(
object
):
class
EBI
(
object
):
__email__
=
'anon@uq.edu.au'
# to whom emails about jobs should go
__ebiServiceUrl__
=
'http://www.ebi.ac.uk/Tools/services/rest/'
# Use UQ mirror when available
__checkInterval__
=
2
# how long to wait between checking job status
...
...
@@ -349,7 +369,8 @@ class EBI(object):
if
self
.
isLocked
():
raise
RuntimeError
(
"""You currently have a
%
s job running. You must
wait until it is complete before submitting another job. Go to
%
sstatus/
%
s to check the status of the job."""
%
(
self
.
service
,
self
.
__ebiServiceUrl__
,
self
.
jobId
))
%
sstatus/
%
s to check the status of the job."""
%
(
self
.
service
,
self
.
__ebiServiceUrl__
,
self
.
jobId
))
url
=
self
.
__ebiServiceUrl__
+
self
.
service
+
'/run/'
# ncbiblast database parameter needs special handling
if
self
.
service
==
'ncbiblast'
:
...
...
@@ -423,8 +444,8 @@ class EBI(object):
else
:
return
results
def
getUniProtDict
(
ids
,
cols
=
""
,
db
=
'uniprot'
,
identities
=
None
):
def
getUniProtDict
(
ids
,
cols
=
""
,
db
=
'uniprot'
,
identities
=
None
):
"""
:param ids: The list of UniProt IDs
...
...
@@ -467,7 +488,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
the same size as the list of identifiers. Or you can just pass a single identity to search Uniref at.
"""
# Format the lists of IDs and columns correctly
cols
=
","
.
join
(
cols
)
...
...
@@ -481,12 +501,14 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
if
type
(
identities
)
!=
list
:
identities
=
[
identities
]
*
len
(
ids
)
elif
len
(
identities
)
!=
len
(
ids
):
raise
RuntimeError
(
'Either supply a single identity threshold or supply one for each identifier in the list'
)
raise
RuntimeError
(
'Either supply a single identity threshold or supply one for each identifier in the list'
)
# Check that the identity thresholds are valid values
for
x
in
identities
:
if
x
not
in
[
1.0
,
0.9
,
0.5
]:
raise
RuntimeError
(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - "
+
str
(
x
))
raise
RuntimeError
(
"UniRef threshold values must be either 1.0, 0.9, or 0.5. Supplied value was - "
+
str
(
x
))
# Add the query syntax around the identifiers
updated_ids
=
""
...
...
@@ -500,8 +522,6 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
url
=
'https://www.uniprot.org/'
+
db
+
'/'
params
=
{
'format'
:
'tab'
,
'query'
:
updated_ids
,
...
...
@@ -518,12 +538,12 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
# For each record we retrieve, split the line by tabs and build up the UniProt dict
for
line
in
page
.
split
(
"
\n
"
)[
1
:]:
if
line
:
splitlines
=
line
.
split
(
"
\t
"
)
splitlines
=
line
.
split
(
"
\t
"
)
id_dict
=
{}
pos
=
1
for
col
in
cols
.
split
(
","
):
id_dict
[
col
]
=
None
if
splitlines
[
pos
]
==
""
else
splitlines
[
pos
]
pos
+=
1
pos
+=
1
up_dict
[
splitlines
[
0
]]
=
id_dict
return
up_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment