Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
a236ba40
Commit
a236ba40
authored
Aug 24, 2017
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update_seqdata_to_python_3
parent
bb25ec08
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
393 additions
and
249 deletions
+393
-249
guide.py
guide.py
+1
-1
seqdata.py
seqdata.py
+392
-248
No files found.
guide.py
View file @
a236ba40
e
a
t
###################################################
###################################################
# This module is a supplement to the Python guide #
# This module is a supplement to the Python guide #
# Version 2017.3 (10/03/2017) #
# Version 2017.3 (10/03/2017) #
###################################################
###################################################
...
...
seqdata.py
View file @
a236ba40
...
@@ -13,8 +13,8 @@ def overlap(chromLoc1, chromLoc2):
...
@@ -13,8 +13,8 @@ def overlap(chromLoc1, chromLoc2):
Return 0 in case of NO overlap.
Return 0 in case of NO overlap.
"""
"""
if
chromLoc1
[
0
]
==
chromLoc2
[
0
]:
if
chromLoc1
[
0
]
==
chromLoc2
[
0
]:
halfWidth1
=
(
chromLoc1
[
2
]
-
chromLoc1
[
1
])
/
2
halfWidth1
=
(
chromLoc1
[
2
]
-
chromLoc1
[
1
])
/
/
2
halfWidth2
=
(
chromLoc2
[
2
]
-
chromLoc2
[
1
])
/
2
halfWidth2
=
(
chromLoc2
[
2
]
-
chromLoc2
[
1
])
/
/
2
minWidth
=
min
(
halfWidth1
,
halfWidth2
)
minWidth
=
min
(
halfWidth1
,
halfWidth2
)
minWidth
=
max
(
minWidth
,
1
)
minWidth
=
max
(
minWidth
,
1
)
maxWidth
=
max
(
halfWidth1
,
halfWidth2
)
maxWidth
=
max
(
halfWidth1
,
halfWidth2
)
...
@@ -37,8 +37,8 @@ def distance(chromLoc1, chromLoc2, minimum = True):
...
@@ -37,8 +37,8 @@ def distance(chromLoc1, chromLoc2, minimum = True):
minimum: if True (default), then use minimum distance, if False, use centre to centre
minimum: if True (default), then use minimum distance, if False, use centre to centre
"""
"""
if
chromLoc1
[
0
]
==
chromLoc2
[
0
]:
if
chromLoc1
[
0
]
==
chromLoc2
[
0
]:
halfWidth1
=
(
chromLoc1
[
2
]
-
chromLoc1
[
1
])
/
2
halfWidth1
=
(
chromLoc1
[
2
]
-
chromLoc1
[
1
])
/
/
2
halfWidth2
=
(
chromLoc2
[
2
]
-
chromLoc2
[
1
])
/
2
halfWidth2
=
(
chromLoc2
[
2
]
-
chromLoc2
[
1
])
/
/
2
minWidth
=
min
(
halfWidth1
,
halfWidth2
)
minWidth
=
min
(
halfWidth1
,
halfWidth2
)
minWidth
=
max
(
minWidth
,
1
)
minWidth
=
max
(
minWidth
,
1
)
maxWidth
=
max
(
halfWidth1
,
halfWidth2
)
maxWidth
=
max
(
halfWidth1
,
halfWidth2
)
...
@@ -151,33 +151,33 @@ class BedEntry():
...
@@ -151,33 +151,33 @@ class BedEntry():
end
=
self
.
chromEnd
end
=
self
.
chromEnd
start
=
self
.
chromStart
start
=
self
.
chromStart
mywidth
=
fixedwidth
or
(
self
.
chromEnd
-
self
.
chromStart
)
mywidth
=
fixedwidth
or
(
self
.
chromEnd
-
self
.
chromStart
)
mycentre
=
start
+
(
self
.
chromEnd
-
self
.
chromStart
)
/
2
mycentre
=
start
+
(
self
.
chromEnd
-
self
.
chromStart
)
/
/
2
if
usesummit
:
if
usesummit
:
mycentre
=
self
.
summit
mycentre
=
self
.
summit
if
useshift
:
if
useshift
:
mycentre
=
mycentre
+
useshift
mycentre
=
mycentre
+
useshift
if
fixedwidth
:
# we need to re-calculate start and end
if
fixedwidth
:
# we need to re-calculate start and end
if
genome
:
if
genome
:
end
=
min
(
len
(
genome
[
self
.
chrom
]),
mycentre
+
(
mywidth
/
2
))
end
=
min
(
len
(
genome
[
self
.
chrom
]),
mycentre
+
(
mywidth
/
/
2
))
else
:
else
:
end
=
mycentre
+
(
mywidth
/
2
)
end
=
mycentre
+
(
mywidth
/
/
2
)
start
=
max
(
0
,
mycentre
-
(
mywidth
/
2
))
start
=
max
(
0
,
mycentre
-
(
mywidth
/
/
2
))
else
:
# other strand
else
:
# other strand
start
=
self
.
chromEnd
start
=
self
.
chromEnd
end
=
self
.
chromStart
end
=
self
.
chromStart
mywidth
=
fixedwidth
or
(
self
.
chromEnd
-
self
.
chromStart
)
mywidth
=
fixedwidth
or
(
self
.
chromEnd
-
self
.
chromStart
)
mycentre
=
self
.
chromStart
+
(
self
.
chromEnd
-
self
.
chromStart
)
/
2
mycentre
=
self
.
chromStart
+
(
self
.
chromEnd
-
self
.
chromStart
)
/
/
2
if
usesummit
:
if
usesummit
:
mycentre
=
self
.
summit
mycentre
=
self
.
summit
if
useshift
:
if
useshift
:
mycentre
=
mycentre
-
useshift
# shift is reversed on other strand
mycentre
=
mycentre
-
useshift
# shift is reversed on other strand
if
fixedwidth
:
# we need to re-calculate start and end
if
fixedwidth
:
# we need to re-calculate start and end
end
=
max
(
0
,
mycentre
-
(
mywidth
/
2
))
end
=
max
(
0
,
mycentre
-
(
mywidth
/
/
2
))
if
genome
:
if
genome
:
start
=
min
(
len
(
genome
[
self
.
chrom
]),
mycentre
+
(
mywidth
/
2
))
start
=
min
(
len
(
genome
[
self
.
chrom
]),
mycentre
+
(
mywidth
/
/
2
))
else
:
else
:
start
=
mycentre
+
(
mywidth
/
2
)
start
=
mycentre
+
(
mywidth
/
/
2
)
if
genome
:
# refer to the genome sequence
if
genome
:
# refer to the genome sequence
return
genome
[
self
.
chrom
][
start
:
end
]
return
genome
[
self
.
chrom
][
start
:
end
]
...
@@ -187,9 +187,9 @@ class BedEntry():
...
@@ -187,9 +187,9 @@ class BedEntry():
def
setwidth
(
self
,
fixedwidth
=
None
,
usesummit
=
False
):
def
setwidth
(
self
,
fixedwidth
=
None
,
usesummit
=
False
):
if
fixedwidth
:
if
fixedwidth
:
if
usesummit
:
if
usesummit
:
diff
=
self
.
summit
-
fixedwidth
/
2
diff
=
self
.
summit
-
fixedwidth
/
/
2
else
:
else
:
diff
=
(
self
.
chromEnd
-
self
.
chromStart
)
/
2
-
fixedwidth
/
2
diff
=
(
self
.
chromEnd
-
self
.
chromStart
)
/
/
2
-
fixedwidth
/
/
2
self
.
chromStart
+=
diff
self
.
chromStart
+=
diff
self
.
chromStart
+=
diff
+
fixedwidth
self
.
chromStart
+=
diff
+
fixedwidth
return
(
self
.
chrom
,
self
.
chromStart
,
self
.
chromEnd
)
return
(
self
.
chrom
,
self
.
chromStart
,
self
.
chromEnd
)
...
@@ -365,11 +365,8 @@ class BedFile():
...
@@ -365,11 +365,8 @@ class BedFile():
def
__iter__
(
self
):
def
__iter__
(
self
):
return
self
.
rows
.
__iter__
()
return
self
.
rows
.
__iter__
()
def
__getslice__
(
self
,
i
,
j
):
def
__getitem__
(
self
,
key
):
return
self
.
rows
.
__getslice__
(
i
,
j
)
return
self
.
rows
[
key
]
def
__getitem__
(
self
,
i
):
return
self
.
rows
[
i
]
def
__len__
(
self
):
def
__len__
(
self
):
return
len
(
self
.
rows
)
return
len
(
self
.
rows
)
...
@@ -388,7 +385,7 @@ class BedFile():
...
@@ -388,7 +385,7 @@ class BedFile():
if
not
row
.
chrom
in
index_end
:
# seeing chromosome entry first time
if
not
row
.
chrom
in
index_end
:
# seeing chromosome entry first time
index_end
[
row
.
chrom
]
=
[]
index_end
[
row
.
chrom
]
=
[]
index_start
[
row
.
chrom
]
.
append
((
row
.
chromStart
,
row
.
chromEnd
-
row
.
chromStart
,
i
))
index_start
[
row
.
chrom
]
.
append
((
row
.
chromStart
,
row
.
chromEnd
-
row
.
chromStart
,
i
))
index_centre
[
row
.
chrom
]
.
append
((
row
.
chromStart
+
(
row
.
chromEnd
-
row
.
chromStart
)
/
2
,
(
row
.
chromEnd
-
row
.
chromStart
)
/
2
,
i
))
index_centre
[
row
.
chrom
]
.
append
((
row
.
chromStart
+
(
row
.
chromEnd
-
row
.
chromStart
)
/
/
2
,
(
row
.
chromEnd
-
row
.
chromStart
)
/
/
2
,
i
))
index_end
[
row
.
chrom
]
.
append
((
row
.
chromEnd
,
row
.
chromEnd
-
row
.
chromStart
,
i
))
index_end
[
row
.
chrom
]
.
append
((
row
.
chromEnd
,
row
.
chromEnd
-
row
.
chromStart
,
i
))
if
row
.
name
:
if
row
.
name
:
index_name
[
row
.
name
]
=
row
index_name
[
row
.
name
]
=
row
...
@@ -407,7 +404,7 @@ class BedFile():
...
@@ -407,7 +404,7 @@ class BedFile():
entries
=
self
.
indices
[
0
][
elem
[
0
]]
# use the start index
entries
=
self
.
indices
[
0
][
elem
[
0
]]
# use the start index
upper
=
len
(
entries
)
# keep an upper boundary
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
while
True
:
while
True
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
d
=
distance
(
entry
.
loc
(),
elem
,
minimum
=
True
)
d
=
distance
(
entry
.
loc
(),
elem
,
minimum
=
True
)
...
@@ -416,11 +413,11 @@ class BedFile():
...
@@ -416,11 +413,11 @@ class BedFile():
return
True
return
True
elif
d
>
0
:
elif
d
>
0
:
lower
=
inspect
+
1
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
inspect
+=
delta
else
:
else
:
upper
=
inspect
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
inspect
-=
delta
if
delta
==
0
:
if
delta
==
0
:
return
False
return
False
...
@@ -436,7 +433,7 @@ class BedFile():
...
@@ -436,7 +433,7 @@ class BedFile():
entries
=
self
.
indices
[
0
][
elem
[
0
]]
# use the start index
entries
=
self
.
indices
[
0
][
elem
[
0
]]
# use the start index
upper
=
len
(
entries
)
# keep an upper boundary
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
while
True
:
while
True
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
d
=
distance
(
entry
.
loc
(),
elem
,
minimum
=
True
)
d
=
distance
(
entry
.
loc
(),
elem
,
minimum
=
True
)
...
@@ -461,11 +458,11 @@ class BedFile():
...
@@ -461,11 +458,11 @@ class BedFile():
return
False
return
False
elif
d
>
0
:
elif
d
>
0
:
lower
=
inspect
+
1
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
inspect
+=
delta
else
:
else
:
upper
=
inspect
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
inspect
-=
delta
if
delta
==
0
:
if
delta
==
0
:
return
False
return
False
...
@@ -494,7 +491,7 @@ class BedFile():
...
@@ -494,7 +491,7 @@ class BedFile():
entries
=
self
.
indices
[
0
][
myloc
[
0
]]
# use start index
entries
=
self
.
indices
[
0
][
myloc
[
0
]]
# use start index
upper
=
len
(
entries
)
# keep an upper boundary
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
delta
=
None
delta
=
None
while
not
delta
==
0
:
while
not
delta
==
0
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
...
@@ -509,11 +506,11 @@ class BedFile():
...
@@ -509,11 +506,11 @@ class BedFile():
return
(
mindist
,
minentry
)
return
(
mindist
,
minentry
)
elif
d
>
0
:
elif
d
>
0
:
lower
=
inspect
+
1
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
inspect
+=
delta
else
:
else
:
upper
=
inspect
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
inspect
-=
delta
# we may have missed the closest, so need to look around this point
# we may have missed the closest, so need to look around this point
for
i_dn
in
range
(
inspect
+
1
,
len
(
entries
)):
# Look downstream since
for
i_dn
in
range
(
inspect
+
1
,
len
(
entries
)):
# Look downstream since
...
@@ -528,7 +525,7 @@ class BedFile():
...
@@ -528,7 +525,7 @@ class BedFile():
entries
=
self
.
indices
[
2
][
myloc
[
0
]]
# use end index
entries
=
self
.
indices
[
2
][
myloc
[
0
]]
# use end index
upper
=
len
(
entries
)
# keep an upper boundary
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
delta
=
None
delta
=
None
while
not
delta
==
0
:
while
not
delta
==
0
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
...
@@ -540,11 +537,11 @@ class BedFile():
...
@@ -540,11 +537,11 @@ class BedFile():
return
(
mindist
,
minentry
)
return
(
mindist
,
minentry
)
elif
d
>
0
:
elif
d
>
0
:
lower
=
inspect
+
1
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
inspect
+=
delta
else
:
else
:
upper
=
inspect
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
inspect
-=
delta
# we may have missed the closest, so need to look around this point
# we may have missed the closest, so need to look around this point
for
i_up
in
range
(
inspect
-
1
,
0
,
-
1
):
# Look upstream since
for
i_up
in
range
(
inspect
-
1
,
0
,
-
1
):
# Look upstream since
...
@@ -560,7 +557,7 @@ class BedFile():
...
@@ -560,7 +557,7 @@ class BedFile():
entries
=
self
.
indices
[
1
][
myloc
[
0
]]
# use centre index
entries
=
self
.
indices
[
1
][
myloc
[
0
]]
# use centre index
upper
=
len
(
entries
)
# keep an upper boundary
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
delta
=
None
delta
=
None
while
not
delta
==
0
:
while
not
delta
==
0
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
...
@@ -575,11 +572,11 @@ class BedFile():
...
@@ -575,11 +572,11 @@ class BedFile():
return
(
mindist
,
minentry
)
return
(
mindist
,
minentry
)
elif
d
>
0
:
elif
d
>
0
:
lower
=
inspect
+
1
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
inspect
+=
delta
else
:
else
:
upper
=
inspect
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
inspect
-=
delta
# at bottom of search
# at bottom of search
return
(
mindist
,
minentry
)
return
(
mindist
,
minentry
)
...
@@ -751,30 +748,64 @@ Modifications to package:
...
@@ -751,30 +748,64 @@ Modifications to package:
- removed download.py and __main__ because they were not used and __main__ had errors.
- removed download.py and __main__ because they were not used and __main__ had errors.
- removed command-line interface because the BED file functionality is implemented more extensively elsewhere
- removed command-line interface because the BED file functionality is implemented more extensively elsewhere
"""
"""
"""
twobitreader
Licensed under Perl Artistic License 2.0
No warranty is provided, express or implied
"""
from
array
import
array
from
array
import
array
from
bisect
import
bisect_right
from
bisect
import
bisect_right
from
errno
import
ENOENT
,
EACCES
from
errno
import
ENOENT
,
EACCES
from
os
import
R_OK
,
access
from
os
import
R_OK
,
access
try
:
try
:
from
os
import
strerror
from
os
import
strerror
except
ImportError
:
except
ImportError
:
strerror
=
lambda
x
:
'strerror not supported'
strerror
=
lambda
x
:
'strerror not supported'
from
os.path
import
exists
from
os.path
import
exists
,
getsize
from
itertools
import
chain
import
logging
import
textwrap
import
sys
if
sys
.
version_info
>
(
3
,):
izip
=
zip
xrange
=
range
_CHAR_CODE
=
'u'
iteritems
=
dict
.
items
else
:
from
itertools
import
izip
_CHAR_CODE
=
'c'
iteritems
=
dict
.
iteritems
def
safe_tostring
(
ary
):
"""
convert arrays to strings in a Python 2.x / 3.x safe way
"""
if
sys
.
version_info
>
(
3
,):
return
ary
.
tounicode
()
.
encode
(
"ascii"
)
.
decode
()
else
:
return
ary
.
tostring
()
def
true_long_type
():
def
true_long_type
():
"""
"""
OS X uses an 8-byte long, so make sure L (long) is the right size
OS X uses an 8-byte long, so make sure L (long) is the right size
and switch to I (int) if needed
and switch to I (int) if needed
"""
"""
for
type_
in
[
'L'
,
'I'
]:
for
type_
in
[
'L'
,
'I'
]:
test_array
=
array
(
type_
,
[
0
])
test_array
=
array
(
type_
,
[
0
])
long_size
=
test_array
.
itemsize
long_size
=
test_array
.
itemsize
if
long_size
==
4
:
return
type_
if
long_size
==
4
:
return
type_
raise
ImportError
(
"Couldn't determine a valid 4-byte long type to use
\
raise
ImportError
(
"Couldn't determine a valid 4-byte long type to use
\
as equivalent to LONG"
)
as equivalent to LONG"
)
LONG
=
true_long_type
()
LONG
=
true_long_type
()
def
byte_to_bases
(
x
):
def
byte_to_bases
(
x
):
"""convert one byte to the four bases it encodes"""
"""convert one byte to the four bases it encodes"""
c
=
(
x
>>
4
)
&
0xf
c
=
(
x
>>
4
)
&
0xf
...
@@ -783,100 +814,162 @@ def byte_to_bases(x):
...
@@ -783,100 +814,162 @@ def byte_to_bases(x):
cf
=
c
&
0x3
cf
=
c
&
0x3
fc
=
(
f
>>
2
)
&
0x3
fc
=
(
f
>>
2
)
&
0x3
ff
=
f
&
0x3
ff
=
f
&
0x3
return
map
(
bits_to_base
,
(
cc
,
cf
,
fc
,
ff
))
return
[
bits_to_base
(
X
)
for
X
in
(
cc
,
cf
,
fc
,
ff
)]
def
bits_to_base
(
x
):
def
bits_to_base
(
x
):
"""convert integer representation of two bits to correct base"""
"""convert integer representation of two bits to correct base"""
if
x
is
0
:
return
'T'
if
x
is
0
:
if
x
is
1
:
return
'C'
return
'T'
if
x
is
2
:
return
'A'
elif
x
is
1
:
if
x
is
3
:
return
'G'
return
'C'
elif
x
is
2
:
return
'A'
elif
x
is
3
:
return
'G'
else
:
raise
ValueError
(
'Only integers 0-3 are valid inputs'
)
def
base_to_bin
(
x
):
def
base_to_bin
(
x
):
"""
"""
provided for user convenience
provided for user convenience
convert a nucleotide to its bit representation
convert a nucleotide to its bit representation
"""
"""
if
x
==
'T'
:
return
'00'
if
x
==
'T'
:
if
x
==
'C'
:
return
'01'
return
'00'
if
x
==
'A'
:
return
'10'
elif
x
==
'C'
:
if
x
==
'G'
:
return
'11'
return
'01'
elif
x
==
'A'
:
return
'10'
elif
x
==
'G'
:
return
'11'
else
:
raise
ValueError
(
'Only characters
\'
ATGC
\'
are valid inputs'
)
def
create_byte_table
():
def
create_byte_table
():
"""create BYTE_TABLE"""
"""create BYTE_TABLE"""
d
=
{}
d
=
{}
for
x
in
range
(
2
**
8
):
for
x
in
xrange
(
2
**
8
):
d
[
x
]
=
byte_to_bases
(
x
)
d
[
x
]
=
byte_to_bases
(
x
)
return
d
return
d
def
split16
(
x
):
def
split16
(
x
):
"""
"""
split a 16-bit number into integer representation
split a 16-bit number into integer representation
of its course and fine parts in binary representation
of its course and fine parts in binary representation
"""
"""
c
=
(
x
>>
8
)
&
0xff
c
=
(
x
>>
8
)
&
0xff
f
=
x
&
0xff
f
=
x
&
0xff
return
c
,
f
return
c
,
f
def
create_twobyte_table
():
def
create_twobyte_table
():
"""create TWOBYTE_TABLE"""
"""create TWOBYTE_TABLE"""
d
=
{}
d
=
{}
for
x
in
range
(
2
**
16
):
for
x
in
xrange
(
2
**
16
):
c
,
f
=
split16
(
x
)
c
,
f
=
split16
(
x
)
d
[
x
]
=
chain
(
byte_to_bases
(
c
),
byte_to_bases
(
f
))
d
[
x
]
=
list
(
byte_to_bases
(
c
))
+
list
(
byte_to_bases
(
f
))
return
d
return
d
BYTE_TABLE
=
create_byte_table
()
BYTE_TABLE
=
create_byte_table
()
TWOBYTE_TABLE
=
create_twobyte_table
()
TWOBYTE_TABLE
=
create_twobyte_table
()
def
longs_to_char_array
(
longs
,
first_base_offset
,
last_base_offset
,
array_size
):
def
longs_to_char_array
(
longs
,
first_base_offset
,
last_base_offset
,
array_size
,
more_bytes
=
None
):
"""
"""
takes in a iterable of longs and converts them to bases in a char array
takes in an array of longs (4 bytes) and converts them to bases in
returns a ctypes string buffer
a char array
you must also provide the offset in the first and last block
(note these offsets are pythonic. last_offset is not included)
and the desired array_size
If you have less than a long worth of bases at the end, you can provide
them as a string with more_bytes=
NOTE: last_base_offset is inside more_bytes not the last long, if more_bytes
is not None
returns the correct subset of the array based on provided offsets
"""
"""
if
array_size
==
0
:
return
array
(
_CHAR_CODE
)
elif
array_size
<
0
:
raise
ValueError
(
'array_size must be at least 0'
)
if
not
first_base_offset
in
range
(
16
):
raise
ValueError
(
'first_base_offset must be in range(16)'
)
if
not
last_base_offset
in
range
(
1
,
17
):
raise
ValueError
(
'last_base_offset must be in range(1, 17)'
)
longs_len
=
len
(
longs
)
longs_len
=
len
(
longs
)
# dna = ctypes.create_string_buffer(array_size)
if
more_bytes
is
None
:
dna
=
array
(
'b'
,
'N'
*
longs_len
)
shorts_length
=
0
else
:
shorts_length
=
len
(
more_bytes
)
if
array_size
>
longs_len
*
16
+
4
*
shorts_length
:
raise
ValueError
(
'array_size exceeds maximum possible for input'
)
dna
=
array
(
_CHAR_CODE
,
'N'
*
(
longs_len
*
16
+
4
*
shorts_length
))
# translate from 32-bit blocks to bytes
# translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed)
# this method ensures correct endianess (byteswap as neeed)
bytes
=
array
(
'B'
)
i
=
0
bytes
.
fromstring
(
longs
.
tostring
())
if
longs_len
>
0
:
# first block
bytes_
=
array
(
'B'
)
first_block
=
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes
[
x
]])
for
x
in
range
(
4
)])
bytes_
.
fromstring
(
longs
.
tostring
())
i
=
16
-
first_base_offset
# first block
if
array_size
<
i
:
i
=
array_size
first_block
=
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes_
[
x
]])
for
x
in
range
(
4
)])
dna
[
0
:
i
]
=
array
(
'b'
,
first_block
[
first_base_offset
:
first_base_offset
+
i
])
i
=
16
-
first_base_offset
if
longs_len
==
1
:
return
dna
if
array_size
<
i
:
# middle blocks (implicitly skipped if they don't exist)
i
=
array_size
for
byte
in
bytes
[
4
:
-
4
]:
dna
[
0
:
i
]
=
array
(
_CHAR_CODE
,
first_block
[
first_base_offset
:
first_base_offset
+
i
])
dna
[
i
:
i
+
4
]
=
array
(
'b'
,
BYTE_TABLE
[
byte
])
if
longs_len
>
1
:
i
+=
4
# middle blocks (implicitly skipped if they don't exist)
# last block
for
byte
in
bytes_
[
4
:
-
4
]:
last_block
=
array
(
'b'
,
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes
[
x
]])
for
x
in
range
(
-
4
,
0
)]))
dna
[
i
:
i
+
4
]
=
array
(
_CHAR_CODE
,
BYTE_TABLE
[
byte
])
dna
[
i
:
i
+
last_base_offset
]
=
last_block
[
0
:
last_base_offset
]
i
+=
4
return
dna
# last block
last_block
=
array
(
_CHAR_CODE
,
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes_
[
x
]])
for
x
in
range
(
-
4
,
0
)]))
if
more_bytes
is
None
:
dna
[
i
:
i
+
last_base_offset
]
=
last_block
[
0
:
last_base_offset
]
else
:
# if there are more bytes, we need the whole last block
dna
[
i
:
i
+
16
]
=
last_block
[
0
:
16
]
i
+=
16
if
more_bytes
is
not
None
:
bytes_
=
array
(
'B'
)
bytes_
.
fromstring
(
more_bytes
)
j
=
i
for
byte
in
bytes_
:
j
=
i
+
4
if
j
>
array_size
:
dnabytes
=
array
(
_CHAR_CODE
,
BYTE_TABLE
[
byte
])[
0
:(
array_size
-
i
)]
dna
[
i
:
array_size
]
=
dnabytes
break
dna
[
i
:
i
+
last_base_offset
]
=
array
(
_CHAR_CODE
,
BYTE_TABLE
[
byte
])
i
+=
4
return
dna
[
0
:
array_size
]
class
TwoBitFile
(
dict
):
class
TwoBitFile
(
dict
):
"""
"""
python-level reader for .2bit files (i.e., from UCSC genome browser)
python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support)
(note: no writing support)
TwoBitFile inherits from dict
TwoBitFile inherits from dict
You may access sequences by name, e.g.
You may access sequences by name, e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20 = genome['chr20']
Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
Sequences are returned as TwoBitSequence objects
e.g.
You may access intervals by slicing or using str() to dump the entire entry
>>> chr20[100100:100120]
e.g.
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
>>> chr20[100100:100200]
tttgcatgttaagtttttttcc'
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
>>> whole_chr20 = str(chr20)
ttgcatgttaagtttttttcc'
Fair warning: dumping the entire chromosome requires a lot of memory
>>> whole_chr20 = str(chr20)
See TwoBitSequence for more info
Fair warning: dumping the entire chromosome requires a lot of memory
See TwoBitSequence for more info
"""
"""
def
__init__
(
self
,
foo
):
def
__init__
(
self
,
foo
):
...
@@ -886,14 +979,19 @@ class TwoBitFile(dict):
...
@@ -886,14 +979,19 @@ class TwoBitFile(dict):
if
not
access
(
foo
,
R_OK
):
if
not
access
(
foo
,
R_OK
):
raise
IOError
(
EACCES
,
strerror
(
EACCES
),
foo
)
raise
IOError
(
EACCES
,
strerror
(
EACCES
),
foo
)
self
.
_filename
=
foo
self
.
_filename
=
foo
self
.
_file_size
=
getsize
(
foo
)
self
.
_file_handle
=
open
(
foo
,
'rb'
)
self
.
_file_handle
=
open
(
foo
,
'rb'
)
self
.
_load_header
()
self
.
_load_header
()
self
.
_load_index
()
self
.
_load_index
()
for
name
,
offset
in
self
.
_offset_dict
.
items
(
):
for
name
,
offset
in
iteritems
(
self
.
_offset_dict
):
self
[
name
]
=
TwoBitSequence
(
self
.
_file_handle
,
offset
,
self
[
name
]
=
TwoBitSequence
(
self
.
_file_handle
,
offset
,
self
.
_file_size
,
self
.
_byteswapped
)
self
.
_byteswapped
)
return
return
def
__reduce__
(
self
):
# enables pickling
return
(
TwoBitFile
,
(
self
.
_filename
,))
def
_load_header
(
self
):
def
_load_header
(
self
):
file_handle
=
self
.
_file_handle
file_handle
=
self
.
_file_handle
header
=
array
(
LONG
)
header
=
array
(
LONG
)
...
@@ -907,8 +1005,9 @@ class TwoBitFile(dict):
...
@@ -907,8 +1005,9 @@ class TwoBitFile(dict):
header
.
byteswap
()
header
.
byteswap
()
(
signature2
,
version
,
sequence_count
,
reserved
)
=
header
(
signature2
,
version
,
sequence_count
,
reserved
)
=
header
if
not
signature2
==
0x1A412743
:
if
not
signature2
==
0x1A412743
:
raise
TwoBitFileError
(
'Signature in header should be 0x1A412743'
raise
TwoBitFileError
(
'Signature in header should be '
+
+
', instead found 0x
%
X'
%
signature
)
'0x1A412743, instead found 0x
%
X'
%
signature
)
if
not
version
==
0
:
if
not
version
==
0
:
raise
TwoBitFileError
(
'File version in header should be 0.'
)
raise
TwoBitFileError
(
'File version in header should be 0.'
)
if
not
reserved
==
0
:
if
not
reserved
==
0
:
...
@@ -922,21 +1021,23 @@ class TwoBitFile(dict):
...
@@ -922,21 +1021,23 @@ class TwoBitFile(dict):
remaining
=
self
.
_sequence_count
remaining
=
self
.
_sequence_count
sequence_offsets
=
[]
sequence_offsets
=
[]
file_handle
.
seek
(
16
)
file_handle
.
seek
(
16
)
while
True
:
while
remaining
>
0
:
if
remaining
==
0
:
break
name_size
=
array
(
'B'
)
name_size
=
array
(
'B'
)
name_size
.
fromfile
(
file_handle
,
1
)
name_size
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
if
byteswapped
:
name_size
.
byteswap
()
name_size
.
byteswap
()
name
=
array
(
'b'
)
# name = array(_CHAR_CODE)
name
=
array
(
'B'
)
name
.
fromfile
(
file_handle
,
name_size
[
0
])
name
=
""
.
join
([
chr
(
X
)
for
X
in
name
])
if
byteswapped
:
if
byteswapped
:
name
.
byteswap
()
name
.
byteswap
()
name
.
fromfile
(
file_handle
,
name_size
[
0
])
offset
=
array
(
LONG
)
offset
=
array
(
LONG
)
offset
.
fromfile
(
file_handle
,
1
)
offset
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
if
byteswapped
:
offset
.
byteswap
()
offset
.
byteswap
()
sequence_offsets
.
append
((
name
.
tostring
()
,
offset
[
0
]))
sequence_offsets
.
append
((
name
,
offset
[
0
]))
remaining
-=
1
remaining
-=
1
self
.
_sequence_offsets
=
sequence_offsets
self
.
_sequence_offsets
=
sequence_offsets
self
.
_offset_dict
=
dict
(
sequence_offsets
)
self
.
_offset_dict
=
dict
(
sequence_offsets
)
...
@@ -946,71 +1047,78 @@ class TwoBitFile(dict):
...
@@ -946,71 +1047,78 @@ class TwoBitFile(dict):
d
=
{}
d
=
{}
file_handle
=
self
.
_file_handle
file_handle
=
self
.
_file_handle
byteswapped
=
self
.
_byteswapped
byteswapped
=
self
.
_byteswapped
for
name
,
offset
in
self
.
_offset_dict
.
items
(
):
for
name
,
offset
in
iteritems
(
self
.
_offset_dict
):
file_handle
.
seek
(
offset
)
file_handle
.
seek
(
offset
)
dna_size
=
array
(
LONG
)
dna_size
=
array
(
LONG
)
dna_size
.
fromfile
(
file_handle
,
1
)
dna_size
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
dna_size
.
byteswap
()
if
byteswapped
:
dna_size
.
byteswap
()
d
[
name
]
=
dna_size
[
0
]
d
[
name
]
=
dna_size
[
0
]
return
d
return
d
class
TwoBitSequence
(
object
):
class
TwoBitSequence
(
object
):
"""
"""
A TwoBitSequence object refers to an entry in a TwoBitFile
A TwoBitSequence object refers to an entry in a TwoBitFile
You may access intervals by slicing or using str() to dump the entire entry
You may access intervals by slicing or using str() to dump the entire entry
e.g.
e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20 = genome['chr20']
>>> chr20[100100:100200] # slicing returns a string
>>> chr20[100100:100200] # slicing returns a string
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
tttgcatgttaagtttttttcc'
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20) # get whole chr as string
>>> whole_chr20 = str(chr20) # get whole chr as string
Fair warning: dumping the entire chromosome requires a lot of memory
Note that we follow python/UCSC conventions:
Fair warning: dumping the entire chromosome requires a lot of memory
Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
Note that we follow python/UCSC conventions:
If you attempt to access a slice past the end of the sequence,
Coordinates are 0-based, end-open
it will be truncated at the end.
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
Your computer probably doesn't have enough memory to load a whole genome
If you attempt to access a slice past the end of the sequence,
but if you want to string-ize your TwoBitFile, here's a recipe:
it will be truncated at the end.
x = TwoBitFile('my.2bit')
d = x.dict()
Your computer probably doesn't have enough memory to load a whole genome
for k,v in d.items(): d[k] = str(v)
but if you want to string-ize your TwoBitFile, here's a recipe:
x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.iteritems(): d[k] = str(v)
"""
"""
def
__init__
(
self
,
file_handle
,
offset
,
byteswapped
=
False
):
def
__init__
(
self
,
file_handle
,
offset
,
file_size
,
byteswapped
=
False
):
self
.
_file_size
=
file_size
self
.
_file_handle
=
file_handle
self
.
_file_handle
=
file_handle
self
.
_original_offset
=
offset
self
.
_original_offset
=
offset
self
.
_byteswapped
=
byteswapped
self
.
_byteswapped
=
byteswapped
file_handle
.
seek
(
offset
)
file_handle
.
seek
(
offset
)
header
=
array
(
LONG
)
header
=
array
(
LONG
)
header
.
fromfile
(
file_handle
,
2
)
header
.
fromfile
(
file_handle
,
2
)
if
byteswapped
:
header
.
byteswap
()
if
byteswapped
:
header
.
byteswap
()
dna_size
,
n_block_count
=
header
dna_size
,
n_block_count
=
header
self
.
_dna_size
=
dna_size
self
.
_dna_size
=
dna_size
# number of characters, 2 bits each
self
.
_packed_dna_size
=
(
dna_size
+
15
)
/
16
# this is 32-bit fragments
self
.
_n_bytes
=
(
dna_size
+
3
)
/
4
# number of bytes
# number of 32-bit fragments
self
.
_packed_dna_size
=
(
dna_size
+
15
)
/
16
n_block_starts
=
array
(
LONG
)
n_block_starts
=
array
(
LONG
)
n_block_sizes
=
array
(
LONG
)
n_block_sizes
=
array
(
LONG
)
n_block_starts
.
fromfile
(
file_handle
,
n_block_count
)
n_block_starts
.
fromfile
(
file_handle
,
n_block_count
)
if
byteswapped
:
n_block_starts
.
byteswap
()
if
byteswapped
:
n_block_starts
.
byteswap
()
n_block_sizes
.
fromfile
(
file_handle
,
n_block_count
)
n_block_sizes
.
fromfile
(
file_handle
,
n_block_count
)
if
byteswapped
:
n_block_sizes
.
byteswap
()
if
byteswapped
:
n_block_sizes
.
byteswap
()
self
.
_n_block_starts
=
n_block_starts
self
.
_n_block_starts
=
n_block_starts
self
.
_n_block_sizes
=
n_block_sizes
self
.
_n_block_sizes
=
n_block_sizes
mask_rawc
=
array
(
LONG
)
mask_rawc
=
array
(
LONG
)
mask_rawc
.
fromfile
(
file_handle
,
1
)
mask_rawc
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
mask_rawc
.
byteswap
()
if
byteswapped
:
mask_rawc
.
byteswap
()
mask_block_count
=
mask_rawc
[
0
]
mask_block_count
=
mask_rawc
[
0
]
mask_block_starts
=
array
(
LONG
)
mask_block_starts
=
array
(
LONG
)
mask_block_starts
.
fromfile
(
file_handle
,
mask_block_count
)
mask_block_starts
.
fromfile
(
file_handle
,
mask_block_count
)
if
byteswapped
:
mask_block_starts
.
byteswap
()
if
byteswapped
:
mask_block_starts
.
byteswap
()
mask_block_sizes
=
array
(
LONG
)
mask_block_sizes
=
array
(
LONG
)
mask_block_sizes
.
fromfile
(
file_handle
,
mask_block_count
)
mask_block_sizes
.
fromfile
(
file_handle
,
mask_block_count
)
if
byteswapped
:
mask_block_sizes
.
byteswap
()
if
byteswapped
:
mask_block_sizes
.
byteswap
()
self
.
_mask_block_starts
=
mask_block_starts
self
.
_mask_block_starts
=
mask_block_starts
self
.
_mask_block_sizes
=
mask_block_sizes
self
.
_mask_block_sizes
=
mask_block_sizes
file_handle
.
read
(
4
)
file_handle
.
read
(
4
)
...
@@ -1019,8 +1127,22 @@ class TwoBitSequence(object):
...
@@ -1019,8 +1127,22 @@ class TwoBitSequence(object):
def
__len__
(
self
):
def
__len__
(
self
):
return
self
.
_dna_size
return
self
.
_dna_size
def
__getslice__
(
self
,
min_
,
max_
=
None
):
def
__getitem__
(
self
,
slice_or_key
):
return
self
.
get_slice
(
min_
,
max_
)
"""
return a sub-sequence, given a slice object
"""
step
=
None
if
isinstance
(
slice_or_key
,
slice
):
step
=
slice_or_key
.
step
if
step
is
not
None
:
raise
ValueError
(
"Slicing by step not currently supported"
)
return
self
.
get_slice
(
min_
=
slice_or_key
.
start
,
max_
=
slice_or_key
.
stop
)
elif
isinstance
(
slice_or_key
,
int
):
max_
=
slice_or_key
+
1
if
max_
==
0
:
max_
=
None
return
self
.
get_slice
(
min_
=
slice_or_key
,
max_
=
max_
)
def
get_slice
(
self
,
min_
,
max_
=
None
):
def
get_slice
(
self
,
min_
,
max_
=
None
):
"""
"""
...
@@ -1028,22 +1150,26 @@ class TwoBitSequence(object):
...
@@ -1028,22 +1150,26 @@ class TwoBitSequence(object):
"""
"""
# handle negative coordinates
# handle negative coordinates
dna_size
=
self
.
_dna_size
dna_size
=
self
.
_dna_size
if
max_
<
0
:
if
min_
is
None
:
# for slicing e.g. [:]
if
max_
<
-
dna_size
:
raise
IndexError
(
'index out of range'
)
min_
=
0
max_
=
dna_size
+
1
+
max_
if
max_
is
not
None
and
max_
<
0
:
if
min_
<
0
:
if
max_
<
-
dna_size
:
if
max_
<
-
dna_size
:
raise
IndexError
(
'index out of range'
)
raise
IndexError
(
'index out of range'
)
min_
=
dna_size
+
1
+
min_
max_
=
dna_size
+
max_
# Find out if the reverse complement is sought
if
min_
is
not
None
and
min_
<
0
:
reverse
=
False
# assume not RC
if
min_
<
-
dna_size
:
if
min_
>
max_
and
max_
is
not
None
:
raise
IndexError
(
'index out of range'
)
reverse
=
True
min_
=
dna_size
+
min_
mymax
=
max_
# make sure there's a proper range
max_
=
min_
if
max_
is
not
None
and
min_
>
max_
:
min_
=
mymax
return
''
if
max_
==
0
:
return
''
if
max_
==
0
or
max_
==
min_
:
return
''
# load all the data
# load all the data
if
max_
>
dna_size
:
max_
=
dna_size
if
max_
is
None
or
max_
>
dna_size
:
max_
=
dna_size
file_handle
=
self
.
_file_handle
file_handle
=
self
.
_file_handle
byteswapped
=
self
.
_byteswapped
byteswapped
=
self
.
_byteswapped
n_block_starts
=
self
.
_n_block_starts
n_block_starts
=
self
.
_n_block_starts
...
@@ -1052,140 +1178,158 @@ class TwoBitSequence(object):
...
@@ -1052,140 +1178,158 @@ class TwoBitSequence(object):
mask_block_sizes
=
self
.
_mask_block_sizes
mask_block_sizes
=
self
.
_mask_block_sizes
offset
=
self
.
_offset
offset
=
self
.
_offset
packed_dna_size
=
self
.
_packed_dna_size
packed_dna_size
=
self
.
_packed_dna_size
# n_bytes = self._n_bytes
# region_size is how many bases the region is
# region_size is how many bases the region is
if
max_
is
None
:
region_size
=
dna_size
-
min_
if
max_
is
None
:
else
:
region_size
=
max_
-
min_
region_size
=
dna_size
-
min_
else
:
region_size
=
max_
-
min_
# start_block, end_block are the first/last 32-bit blocks we need
# start_block, end_block are the first/last 32-bit blocks we need
# note: end_block is not read
# blocks start at 0
# blocks start at 0
start_block
=
min_
/
16
start_block
=
min_
//
16
end_block
=
max_
/
16
# jump directly to desired file location
local_offset
=
offset
+
(
start_block
*
4
)
end_block
=
(
max_
-
1
+
16
)
//
16
# don't read past seq end
# don't read past seq end
if
end_block
>=
packed_dna_size
:
end_block
=
packed_dna_size
-
1
# +1 we still need to read block
blocks_to_read
=
end_block
-
start_block
+
1
# jump directly to desired file location
local_offset
=
offset
+
start_block
*
4
file_handle
.
seek
(
local_offset
)
file_handle
.
seek
(
local_offset
)
# note we won't actually read the last base
# note we won't actually read the last base
# this is a python slice first_base_offset:16*blocks+last_base_offset
# this is a python slice first_base_offset:16*blocks+last_base_offset
first_base_offset
=
min_
%
16
first_base_offset
=
min_
%
16
last_base_offset
=
max_
%
16
last_base_offset
=
max_
%
16
if
last_base_offset
==
0
:
last_base_offset
=
16
# +1 we still need to read end_block maybe
blocks_to_read
=
end_block
-
start_block
if
(
blocks_to_read
+
start_block
)
>
packed_dna_size
:
blocks_to_read
=
packed_dna_size
-
start_block
fourbyte_dna
=
array
(
LONG
)
fourbyte_dna
=
array
(
LONG
)
fourbyte_dna
.
fromfile
(
file_handle
,
blocks_to_read
)
# remainder_seq = None
if
byteswapped
:
fourbyte_dna
.
byteswap
()
if
(
blocks_to_read
*
4
+
local_offset
)
>
self
.
_file_size
:
string_as_array
=
longs_to_char_array
(
fourbyte_dna
,
first_base_offset
,
fourbyte_dna
.
fromfile
(
file_handle
,
blocks_to_read
-
1
)
last_base_offset
,
region_size
)
morebytes
=
file_handle
.
read
()
# read the remaining characters
for
start
,
size
in
zip
(
n_block_starts
,
n_block_sizes
):
# if byteswapped:
# morebytes = ''.join(reversed(morebytes))
else
:
fourbyte_dna
.
fromfile
(
file_handle
,
blocks_to_read
)
morebytes
=
None
if
byteswapped
:
fourbyte_dna
.
byteswap
()
str_as_array
=
longs_to_char_array
(
fourbyte_dna
,
first_base_offset
,
last_base_offset
,
region_size
,
more_bytes
=
morebytes
)
for
start
,
size
in
izip
(
n_block_starts
,
n_block_sizes
):
end
=
start
+
size
end
=
start
+
size
if
end
<=
min_
:
continue
if
end
<=
min_
:
if
start
>
max_
:
break
continue
if
start
<
min_
:
start
=
min_
if
start
>
max_
:
if
end
>
max_
:
end
=
max_
break
if
start
<
min_
:
start
=
min_
if
end
>
max_
:
end
=
max_
start
-=
min_
start
-=
min_
end
-=
min_
end
-=
min_
string_as_array
[
start
:
end
]
=
array
(
'b'
,
'N'
*
(
end
-
start
))
# this should actually be decoded, 00=N, 01=n
str_as_array
[
start
:
end
]
=
array
(
_CHAR_CODE
,
'N'
*
(
end
-
start
))
lower
=
str
.
lower
lower
=
str
.
lower
first_masked_region
=
max
(
0
,
first_masked_region
=
max
(
0
,
bisect_right
(
mask_block_starts
,
min_
)
-
1
)
bisect_right
(
mask_block_starts
,
min_
)
-
1
)
last_masked_region
=
min
(
len
(
mask_block_starts
),
last_masked_region
=
min
(
len
(
mask_block_starts
),
1
+
bisect_right
(
mask_block_starts
,
max_
,
1
+
bisect_right
(
mask_block_starts
,
max_
,
lo
=
first_masked_region
))
lo
=
first_masked_region
))
for
start
,
size
in
zip
(
mask_block_starts
[
first_masked_region
:
last_masked_region
],
for
start
,
size
in
izip
(
mask_block_starts
[
first_masked_region
:
mask_block_sizes
[
first_masked_region
:
last_masked_region
]):
last_masked_region
],
mask_block_sizes
[
first_masked_region
:
last_masked_region
]):
end
=
start
+
size
end
=
start
+
size
if
end
<=
min_
:
continue
if
end
<=
min_
:
if
start
>
max_
:
break
continue
if
start
<
min_
:
start
=
min_
if
start
>
max_
:
if
end
>
max_
:
end
=
max_
break
if
start
<
min_
:
start
=
min_
if
end
>
max_
:
end
=
max_
start
-=
min_
start
-=
min_
end
-=
min_
end
-=
min_
string_as_array
[
start
:
end
]
=
array
(
'b'
,
lower
(
string_as_array
[
start
:
end
]
.
tostring
()))
str_as_array
[
start
:
end
]
=
array
(
_CHAR_CODE
,
if
not
len
(
string_as_array
)
==
max_
-
min_
:
lower
(
safe_tostring
(
str_as_array
[
start
:
end
])))
raise
RuntimeError
(
"Sequence was longer than it should be"
)
if
not
len
(
str_as_array
)
==
max_
-
min_
:
if
reverse
:
raise
RuntimeError
(
"Sequence was the wrong size"
)
return
self
.
reverseComplement
(
string_as_array
.
tostring
())
return
safe_tostring
(
str_as_array
)
return
string_as_array
.
tostring
()
def
reverseComplement
(
self
,
dna
):
""" Return a new sequence: the reverse complement of this sequence. """
newseq
=
''
symbols
=
{
'A'
:
'T'
,
'C'
:
'G'
,
'T'
:
'A'
,
'G'
:
'C'
,
'a'
:
't'
,
'c'
:
'g'
,
't'
:
'a'
,
'g'
:
'c'
,
'n'
:
'n'
,
'N'
:
'N'
}
# reverse complement dictionary
for
symbol
in
dna
[::
-
1
]:
newsymbol
=
symbols
[
symbol
]
# uses the reverse complement symbols in dictionary
newseq
+=
newsymbol
return
newseq
# returns RC sequences
def
__str__
(
self
):
def
__str__
(
self
):
"""
"""
returns the entire chromosome
returns the entire chromosome
"""
"""
return
self
.
__getslice__
(
0
,
None
)
return
self
.
get_slice
(
0
,
None
)
class
TwoBitFileError
(
Exception
):
class
TwoBitFileError
(
Exception
):
"""
"""
Base exception for TwoBit module
Base exception for TwoBit module
"""
"""
def
__init__
(
self
,
msg
):
def
__init__
(
self
,
msg
):
errtext
=
'Invalid 2-bit file. '
+
msg
errtext
=
'Invalid 2-bit file. '
+
msg
return
super
(
TwoBitFileError
,
self
)
.
__init__
(
errtext
)
return
super
(
TwoBitFileError
,
self
)
.
__init__
(
errtext
)
def
print_specification
():
def
print_specification
():
"""
"""
Prints the twoBit file format specification I got from the Internet.
Prints the twoBit file format specification I got from the Internet.
This is only here for reference
This is only here for reference
"""
"""
return
"""
return
"""
From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
.2bit files
.2bit files
A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact
\
randomly accessible format. The two bit files contain masking information as
\
A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact
\
well as the DNA itself. The file begins with a 16 byte header containing the
\
randomly accessible format. The two bit files contain masking information as
\
following fields:
well as the DNA itself. The file begins with a 16 byte header containing the
\
signature - the number 0x1A412743 in the architecture of the machine that
\
following fields:
created the file.
version - zero for now. Readers should abort if they see a version number
\
signature - the number 0x1A412743 in the architecture of the machine that
\
higher than 0.
created the file.
sequenceCount - the number of sequences in the file
version - zero for now. Readers should abort if they see a version number
\
reserved - always zero for now.
higher than 0.
All fields are 32 bits unless noted. If the signature value is not as given,
\
sequenceCount - the number of sequences in the file
the reader program should byte swap the signature and see if the swapped
\
reserved - always zero for now.
version matches. If so all multiple-byte entities in the file will need to be
\
All fields are 32 bits unless noted. If the signature value is not as given,
\
byte-swapped. This enables these binary files to be used unchanged on
\
the reader program should byte swap the signature and see if the swapped
\
different architectures.
version matches. If so all multiple-byte entities in the file will need to be
\
The header is followed by a file index. There is one entry in the index for
\
byte-swapped. This enables these binary files to be used unchanged on
\
each sequence. Each index entry contains three fields:
different architectures.
nameSize - a byte containing the length of the name field
name - this contains the sequence name itself, and is variable length
\
The header is followed by a file index. There is one entry in the index for
\
depending on nameSize.
each sequence. Each index entry contains three fields:
offset - 32 bit offset of the sequence data relative to the start of the file
The index is followed by the sequence records. These contain 9 fields:
nameSize - a byte containing the length of the name field
dnaSize - number of bases of DNA in the sequence.
name - this contains the sequence name itself, and is variable length
\
nBlockCount - the number of blocks of N's in the file (representing unknown
\
depending on nameSize.
sequence).
offset - 32 bit offset of the sequence data relative to the start of the file
nBlockStarts - a starting position for each block of N's
nBlockSizes - the size of each block of N's
The index is followed by the sequence records. These contain 9 fields:
maskBlockCount - the number of masked (lower case) blocks
maskBlockStarts - starting position for each masked block
dnaSize - number of bases of DNA in the sequence.
maskBlockSizes - the size of each masked block
nBlockCount - the number of blocks of N's in the file (representing unknown
\
packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A,
\
sequence).
11 - G. The first base is in the most significant 2 bits byte, and the last
\
nBlockStarts - a starting position for each block of N's
base in the least significant 2 bits, so that the sequence TCAG would be
\
nBlockSizes - the size of each block of N's
represented as 00011011. The packedDna field will be padded with 0 bits as
\
maskBlockCount - the number of masked (lower case) blocks
necessary so that it takes an even multiple of 32 bit in the file, as this
\
maskBlockStarts - starting position for each masked block
improves i/o performance on some machines.
maskBlockSizes - the size of each masked block
.nib files
packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A,
\
"""
11 - G. The first base is in the most significant 2 bits byte, and the last
\
base in the least significant 2 bits, so that the sequence TCAG would be
\
represented as 00011011. The packedDna field will be padded with 0 bits as
\
if
__name__
==
'__main__'
:
hg19
=
TwoBitFile
(
'/Users/mikael/simhome/share/hg19.2bit'
)
# assumes that the genome is stored in your current directory
necessary so that it takes an even multiple of 32 bit in the file, as this
\
for
key
in
hg19
:
improves i/o performance on some machines.
print
(
key
)
.nib files
print
(
hg19
[
'chrX'
][
1000000
:
1000060
])
"""
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment