Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
a236ba40
Commit
a236ba40
authored
Aug 24, 2017
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update_seqdata_to_python_3
parent
bb25ec08
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
393 additions
and
249 deletions
+393
-249
guide.py
guide.py
+1
-1
seqdata.py
seqdata.py
+392
-248
No files found.
guide.py
View file @
a236ba40
e
a
t
###################################################
###################################################
# This module is a supplement to the Python guide #
# Version 2017.3 (10/03/2017) #
###################################################
...
...
seqdata.py
View file @
a236ba40
...
...
@@ -13,8 +13,8 @@ def overlap(chromLoc1, chromLoc2):
Return 0 in case of NO overlap.
"""
if
chromLoc1
[
0
]
==
chromLoc2
[
0
]:
halfWidth1
=
(
chromLoc1
[
2
]
-
chromLoc1
[
1
])
/
2
halfWidth2
=
(
chromLoc2
[
2
]
-
chromLoc2
[
1
])
/
2
halfWidth1
=
(
chromLoc1
[
2
]
-
chromLoc1
[
1
])
/
/
2
halfWidth2
=
(
chromLoc2
[
2
]
-
chromLoc2
[
1
])
/
/
2
minWidth
=
min
(
halfWidth1
,
halfWidth2
)
minWidth
=
max
(
minWidth
,
1
)
maxWidth
=
max
(
halfWidth1
,
halfWidth2
)
...
...
@@ -37,8 +37,8 @@ def distance(chromLoc1, chromLoc2, minimum = True):
minimum: if True (default), then use minimum distance, if False, use centre to centre
"""
if
chromLoc1
[
0
]
==
chromLoc2
[
0
]:
halfWidth1
=
(
chromLoc1
[
2
]
-
chromLoc1
[
1
])
/
2
halfWidth2
=
(
chromLoc2
[
2
]
-
chromLoc2
[
1
])
/
2
halfWidth1
=
(
chromLoc1
[
2
]
-
chromLoc1
[
1
])
/
/
2
halfWidth2
=
(
chromLoc2
[
2
]
-
chromLoc2
[
1
])
/
/
2
minWidth
=
min
(
halfWidth1
,
halfWidth2
)
minWidth
=
max
(
minWidth
,
1
)
maxWidth
=
max
(
halfWidth1
,
halfWidth2
)
...
...
@@ -151,33 +151,33 @@ class BedEntry():
end
=
self
.
chromEnd
start
=
self
.
chromStart
mywidth
=
fixedwidth
or
(
self
.
chromEnd
-
self
.
chromStart
)
mycentre
=
start
+
(
self
.
chromEnd
-
self
.
chromStart
)
/
2
mycentre
=
start
+
(
self
.
chromEnd
-
self
.
chromStart
)
/
/
2
if
usesummit
:
mycentre
=
self
.
summit
if
useshift
:
mycentre
=
mycentre
+
useshift
if
fixedwidth
:
# we need to re-calculate start and end
if
genome
:
end
=
min
(
len
(
genome
[
self
.
chrom
]),
mycentre
+
(
mywidth
/
2
))
end
=
min
(
len
(
genome
[
self
.
chrom
]),
mycentre
+
(
mywidth
/
/
2
))
else
:
end
=
mycentre
+
(
mywidth
/
2
)
start
=
max
(
0
,
mycentre
-
(
mywidth
/
2
))
end
=
mycentre
+
(
mywidth
/
/
2
)
start
=
max
(
0
,
mycentre
-
(
mywidth
/
/
2
))
else
:
# other strand
start
=
self
.
chromEnd
end
=
self
.
chromStart
mywidth
=
fixedwidth
or
(
self
.
chromEnd
-
self
.
chromStart
)
mycentre
=
self
.
chromStart
+
(
self
.
chromEnd
-
self
.
chromStart
)
/
2
mycentre
=
self
.
chromStart
+
(
self
.
chromEnd
-
self
.
chromStart
)
/
/
2
if
usesummit
:
mycentre
=
self
.
summit
if
useshift
:
mycentre
=
mycentre
-
useshift
# shift is reversed on other strand
if
fixedwidth
:
# we need to re-calculate start and end
end
=
max
(
0
,
mycentre
-
(
mywidth
/
2
))
end
=
max
(
0
,
mycentre
-
(
mywidth
/
/
2
))
if
genome
:
start
=
min
(
len
(
genome
[
self
.
chrom
]),
mycentre
+
(
mywidth
/
2
))
start
=
min
(
len
(
genome
[
self
.
chrom
]),
mycentre
+
(
mywidth
/
/
2
))
else
:
start
=
mycentre
+
(
mywidth
/
2
)
start
=
mycentre
+
(
mywidth
/
/
2
)
if
genome
:
# refer to the genome sequence
return
genome
[
self
.
chrom
][
start
:
end
]
...
...
@@ -187,9 +187,9 @@ class BedEntry():
def
setwidth
(
self
,
fixedwidth
=
None
,
usesummit
=
False
):
if
fixedwidth
:
if
usesummit
:
diff
=
self
.
summit
-
fixedwidth
/
2
diff
=
self
.
summit
-
fixedwidth
/
/
2
else
:
diff
=
(
self
.
chromEnd
-
self
.
chromStart
)
/
2
-
fixedwidth
/
2
diff
=
(
self
.
chromEnd
-
self
.
chromStart
)
/
/
2
-
fixedwidth
/
/
2
self
.
chromStart
+=
diff
self
.
chromStart
+=
diff
+
fixedwidth
return
(
self
.
chrom
,
self
.
chromStart
,
self
.
chromEnd
)
...
...
@@ -365,11 +365,8 @@ class BedFile():
def
__iter__
(
self
):
return
self
.
rows
.
__iter__
()
def
__getslice__
(
self
,
i
,
j
):
return
self
.
rows
.
__getslice__
(
i
,
j
)
def
__getitem__
(
self
,
i
):
return
self
.
rows
[
i
]
def
__getitem__
(
self
,
key
):
return
self
.
rows
[
key
]
def
__len__
(
self
):
return
len
(
self
.
rows
)
...
...
@@ -388,7 +385,7 @@ class BedFile():
if
not
row
.
chrom
in
index_end
:
# seeing chromosome entry first time
index_end
[
row
.
chrom
]
=
[]
index_start
[
row
.
chrom
]
.
append
((
row
.
chromStart
,
row
.
chromEnd
-
row
.
chromStart
,
i
))
index_centre
[
row
.
chrom
]
.
append
((
row
.
chromStart
+
(
row
.
chromEnd
-
row
.
chromStart
)
/
2
,
(
row
.
chromEnd
-
row
.
chromStart
)
/
2
,
i
))
index_centre
[
row
.
chrom
]
.
append
((
row
.
chromStart
+
(
row
.
chromEnd
-
row
.
chromStart
)
/
/
2
,
(
row
.
chromEnd
-
row
.
chromStart
)
/
/
2
,
i
))
index_end
[
row
.
chrom
]
.
append
((
row
.
chromEnd
,
row
.
chromEnd
-
row
.
chromStart
,
i
))
if
row
.
name
:
index_name
[
row
.
name
]
=
row
...
...
@@ -407,7 +404,7 @@ class BedFile():
entries
=
self
.
indices
[
0
][
elem
[
0
]]
# use the start index
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
while
True
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
d
=
distance
(
entry
.
loc
(),
elem
,
minimum
=
True
)
...
...
@@ -416,11 +413,11 @@ class BedFile():
return
True
elif
d
>
0
:
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
else
:
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
if
delta
==
0
:
return
False
...
...
@@ -436,7 +433,7 @@ class BedFile():
entries
=
self
.
indices
[
0
][
elem
[
0
]]
# use the start index
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
while
True
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
d
=
distance
(
entry
.
loc
(),
elem
,
minimum
=
True
)
...
...
@@ -461,11 +458,11 @@ class BedFile():
return
False
elif
d
>
0
:
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
else
:
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
if
delta
==
0
:
return
False
...
...
@@ -494,7 +491,7 @@ class BedFile():
entries
=
self
.
indices
[
0
][
myloc
[
0
]]
# use start index
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
delta
=
None
while
not
delta
==
0
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
...
...
@@ -509,11 +506,11 @@ class BedFile():
return
(
mindist
,
minentry
)
elif
d
>
0
:
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
else
:
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
# we may have missed the closest, so need to look around this point
for
i_dn
in
range
(
inspect
+
1
,
len
(
entries
)):
# Look downstream since
...
...
@@ -528,7 +525,7 @@ class BedFile():
entries
=
self
.
indices
[
2
][
myloc
[
0
]]
# use end index
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
delta
=
None
while
not
delta
==
0
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
...
...
@@ -540,11 +537,11 @@ class BedFile():
return
(
mindist
,
minentry
)
elif
d
>
0
:
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
else
:
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
# we may have missed the closest, so need to look around this point
for
i_up
in
range
(
inspect
-
1
,
0
,
-
1
):
# Look upstream since
...
...
@@ -560,7 +557,7 @@ class BedFile():
entries
=
self
.
indices
[
1
][
myloc
[
0
]]
# use centre index
upper
=
len
(
entries
)
# keep an upper boundary
lower
=
0
# and a lower boundary
inspect
=
(
upper
-
lower
)
/
2
# start by looking in the middle
inspect
=
(
upper
-
lower
)
/
/
2
# start by looking in the middle
delta
=
None
while
not
delta
==
0
:
entry
=
self
.
rows
[
entries
[
inspect
][
2
]]
...
...
@@ -575,11 +572,11 @@ class BedFile():
return
(
mindist
,
minentry
)
elif
d
>
0
:
lower
=
inspect
+
1
delta
=
(
upper
-
inspect
)
/
2
# splitting in half, potential speed improvements with some heuristic?
delta
=
(
upper
-
inspect
)
/
/
2
# splitting in half, potential speed improvements with some heuristic?
inspect
+=
delta
else
:
upper
=
inspect
delta
=
(
inspect
-
lower
+
1
)
/
2
delta
=
(
inspect
-
lower
+
1
)
/
/
2
inspect
-=
delta
# at bottom of search
return
(
mindist
,
minentry
)
...
...
@@ -751,30 +748,64 @@ Modifications to package:
- removed download.py and __main__ because they were not used and __main__ had errors.
- removed command-line interface because the BED file functionality is implemented more extensively elsewhere
"""
"""
twobitreader
Licensed under Perl Artistic License 2.0
No warranty is provided, express or implied
"""
from
array
import
array
from
bisect
import
bisect_right
from
errno
import
ENOENT
,
EACCES
from
os
import
R_OK
,
access
try
:
from
os
import
strerror
except
ImportError
:
strerror
=
lambda
x
:
'strerror not supported'
from
os.path
import
exists
from
itertools
import
chain
from
os.path
import
exists
,
getsize
import
logging
import
textwrap
import
sys
if
sys
.
version_info
>
(
3
,):
izip
=
zip
xrange
=
range
_CHAR_CODE
=
'u'
iteritems
=
dict
.
items
else
:
from
itertools
import
izip
_CHAR_CODE
=
'c'
iteritems
=
dict
.
iteritems
def
safe_tostring
(
ary
):
"""
convert arrays to strings in a Python 2.x / 3.x safe way
"""
if
sys
.
version_info
>
(
3
,):
return
ary
.
tounicode
()
.
encode
(
"ascii"
)
.
decode
()
else
:
return
ary
.
tostring
()
def
true_long_type
():
"""
OS X uses an 8-byte long, so make sure L (long) is the right size
and switch to I (int) if needed
OS X uses an 8-byte long, so make sure L (long) is the right size
and switch to I (int) if needed
"""
for
type_
in
[
'L'
,
'I'
]:
test_array
=
array
(
type_
,
[
0
])
long_size
=
test_array
.
itemsize
if
long_size
==
4
:
return
type_
if
long_size
==
4
:
return
type_
raise
ImportError
(
"Couldn't determine a valid 4-byte long type to use
\
as equivalent to LONG"
)
LONG
=
true_long_type
()
def
byte_to_bases
(
x
):
"""convert one byte to the four bases it encodes"""
c
=
(
x
>>
4
)
&
0xf
...
...
@@ -783,100 +814,162 @@ def byte_to_bases(x):
cf
=
c
&
0x3
fc
=
(
f
>>
2
)
&
0x3
ff
=
f
&
0x3
return
map
(
bits_to_base
,
(
cc
,
cf
,
fc
,
ff
))
return
[
bits_to_base
(
X
)
for
X
in
(
cc
,
cf
,
fc
,
ff
)]
def
bits_to_base
(
x
):
"""convert integer representation of two bits to correct base"""
if
x
is
0
:
return
'T'
if
x
is
1
:
return
'C'
if
x
is
2
:
return
'A'
if
x
is
3
:
return
'G'
if
x
is
0
:
return
'T'
elif
x
is
1
:
return
'C'
elif
x
is
2
:
return
'A'
elif
x
is
3
:
return
'G'
else
:
raise
ValueError
(
'Only integers 0-3 are valid inputs'
)
def
base_to_bin
(
x
):
"""
provided for user convenience
convert a nucleotide to its bit representation
provided for user convenience
convert a nucleotide to its bit representation
"""
if
x
==
'T'
:
return
'00'
if
x
==
'C'
:
return
'01'
if
x
==
'A'
:
return
'10'
if
x
==
'G'
:
return
'11'
if
x
==
'T'
:
return
'00'
elif
x
==
'C'
:
return
'01'
elif
x
==
'A'
:
return
'10'
elif
x
==
'G'
:
return
'11'
else
:
raise
ValueError
(
'Only characters
\'
ATGC
\'
are valid inputs'
)
def
create_byte_table
():
"""create BYTE_TABLE"""
d
=
{}
for
x
in
range
(
2
**
8
):
for
x
in
xrange
(
2
**
8
):
d
[
x
]
=
byte_to_bases
(
x
)
return
d
def
split16
(
x
):
"""
split a 16-bit number into integer representation
of its course and fine parts in binary representation
split a 16-bit number into integer representation
of its course and fine parts in binary representation
"""
c
=
(
x
>>
8
)
&
0xff
f
=
x
&
0xff
return
c
,
f
def
create_twobyte_table
():
"""create TWOBYTE_TABLE"""
d
=
{}
for
x
in
range
(
2
**
16
):
for
x
in
xrange
(
2
**
16
):
c
,
f
=
split16
(
x
)
d
[
x
]
=
chain
(
byte_to_bases
(
c
),
byte_to_bases
(
f
))
d
[
x
]
=
list
(
byte_to_bases
(
c
))
+
list
(
byte_to_bases
(
f
))
return
d
BYTE_TABLE
=
create_byte_table
()
TWOBYTE_TABLE
=
create_twobyte_table
()
def
longs_to_char_array
(
longs
,
first_base_offset
,
last_base_offset
,
array_size
):
def
longs_to_char_array
(
longs
,
first_base_offset
,
last_base_offset
,
array_size
,
more_bytes
=
None
):
"""
takes in a iterable of longs and converts them to bases in a char array
returns a ctypes string buffer
takes in an array of longs (4 bytes) and converts them to bases in
a char array
you must also provide the offset in the first and last block
(note these offsets are pythonic. last_offset is not included)
and the desired array_size
If you have less than a long worth of bases at the end, you can provide
them as a string with more_bytes=
NOTE: last_base_offset is inside more_bytes not the last long, if more_bytes
is not None
returns the correct subset of the array based on provided offsets
"""
if
array_size
==
0
:
return
array
(
_CHAR_CODE
)
elif
array_size
<
0
:
raise
ValueError
(
'array_size must be at least 0'
)
if
not
first_base_offset
in
range
(
16
):
raise
ValueError
(
'first_base_offset must be in range(16)'
)
if
not
last_base_offset
in
range
(
1
,
17
):
raise
ValueError
(
'last_base_offset must be in range(1, 17)'
)
longs_len
=
len
(
longs
)
# dna = ctypes.create_string_buffer(array_size)
dna
=
array
(
'b'
,
'N'
*
longs_len
)
if
more_bytes
is
None
:
shorts_length
=
0
else
:
shorts_length
=
len
(
more_bytes
)
if
array_size
>
longs_len
*
16
+
4
*
shorts_length
:
raise
ValueError
(
'array_size exceeds maximum possible for input'
)
dna
=
array
(
_CHAR_CODE
,
'N'
*
(
longs_len
*
16
+
4
*
shorts_length
))
# translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed)
bytes
=
array
(
'B'
)
bytes
.
fromstring
(
longs
.
tostring
())
# first block
first_block
=
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes
[
x
]])
for
x
in
range
(
4
)])
i
=
16
-
first_base_offset
if
array_size
<
i
:
i
=
array_size
dna
[
0
:
i
]
=
array
(
'b'
,
first_block
[
first_base_offset
:
first_base_offset
+
i
])
if
longs_len
==
1
:
return
dna
# middle blocks (implicitly skipped if they don't exist)
for
byte
in
bytes
[
4
:
-
4
]:
dna
[
i
:
i
+
4
]
=
array
(
'b'
,
BYTE_TABLE
[
byte
])
i
+=
4
# last block
last_block
=
array
(
'b'
,
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes
[
x
]])
for
x
in
range
(
-
4
,
0
)]))
dna
[
i
:
i
+
last_base_offset
]
=
last_block
[
0
:
last_base_offset
]
return
dna
i
=
0
if
longs_len
>
0
:
bytes_
=
array
(
'B'
)
bytes_
.
fromstring
(
longs
.
tostring
())
# first block
first_block
=
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes_
[
x
]])
for
x
in
range
(
4
)])
i
=
16
-
first_base_offset
if
array_size
<
i
:
i
=
array_size
dna
[
0
:
i
]
=
array
(
_CHAR_CODE
,
first_block
[
first_base_offset
:
first_base_offset
+
i
])
if
longs_len
>
1
:
# middle blocks (implicitly skipped if they don't exist)
for
byte
in
bytes_
[
4
:
-
4
]:
dna
[
i
:
i
+
4
]
=
array
(
_CHAR_CODE
,
BYTE_TABLE
[
byte
])
i
+=
4
# last block
last_block
=
array
(
_CHAR_CODE
,
''
.
join
([
''
.
join
(
BYTE_TABLE
[
bytes_
[
x
]])
for
x
in
range
(
-
4
,
0
)]))
if
more_bytes
is
None
:
dna
[
i
:
i
+
last_base_offset
]
=
last_block
[
0
:
last_base_offset
]
else
:
# if there are more bytes, we need the whole last block
dna
[
i
:
i
+
16
]
=
last_block
[
0
:
16
]
i
+=
16
if
more_bytes
is
not
None
:
bytes_
=
array
(
'B'
)
bytes_
.
fromstring
(
more_bytes
)
j
=
i
for
byte
in
bytes_
:
j
=
i
+
4
if
j
>
array_size
:
dnabytes
=
array
(
_CHAR_CODE
,
BYTE_TABLE
[
byte
])[
0
:(
array_size
-
i
)]
dna
[
i
:
array_size
]
=
dnabytes
break
dna
[
i
:
i
+
last_base_offset
]
=
array
(
_CHAR_CODE
,
BYTE_TABLE
[
byte
])
i
+=
4
return
dna
[
0
:
array_size
]
class
TwoBitFile
(
dict
):
"""
python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support)
TwoBitFile inherits from dict
You may access sequences by name, e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> chr20[100100:100200]
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20)
Fair warning: dumping the entire chromosome requires a lot of memory
See TwoBitSequence for more info
python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support)
TwoBitFile inherits from dict
You may access sequences by name, e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> chr20[100100:100120]
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
tttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20)
Fair warning: dumping the entire chromosome requires a lot of memory
See TwoBitSequence for more info
"""
def
__init__
(
self
,
foo
):
...
...
@@ -886,14 +979,19 @@ class TwoBitFile(dict):
if
not
access
(
foo
,
R_OK
):
raise
IOError
(
EACCES
,
strerror
(
EACCES
),
foo
)
self
.
_filename
=
foo
self
.
_file_size
=
getsize
(
foo
)
self
.
_file_handle
=
open
(
foo
,
'rb'
)
self
.
_load_header
()
self
.
_load_index
()
for
name
,
offset
in
self
.
_offset_dict
.
items
(
):
for
name
,
offset
in
iteritems
(
self
.
_offset_dict
):
self
[
name
]
=
TwoBitSequence
(
self
.
_file_handle
,
offset
,
self
.
_file_size
,
self
.
_byteswapped
)
return
def
__reduce__
(
self
):
# enables pickling
return
(
TwoBitFile
,
(
self
.
_filename
,))
def
_load_header
(
self
):
file_handle
=
self
.
_file_handle
header
=
array
(
LONG
)
...
...
@@ -907,8 +1005,9 @@ class TwoBitFile(dict):
header
.
byteswap
()
(
signature2
,
version
,
sequence_count
,
reserved
)
=
header
if
not
signature2
==
0x1A412743
:
raise
TwoBitFileError
(
'Signature in header should be 0x1A412743'
+
', instead found 0x
%
X'
%
signature
)
raise
TwoBitFileError
(
'Signature in header should be '
+
'0x1A412743, instead found 0x
%
X'
%
signature
)
if
not
version
==
0
:
raise
TwoBitFileError
(
'File version in header should be 0.'
)
if
not
reserved
==
0
:
...
...
@@ -922,21 +1021,23 @@ class TwoBitFile(dict):
remaining
=
self
.
_sequence_count
sequence_offsets
=
[]
file_handle
.
seek
(
16
)
while
True
:
if
remaining
==
0
:
break
while
remaining
>
0
:
name_size
=
array
(
'B'
)
name_size
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
name_size
.
byteswap
()
name
=
array
(
'b'
)
# name = array(_CHAR_CODE)
name
=
array
(
'B'
)
name
.
fromfile
(
file_handle
,
name_size
[
0
])
name
=
""
.
join
([
chr
(
X
)
for
X
in
name
])
if
byteswapped
:
name
.
byteswap
()
name
.
fromfile
(
file_handle
,
name_size
[
0
])
offset
=
array
(
LONG
)
offset
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
offset
.
byteswap
()
sequence_offsets
.
append
((
name
.
tostring
()
,
offset
[
0
]))
sequence_offsets
.
append
((
name
,
offset
[
0
]))
remaining
-=
1
self
.
_sequence_offsets
=
sequence_offsets
self
.
_offset_dict
=
dict
(
sequence_offsets
)
...
...
@@ -946,71 +1047,78 @@ class TwoBitFile(dict):
d
=
{}
file_handle
=
self
.
_file_handle
byteswapped
=
self
.
_byteswapped
for
name
,
offset
in
self
.
_offset_dict
.
items
(
):
for
name
,
offset
in
iteritems
(
self
.
_offset_dict
):
file_handle
.
seek
(
offset
)
dna_size
=
array
(
LONG
)
dna_size
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
dna_size
.
byteswap
()
if
byteswapped
:
dna_size
.
byteswap
()
d
[
name
]
=
dna_size
[
0
]
return
d
class
TwoBitSequence
(
object
):
"""
A TwoBitSequence object refers to an entry in a TwoBitFile
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20[100100:100200] # slicing returns a string
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20) # get whole chr as string
Fair warning: dumping the entire chromosome requires a lot of memory
Note that we follow python/UCSC conventions:
Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
If you attempt to access a slice past the end of the sequence,
it will be truncated at the end.
Your computer probably doesn't have enough memory to load a whole genome
but if you want to string-ize your TwoBitFile, here's a recipe:
x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.iteritems(): d[k] = str(v)
A TwoBitSequence object refers to an entry in a TwoBitFile
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20[100100:100200] # slicing returns a string
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
tttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20) # get whole chr as string
Fair warning: dumping the entire chromosome requires a lot of memory
Note that we follow python/UCSC conventions:
Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
If you attempt to access a slice past the end of the sequence,
it will be truncated at the end.
Your computer probably doesn't have enough memory to load a whole genome
but if you want to string-ize your TwoBitFile, here's a recipe:
x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.items(): d[k] = str(v)
"""
def
__init__
(
self
,
file_handle
,
offset
,
byteswapped
=
False
):
def
__init__
(
self
,
file_handle
,
offset
,
file_size
,
byteswapped
=
False
):
self
.
_file_size
=
file_size
self
.
_file_handle
=
file_handle
self
.
_original_offset
=
offset
self
.
_byteswapped
=
byteswapped
file_handle
.
seek
(
offset
)
header
=
array
(
LONG
)
header
.
fromfile
(
file_handle
,
2
)
if
byteswapped
:
header
.
byteswap
()
if
byteswapped
:
header
.
byteswap
()
dna_size
,
n_block_count
=
header
self
.
_dna_size
=
dna_size
self
.
_packed_dna_size
=
(
dna_size
+
15
)
/
16
# this is 32-bit fragments
self
.
_dna_size
=
dna_size
# number of characters, 2 bits each
self
.
_n_bytes
=
(
dna_size
+
3
)
/
4
# number of bytes
# number of 32-bit fragments
self
.
_packed_dna_size
=
(
dna_size
+
15
)
/
16
n_block_starts
=
array
(
LONG
)
n_block_sizes
=
array
(
LONG
)
n_block_starts
.
fromfile
(
file_handle
,
n_block_count
)
if
byteswapped
:
n_block_starts
.
byteswap
()
if
byteswapped
:
n_block_starts
.
byteswap
()
n_block_sizes
.
fromfile
(
file_handle
,
n_block_count
)
if
byteswapped
:
n_block_sizes
.
byteswap
()
if
byteswapped
:
n_block_sizes
.
byteswap
()
self
.
_n_block_starts
=
n_block_starts
self
.
_n_block_sizes
=
n_block_sizes
self
.
_n_block_sizes
=
n_block_sizes
mask_rawc
=
array
(
LONG
)
mask_rawc
.
fromfile
(
file_handle
,
1
)
if
byteswapped
:
mask_rawc
.
byteswap
()
if
byteswapped
:
mask_rawc
.
byteswap
()
mask_block_count
=
mask_rawc
[
0
]
mask_block_starts
=
array
(
LONG
)
mask_block_starts
.
fromfile
(
file_handle
,
mask_block_count
)
if
byteswapped
:
mask_block_starts
.
byteswap
()
if
byteswapped
:
mask_block_starts
.
byteswap
()
mask_block_sizes
=
array
(
LONG
)
mask_block_sizes
.
fromfile
(
file_handle
,
mask_block_count
)
if
byteswapped
:
mask_block_sizes
.
byteswap
()
if
byteswapped
:
mask_block_sizes
.
byteswap
()
self
.
_mask_block_starts
=
mask_block_starts
self
.
_mask_block_sizes
=
mask_block_sizes
file_handle
.
read
(
4
)
...
...
@@ -1019,8 +1127,22 @@ class TwoBitSequence(object):
def
__len__
(
self
):
return
self
.
_dna_size
def
__getslice__
(
self
,
min_
,
max_
=
None
):
return
self
.
get_slice
(
min_
,
max_
)
def
__getitem__
(
self
,
slice_or_key
):
"""
return a sub-sequence, given a slice object
"""
step
=
None
if
isinstance
(
slice_or_key
,
slice
):
step
=
slice_or_key
.
step
if
step
is
not
None
:
raise
ValueError
(
"Slicing by step not currently supported"
)
return
self
.
get_slice
(
min_
=
slice_or_key
.
start
,
max_
=
slice_or_key
.
stop
)
elif
isinstance
(
slice_or_key
,
int
):
max_
=
slice_or_key
+
1
if
max_
==
0
:
max_
=
None
return
self
.
get_slice
(
min_
=
slice_or_key
,
max_
=
max_
)
def
get_slice
(
self
,
min_
,
max_
=
None
):
"""
...
...
@@ -1028,22 +1150,26 @@ class TwoBitSequence(object):
"""
# handle negative coordinates
dna_size
=
self
.
_dna_size
if
max_
<
0
:
if
max_
<
-
dna_size
:
raise
IndexError
(
'index out of range'
)
max_
=
dna_size
+
1
+
max_
if
min_
<
0
:
if
max_
<
-
dna_size
:
raise
IndexError
(
'index out of range'
)
min_
=
dna_size
+
1
+
min_
# Find out if the reverse complement is sought
reverse
=
False
# assume not RC
if
min_
>
max_
and
max_
is
not
None
:
reverse
=
True
mymax
=
max_
max_
=
min_
min_
=
mymax
if
max_
==
0
:
return
''
if
min_
is
None
:
# for slicing e.g. [:]
min_
=
0
if
max_
is
not
None
and
max_
<
0
:
if
max_
<
-
dna_size
:
raise
IndexError
(
'index out of range'
)
max_
=
dna_size
+
max_
if
min_
is
not
None
and
min_
<
0
:
if
min_
<
-
dna_size
:
raise
IndexError
(
'index out of range'
)
min_
=
dna_size
+
min_
# make sure there's a proper range
if
max_
is
not
None
and
min_
>
max_
:
return
''
if
max_
==
0
or
max_
==
min_
:
return
''
# load all the data
if
max_
>
dna_size
:
max_
=
dna_size
if
max_
is
None
or
max_
>
dna_size
:
max_
=
dna_size
file_handle
=
self
.
_file_handle
byteswapped
=
self
.
_byteswapped
n_block_starts
=
self
.
_n_block_starts
...
...
@@ -1052,140 +1178,158 @@ class TwoBitSequence(object):
mask_block_sizes
=
self
.
_mask_block_sizes
offset
=
self
.
_offset
packed_dna_size
=
self
.
_packed_dna_size
# n_bytes = self._n_bytes
# region_size is how many bases the region is
if
max_
is
None
:
region_size
=
dna_size
-
min_
else
:
region_size
=
max_
-
min_
if
max_
is
None
:
region_size
=
dna_size
-
min_
else
:
region_size
=
max_
-
min_
# start_block, end_block are the first/last 32-bit blocks we need
# note: end_block is not read
# blocks start at 0
start_block
=
min_
/
16
end_block
=
max_
/
16
start_block
=
min_
//
16
# jump directly to desired file location
local_offset
=
offset
+
(
start_block
*
4
)
end_block
=
(
max_
-
1
+
16
)
//
16
# don't read past seq end
if
end_block
>=
packed_dna_size
:
end_block
=
packed_dna_size
-
1
# +1 we still need to read block
blocks_to_read
=
end_block
-
start_block
+
1
# jump directly to desired file location
local_offset
=
offset
+
start_block
*
4
file_handle
.
seek
(
local_offset
)
# note we won't actually read the last base
# this is a python slice first_base_offset:16*blocks+last_base_offset
first_base_offset
=
min_
%
16
last_base_offset
=
max_
%
16
if
last_base_offset
==
0
:
last_base_offset
=
16
# +1 we still need to read end_block maybe
blocks_to_read
=
end_block
-
start_block
if
(
blocks_to_read
+
start_block
)
>
packed_dna_size
:
blocks_to_read
=
packed_dna_size
-
start_block
fourbyte_dna
=
array
(
LONG
)
fourbyte_dna
.
fromfile
(
file_handle
,
blocks_to_read
)
if
byteswapped
:
fourbyte_dna
.
byteswap
()
string_as_array
=
longs_to_char_array
(
fourbyte_dna
,
first_base_offset
,
last_base_offset
,
region_size
)
for
start
,
size
in
zip
(
n_block_starts
,
n_block_sizes
):
# remainder_seq = None
if
(
blocks_to_read
*
4
+
local_offset
)
>
self
.
_file_size
:
fourbyte_dna
.
fromfile
(
file_handle
,
blocks_to_read
-
1
)
morebytes
=
file_handle
.
read
()
# read the remaining characters
# if byteswapped:
# morebytes = ''.join(reversed(morebytes))
else
:
fourbyte_dna
.
fromfile
(
file_handle
,
blocks_to_read
)
morebytes
=
None
if
byteswapped
:
fourbyte_dna
.
byteswap
()
str_as_array
=
longs_to_char_array
(
fourbyte_dna
,
first_base_offset
,
last_base_offset
,
region_size
,
more_bytes
=
morebytes
)
for
start
,
size
in
izip
(
n_block_starts
,
n_block_sizes
):
end
=
start
+
size
if
end
<=
min_
:
continue
if
start
>
max_
:
break
if
start
<
min_
:
start
=
min_
if
end
>
max_
:
end
=
max_
if
end
<=
min_
:
continue
if
start
>
max_
:
break
if
start
<
min_
:
start
=
min_
if
end
>
max_
:
end
=
max_
start
-=
min_
end
-=
min_
string_as_array
[
start
:
end
]
=
array
(
'b'
,
'N'
*
(
end
-
start
))
# this should actually be decoded, 00=N, 01=n
str_as_array
[
start
:
end
]
=
array
(
_CHAR_CODE
,
'N'
*
(
end
-
start
))
lower
=
str
.
lower
first_masked_region
=
max
(
0
,
bisect_right
(
mask_block_starts
,
min_
)
-
1
)
last_masked_region
=
min
(
len
(
mask_block_starts
),
1
+
bisect_right
(
mask_block_starts
,
max_
,
lo
=
first_masked_region
))
for
start
,
size
in
zip
(
mask_block_starts
[
first_masked_region
:
last_masked_region
],
mask_block_sizes
[
first_masked_region
:
last_masked_region
]):
for
start
,
size
in
izip
(
mask_block_starts
[
first_masked_region
:
last_masked_region
],
mask_block_sizes
[
first_masked_region
:
last_masked_region
]):
end
=
start
+
size
if
end
<=
min_
:
continue
if
start
>
max_
:
break
if
start
<
min_
:
start
=
min_
if
end
>
max_
:
end
=
max_
if
end
<=
min_
:
continue
if
start
>
max_
:
break
if
start
<
min_
:
start
=
min_
if
end
>
max_
:
end
=
max_
start
-=
min_
end
-=
min_
string_as_array
[
start
:
end
]
=
array
(
'b'
,
lower
(
string_as_array
[
start
:
end
]
.
tostring
()))
if
not
len
(
string_as_array
)
==
max_
-
min_
:
raise
RuntimeError
(
"Sequence was longer than it should be"
)
if
reverse
:
return
self
.
reverseComplement
(
string_as_array
.
tostring
())
return
string_as_array
.
tostring
()
def
reverseComplement
(
self
,
dna
):
""" Return a new sequence: the reverse complement of this sequence. """
newseq
=
''
symbols
=
{
'A'
:
'T'
,
'C'
:
'G'
,
'T'
:
'A'
,
'G'
:
'C'
,
'a'
:
't'
,
'c'
:
'g'
,
't'
:
'a'
,
'g'
:
'c'
,
'n'
:
'n'
,
'N'
:
'N'
}
# reverse complement dictionary
for
symbol
in
dna
[::
-
1
]:
newsymbol
=
symbols
[
symbol
]
# uses the reverse complement symbols in dictionary
newseq
+=
newsymbol
return
newseq
# returns RC sequences
str_as_array
[
start
:
end
]
=
array
(
_CHAR_CODE
,
lower
(
safe_tostring
(
str_as_array
[
start
:
end
])))
if
not
len
(
str_as_array
)
==
max_
-
min_
:
raise
RuntimeError
(
"Sequence was the wrong size"
)
return
safe_tostring
(
str_as_array
)
def
__str__
(
self
):
"""
returns the entire chromosome
"""
return
self
.
__getslice__
(
0
,
None
)
return
self
.
get_slice
(
0
,
None
)
class
TwoBitFileError
(
Exception
):
"""
Base exception for TwoBit module
"""
def
__init__
(
self
,
msg
):
errtext
=
'Invalid 2-bit file. '
+
msg
return
super
(
TwoBitFileError
,
self
)
.
__init__
(
errtext
)
def
print_specification
():
"""
Prints the twoBit file format specification I got from the Internet.
This is only here for reference
"""
return
"""
From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
.2bit files
A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact
\
randomly accessible format. The two bit files contain masking information as
\
well as the DNA itself. The file begins with a 16 byte header containing the
\
following fields:
signature - the number 0x1A412743 in the architecture of the machine that
\
created the file.
version - zero for now. Readers should abort if they see a version number
\
higher than 0.
sequenceCount - the number of sequences in the file
reserved - always zero for now.
All fields are 32 bits unless noted. If the signature value is not as given,
\
the reader program should byte swap the signature and see if the swapped
\
version matches. If so all multiple-byte entities in the file will need to be
\
byte-swapped. This enables these binary files to be used unchanged on
\
different architectures.
The header is followed by a file index. There is one entry in the index for
\
each sequence. Each index entry contains three fields:
nameSize - a byte containing the length of the name field
name - this contains the sequence name itself, and is variable length
\
depending on nameSize.
offset - 32 bit offset of the sequence data relative to the start of the file
The index is followed by the sequence records. These contain 9 fields:
dnaSize - number of bases of DNA in the sequence.
nBlockCount - the number of blocks of N's in the file (representing unknown
\
sequence).
nBlockStarts - a starting position for each block of N's
nBlockSizes - the size of each block of N's
maskBlockCount - the number of masked (lower case) blocks
maskBlockStarts - starting position for each masked block
maskBlockSizes - the size of each masked block
packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A,
\
11 - G. The first base is in the most significant 2 bits byte, and the last
\
base in the least significant 2 bits, so that the sequence TCAG would be
\
represented as 00011011. The packedDna field will be padded with 0 bits as
\
necessary so that it takes an even multiple of 32 bit in the file, as this
\
improves i/o performance on some machines.
.nib files
"""
From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
.2bit files
A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact
\
randomly accessible format. The two bit files contain masking information as
\
well as the DNA itself. The file begins with a 16 byte header containing the
\
following fields:
signature - the number 0x1A412743 in the architecture of the machine that
\
created the file.
version - zero for now. Readers should abort if they see a version number
\
higher than 0.
sequenceCount - the number of sequences in the file
reserved - always zero for now.
All fields are 32 bits unless noted. If the signature value is not as given,
\
the reader program should byte swap the signature and see if the swapped
\
version matches. If so all multiple-byte entities in the file will need to be
\
byte-swapped. This enables these binary files to be used unchanged on
\
different architectures.
The header is followed by a file index. There is one entry in the index for
\
each sequence. Each index entry contains three fields:
nameSize - a byte containing the length of the name field
name - this contains the sequence name itself, and is variable length
\
depending on nameSize.
offset - 32 bit offset of the sequence data relative to the start of the file
The index is followed by the sequence records. These contain 9 fields:
dnaSize - number of bases of DNA in the sequence.
nBlockCount - the number of blocks of N's in the file (representing unknown
\
sequence).
nBlockStarts - a starting position for each block of N's
nBlockSizes - the size of each block of N's
maskBlockCount - the number of masked (lower case) blocks
maskBlockStarts - starting position for each masked block
maskBlockSizes - the size of each masked block
packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A,
\
11 - G. The first base is in the most significant 2 bits byte, and the last
\
base in the least significant 2 bits, so that the sequence TCAG would be
\
represented as 00011011. The packedDna field will be padded with 0 bits as
\
necessary so that it takes an even multiple of 32 bit in the file, as this
\
improves i/o performance on some machines.
.nib files
"""
if
__name__
==
'__main__'
:
hg19
=
TwoBitFile
(
'/Users/mikael/simhome/share/hg19.2bit'
)
# assumes that the genome is stored in your current directory
for
key
in
hg19
:
print
(
key
)
print
(
hg19
[
'chrX'
][
1000000
:
1000060
])
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment