Commit a236ba40 authored by Mikael Boden's avatar Mikael Boden

update_seqdata_to_python_3

parent bb25ec08
e a t################################################### ###################################################
# This module is a supplement to the Python guide # # This module is a supplement to the Python guide #
# Version 2017.3 (10/03/2017) # # Version 2017.3 (10/03/2017) #
################################################### ###################################################
......
...@@ -13,8 +13,8 @@ def overlap(chromLoc1, chromLoc2): ...@@ -13,8 +13,8 @@ def overlap(chromLoc1, chromLoc2):
Return 0 in case of NO overlap. Return 0 in case of NO overlap.
""" """
if chromLoc1[0] == chromLoc2[0]: if chromLoc1[0] == chromLoc2[0]:
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2 halfWidth1 = (chromLoc1[2] - chromLoc1[1]) // 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2 halfWidth2 = (chromLoc2[2] - chromLoc2[1]) // 2
minWidth = min(halfWidth1, halfWidth2) minWidth = min(halfWidth1, halfWidth2)
minWidth = max(minWidth, 1) minWidth = max(minWidth, 1)
maxWidth = max(halfWidth1, halfWidth2) maxWidth = max(halfWidth1, halfWidth2)
...@@ -37,8 +37,8 @@ def distance(chromLoc1, chromLoc2, minimum = True): ...@@ -37,8 +37,8 @@ def distance(chromLoc1, chromLoc2, minimum = True):
minimum: if True (default), then use minimum distance, if False, use centre to centre minimum: if True (default), then use minimum distance, if False, use centre to centre
""" """
if chromLoc1[0] == chromLoc2[0]: if chromLoc1[0] == chromLoc2[0]:
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2 halfWidth1 = (chromLoc1[2] - chromLoc1[1]) // 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2 halfWidth2 = (chromLoc2[2] - chromLoc2[1]) // 2
minWidth = min(halfWidth1, halfWidth2) minWidth = min(halfWidth1, halfWidth2)
minWidth = max(minWidth, 1) minWidth = max(minWidth, 1)
maxWidth = max(halfWidth1, halfWidth2) maxWidth = max(halfWidth1, halfWidth2)
...@@ -151,33 +151,33 @@ class BedEntry(): ...@@ -151,33 +151,33 @@ class BedEntry():
end = self.chromEnd end = self.chromEnd
start = self.chromStart start = self.chromStart
mywidth = fixedwidth or (self.chromEnd - self.chromStart) mywidth = fixedwidth or (self.chromEnd - self.chromStart)
mycentre = start + (self.chromEnd - self.chromStart) / 2 mycentre = start + (self.chromEnd - self.chromStart) // 2
if usesummit: if usesummit:
mycentre = self.summit mycentre = self.summit
if useshift: if useshift:
mycentre = mycentre + useshift mycentre = mycentre + useshift
if fixedwidth: # we need to re-calculate start and end if fixedwidth: # we need to re-calculate start and end
if genome: if genome:
end = min(len(genome[self.chrom]), mycentre + (mywidth / 2)) end = min(len(genome[self.chrom]), mycentre + (mywidth // 2))
else: else:
end = mycentre + (mywidth / 2) end = mycentre + (mywidth // 2)
start = max(0, mycentre - (mywidth / 2)) start = max(0, mycentre - (mywidth // 2))
else: # other strand else: # other strand
start = self.chromEnd start = self.chromEnd
end = self.chromStart end = self.chromStart
mywidth = fixedwidth or (self.chromEnd - self.chromStart) mywidth = fixedwidth or (self.chromEnd - self.chromStart)
mycentre = self.chromStart + (self.chromEnd - self.chromStart) / 2 mycentre = self.chromStart + (self.chromEnd - self.chromStart) // 2
if usesummit: if usesummit:
mycentre = self.summit mycentre = self.summit
if useshift: if useshift:
mycentre = mycentre - useshift # shift is reversed on other strand mycentre = mycentre - useshift # shift is reversed on other strand
if fixedwidth: # we need to re-calculate start and end if fixedwidth: # we need to re-calculate start and end
end = max(0, mycentre - (mywidth / 2)) end = max(0, mycentre - (mywidth // 2))
if genome: if genome:
start = min(len(genome[self.chrom]), mycentre + (mywidth / 2)) start = min(len(genome[self.chrom]), mycentre + (mywidth // 2))
else: else:
start = mycentre + (mywidth / 2) start = mycentre + (mywidth // 2)
if genome: # refer to the genome sequence if genome: # refer to the genome sequence
return genome[self.chrom][start : end] return genome[self.chrom][start : end]
...@@ -187,9 +187,9 @@ class BedEntry(): ...@@ -187,9 +187,9 @@ class BedEntry():
def setwidth(self, fixedwidth = None, usesummit = False): def setwidth(self, fixedwidth = None, usesummit = False):
if fixedwidth: if fixedwidth:
if usesummit: if usesummit:
diff = self.summit - fixedwidth / 2 diff = self.summit - fixedwidth // 2
else: else:
diff = (self.chromEnd - self.chromStart) / 2 - fixedwidth / 2 diff = (self.chromEnd - self.chromStart) // 2 - fixedwidth // 2
self.chromStart += diff self.chromStart += diff
self.chromStart += diff + fixedwidth self.chromStart += diff + fixedwidth
return (self.chrom, self.chromStart, self.chromEnd) return (self.chrom, self.chromStart, self.chromEnd)
...@@ -365,11 +365,8 @@ class BedFile(): ...@@ -365,11 +365,8 @@ class BedFile():
def __iter__(self): def __iter__(self):
return self.rows.__iter__() return self.rows.__iter__()
def __getslice__(self, i, j): def __getitem__(self, key):
return self.rows.__getslice__(i, j) return self.rows[key]
def __getitem__(self, i):
return self.rows[i]
def __len__(self): def __len__(self):
return len(self.rows) return len(self.rows)
...@@ -388,7 +385,7 @@ class BedFile(): ...@@ -388,7 +385,7 @@ class BedFile():
if not row.chrom in index_end: # seeing chromosome entry first time if not row.chrom in index_end: # seeing chromosome entry first time
index_end[row.chrom] = [] index_end[row.chrom] = []
index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i)) index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i)) index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) // 2, (row.chromEnd - row.chromStart) // 2, i))
index_end[row.chrom].append((row.chromEnd, row.chromEnd - row.chromStart, i)) index_end[row.chrom].append((row.chromEnd, row.chromEnd - row.chromStart, i))
if row.name: if row.name:
index_name[row.name] = row index_name[row.name] = row
...@@ -407,7 +404,7 @@ class BedFile(): ...@@ -407,7 +404,7 @@ class BedFile():
entries = self.indices[0][elem[0]] # use the start index entries = self.indices[0][elem[0]] # use the start index
upper = len(entries) # keep an upper boundary upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle inspect = (upper - lower) // 2 # start by looking in the middle
while True: while True:
entry = self.rows[entries[inspect][2]] entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), elem, minimum = True) d = distance(entry.loc(), elem, minimum = True)
...@@ -416,11 +413,11 @@ class BedFile(): ...@@ -416,11 +413,11 @@ class BedFile():
return True return True
elif d > 0: elif d > 0:
lower = inspect + 1 lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic? delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta inspect += delta
else: else:
upper = inspect upper = inspect
delta = (inspect - lower + 1) / 2 delta = (inspect - lower + 1) // 2
inspect -= delta inspect -= delta
if delta == 0: if delta == 0:
return False return False
...@@ -436,7 +433,7 @@ class BedFile(): ...@@ -436,7 +433,7 @@ class BedFile():
entries = self.indices[0][elem[0]] # use the start index entries = self.indices[0][elem[0]] # use the start index
upper = len(entries) # keep an upper boundary upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle inspect = (upper - lower) // 2 # start by looking in the middle
while True: while True:
entry = self.rows[entries[inspect][2]] entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), elem, minimum = True) d = distance(entry.loc(), elem, minimum = True)
...@@ -461,11 +458,11 @@ class BedFile(): ...@@ -461,11 +458,11 @@ class BedFile():
return False return False
elif d > 0: elif d > 0:
lower = inspect + 1 lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic? delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta inspect += delta
else: else:
upper = inspect upper = inspect
delta = (inspect - lower + 1) / 2 delta = (inspect - lower + 1) // 2
inspect -= delta inspect -= delta
if delta == 0: if delta == 0:
return False return False
...@@ -494,7 +491,7 @@ class BedFile(): ...@@ -494,7 +491,7 @@ class BedFile():
entries = self.indices[0][myloc[0]] # use start index entries = self.indices[0][myloc[0]] # use start index
upper = len(entries) # keep an upper boundary upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle inspect = (upper - lower) // 2 # start by looking in the middle
delta = None delta = None
while not delta == 0: while not delta == 0:
entry = self.rows[entries[inspect][2]] entry = self.rows[entries[inspect][2]]
...@@ -509,11 +506,11 @@ class BedFile(): ...@@ -509,11 +506,11 @@ class BedFile():
return (mindist, minentry) return (mindist, minentry)
elif d > 0: elif d > 0:
lower = inspect + 1 lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic? delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta inspect += delta
else: else:
upper = inspect upper = inspect
delta = (inspect - lower + 1) / 2 delta = (inspect - lower + 1) // 2
inspect -= delta inspect -= delta
# we may have missed the closest, so need to look around this point # we may have missed the closest, so need to look around this point
for i_dn in range(inspect + 1, len(entries)): # Look downstream since for i_dn in range(inspect + 1, len(entries)): # Look downstream since
...@@ -528,7 +525,7 @@ class BedFile(): ...@@ -528,7 +525,7 @@ class BedFile():
entries = self.indices[2][myloc[0]] # use end index entries = self.indices[2][myloc[0]] # use end index
upper = len(entries) # keep an upper boundary upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle inspect = (upper - lower) // 2 # start by looking in the middle
delta = None delta = None
while not delta == 0: while not delta == 0:
entry = self.rows[entries[inspect][2]] entry = self.rows[entries[inspect][2]]
...@@ -540,11 +537,11 @@ class BedFile(): ...@@ -540,11 +537,11 @@ class BedFile():
return (mindist, minentry) return (mindist, minentry)
elif d > 0: elif d > 0:
lower = inspect + 1 lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic? delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta inspect += delta
else: else:
upper = inspect upper = inspect
delta = (inspect - lower + 1) / 2 delta = (inspect - lower + 1) // 2
inspect -= delta inspect -= delta
# we may have missed the closest, so need to look around this point # we may have missed the closest, so need to look around this point
for i_up in range(inspect - 1, 0, -1): # Look upstream since for i_up in range(inspect - 1, 0, -1): # Look upstream since
...@@ -560,7 +557,7 @@ class BedFile(): ...@@ -560,7 +557,7 @@ class BedFile():
entries = self.indices[1][myloc[0]] # use centre index entries = self.indices[1][myloc[0]] # use centre index
upper = len(entries) # keep an upper boundary upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle inspect = (upper - lower) // 2 # start by looking in the middle
delta = None delta = None
while not delta == 0: while not delta == 0:
entry = self.rows[entries[inspect][2]] entry = self.rows[entries[inspect][2]]
...@@ -575,11 +572,11 @@ class BedFile(): ...@@ -575,11 +572,11 @@ class BedFile():
return (mindist, minentry) return (mindist, minentry)
elif d > 0: elif d > 0:
lower = inspect + 1 lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic? delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta inspect += delta
else: else:
upper = inspect upper = inspect
delta = (inspect - lower + 1) / 2 delta = (inspect - lower + 1) // 2
inspect -= delta inspect -= delta
# at bottom of search # at bottom of search
return (mindist, minentry) return (mindist, minentry)
...@@ -751,30 +748,64 @@ Modifications to package: ...@@ -751,30 +748,64 @@ Modifications to package:
- removed download.py and __main__ because they were not used and __main__ had errors. - removed download.py and __main__ because they were not used and __main__ had errors.
- removed command-line interface because the BED file functionality is implemented more extensively elsewhere - removed command-line interface because the BED file functionality is implemented more extensively elsewhere
""" """
"""
twobitreader
Licensed under Perl Artistic License 2.0
No warranty is provided, express or implied
"""
from array import array from array import array
from bisect import bisect_right from bisect import bisect_right
from errno import ENOENT, EACCES from errno import ENOENT, EACCES
from os import R_OK, access from os import R_OK, access
try: try:
from os import strerror from os import strerror
except ImportError: except ImportError:
strerror = lambda x: 'strerror not supported' strerror = lambda x: 'strerror not supported'
from os.path import exists from os.path import exists, getsize
from itertools import chain import logging
import textwrap
import sys
if sys.version_info > (3,):
izip = zip
xrange = range
_CHAR_CODE = 'u'
iteritems = dict.items
else:
from itertools import izip
_CHAR_CODE = 'c'
iteritems = dict.iteritems
def safe_tostring(ary):
"""
convert arrays to strings in a Python 2.x / 3.x safe way
"""
if sys.version_info > (3,):
return ary.tounicode().encode("ascii").decode()
else:
return ary.tostring()
def true_long_type(): def true_long_type():
""" """
OS X uses an 8-byte long, so make sure L (long) is the right size OS X uses an 8-byte long, so make sure L (long) is the right size
and switch to I (int) if needed and switch to I (int) if needed
""" """
for type_ in ['L', 'I']: for type_ in ['L', 'I']:
test_array = array(type_, [0]) test_array = array(type_, [0])
long_size = test_array.itemsize long_size = test_array.itemsize
if long_size == 4: return type_ if long_size == 4:
return type_
raise ImportError("Couldn't determine a valid 4-byte long type to use \ raise ImportError("Couldn't determine a valid 4-byte long type to use \
as equivalent to LONG") as equivalent to LONG")
LONG = true_long_type() LONG = true_long_type()
def byte_to_bases(x): def byte_to_bases(x):
"""convert one byte to the four bases it encodes""" """convert one byte to the four bases it encodes"""
c = (x >> 4) & 0xf c = (x >> 4) & 0xf
...@@ -783,100 +814,162 @@ def byte_to_bases(x): ...@@ -783,100 +814,162 @@ def byte_to_bases(x):
cf = c & 0x3 cf = c & 0x3
fc = (f >> 2) & 0x3 fc = (f >> 2) & 0x3
ff = f & 0x3 ff = f & 0x3
return map(bits_to_base, (cc, cf, fc, ff)) return [bits_to_base(X) for X in (cc, cf, fc, ff)]
def bits_to_base(x): def bits_to_base(x):
"""convert integer representation of two bits to correct base""" """convert integer representation of two bits to correct base"""
if x is 0: return 'T' if x is 0:
if x is 1: return 'C' return 'T'
if x is 2: return 'A' elif x is 1:
if x is 3: return 'G' return 'C'
elif x is 2:
return 'A'
elif x is 3:
return 'G'
else:
raise ValueError('Only integers 0-3 are valid inputs')
def base_to_bin(x): def base_to_bin(x):
""" """
provided for user convenience provided for user convenience
convert a nucleotide to its bit representation convert a nucleotide to its bit representation
""" """
if x == 'T': return '00' if x == 'T':
if x == 'C': return '01' return '00'
if x == 'A': return '10' elif x == 'C':
if x == 'G': return '11' return '01'
elif x == 'A':
return '10'
elif x == 'G':
return '11'
else:
raise ValueError('Only characters \'ATGC\' are valid inputs')
def create_byte_table(): def create_byte_table():
"""create BYTE_TABLE""" """create BYTE_TABLE"""
d = {} d = {}
for x in range(2**8): for x in xrange(2 ** 8):
d[x] = byte_to_bases(x) d[x] = byte_to_bases(x)
return d return d
def split16(x): def split16(x):
""" """
split a 16-bit number into integer representation split a 16-bit number into integer representation
of its course and fine parts in binary representation of its course and fine parts in binary representation
""" """
c = (x >> 8) & 0xff c = (x >> 8) & 0xff
f = x & 0xff f = x & 0xff
return c, f return c, f
def create_twobyte_table(): def create_twobyte_table():
"""create TWOBYTE_TABLE""" """create TWOBYTE_TABLE"""
d = {} d = {}
for x in range(2**16): for x in xrange(2 ** 16):
c, f = split16(x) c, f = split16(x)
d[x] = chain(byte_to_bases(c), byte_to_bases(f)) d[x] = list(byte_to_bases(c)) + list(byte_to_bases(f))
return d return d
BYTE_TABLE = create_byte_table() BYTE_TABLE = create_byte_table()
TWOBYTE_TABLE = create_twobyte_table() TWOBYTE_TABLE = create_twobyte_table()
def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size,
more_bytes=None):
""" """
takes in a iterable of longs and converts them to bases in a char array takes in an array of longs (4 bytes) and converts them to bases in
returns a ctypes string buffer a char array
you must also provide the offset in the first and last block
(note these offsets are pythonic. last_offset is not included)
and the desired array_size
If you have less than a long worth of bases at the end, you can provide
them as a string with more_bytes=
NOTE: last_base_offset is inside more_bytes not the last long, if more_bytes
is not None
returns the correct subset of the array based on provided offsets
""" """
if array_size == 0:
return array(_CHAR_CODE)
elif array_size < 0:
raise ValueError('array_size must be at least 0')
if not first_base_offset in range(16):
raise ValueError('first_base_offset must be in range(16)')
if not last_base_offset in range(1, 17):
raise ValueError('last_base_offset must be in range(1, 17)')
longs_len = len(longs) longs_len = len(longs)
# dna = ctypes.create_string_buffer(array_size) if more_bytes is None:
dna = array('b', 'N' * longs_len) shorts_length = 0
else:
shorts_length = len(more_bytes)
if array_size > longs_len * 16 + 4 * shorts_length:
raise ValueError('array_size exceeds maximum possible for input')
dna = array(_CHAR_CODE, 'N' * (longs_len * 16 + 4 * shorts_length))
# translate from 32-bit blocks to bytes # translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed) # this method ensures correct endianess (byteswap as neeed)
bytes = array('B') i = 0
bytes.fromstring(longs.tostring()) if longs_len > 0:
# first block bytes_ = array('B')
first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)]) bytes_.fromstring(longs.tostring())
i = 16 - first_base_offset # first block
if array_size < i: i = array_size first_block = ''.join([''.join(BYTE_TABLE[bytes_[x]]) for x in range(4)])
dna[0:i] = array('b', first_block[first_base_offset:first_base_offset + i]) i = 16 - first_base_offset
if longs_len == 1: return dna if array_size < i:
# middle blocks (implicitly skipped if they don't exist) i = array_size
for byte in bytes[4:-4]: dna[0:i] = array(_CHAR_CODE, first_block[first_base_offset:first_base_offset + i])
dna[i:i + 4] = array('b', BYTE_TABLE[byte]) if longs_len > 1:
i += 4 # middle blocks (implicitly skipped if they don't exist)
# last block for byte in bytes_[4:-4]:
last_block = array('b', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)])) dna[i:i + 4] = array(_CHAR_CODE, BYTE_TABLE[byte])
dna[i:i + last_base_offset] = last_block[0:last_base_offset] i += 4
return dna # last block
last_block = array(_CHAR_CODE, ''.join([''.join(BYTE_TABLE[bytes_[x]])
for x in range(-4, 0)]))
if more_bytes is None:
dna[i:i + last_base_offset] = last_block[0:last_base_offset]
else: # if there are more bytes, we need the whole last block
dna[i:i + 16] = last_block[0:16]
i += 16
if more_bytes is not None:
bytes_ = array('B')
bytes_.fromstring(more_bytes)
j = i
for byte in bytes_:
j = i + 4
if j > array_size:
dnabytes = array(_CHAR_CODE, BYTE_TABLE[byte])[0:(array_size - i)]
dna[i:array_size] = dnabytes
break
dna[i:i + last_base_offset] = array(_CHAR_CODE, BYTE_TABLE[byte])
i += 4
return dna[0:array_size]
class TwoBitFile(dict): class TwoBitFile(dict):
""" """
python-level reader for .2bit files (i.e., from UCSC genome browser) python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support) (note: no writing support)
TwoBitFile inherits from dict
TwoBitFile inherits from dict You may access sequences by name, e.g.
You may access sequences by name, e.g. >>> genome = TwoBitFile('hg18.2bit')
>>> genome = TwoBitFile('hg18.2bit') >>> chr20 = genome['chr20']
>>> chr20 = genome['chr20'] Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
Sequences are returned as TwoBitSequence objects e.g.
You may access intervals by slicing or using str() to dump the entire entry >>> chr20[100100:100120]
e.g. 'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
>>> chr20[100100:100200] tttgcatgttaagtttttttcc'
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat >>> whole_chr20 = str(chr20)
ttgcatgttaagtttttttcc' Fair warning: dumping the entire chromosome requires a lot of memory
>>> whole_chr20 = str(chr20) See TwoBitSequence for more info
Fair warning: dumping the entire chromosome requires a lot of memory
See TwoBitSequence for more info
""" """
def __init__(self, foo): def __init__(self, foo):
...@@ -886,14 +979,19 @@ class TwoBitFile(dict): ...@@ -886,14 +979,19 @@ class TwoBitFile(dict):
if not access(foo, R_OK): if not access(foo, R_OK):
raise IOError(EACCES, strerror(EACCES), foo) raise IOError(EACCES, strerror(EACCES), foo)
self._filename = foo self._filename = foo
self._file_size = getsize(foo)
self._file_handle = open(foo, 'rb') self._file_handle = open(foo, 'rb')
self._load_header() self._load_header()
self._load_index() self._load_index()
for name, offset in self._offset_dict.items(): for name, offset in iteritems(self._offset_dict):
self[name] = TwoBitSequence(self._file_handle, offset, self[name] = TwoBitSequence(self._file_handle, offset,
self._file_size,
self._byteswapped) self._byteswapped)
return return
def __reduce__(self): # enables pickling
return (TwoBitFile, (self._filename,))
def _load_header(self): def _load_header(self):
file_handle = self._file_handle file_handle = self._file_handle
header = array(LONG) header = array(LONG)
...@@ -907,8 +1005,9 @@ class TwoBitFile(dict): ...@@ -907,8 +1005,9 @@ class TwoBitFile(dict):
header.byteswap() header.byteswap()
(signature2, version, sequence_count, reserved) = header (signature2, version, sequence_count, reserved) = header
if not signature2 == 0x1A412743: if not signature2 == 0x1A412743:
raise TwoBitFileError('Signature in header should be 0x1A412743' raise TwoBitFileError('Signature in header should be ' +
+ ', instead found 0x%X' % signature) '0x1A412743, instead found 0x%X' %
signature)
if not version == 0: if not version == 0:
raise TwoBitFileError('File version in header should be 0.') raise TwoBitFileError('File version in header should be 0.')
if not reserved == 0: if not reserved == 0:
...@@ -922,21 +1021,23 @@ class TwoBitFile(dict): ...@@ -922,21 +1021,23 @@ class TwoBitFile(dict):
remaining = self._sequence_count remaining = self._sequence_count
sequence_offsets = [] sequence_offsets = []
file_handle.seek(16) file_handle.seek(16)
while True: while remaining > 0:
if remaining == 0: break
name_size = array('B') name_size = array('B')
name_size.fromfile(file_handle, 1) name_size.fromfile(file_handle, 1)
if byteswapped: if byteswapped:
name_size.byteswap() name_size.byteswap()
name = array('b') # name = array(_CHAR_CODE)
name = array('B')
name.fromfile(file_handle, name_size[0])
name = "".join([chr(X) for X in name])
if byteswapped: if byteswapped:
name.byteswap() name.byteswap()
name.fromfile(file_handle, name_size[0])
offset = array(LONG) offset = array(LONG)
offset.fromfile(file_handle, 1) offset.fromfile(file_handle, 1)
if byteswapped: if byteswapped:
offset.byteswap() offset.byteswap()
sequence_offsets.append((name.tostring(), offset[0])) sequence_offsets.append((name, offset[0]))
remaining -= 1 remaining -= 1
self._sequence_offsets = sequence_offsets self._sequence_offsets = sequence_offsets
self._offset_dict = dict(sequence_offsets) self._offset_dict = dict(sequence_offsets)
...@@ -946,71 +1047,78 @@ class TwoBitFile(dict): ...@@ -946,71 +1047,78 @@ class TwoBitFile(dict):
d = {} d = {}
file_handle = self._file_handle file_handle = self._file_handle
byteswapped = self._byteswapped byteswapped = self._byteswapped
for name, offset in self._offset_dict.items(): for name, offset in iteritems(self._offset_dict):
file_handle.seek(offset) file_handle.seek(offset)
dna_size = array(LONG) dna_size = array(LONG)
dna_size.fromfile(file_handle, 1) dna_size.fromfile(file_handle, 1)
if byteswapped: dna_size.byteswap() if byteswapped:
dna_size.byteswap()
d[name] = dna_size[0] d[name] = dna_size[0]
return d return d
class TwoBitSequence(object): class TwoBitSequence(object):
""" """
A TwoBitSequence object refers to an entry in a TwoBitFile A TwoBitSequence object refers to an entry in a TwoBitFile
You may access intervals by slicing or using str() to dump the entire entry
You may access intervals by slicing or using str() to dump the entire entry e.g.
e.g. >>> genome = TwoBitFile('hg18.2bit')
>>> genome = TwoBitFile('hg18.2bit') >>> chr20 = genome['chr20']
>>> chr20 = genome['chr20'] >>> chr20[100100:100200] # slicing returns a string
>>> chr20[100100:100200] # slicing returns a string 'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat tttgcatgttaagtttttttcc'
ttgcatgttaagtttttttcc' >>> whole_chr20 = str(chr20) # get whole chr as string
>>> whole_chr20 = str(chr20) # get whole chr as string Fair warning: dumping the entire chromosome requires a lot of memory
Note that we follow python/UCSC conventions:
Fair warning: dumping the entire chromosome requires a lot of memory Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
Note that we follow python/UCSC conventions: If you attempt to access a slice past the end of the sequence,
Coordinates are 0-based, end-open it will be truncated at the end.
(Note: The UCSC web-based genome browser uses 1-based closed coordinates) Your computer probably doesn't have enough memory to load a whole genome
If you attempt to access a slice past the end of the sequence, but if you want to string-ize your TwoBitFile, here's a recipe:
it will be truncated at the end. x = TwoBitFile('my.2bit')
d = x.dict()
Your computer probably doesn't have enough memory to load a whole genome for k,v in d.items(): d[k] = str(v)
but if you want to string-ize your TwoBitFile, here's a recipe:
x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.iteritems(): d[k] = str(v)
""" """
def __init__(self, file_handle, offset, byteswapped=False):
def __init__(self, file_handle, offset, file_size, byteswapped=False):
self._file_size = file_size
self._file_handle = file_handle self._file_handle = file_handle
self._original_offset = offset self._original_offset = offset
self._byteswapped = byteswapped self._byteswapped = byteswapped
file_handle.seek(offset) file_handle.seek(offset)
header = array(LONG) header = array(LONG)
header.fromfile(file_handle, 2) header.fromfile(file_handle, 2)
if byteswapped: header.byteswap() if byteswapped:
header.byteswap()
dna_size, n_block_count = header dna_size, n_block_count = header
self._dna_size = dna_size self._dna_size = dna_size # number of characters, 2 bits each
self._packed_dna_size = (dna_size + 15) / 16 # this is 32-bit fragments self._n_bytes = (dna_size + 3) / 4 # number of bytes
# number of 32-bit fragments
self._packed_dna_size = (dna_size + 15) / 16
n_block_starts = array(LONG) n_block_starts = array(LONG)
n_block_sizes = array(LONG) n_block_sizes = array(LONG)
n_block_starts.fromfile(file_handle, n_block_count) n_block_starts.fromfile(file_handle, n_block_count)
if byteswapped: n_block_starts.byteswap() if byteswapped:
n_block_starts.byteswap()
n_block_sizes.fromfile(file_handle, n_block_count) n_block_sizes.fromfile(file_handle, n_block_count)
if byteswapped: n_block_sizes.byteswap() if byteswapped:
n_block_sizes.byteswap()
self._n_block_starts = n_block_starts self._n_block_starts = n_block_starts
self._n_block_sizes= n_block_sizes self._n_block_sizes = n_block_sizes
mask_rawc = array(LONG) mask_rawc = array(LONG)
mask_rawc.fromfile(file_handle, 1) mask_rawc.fromfile(file_handle, 1)
if byteswapped: mask_rawc.byteswap() if byteswapped:
mask_rawc.byteswap()
mask_block_count = mask_rawc[0] mask_block_count = mask_rawc[0]
mask_block_starts = array(LONG) mask_block_starts = array(LONG)
mask_block_starts.fromfile(file_handle, mask_block_count) mask_block_starts.fromfile(file_handle, mask_block_count)
if byteswapped: mask_block_starts.byteswap() if byteswapped:
mask_block_starts.byteswap()
mask_block_sizes = array(LONG) mask_block_sizes = array(LONG)
mask_block_sizes.fromfile(file_handle, mask_block_count) mask_block_sizes.fromfile(file_handle, mask_block_count)
if byteswapped: mask_block_sizes.byteswap() if byteswapped:
mask_block_sizes.byteswap()
self._mask_block_starts = mask_block_starts self._mask_block_starts = mask_block_starts
self._mask_block_sizes = mask_block_sizes self._mask_block_sizes = mask_block_sizes
file_handle.read(4) file_handle.read(4)
...@@ -1019,8 +1127,22 @@ class TwoBitSequence(object): ...@@ -1019,8 +1127,22 @@ class TwoBitSequence(object):
def __len__(self): def __len__(self):
return self._dna_size return self._dna_size
def __getslice__(self, min_, max_=None): def __getitem__(self, slice_or_key):
return self.get_slice(min_, max_) """
return a sub-sequence, given a slice object
"""
step = None
if isinstance(slice_or_key, slice):
step = slice_or_key.step
if step is not None:
raise ValueError("Slicing by step not currently supported")
return self.get_slice(min_=slice_or_key.start, max_=slice_or_key.stop)
elif isinstance(slice_or_key, int):
max_ = slice_or_key + 1
if max_ == 0:
max_ = None
return self.get_slice(min_=slice_or_key, max_=max_)
def get_slice(self, min_, max_=None): def get_slice(self, min_, max_=None):
""" """
...@@ -1028,22 +1150,26 @@ class TwoBitSequence(object): ...@@ -1028,22 +1150,26 @@ class TwoBitSequence(object):
""" """
# handle negative coordinates # handle negative coordinates
dna_size = self._dna_size dna_size = self._dna_size
if max_ < 0: if min_ is None: # for slicing e.g. [:]
if max_ < -dna_size: raise IndexError('index out of range') min_ = 0
max_ = dna_size + 1 + max_ if max_ is not None and max_ < 0:
if min_ < 0: if max_ < -dna_size:
if max_ < -dna_size: raise IndexError('index out of range') raise IndexError('index out of range')
min_ = dna_size + 1 + min_ max_ = dna_size + max_
# Find out if the reverse complement is sought if min_ is not None and min_ < 0:
reverse = False # assume not RC if min_ < -dna_size:
if min_ > max_ and max_ is not None: raise IndexError('index out of range')
reverse = True min_ = dna_size + min_
mymax = max_ # make sure there's a proper range
max_ = min_ if max_ is not None and min_ > max_:
min_ = mymax return ''
if max_ == 0: return '' if max_ == 0 or max_ == min_:
return ''
# load all the data # load all the data
if max_ > dna_size: max_ = dna_size if max_ is None or max_ > dna_size:
max_ = dna_size
file_handle = self._file_handle file_handle = self._file_handle
byteswapped = self._byteswapped byteswapped = self._byteswapped
n_block_starts = self._n_block_starts n_block_starts = self._n_block_starts
...@@ -1052,140 +1178,158 @@ class TwoBitSequence(object): ...@@ -1052,140 +1178,158 @@ class TwoBitSequence(object):
mask_block_sizes = self._mask_block_sizes mask_block_sizes = self._mask_block_sizes
offset = self._offset offset = self._offset
packed_dna_size = self._packed_dna_size packed_dna_size = self._packed_dna_size
# n_bytes = self._n_bytes
# region_size is how many bases the region is # region_size is how many bases the region is
if max_ is None: region_size = dna_size - min_ if max_ is None:
else: region_size = max_ - min_ region_size = dna_size - min_
else:
region_size = max_ - min_
# start_block, end_block are the first/last 32-bit blocks we need # start_block, end_block are the first/last 32-bit blocks we need
# note: end_block is not read
# blocks start at 0 # blocks start at 0
start_block = min_ / 16 start_block = min_ // 16
end_block = max_ / 16 # jump directly to desired file location
local_offset = offset + (start_block * 4)
end_block = (max_ - 1 + 16) // 16
# don't read past seq end # don't read past seq end
if end_block >= packed_dna_size: end_block = packed_dna_size - 1
# +1 we still need to read block
blocks_to_read = end_block - start_block + 1
# jump directly to desired file location
local_offset = offset + start_block * 4
file_handle.seek(local_offset) file_handle.seek(local_offset)
# note we won't actually read the last base # note we won't actually read the last base
# this is a python slice first_base_offset:16*blocks+last_base_offset # this is a python slice first_base_offset:16*blocks+last_base_offset
first_base_offset = min_ % 16 first_base_offset = min_ % 16
last_base_offset = max_ % 16 last_base_offset = max_ % 16
if last_base_offset == 0:
last_base_offset = 16
# +1 we still need to read end_block maybe
blocks_to_read = end_block - start_block
if (blocks_to_read + start_block) > packed_dna_size:
blocks_to_read = packed_dna_size - start_block
fourbyte_dna = array(LONG) fourbyte_dna = array(LONG)
fourbyte_dna.fromfile(file_handle, blocks_to_read) # remainder_seq = None
if byteswapped: fourbyte_dna.byteswap() if (blocks_to_read * 4 + local_offset) > self._file_size:
string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset, fourbyte_dna.fromfile(file_handle, blocks_to_read - 1)
last_base_offset, region_size) morebytes = file_handle.read() # read the remaining characters
for start, size in zip(n_block_starts, n_block_sizes): # if byteswapped:
# morebytes = ''.join(reversed(morebytes))
else:
fourbyte_dna.fromfile(file_handle, blocks_to_read)
morebytes = None
if byteswapped:
fourbyte_dna.byteswap()
str_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
last_base_offset, region_size,
more_bytes=morebytes)
for start, size in izip(n_block_starts, n_block_sizes):
end = start + size end = start + size
if end <= min_: continue if end <= min_:
if start > max_: break continue
if start < min_: start = min_ if start > max_:
if end > max_: end = max_ break
if start < min_:
start = min_
if end > max_:
end = max_
start -= min_ start -= min_
end -= min_ end -= min_
string_as_array[start:end] = array('b', 'N'*(end-start)) # this should actually be decoded, 00=N, 01=n
str_as_array[start:end] = array(_CHAR_CODE, 'N' * (end - start))
lower = str.lower lower = str.lower
first_masked_region = max(0, first_masked_region = max(0,
bisect_right(mask_block_starts, min_) - 1) bisect_right(mask_block_starts, min_) - 1)
last_masked_region = min(len(mask_block_starts), last_masked_region = min(len(mask_block_starts),
1 + bisect_right(mask_block_starts, max_, 1 + bisect_right(mask_block_starts, max_,
lo=first_masked_region)) lo=first_masked_region))
for start, size in zip(mask_block_starts[first_masked_region:last_masked_region], for start, size in izip(mask_block_starts[first_masked_region:
mask_block_sizes[first_masked_region:last_masked_region]): last_masked_region],
mask_block_sizes[first_masked_region:
last_masked_region]):
end = start + size end = start + size
if end <= min_: continue if end <= min_:
if start > max_: break continue
if start < min_: start = min_ if start > max_:
if end > max_: end = max_ break
if start < min_:
start = min_
if end > max_:
end = max_
start -= min_ start -= min_
end -= min_ end -= min_
string_as_array[start:end] = array('b', lower(string_as_array[start:end].tostring())) str_as_array[start:end] = array(_CHAR_CODE,
if not len(string_as_array) == max_ - min_: lower(safe_tostring(str_as_array[start:end])))
raise RuntimeError("Sequence was longer than it should be") if not len(str_as_array) == max_ - min_:
if reverse: raise RuntimeError("Sequence was the wrong size")
return self.reverseComplement(string_as_array.tostring()) return safe_tostring(str_as_array)
return string_as_array.tostring()
def reverseComplement(self, dna):
""" Return a new sequence: the reverse complement of this sequence. """
newseq=''
symbols={'A':'T','C':'G','T':'A','G':'C','a':'t','c':'g','t':'a','g':'c','n':'n','N':'N'} # reverse complement dictionary
for symbol in dna[::-1]:
newsymbol=symbols[symbol] # uses the reverse complement symbols in dictionary
newseq+=newsymbol
return newseq # returns RC sequences
def __str__(self): def __str__(self):
""" """
returns the entire chromosome returns the entire chromosome
""" """
return self.__getslice__(0, None) return self.get_slice(0, None)
class TwoBitFileError(Exception): class TwoBitFileError(Exception):
""" """
Base exception for TwoBit module Base exception for TwoBit module
""" """
def __init__(self, msg): def __init__(self, msg):
errtext = 'Invalid 2-bit file. ' + msg errtext = 'Invalid 2-bit file. ' + msg
return super(TwoBitFileError, self).__init__(errtext) return super(TwoBitFileError, self).__init__(errtext)
def print_specification(): def print_specification():
""" """
Prints the twoBit file format specification I got from the Internet. Prints the twoBit file format specification I got from the Internet.
This is only here for reference This is only here for reference
""" """
return """ return """
From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
.2bit files
.2bit files A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact \
randomly accessible format. The two bit files contain masking information as \
A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact \ well as the DNA itself. The file begins with a 16 byte header containing the \
randomly accessible format. The two bit files contain masking information as \ following fields:
well as the DNA itself. The file begins with a 16 byte header containing the \ signature - the number 0x1A412743 in the architecture of the machine that \
following fields: created the file.
version - zero for now. Readers should abort if they see a version number \
signature - the number 0x1A412743 in the architecture of the machine that \ higher than 0.
created the file. sequenceCount - the number of sequences in the file
version - zero for now. Readers should abort if they see a version number \ reserved - always zero for now.
higher than 0. All fields are 32 bits unless noted. If the signature value is not as given, \
sequenceCount - the number of sequences in the file the reader program should byte swap the signature and see if the swapped \
reserved - always zero for now. version matches. If so all multiple-byte entities in the file will need to be \
All fields are 32 bits unless noted. If the signature value is not as given, \ byte-swapped. This enables these binary files to be used unchanged on \
the reader program should byte swap the signature and see if the swapped \ different architectures.
version matches. If so all multiple-byte entities in the file will need to be \ The header is followed by a file index. There is one entry in the index for \
byte-swapped. This enables these binary files to be used unchanged on \ each sequence. Each index entry contains three fields:
different architectures. nameSize - a byte containing the length of the name field
name - this contains the sequence name itself, and is variable length \
The header is followed by a file index. There is one entry in the index for \ depending on nameSize.
each sequence. Each index entry contains three fields: offset - 32 bit offset of the sequence data relative to the start of the file
The index is followed by the sequence records. These contain 9 fields:
nameSize - a byte containing the length of the name field dnaSize - number of bases of DNA in the sequence.
name - this contains the sequence name itself, and is variable length \ nBlockCount - the number of blocks of N's in the file (representing unknown \
depending on nameSize. sequence).
offset - 32 bit offset of the sequence data relative to the start of the file nBlockStarts - a starting position for each block of N's
nBlockSizes - the size of each block of N's
The index is followed by the sequence records. These contain 9 fields: maskBlockCount - the number of masked (lower case) blocks
maskBlockStarts - starting position for each masked block
dnaSize - number of bases of DNA in the sequence. maskBlockSizes - the size of each masked block
nBlockCount - the number of blocks of N's in the file (representing unknown \ packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A,\
sequence). 11 - G. The first base is in the most significant 2 bits byte, and the last \
nBlockStarts - a starting position for each block of N's base in the least significant 2 bits, so that the sequence TCAG would be \
nBlockSizes - the size of each block of N's represented as 00011011. The packedDna field will be padded with 0 bits as \
maskBlockCount - the number of masked (lower case) blocks necessary so that it takes an even multiple of 32 bit in the file, as this \
maskBlockStarts - starting position for each masked block improves i/o performance on some machines.
maskBlockSizes - the size of each masked block .nib files
packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A, \ """
11 - G. The first base is in the most significant 2 bits byte, and the last \
base in the least significant 2 bits, so that the sequence TCAG would be \
represented as 00011011. The packedDna field will be padded with 0 bits as \ if __name__ == '__main__':
hg19 = TwoBitFile('/Users/mikael/simhome/share/hg19.2bit') # assumes that the genome is stored in your current directory
necessary so that it takes an even multiple of 32 bit in the file, as this \ for key in hg19:
improves i/o performance on some machines. print(key)
.nib files print(hg19['chrX'][1000000:1000060])
""" \ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment