Commit a236ba40 authored by Mikael Boden's avatar Mikael Boden

update_seqdata_to_python_3

parent bb25ec08
e a t###################################################
###################################################
# This module is a supplement to the Python guide #
# Version 2017.3 (10/03/2017) #
###################################################
......
......@@ -13,8 +13,8 @@ def overlap(chromLoc1, chromLoc2):
Return 0 in case of NO overlap.
"""
if chromLoc1[0] == chromLoc2[0]:
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) // 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) // 2
minWidth = min(halfWidth1, halfWidth2)
minWidth = max(minWidth, 1)
maxWidth = max(halfWidth1, halfWidth2)
......@@ -37,8 +37,8 @@ def distance(chromLoc1, chromLoc2, minimum = True):
minimum: if True (default), then use minimum distance, if False, use centre to centre
"""
if chromLoc1[0] == chromLoc2[0]:
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) // 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) // 2
minWidth = min(halfWidth1, halfWidth2)
minWidth = max(minWidth, 1)
maxWidth = max(halfWidth1, halfWidth2)
......@@ -151,33 +151,33 @@ class BedEntry():
end = self.chromEnd
start = self.chromStart
mywidth = fixedwidth or (self.chromEnd - self.chromStart)
mycentre = start + (self.chromEnd - self.chromStart) / 2
mycentre = start + (self.chromEnd - self.chromStart) // 2
if usesummit:
mycentre = self.summit
if useshift:
mycentre = mycentre + useshift
if fixedwidth: # we need to re-calculate start and end
if genome:
end = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
end = min(len(genome[self.chrom]), mycentre + (mywidth // 2))
else:
end = mycentre + (mywidth / 2)
start = max(0, mycentre - (mywidth / 2))
end = mycentre + (mywidth // 2)
start = max(0, mycentre - (mywidth // 2))
else: # other strand
start = self.chromEnd
end = self.chromStart
mywidth = fixedwidth or (self.chromEnd - self.chromStart)
mycentre = self.chromStart + (self.chromEnd - self.chromStart) / 2
mycentre = self.chromStart + (self.chromEnd - self.chromStart) // 2
if usesummit:
mycentre = self.summit
if useshift:
mycentre = mycentre - useshift # shift is reversed on other strand
if fixedwidth: # we need to re-calculate start and end
end = max(0, mycentre - (mywidth / 2))
end = max(0, mycentre - (mywidth // 2))
if genome:
start = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
start = min(len(genome[self.chrom]), mycentre + (mywidth // 2))
else:
start = mycentre + (mywidth / 2)
start = mycentre + (mywidth // 2)
if genome: # refer to the genome sequence
return genome[self.chrom][start : end]
......@@ -187,9 +187,9 @@ class BedEntry():
def setwidth(self, fixedwidth = None, usesummit = False):
if fixedwidth:
if usesummit:
diff = self.summit - fixedwidth / 2
diff = self.summit - fixedwidth // 2
else:
diff = (self.chromEnd - self.chromStart) / 2 - fixedwidth / 2
diff = (self.chromEnd - self.chromStart) // 2 - fixedwidth // 2
self.chromStart += diff
self.chromStart += diff + fixedwidth
return (self.chrom, self.chromStart, self.chromEnd)
......@@ -365,11 +365,8 @@ class BedFile():
def __iter__(self):
return self.rows.__iter__()
def __getslice__(self, i, j):
return self.rows.__getslice__(i, j)
def __getitem__(self, i):
return self.rows[i]
def __getitem__(self, key):
return self.rows[key]
def __len__(self):
return len(self.rows)
......@@ -388,7 +385,7 @@ class BedFile():
if not row.chrom in index_end: # seeing chromosome entry first time
index_end[row.chrom] = []
index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) // 2, (row.chromEnd - row.chromStart) // 2, i))
index_end[row.chrom].append((row.chromEnd, row.chromEnd - row.chromStart, i))
if row.name:
index_name[row.name] = row
......@@ -407,7 +404,7 @@ class BedFile():
entries = self.indices[0][elem[0]] # use the start index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
while True:
entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), elem, minimum = True)
......@@ -416,11 +413,11 @@ class BedFile():
return True
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
if delta == 0:
return False
......@@ -436,7 +433,7 @@ class BedFile():
entries = self.indices[0][elem[0]] # use the start index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
while True:
entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), elem, minimum = True)
......@@ -461,11 +458,11 @@ class BedFile():
return False
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
if delta == 0:
return False
......@@ -494,7 +491,7 @@ class BedFile():
entries = self.indices[0][myloc[0]] # use start index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
delta = None
while not delta == 0:
entry = self.rows[entries[inspect][2]]
......@@ -509,11 +506,11 @@ class BedFile():
return (mindist, minentry)
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
# we may have missed the closest, so need to look around this point
for i_dn in range(inspect + 1, len(entries)): # Look downstream since
......@@ -528,7 +525,7 @@ class BedFile():
entries = self.indices[2][myloc[0]] # use end index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
delta = None
while not delta == 0:
entry = self.rows[entries[inspect][2]]
......@@ -540,11 +537,11 @@ class BedFile():
return (mindist, minentry)
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
# we may have missed the closest, so need to look around this point
for i_up in range(inspect - 1, 0, -1): # Look upstream since
......@@ -560,7 +557,7 @@ class BedFile():
entries = self.indices[1][myloc[0]] # use centre index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
delta = None
while not delta == 0:
entry = self.rows[entries[inspect][2]]
......@@ -575,11 +572,11 @@ class BedFile():
return (mindist, minentry)
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
# at bottom of search
return (mindist, minentry)
......@@ -751,30 +748,64 @@ Modifications to package:
- removed download.py and __main__ because they were not used and __main__ had errors.
- removed command-line interface because the BED file functionality is implemented more extensively elsewhere
"""
"""
twobitreader
Licensed under Perl Artistic License 2.0
No warranty is provided, express or implied
"""
from array import array
from bisect import bisect_right
from errno import ENOENT, EACCES
from os import R_OK, access
try:
from os import strerror
except ImportError:
strerror = lambda x: 'strerror not supported'
from os.path import exists
from itertools import chain
from os.path import exists, getsize
import logging
import textwrap
import sys
if sys.version_info > (3,):
izip = zip
xrange = range
_CHAR_CODE = 'u'
iteritems = dict.items
else:
from itertools import izip
_CHAR_CODE = 'c'
iteritems = dict.iteritems
def safe_tostring(ary):
"""
convert arrays to strings in a Python 2.x / 3.x safe way
"""
if sys.version_info > (3,):
return ary.tounicode().encode("ascii").decode()
else:
return ary.tostring()
def true_long_type():
"""
OS X uses an 8-byte long, so make sure L (long) is the right size
and switch to I (int) if needed
OS X uses an 8-byte long, so make sure L (long) is the right size
and switch to I (int) if needed
"""
for type_ in ['L', 'I']:
test_array = array(type_, [0])
long_size = test_array.itemsize
if long_size == 4: return type_
if long_size == 4:
return type_
raise ImportError("Couldn't determine a valid 4-byte long type to use \
as equivalent to LONG")
LONG = true_long_type()
def byte_to_bases(x):
"""convert one byte to the four bases it encodes"""
c = (x >> 4) & 0xf
......@@ -783,100 +814,162 @@ def byte_to_bases(x):
cf = c & 0x3
fc = (f >> 2) & 0x3
ff = f & 0x3
return map(bits_to_base, (cc, cf, fc, ff))
return [bits_to_base(X) for X in (cc, cf, fc, ff)]
def bits_to_base(x):
"""convert integer representation of two bits to correct base"""
if x is 0: return 'T'
if x is 1: return 'C'
if x is 2: return 'A'
if x is 3: return 'G'
if x is 0:
return 'T'
elif x is 1:
return 'C'
elif x is 2:
return 'A'
elif x is 3:
return 'G'
else:
raise ValueError('Only integers 0-3 are valid inputs')
def base_to_bin(x):
"""
provided for user convenience
convert a nucleotide to its bit representation
provided for user convenience
convert a nucleotide to its bit representation
"""
if x == 'T': return '00'
if x == 'C': return '01'
if x == 'A': return '10'
if x == 'G': return '11'
if x == 'T':
return '00'
elif x == 'C':
return '01'
elif x == 'A':
return '10'
elif x == 'G':
return '11'
else:
raise ValueError('Only characters \'ATGC\' are valid inputs')
def create_byte_table():
"""create BYTE_TABLE"""
d = {}
for x in range(2**8):
for x in xrange(2 ** 8):
d[x] = byte_to_bases(x)
return d
def split16(x):
"""
split a 16-bit number into integer representation
of its course and fine parts in binary representation
split a 16-bit number into integer representation
of its course and fine parts in binary representation
"""
c = (x >> 8) & 0xff
f = x & 0xff
return c, f
def create_twobyte_table():
"""create TWOBYTE_TABLE"""
d = {}
for x in range(2**16):
for x in xrange(2 ** 16):
c, f = split16(x)
d[x] = chain(byte_to_bases(c), byte_to_bases(f))
d[x] = list(byte_to_bases(c)) + list(byte_to_bases(f))
return d
BYTE_TABLE = create_byte_table()
TWOBYTE_TABLE = create_twobyte_table()
def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size,
more_bytes=None):
"""
takes in a iterable of longs and converts them to bases in a char array
returns a ctypes string buffer
takes in an array of longs (4 bytes) and converts them to bases in
a char array
you must also provide the offset in the first and last block
(note these offsets are pythonic. last_offset is not included)
and the desired array_size
If you have less than a long worth of bases at the end, you can provide
them as a string with more_bytes=
NOTE: last_base_offset is inside more_bytes not the last long, if more_bytes
is not None
returns the correct subset of the array based on provided offsets
"""
if array_size == 0:
return array(_CHAR_CODE)
elif array_size < 0:
raise ValueError('array_size must be at least 0')
if not first_base_offset in range(16):
raise ValueError('first_base_offset must be in range(16)')
if not last_base_offset in range(1, 17):
raise ValueError('last_base_offset must be in range(1, 17)')
longs_len = len(longs)
# dna = ctypes.create_string_buffer(array_size)
dna = array('b', 'N' * longs_len)
if more_bytes is None:
shorts_length = 0
else:
shorts_length = len(more_bytes)
if array_size > longs_len * 16 + 4 * shorts_length:
raise ValueError('array_size exceeds maximum possible for input')
dna = array(_CHAR_CODE, 'N' * (longs_len * 16 + 4 * shorts_length))
# translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed)
bytes = array('B')
bytes.fromstring(longs.tostring())
# first block
first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
i = 16 - first_base_offset
if array_size < i: i = array_size
dna[0:i] = array('b', first_block[first_base_offset:first_base_offset + i])
if longs_len == 1: return dna
# middle blocks (implicitly skipped if they don't exist)
for byte in bytes[4:-4]:
dna[i:i + 4] = array('b', BYTE_TABLE[byte])
i += 4
# last block
last_block = array('b', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
dna[i:i + last_base_offset] = last_block[0:last_base_offset]
return dna
i = 0
if longs_len > 0:
bytes_ = array('B')
bytes_.fromstring(longs.tostring())
# first block
first_block = ''.join([''.join(BYTE_TABLE[bytes_[x]]) for x in range(4)])
i = 16 - first_base_offset
if array_size < i:
i = array_size
dna[0:i] = array(_CHAR_CODE, first_block[first_base_offset:first_base_offset + i])
if longs_len > 1:
# middle blocks (implicitly skipped if they don't exist)
for byte in bytes_[4:-4]:
dna[i:i + 4] = array(_CHAR_CODE, BYTE_TABLE[byte])
i += 4
# last block
last_block = array(_CHAR_CODE, ''.join([''.join(BYTE_TABLE[bytes_[x]])
for x in range(-4, 0)]))
if more_bytes is None:
dna[i:i + last_base_offset] = last_block[0:last_base_offset]
else: # if there are more bytes, we need the whole last block
dna[i:i + 16] = last_block[0:16]
i += 16
if more_bytes is not None:
bytes_ = array('B')
bytes_.fromstring(more_bytes)
j = i
for byte in bytes_:
j = i + 4
if j > array_size:
dnabytes = array(_CHAR_CODE, BYTE_TABLE[byte])[0:(array_size - i)]
dna[i:array_size] = dnabytes
break
dna[i:i + last_base_offset] = array(_CHAR_CODE, BYTE_TABLE[byte])
i += 4
return dna[0:array_size]
class TwoBitFile(dict):
"""
python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support)
TwoBitFile inherits from dict
You may access sequences by name, e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> chr20[100100:100200]
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20)
Fair warning: dumping the entire chromosome requires a lot of memory
See TwoBitSequence for more info
python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support)
TwoBitFile inherits from dict
You may access sequences by name, e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> chr20[100100:100120]
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
tttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20)
Fair warning: dumping the entire chromosome requires a lot of memory
See TwoBitSequence for more info
"""
def __init__(self, foo):
......@@ -886,14 +979,19 @@ class TwoBitFile(dict):
if not access(foo, R_OK):
raise IOError(EACCES, strerror(EACCES), foo)
self._filename = foo
self._file_size = getsize(foo)
self._file_handle = open(foo, 'rb')
self._load_header()
self._load_index()
for name, offset in self._offset_dict.items():
for name, offset in iteritems(self._offset_dict):
self[name] = TwoBitSequence(self._file_handle, offset,
self._file_size,
self._byteswapped)
return
def __reduce__(self): # enables pickling
return (TwoBitFile, (self._filename,))
def _load_header(self):
file_handle = self._file_handle
header = array(LONG)
......@@ -907,8 +1005,9 @@ class TwoBitFile(dict):
header.byteswap()
(signature2, version, sequence_count, reserved) = header
if not signature2 == 0x1A412743:
raise TwoBitFileError('Signature in header should be 0x1A412743'
+ ', instead found 0x%X' % signature)
raise TwoBitFileError('Signature in header should be ' +
'0x1A412743, instead found 0x%X' %
signature)
if not version == 0:
raise TwoBitFileError('File version in header should be 0.')
if not reserved == 0:
......@@ -922,21 +1021,23 @@ class TwoBitFile(dict):
remaining = self._sequence_count
sequence_offsets = []
file_handle.seek(16)
while True:
if remaining == 0: break
while remaining > 0:
name_size = array('B')
name_size.fromfile(file_handle, 1)
if byteswapped:
name_size.byteswap()
name = array('b')
# name = array(_CHAR_CODE)
name = array('B')
name.fromfile(file_handle, name_size[0])
name = "".join([chr(X) for X in name])
if byteswapped:
name.byteswap()
name.fromfile(file_handle, name_size[0])
offset = array(LONG)
offset.fromfile(file_handle, 1)
if byteswapped:
offset.byteswap()
sequence_offsets.append((name.tostring(), offset[0]))
sequence_offsets.append((name, offset[0]))
remaining -= 1
self._sequence_offsets = sequence_offsets
self._offset_dict = dict(sequence_offsets)
......@@ -946,71 +1047,78 @@ class TwoBitFile(dict):
d = {}
file_handle = self._file_handle
byteswapped = self._byteswapped
for name, offset in self._offset_dict.items():
for name, offset in iteritems(self._offset_dict):
file_handle.seek(offset)
dna_size = array(LONG)
dna_size.fromfile(file_handle, 1)
if byteswapped: dna_size.byteswap()
if byteswapped:
dna_size.byteswap()
d[name] = dna_size[0]
return d
class TwoBitSequence(object):
"""
A TwoBitSequence object refers to an entry in a TwoBitFile
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20[100100:100200] # slicing returns a string
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20) # get whole chr as string
Fair warning: dumping the entire chromosome requires a lot of memory
Note that we follow python/UCSC conventions:
Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
If you attempt to access a slice past the end of the sequence,
it will be truncated at the end.
Your computer probably doesn't have enough memory to load a whole genome
but if you want to string-ize your TwoBitFile, here's a recipe:
x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.iteritems(): d[k] = str(v)
A TwoBitSequence object refers to an entry in a TwoBitFile
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20[100100:100200] # slicing returns a string
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
tttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20) # get whole chr as string
Fair warning: dumping the entire chromosome requires a lot of memory
Note that we follow python/UCSC conventions:
Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
If you attempt to access a slice past the end of the sequence,
it will be truncated at the end.
Your computer probably doesn't have enough memory to load a whole genome
but if you want to string-ize your TwoBitFile, here's a recipe:
x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.items(): d[k] = str(v)
"""
def __init__(self, file_handle, offset, byteswapped=False):
def __init__(self, file_handle, offset, file_size, byteswapped=False):
self._file_size = file_size
self._file_handle = file_handle
self._original_offset = offset
self._byteswapped = byteswapped
file_handle.seek(offset)
header = array(LONG)
header.fromfile(file_handle, 2)
if byteswapped: header.byteswap()
if byteswapped:
header.byteswap()
dna_size, n_block_count = header
self._dna_size = dna_size
self._packed_dna_size = (dna_size + 15) / 16 # this is 32-bit fragments
self._dna_size = dna_size # number of characters, 2 bits each
self._n_bytes = (dna_size + 3) / 4 # number of bytes
# number of 32-bit fragments
self._packed_dna_size = (dna_size + 15) / 16
n_block_starts = array(LONG)
n_block_sizes = array(LONG)
n_block_starts.fromfile(file_handle, n_block_count)
if byteswapped: n_block_starts.byteswap()
if byteswapped:
n_block_starts.byteswap()
n_block_sizes.fromfile(file_handle, n_block_count)
if byteswapped: n_block_sizes.byteswap()
if byteswapped:
n_block_sizes.byteswap()
self._n_block_starts = n_block_starts
self._n_block_sizes= n_block_sizes
self._n_block_sizes = n_block_sizes
mask_rawc = array(LONG)
mask_rawc.fromfile(file_handle, 1)
if byteswapped: mask_rawc.byteswap()
if byteswapped:
mask_rawc.byteswap()
mask_block_count = mask_rawc[0]
mask_block_starts = array(LONG)
mask_block_starts.fromfile(file_handle, mask_block_count)
if byteswapped: mask_block_starts.byteswap()
if byteswapped:
mask_block_starts.byteswap()
mask_block_sizes = array(LONG)
mask_block_sizes.fromfile(file_handle, mask_block_count)
if byteswapped: mask_block_sizes.byteswap()
if byteswapped:
mask_block_sizes.byteswap()
self._mask_block_starts = mask_block_starts
self._mask_block_sizes = mask_block_sizes
file_handle.read(4)
......@@ -1019,8 +1127,22 @@ class TwoBitSequence(object):
def __len__(self):
return self._dna_size
def __getslice__(self, min_, max_=None):
return self.get_slice(min_, max_)
def __getitem__(self, slice_or_key):
"""
return a sub-sequence, given a slice object
"""
step = None
if isinstance(slice_or_key, slice):
step = slice_or_key.step
if step is not None:
raise ValueError("Slicing by step not currently supported")
return self.get_slice(min_=slice_or_key.start, max_=slice_or_key.stop)
elif isinstance(slice_or_key, int):
max_ = slice_or_key + 1
if max_ == 0:
max_ = None
return self.get_slice(min_=slice_or_key, max_=max_)
def get_slice(self, min_, max_=None):
"""
......@@ -1028,22 +1150,26 @@ class TwoBitSequence(object):
"""
# handle negative coordinates
dna_size = self._dna_size
if max_ < 0:
if max_ < -dna_size: raise IndexError('index out of range')
max_ = dna_size + 1 + max_
if min_ < 0:
if max_ < -dna_size: raise IndexError('index out of range')
min_ = dna_size + 1 + min_
# Find out if the reverse complement is sought
reverse = False # assume not RC
if min_ > max_ and max_ is not None:
reverse = True
mymax = max_
max_ = min_
min_ = mymax
if max_ == 0: return ''
if min_ is None: # for slicing e.g. [:]
min_ = 0
if max_ is not None and max_ < 0:
if max_ < -dna_size:
raise IndexError('index out of range')
max_ = dna_size + max_
if min_ is not None and min_ < 0:
if min_ < -dna_size:
raise IndexError('index out of range')
min_ = dna_size + min_
# make sure there's a proper range
if max_ is not None and min_ > max_:
return ''
if max_ == 0 or max_ == min_:
return ''
# load all the data
if max_ > dna_size: max_ = dna_size
if max_ is None or max_ > dna_size:
max_ = dna_size
file_handle = self._file_handle
byteswapped = self._byteswapped
n_block_starts = self._n_block_starts
......@@ -1052,140 +1178,158 @@ class TwoBitSequence(object):
mask_block_sizes = self._mask_block_sizes
offset = self._offset
packed_dna_size = self._packed_dna_size
# n_bytes = self._n_bytes
# region_size is how many bases the region is
if max_ is None: region_size = dna_size - min_
else: region_size = max_ - min_
if max_ is None:
region_size = dna_size - min_
else:
region_size = max_ - min_
# start_block, end_block are the first/last 32-bit blocks we need
# note: end_block is not read
# blocks start at 0
start_block = min_ / 16
end_block = max_ / 16
start_block = min_ // 16
# jump directly to desired file location
local_offset = offset + (start_block * 4)
end_block = (max_ - 1 + 16) // 16
# don't read past seq end
if end_block >= packed_dna_size: end_block = packed_dna_size - 1
# +1 we still need to read block
blocks_to_read = end_block - start_block + 1
# jump directly to desired file location
local_offset = offset + start_block * 4
file_handle.seek(local_offset)
# note we won't actually read the last base
# this is a python slice first_base_offset:16*blocks+last_base_offset
first_base_offset = min_ % 16
last_base_offset = max_ % 16
if last_base_offset == 0:
last_base_offset = 16
# +1 we still need to read end_block maybe
blocks_to_read = end_block - start_block
if (blocks_to_read + start_block) > packed_dna_size:
blocks_to_read = packed_dna_size - start_block
fourbyte_dna = array(LONG)
fourbyte_dna.fromfile(file_handle, blocks_to_read)
if byteswapped: fourbyte_dna.byteswap()
string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
last_base_offset, region_size)
for start, size in zip(n_block_starts, n_block_sizes):
# remainder_seq = None
if (blocks_to_read * 4 + local_offset) > self._file_size:
fourbyte_dna.fromfile(file_handle, blocks_to_read - 1)
morebytes = file_handle.read() # read the remaining characters
# if byteswapped:
# morebytes = ''.join(reversed(morebytes))
else:
fourbyte_dna.fromfile(file_handle, blocks_to_read)
morebytes = None
if byteswapped:
fourbyte_dna.byteswap()
str_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
last_base_offset, region_size,
more_bytes=morebytes)
for start, size in izip(n_block_starts, n_block_sizes):
end = start + size
if end <= min_: continue
if start > max_: break
if start < min_: start = min_
if end > max_: end = max_
if end <= min_:
continue
if start > max_:
break
if start < min_:
start = min_
if end > max_:
end = max_
start -= min_
end -= min_
string_as_array[start:end] = array('b', 'N'*(end-start))
# this should actually be decoded, 00=N, 01=n
str_as_array[start:end] = array(_CHAR_CODE, 'N' * (end - start))
lower = str.lower
first_masked_region = max(0,
bisect_right(mask_block_starts, min_) - 1)
last_masked_region = min(len(mask_block_starts),
1 + bisect_right(mask_block_starts, max_,
lo=first_masked_region))
for start, size in zip(mask_block_starts[first_masked_region:last_masked_region],
mask_block_sizes[first_masked_region:last_masked_region]):
for start, size in izip(mask_block_starts[first_masked_region:
last_masked_region],
mask_block_sizes[first_masked_region:
last_masked_region]):
end = start + size
if end <= min_: continue
if start > max_: break
if start < min_: start = min_
if end > max_: end = max_
if end <= min_:
continue
if start > max_:
break
if start < min_:
start = min_
if end > max_:
end = max_
start -= min_
end -= min_
string_as_array[start:end] = array('b', lower(string_as_array[start:end].tostring()))
if not len(string_as_array) == max_ - min_:
raise RuntimeError("Sequence was longer than it should be")
if reverse:
return self.reverseComplement(string_as_array.tostring())
return string_as_array.tostring()
def reverseComplement(self, dna):
""" Return a new sequence: the reverse complement of this sequence. """
newseq=''
symbols={'A':'T','C':'G','T':'A','G':'C','a':'t','c':'g','t':'a','g':'c','n':'n','N':'N'} # reverse complement dictionary
for symbol in dna[::-1]:
newsymbol=symbols[symbol] # uses the reverse complement symbols in dictionary
newseq+=newsymbol
return newseq # returns RC sequences
str_as_array[start:end] = array(_CHAR_CODE,
lower(safe_tostring(str_as_array[start:end])))
if not len(str_as_array) == max_ - min_:
raise RuntimeError("Sequence was the wrong size")
return safe_tostring(str_as_array)
def __str__(self):
"""
returns the entire chromosome
"""
return self.__getslice__(0, None)
return self.get_slice(0, None)
class TwoBitFileError(Exception):
"""
Base exception for TwoBit module
"""
def __init__(self, msg):
errtext = 'Invalid 2-bit file. ' + msg
return super(TwoBitFileError, self).__init__(errtext)
def print_specification():
"""
Prints the twoBit file format specification I got from the Internet.
This is only here for reference
"""
return """
From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
.2bit files
A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact \
randomly accessible format. The two bit files contain masking information as \
well as the DNA itself. The file begins with a 16 byte header containing the \
following fields:
signature - the number 0x1A412743 in the architecture of the machine that \
created the file.
version - zero for now. Readers should abort if they see a version number \
higher than 0.
sequenceCount - the number of sequences in the file
reserved - always zero for now.
All fields are 32 bits unless noted. If the signature value is not as given, \
the reader program should byte swap the signature and see if the swapped \
version matches. If so all multiple-byte entities in the file will need to be \
byte-swapped. This enables these binary files to be used unchanged on \
different architectures.
The header is followed by a file index. There is one entry in the index for \
each sequence. Each index entry contains three fields:
nameSize - a byte containing the length of the name field
name - this contains the sequence name itself, and is variable length \
depending on nameSize.
offset - 32 bit offset of the sequence data relative to the start of the file
The index is followed by the sequence records. These contain 9 fields:
dnaSize - number of bases of DNA in the sequence.
nBlockCount - the number of blocks of N's in the file (representing unknown \
sequence).
nBlockStarts - a starting position for each block of N's
nBlockSizes - the size of each block of N's
maskBlockCount - the number of masked (lower case) blocks
maskBlockStarts - starting position for each masked block
maskBlockSizes - the size of each masked block
packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A, \
11 - G. The first base is in the most significant 2 bits byte, and the last \
base in the least significant 2 bits, so that the sequence TCAG would be \
represented as 00011011. The packedDna field will be padded with 0 bits as \
necessary so that it takes an even multiple of 32 bit in the file, as this \
improves i/o performance on some machines.
.nib files
"""
From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
.2bit files
A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact \
randomly accessible format. The two bit files contain masking information as \
well as the DNA itself. The file begins with a 16 byte header containing the \
following fields:
signature - the number 0x1A412743 in the architecture of the machine that \
created the file.
version - zero for now. Readers should abort if they see a version number \
higher than 0.
sequenceCount - the number of sequences in the file
reserved - always zero for now.
All fields are 32 bits unless noted. If the signature value is not as given, \
the reader program should byte swap the signature and see if the swapped \
version matches. If so all multiple-byte entities in the file will need to be \
byte-swapped. This enables these binary files to be used unchanged on \
different architectures.
The header is followed by a file index. There is one entry in the index for \
each sequence. Each index entry contains three fields:
nameSize - a byte containing the length of the name field
name - this contains the sequence name itself, and is variable length \
depending on nameSize.
offset - 32 bit offset of the sequence data relative to the start of the file
The index is followed by the sequence records. These contain 9 fields:
dnaSize - number of bases of DNA in the sequence.
nBlockCount - the number of blocks of N's in the file (representing unknown \
sequence).
nBlockStarts - a starting position for each block of N's
nBlockSizes - the size of each block of N's
maskBlockCount - the number of masked (lower case) blocks
maskBlockStarts - starting position for each masked block
maskBlockSizes - the size of each masked block
packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A,\
11 - G. The first base is in the most significant 2 bits byte, and the last \
base in the least significant 2 bits, so that the sequence TCAG would be \
represented as 00011011. The packedDna field will be padded with 0 bits as \
necessary so that it takes an even multiple of 32 bit in the file, as this \
improves i/o performance on some machines.
.nib files
"""
if __name__ == '__main__':
hg19 = TwoBitFile('/Users/mikael/simhome/share/hg19.2bit') # assumes that the genome is stored in your current directory
for key in hg19:
print(key)
print(hg19['chrX'][1000000:1000060])
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment