Commit a236ba40 authored by Mikael Boden's avatar Mikael Boden

update_seqdata_to_python_3

parent bb25ec08
e a t###################################################
###################################################
# This module is a supplement to the Python guide #
# Version 2017.3 (10/03/2017) #
###################################################
......
......@@ -13,8 +13,8 @@ def overlap(chromLoc1, chromLoc2):
Return 0 in case of NO overlap.
"""
if chromLoc1[0] == chromLoc2[0]:
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) // 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) // 2
minWidth = min(halfWidth1, halfWidth2)
minWidth = max(minWidth, 1)
maxWidth = max(halfWidth1, halfWidth2)
......@@ -37,8 +37,8 @@ def distance(chromLoc1, chromLoc2, minimum = True):
minimum: if True (default), then use minimum distance, if False, use centre to centre
"""
if chromLoc1[0] == chromLoc2[0]:
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
halfWidth1 = (chromLoc1[2] - chromLoc1[1]) // 2
halfWidth2 = (chromLoc2[2] - chromLoc2[1]) // 2
minWidth = min(halfWidth1, halfWidth2)
minWidth = max(minWidth, 1)
maxWidth = max(halfWidth1, halfWidth2)
......@@ -151,33 +151,33 @@ class BedEntry():
end = self.chromEnd
start = self.chromStart
mywidth = fixedwidth or (self.chromEnd - self.chromStart)
mycentre = start + (self.chromEnd - self.chromStart) / 2
mycentre = start + (self.chromEnd - self.chromStart) // 2
if usesummit:
mycentre = self.summit
if useshift:
mycentre = mycentre + useshift
if fixedwidth: # we need to re-calculate start and end
if genome:
end = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
end = min(len(genome[self.chrom]), mycentre + (mywidth // 2))
else:
end = mycentre + (mywidth / 2)
start = max(0, mycentre - (mywidth / 2))
end = mycentre + (mywidth // 2)
start = max(0, mycentre - (mywidth // 2))
else: # other strand
start = self.chromEnd
end = self.chromStart
mywidth = fixedwidth or (self.chromEnd - self.chromStart)
mycentre = self.chromStart + (self.chromEnd - self.chromStart) / 2
mycentre = self.chromStart + (self.chromEnd - self.chromStart) // 2
if usesummit:
mycentre = self.summit
if useshift:
mycentre = mycentre - useshift # shift is reversed on other strand
if fixedwidth: # we need to re-calculate start and end
end = max(0, mycentre - (mywidth / 2))
end = max(0, mycentre - (mywidth // 2))
if genome:
start = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
start = min(len(genome[self.chrom]), mycentre + (mywidth // 2))
else:
start = mycentre + (mywidth / 2)
start = mycentre + (mywidth // 2)
if genome: # refer to the genome sequence
return genome[self.chrom][start : end]
......@@ -187,9 +187,9 @@ class BedEntry():
def setwidth(self, fixedwidth = None, usesummit = False):
if fixedwidth:
if usesummit:
diff = self.summit - fixedwidth / 2
diff = self.summit - fixedwidth // 2
else:
diff = (self.chromEnd - self.chromStart) / 2 - fixedwidth / 2
diff = (self.chromEnd - self.chromStart) // 2 - fixedwidth // 2
self.chromStart += diff
self.chromStart += diff + fixedwidth
return (self.chrom, self.chromStart, self.chromEnd)
......@@ -365,11 +365,8 @@ class BedFile():
def __iter__(self):
return self.rows.__iter__()
def __getslice__(self, i, j):
return self.rows.__getslice__(i, j)
def __getitem__(self, i):
return self.rows[i]
def __getitem__(self, key):
return self.rows[key]
def __len__(self):
return len(self.rows)
......@@ -388,7 +385,7 @@ class BedFile():
if not row.chrom in index_end: # seeing chromosome entry first time
index_end[row.chrom] = []
index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) // 2, (row.chromEnd - row.chromStart) // 2, i))
index_end[row.chrom].append((row.chromEnd, row.chromEnd - row.chromStart, i))
if row.name:
index_name[row.name] = row
......@@ -407,7 +404,7 @@ class BedFile():
entries = self.indices[0][elem[0]] # use the start index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
while True:
entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), elem, minimum = True)
......@@ -416,11 +413,11 @@ class BedFile():
return True
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
if delta == 0:
return False
......@@ -436,7 +433,7 @@ class BedFile():
entries = self.indices[0][elem[0]] # use the start index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
while True:
entry = self.rows[entries[inspect][2]]
d = distance(entry.loc(), elem, minimum = True)
......@@ -461,11 +458,11 @@ class BedFile():
return False
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
if delta == 0:
return False
......@@ -494,7 +491,7 @@ class BedFile():
entries = self.indices[0][myloc[0]] # use start index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
delta = None
while not delta == 0:
entry = self.rows[entries[inspect][2]]
......@@ -509,11 +506,11 @@ class BedFile():
return (mindist, minentry)
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
# we may have missed the closest, so need to look around this point
for i_dn in range(inspect + 1, len(entries)): # Look downstream since
......@@ -528,7 +525,7 @@ class BedFile():
entries = self.indices[2][myloc[0]] # use end index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
delta = None
while not delta == 0:
entry = self.rows[entries[inspect][2]]
......@@ -540,11 +537,11 @@ class BedFile():
return (mindist, minentry)
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
# we may have missed the closest, so need to look around this point
for i_up in range(inspect - 1, 0, -1): # Look upstream since
......@@ -560,7 +557,7 @@ class BedFile():
entries = self.indices[1][myloc[0]] # use centre index
upper = len(entries) # keep an upper boundary
lower = 0 # and a lower boundary
inspect = (upper - lower) / 2 # start by looking in the middle
inspect = (upper - lower) // 2 # start by looking in the middle
delta = None
while not delta == 0:
entry = self.rows[entries[inspect][2]]
......@@ -575,11 +572,11 @@ class BedFile():
return (mindist, minentry)
elif d > 0:
lower = inspect + 1
delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
inspect += delta
else:
upper = inspect
delta = (inspect - lower + 1) / 2
delta = (inspect - lower + 1) // 2
inspect -= delta
# at bottom of search
return (mindist, minentry)
......@@ -751,30 +748,64 @@ Modifications to package:
- removed download.py and __main__ because they were not used and __main__ had errors.
- removed command-line interface because the BED file functionality is implemented more extensively elsewhere
"""
"""
twobitreader
Licensed under Perl Artistic License 2.0
No warranty is provided, express or implied
"""
from array import array
from bisect import bisect_right
from errno import ENOENT, EACCES
from os import R_OK, access
try:
from os import strerror
except ImportError:
strerror = lambda x: 'strerror not supported'
from os.path import exists
from itertools import chain
from os.path import exists, getsize
import logging
import textwrap
import sys
if sys.version_info > (3,):
izip = zip
xrange = range
_CHAR_CODE = 'u'
iteritems = dict.items
else:
from itertools import izip
_CHAR_CODE = 'c'
iteritems = dict.iteritems
def safe_tostring(ary):
"""
convert arrays to strings in a Python 2.x / 3.x safe way
"""
if sys.version_info > (3,):
return ary.tounicode().encode("ascii").decode()
else:
return ary.tostring()
def true_long_type():
"""
OS X uses an 8-byte long, so make sure L (long) is the right size
and switch to I (int) if needed
OS X uses an 8-byte long, so make sure L (long) is the right size
and switch to I (int) if needed
"""
for type_ in ['L', 'I']:
test_array = array(type_, [0])
long_size = test_array.itemsize
if long_size == 4: return type_
if long_size == 4:
return type_
raise ImportError("Couldn't determine a valid 4-byte long type to use \
as equivalent to LONG")
LONG = true_long_type()
def byte_to_bases(x):
"""convert one byte to the four bases it encodes"""
c = (x >> 4) & 0xf
......@@ -783,100 +814,162 @@ def byte_to_bases(x):
cf = c & 0x3
fc = (f >> 2) & 0x3
ff = f & 0x3
return map(bits_to_base, (cc, cf, fc, ff))
return [bits_to_base(X) for X in (cc, cf, fc, ff)]
def bits_to_base(x):
"""convert integer representation of two bits to correct base"""
if x is 0: return 'T'
if x is 1: return 'C'
if x is 2: return 'A'
if x is 3: return 'G'
if x is 0:
return 'T'
elif x is 1:
return 'C'
elif x is 2:
return 'A'
elif x is 3:
return 'G'
else:
raise ValueError('Only integers 0-3 are valid inputs')
def base_to_bin(x):
"""
provided for user convenience
convert a nucleotide to its bit representation
provided for user convenience
convert a nucleotide to its bit representation
"""
if x == 'T': return '00'
if x == 'C': return '01'
if x == 'A': return '10'
if x == 'G': return '11'
if x == 'T':
return '00'
elif x == 'C':
return '01'
elif x == 'A':
return '10'
elif x == 'G':
return '11'
else:
raise ValueError('Only characters \'ATGC\' are valid inputs')
def create_byte_table():
"""create BYTE_TABLE"""
d = {}
for x in range(2**8):
for x in xrange(2 ** 8):
d[x] = byte_to_bases(x)
return d
def split16(x):
"""
split a 16-bit number into integer representation
of its course and fine parts in binary representation
split a 16-bit number into integer representation
of its course and fine parts in binary representation
"""
c = (x >> 8) & 0xff
f = x & 0xff
return c, f
def create_twobyte_table():
"""create TWOBYTE_TABLE"""
d = {}
for x in range(2**16):
for x in xrange(2 ** 16):
c, f = split16(x)
d[x] = chain(byte_to_bases(c), byte_to_bases(f))
d[x] = list(byte_to_bases(c)) + list(byte_to_bases(f))
return d
BYTE_TABLE = create_byte_table()
TWOBYTE_TABLE = create_twobyte_table()
def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size,
more_bytes=None):
"""
takes in a iterable of longs and converts them to bases in a char array
returns a ctypes string buffer
takes in an array of longs (4 bytes) and converts them to bases in
a char array
you must also provide the offset in the first and last block
(note these offsets are pythonic. last_offset is not included)
and the desired array_size
If you have less than a long worth of bases at the end, you can provide
them as a string with more_bytes=
NOTE: last_base_offset is inside more_bytes not the last long, if more_bytes
is not None
returns the correct subset of the array based on provided offsets
"""
if array_size == 0:
return array(_CHAR_CODE)
elif array_size < 0:
raise ValueError('array_size must be at least 0')
if not first_base_offset in range(16):
raise ValueError('first_base_offset must be in range(16)')
if not last_base_offset in range(1, 17):
raise ValueError('last_base_offset must be in range(1, 17)')
longs_len = len(longs)
# dna = ctypes.create_string_buffer(array_size)
dna = array('b', 'N' * longs_len)
if more_bytes is None:
shorts_length = 0
else:
shorts_length = len(more_bytes)
if array_size > longs_len * 16 + 4 * shorts_length:
raise ValueError('array_size exceeds maximum possible for input')
dna = array(_CHAR_CODE, 'N' * (longs_len * 16 + 4 * shorts_length))
# translate from 32-bit blocks to bytes
# this method ensures correct endianess (byteswap as neeed)
bytes = array('B')
bytes.fromstring(longs.tostring())
# first block
first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
i = 16 - first_base_offset
if array_size < i: i = array_size
dna[0:i] = array('b', first_block[first_base_offset:first_base_offset + i])
if longs_len == 1: return dna
# middle blocks (implicitly skipped if they don't exist)
for byte in bytes[4:-4]:
dna[i:i + 4] = array('b', BYTE_TABLE[byte])
i += 4
# last block
last_block = array('b', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
dna[i:i + last_base_offset] = last_block[0:last_base_offset]
return dna
i = 0
if longs_len > 0:
bytes_ = array('B')
bytes_.fromstring(longs.tostring())
# first block
first_block = ''.join([''.join(BYTE_TABLE[bytes_[x]]) for x in range(4)])
i = 16 - first_base_offset
if array_size < i:
i = array_size
dna[0:i] = array(_CHAR_CODE, first_block[first_base_offset:first_base_offset + i])
if longs_len > 1:
# middle blocks (implicitly skipped if they don't exist)
for byte in bytes_[4:-4]:
dna[i:i + 4] = array(_CHAR_CODE, BYTE_TABLE[byte])
i += 4
# last block
last_block = array(_CHAR_CODE, ''.join([''.join(BYTE_TABLE[bytes_[x]])
for x in range(-4, 0)]))
if more_bytes is None:
dna[i:i + last_base_offset] = last_block[0:last_base_offset]
else: # if there are more bytes, we need the whole last block
dna[i:i + 16] = last_block[0:16]
i += 16
if more_bytes is not None:
bytes_ = array('B')
bytes_.fromstring(more_bytes)
j = i
for byte in bytes_:
j = i + 4
if j > array_size:
dnabytes = array(_CHAR_CODE, BYTE_TABLE[byte])[0:(array_size - i)]
dna[i:array_size] = dnabytes
break
dna[i:i + last_base_offset] = array(_CHAR_CODE, BYTE_TABLE[byte])
i += 4
return dna[0:array_size]
class TwoBitFile(dict):
"""
python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support)
TwoBitFile inherits from dict
You may access sequences by name, e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> chr20[100100:100200]
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20)
Fair warning: dumping the entire chromosome requires a lot of memory
See TwoBitSequence for more info
python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support)
TwoBitFile inherits from dict
You may access sequences by name, e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> chr20[100100:100120]
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
tttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20)
Fair warning: dumping the entire chromosome requires a lot of memory
See TwoBitSequence for more info
"""
def __init__(self, foo):
......@@ -886,14 +979,19 @@ class TwoBitFile(dict):
if not access(foo, R_OK):
raise IOError(EACCES, strerror(EACCES), foo)
self._filename = foo
self._file_size = getsize(foo)
self._file_handle = open(foo, 'rb')
self._load_header()
self._load_index()
for name, offset in self._offset_dict.items():
for name, offset in iteritems(self._offset_dict):
self[name] = TwoBitSequence(self._file_handle, offset,
self._file_size,
self._byteswapped)
return
def __reduce__(self): # enables pickling
return (TwoBitFile, (self._filename,))
def _load_header(self):
file_handle = self._file_handle
header = array(LONG)
......@@ -907,8 +1005,9 @@ class TwoBitFile(dict):
header.byteswap()
(signature2, version, sequence_count, reserved) = header
if not signature2 == 0x1A412743:
raise TwoBitFileError('Signature in header should be 0x1A412743'
+ ', instead found 0x%X' % signature)
raise TwoBitFileError('Signature in header should be ' +
'0x1A412743, instead found 0x%X' %
signature)
if not version == 0:
raise TwoBitFileError('File version in header should be 0.')
if not reserved == 0:
......@@ -922,21 +1021,23 @@ class TwoBitFile(dict):
remaining = self._sequence_count
sequence_offsets = []
file_handle.seek(16)
while True:
if remaining == 0: break
while remaining > 0:
name_size = array('B')
name_size.fromfile(file_handle, 1)
if byteswapped:
name_size.byteswap()
name = array('b')
# name = array(_CHAR_CODE)
name = array('B')
name.fromfile(file_handle, name_size[0])
name = "".join([chr(X) for X in name])
if byteswapped:
name.byteswap()
name.fromfile(file_handle, name_size[0])
offset = array(LONG)
offset.fromfile(file_handle, 1)
if byteswapped:
offset.byteswap()
sequence_offsets.append((name.tostring(), offset[0]))
sequence_offsets.append((name, offset[0]))
remaining -= 1
self._sequence_offsets = sequence_offsets
self._offset_dict = dict(sequence_offsets)
......@@ -946,71 +1047,78 @@ class TwoBitFile(dict):
d = {}
file_handle = self._file_handle
byteswapped = self._byteswapped
for name, offset in self._offset_dict.items():
for name, offset in iteritems(self._offset_dict):
file_handle.seek(offset)
dna_size = array(LONG)
dna_size.fromfile(file_handle, 1)
if byteswapped: dna_size.byteswap()
if byteswapped:
dna_size.byteswap()
d[name] = dna_size[0]
return d
class TwoBitSequence(object):
"""
A TwoBitSequence object refers to an entry in a TwoBitFile
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20[100100:100200] # slicing returns a string
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20) # get whole chr as string
Fair warning: dumping the entire chromosome requires a lot of memory
Note that we follow python/UCSC conventions:
Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
If you attempt to access a slice past the end of the sequence,
it will be truncated at the end.
Your computer probably doesn't have enough memory to load a whole genome
but if you want to string-ize your TwoBitFile, here's a recipe:
x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.iteritems(): d[k] = str(v)
A TwoBitSequence object refers to an entry in a TwoBitFile
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20[100100:100200] # slicing returns a string
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
tttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20) # get whole chr as string
Fair warning: dumping the entire chromosome requires a lot of memory
Note that we follow python/UCSC conventions:
Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
If you attempt to access a slice past the end of the sequence,
it will be truncated at the end.
Your computer probably doesn't have enough memory to load a whole genome
but if you want to string-ize your TwoBitFile, here's a recipe:
x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.items(): d[k] = str(v)
"""
def __init__(self, file_handle, offset, byteswapped=False):
def __init__(self, file_handle, offset, file_size, byteswapped=False):
self._file_size = file_size
self._file_handle = file_handle
self._original_offset = offset
self._byteswapped = byteswapped
file_handle.seek(offset)
header = array(LONG)
header.fromfile(file_handle, 2)
if byteswapped: header.byteswap()
if byteswapped:
header.byteswap()
dna_size, n_block_count = header
self._dna_size = dna_size
self._packed_dna_size = (dna_size + 15) / 16 # this is 32-bit fragments
self._dna_size = dna_size # number of characters, 2 bits each
self._n_bytes = (dna_size + 3) / 4 # number of bytes
# number of 32-bit fragments
self._packed_dna_size = (dna_size + 15) / 16
n_block_starts = array(LONG)
n_block_sizes = array(LONG)
n_block_starts.fromfile(file_handle, n_block_count)
if byteswapped: n_block_starts.byteswap()
if byteswapped:
n_block_starts.byteswap()
n_block_sizes.fromfile(file_handle, n_block_count)
if byteswapped: n_block_sizes.byteswap()
if byteswapped:
n_block_sizes.byteswap()
self._n_block_starts = n_block_starts
self._n_block_sizes= n_block_sizes
self._n_block_sizes = n_block_sizes
mask_rawc = array(LONG)
mask_rawc.fromfile(file_handle, 1)
if byteswapped: mask_rawc.byteswap()
if byteswapped:
mask_rawc.byteswap()
mask_block_count = mask_rawc[0]
mask_block_starts = array(LONG)
mask_block_starts.fromfile(file_handle, mask_block_count)
if byteswapped: mask_block_starts.byteswap()
if byteswapped:
mask_block_starts.byteswap()
mask_block_sizes = array(LONG)
mask_block_sizes.fromfile(file_handle, mask_block_count)
if byteswapped: mask_block_sizes.byteswap()
if byteswapped:
mask_block_sizes.byteswap()
self._mask_block_starts = mask_block_starts
self._mask_block_sizes = mask_block_sizes
file_handle.read(4)
......@@ -1019,8 +1127,22 @@ class TwoBitSequence(object):
def __len__(self):
return self._dna_size
def __getslice__(self, min_, max_=None):
return self.get_slice(min_, max_)
def __getitem__(self, slice_or_key):
"""
return a sub-sequence, given a slice object
"""
step = None
if isinstance(slice_or_key, slice):
step = slice_or_key.step
if step is not None:
raise ValueError("Slicing by step not currently supported")
return self.get_slice(min_=slice_or_key.start, max_=slice_or_key.stop)
elif isinstance(slice_or_key, int):
max_ = slice_or_key + 1
if max_ == 0:
max_ = None
return self.get_slice(min_=slice_or_key, max_=max_)
def get_slice(self, min_, max_=None):
"""
......@@ -1028,22 +1150,26 @@ class TwoBitSequence(object):
"""
# handle negative coordinates
dna_size = self._dna_size
if max_ < 0:
if max_ < -dna_size: raise IndexError('index out of range')
max_ = dna_size + 1 + max_
if min_ < 0:
if max_ < -dna_size: raise IndexError('index out of range')
min_ = dna_size + 1 + min_
# Find out if the reverse complement is sought
reverse = False # assume not RC
if min_ > max_ and max_ is not None:
reverse = True
mymax = max_
max_ = min_
min_ = mymax
if max_ == 0: return ''
if min_ is None: # for slicing e.g. [:]
min_ = 0
if max_ is not None and max_ < 0:
if max_ < -dna_size:
raise IndexError('index out of range')
max_ = dna_size + max_
if min_ is not None and min_ < 0:
if min_ < -dna_size:
raise IndexError('index out of range')
min_ = dna_size + min_