update_seqdata_to_python_3

a236ba40 · Mikael Boden · bb25ec08 · a236ba40 · a236ba40
Commit a236ba40 authored Aug 24, 2017 by Mikael Boden
Hide whitespace changes
Inline Side-by-side

Showing with 393 additions and 249 deletions

guide.py guide.py +1 -1

seqdata.py seqdata.py +392 -248

No files found.
--- a/guide.py
+++ b/guide.py
-e a t###################################################
+###################################################
 # This module is a supplement to the Python guide #
 # Version 2017.3  (10/03/2017)                    #
 ###################################################

--- a/seqdata.py
+++ b/seqdata.py
@@ -13,8 +13,8 @@ def overlap(chromLoc1, chromLoc2):
        Return 0 in case of NO overlap.
    """
    if chromLoc1[0] == chromLoc2[0]:
-        halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
+        halfWidth1 = (chromLoc1[2] - chromLoc1[1]) // 2
-        halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
+        halfWidth2 = (chromLoc2[2] - chromLoc2[1]) // 2
        minWidth = min(halfWidth1, halfWidth2)
        minWidth = max(minWidth, 1)
        maxWidth = max(halfWidth1, halfWidth2)
@@ -37,8 +37,8 @@ def distance(chromLoc1, chromLoc2, minimum = True):
        minimum: if True (default), then use minimum distance, if False, use centre to centre
    """
    if chromLoc1[0] == chromLoc2[0]:
-        halfWidth1 = (chromLoc1[2] - chromLoc1[1]) / 2
+        halfWidth1 = (chromLoc1[2] - chromLoc1[1]) // 2
-        halfWidth2 = (chromLoc2[2] - chromLoc2[1]) / 2
+        halfWidth2 = (chromLoc2[2] - chromLoc2[1]) // 2
        minWidth = min(halfWidth1, halfWidth2)
        minWidth = max(minWidth, 1)
        maxWidth = max(halfWidth1, halfWidth2)
@@ -151,33 +151,33 @@ class BedEntry():
            end = self.chromEnd
            start = self.chromStart
            mywidth = fixedwidth or (self.chromEnd - self.chromStart)
-            mycentre = start + (self.chromEnd - self.chromStart) / 2
+            mycentre = start + (self.chromEnd - self.chromStart) // 2
            if usesummit:
                mycentre = self.summit
            if useshift:
                mycentre = mycentre + useshift
            if fixedwidth: # we need to re-calculate start and end
                if genome:
-                    end = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
+                    end = min(len(genome[self.chrom]), mycentre + (mywidth // 2))
                else:
-                    end = mycentre + (mywidth / 2)
+                    end = mycentre + (mywidth // 2)
-                start = max(0, mycentre - (mywidth / 2))
+                start = max(0, mycentre - (mywidth // 2))
        else: # other strand
            start = self.chromEnd
            end = self.chromStart
            mywidth = fixedwidth or (self.chromEnd - self.chromStart)
-            mycentre = self.chromStart + (self.chromEnd - self.chromStart) / 2
+            mycentre = self.chromStart + (self.chromEnd - self.chromStart) // 2
            if usesummit:
                mycentre = self.summit
            if useshift:
                mycentre = mycentre - useshift # shift is reversed on other strand
            if fixedwidth: # we need to re-calculate start and end
-                end = max(0, mycentre - (mywidth / 2))
+                end = max(0, mycentre - (mywidth // 2))
                if genome:
-                    start = min(len(genome[self.chrom]), mycentre + (mywidth / 2))
+                    start = min(len(genome[self.chrom]), mycentre + (mywidth // 2))
                else:
-                    start = mycentre + (mywidth / 2)
+                    start = mycentre + (mywidth // 2)
        if genome: # refer to the genome sequence
            return genome[self.chrom][start : end]
@@ -187,9 +187,9 @@ class BedEntry():
    def setwidth(self, fixedwidth = None, usesummit = False):
        if fixedwidth:
            if usesummit:
-                diff = self.summit - fixedwidth / 2
+                diff = self.summit - fixedwidth // 2
            else:
-                diff = (self.chromEnd - self.chromStart) / 2 - fixedwidth / 2
+                diff = (self.chromEnd - self.chromStart) // 2 - fixedwidth // 2
            self.chromStart += diff
            self.chromStart += diff + fixedwidth
        return (self.chrom, self.chromStart, self.chromEnd)
@@ -365,11 +365,8 @@ class BedFile():
    def __iter__(self):
        return self.rows.__iter__()
-    def __getslice__(self, i, j):
+    def __getitem__(self, key):
-        return self.rows.__getslice__(i, j)
+        return self.rows[key]
-    def __getitem__(self, i):
-        return self.rows[i]
    def __len__(self):
        return len(self.rows)
@@ -388,7 +385,7 @@ class BedFile():
            if not row.chrom in index_end: # seeing chromosome entry first time
                index_end[row.chrom] = []
            index_start[row.chrom].append((row.chromStart, row.chromEnd - row.chromStart, i))
-            index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) / 2, (row.chromEnd - row.chromStart) / 2, i))
+            index_centre[row.chrom].append((row.chromStart + (row.chromEnd - row.chromStart) // 2, (row.chromEnd - row.chromStart) // 2, i))
            index_end[row.chrom].append((row.chromEnd, row.chromEnd - row.chromStart, i))
            if row.name:
                index_name[row.name] = row
@@ -407,7 +404,7 @@ class BedFile():
            entries = self.indices[0][elem[0]] # use the start index
            upper = len(entries)    # keep an upper boundary
            lower = 0               # and a lower boundary
-            inspect = (upper - lower) / 2 # start by looking in the middle
+            inspect = (upper - lower) // 2 # start by looking in the middle
            while True:
                entry = self.rows[entries[inspect][2]]
                d = distance(entry.loc(), elem, minimum = True)
@@ -416,11 +413,11 @@ class BedFile():
                    return True
                elif d > 0:
                    lower = inspect + 1
-                    delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                    delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
                    inspect += delta
                else:
                    upper = inspect
-                    delta = (inspect - lower + 1) / 2
+                    delta = (inspect - lower + 1) // 2
                    inspect -= delta
                if delta == 0:
                    return False
@@ -436,7 +433,7 @@ class BedFile():
            entries = self.indices[0][elem[0]] # use the start index
            upper = len(entries)    # keep an upper boundary
            lower = 0               # and a lower boundary
-            inspect = (upper - lower) / 2 # start by looking in the middle
+            inspect = (upper - lower) // 2 # start by looking in the middle
            while True:
                entry = self.rows[entries[inspect][2]]
                d = distance(entry.loc(), elem, minimum = True)
@@ -461,11 +458,11 @@ class BedFile():
                    return False
                elif d > 0:
                    lower = inspect + 1
-                    delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                    delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
                    inspect += delta
                else:
                    upper = inspect
-                    delta = (inspect - lower + 1) / 2
+                    delta = (inspect - lower + 1) // 2
                    inspect -= delta
                if delta == 0:
                    return False
@@ -494,7 +491,7 @@ class BedFile():
                entries = self.indices[0][myloc[0]] # use start index
                upper = len(entries)    # keep an upper boundary
                lower = 0               # and a lower boundary
-                inspect = (upper - lower) / 2 # start by looking in the middle
+                inspect = (upper - lower) // 2 # start by looking in the middle
                delta = None
                while not delta == 0:
                    entry = self.rows[entries[inspect][2]]
@@ -509,11 +506,11 @@ class BedFile():
                        return (mindist, minentry)
                    elif d > 0:
                        lower = inspect + 1
-                        delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                        delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
                        inspect += delta
                    else:
                        upper = inspect
-                        delta = (inspect - lower + 1) / 2
+                        delta = (inspect - lower + 1) // 2
                        inspect -= delta
                # we may have missed the closest, so need to look around this point
                for i_dn in range(inspect + 1, len(entries)): # Look downstream since
@@ -528,7 +525,7 @@ class BedFile():
                entries = self.indices[2][myloc[0]] # use end index
                upper = len(entries)    # keep an upper boundary
                lower = 0               # and a lower boundary
-                inspect = (upper - lower) / 2 # start by looking in the middle
+                inspect = (upper - lower) // 2 # start by looking in the middle
                delta = None
                while not delta == 0:
                    entry = self.rows[entries[inspect][2]]
@@ -540,11 +537,11 @@ class BedFile():
                        return (mindist, minentry)
                    elif d > 0:
                        lower = inspect + 1
-                        delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                        delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
                        inspect += delta
                    else:
                        upper = inspect
-                        delta = (inspect - lower + 1) / 2
+                        delta = (inspect - lower + 1) // 2
                        inspect -= delta
                # we may have missed the closest, so need to look around this point
                for i_up in range(inspect - 1, 0, -1): # Look upstream since
@@ -560,7 +557,7 @@ class BedFile():
                entries = self.indices[1][myloc[0]] # use centre index
                upper = len(entries)    # keep an upper boundary
                lower = 0               # and a lower boundary
-                inspect = (upper - lower) / 2 # start by looking in the middle
+                inspect = (upper - lower) // 2 # start by looking in the middle
                delta = None
                while not delta == 0:
                    entry = self.rows[entries[inspect][2]]
@@ -575,11 +572,11 @@ class BedFile():
                        return (mindist, minentry)
                    elif d > 0:
                        lower = inspect + 1
-                        delta = (upper - inspect) / 2 # splitting in half, potential speed improvements with some heuristic?
+                        delta = (upper - inspect) // 2 # splitting in half, potential speed improvements with some heuristic?
                        inspect += delta
                    else:
                        upper = inspect
-                        delta = (inspect - lower + 1) / 2
+                        delta = (inspect - lower + 1) // 2
                        inspect -= delta
                # at bottom of search
                return (mindist, minentry)
@@ -751,30 +748,64 @@ Modifications to package:
 - removed download.py and __main__ because they were not used and __main__ had errors.
 - removed command-line interface because the BED file functionality is implemented more extensively elsewhere
 """
+"""
+twobitreader
+Licensed under Perl Artistic License 2.0
+No warranty is provided, express or implied
+"""
 from array import array
 from bisect import bisect_right
 from errno import ENOENT, EACCES
 from os import R_OK, access
 try:
    from os import strerror
 except ImportError:
    strerror = lambda x: 'strerror not supported'
-from os.path import exists
+from os.path import exists, getsize
-from itertools import chain
+import logging
+import textwrap
+import sys
+if sys.version_info > (3,):
+    izip = zip
+    xrange = range
+    _CHAR_CODE = 'u'
+    iteritems = dict.items
+else:
+    from itertools import izip
+    _CHAR_CODE = 'c'
+    iteritems = dict.iteritems
+def safe_tostring(ary):
+    """
+    convert arrays to strings in a Python 2.x / 3.x safe way
+    """
+    if sys.version_info > (3,):
+        return ary.tounicode().encode("ascii").decode()
+    else:
+        return ary.tostring()
 def true_long_type():
    """
-        OS X uses an 8-byte long, so make sure L (long) is the right size
+    OS X uses an 8-byte long, so make sure L (long) is the right size
-        and switch to I (int) if needed
+    and switch to I (int) if needed
    """
    for type_ in ['L', 'I']:
        test_array = array(type_, [0])
        long_size = test_array.itemsize
-        if long_size == 4: return type_
+        if long_size == 4:
+            return type_
    raise ImportError("Couldn't determine a valid 4-byte long type to use \
 as equivalent to LONG")
 LONG = true_long_type()
 def byte_to_bases(x):
    """convert one byte to the four bases it encodes"""
    c = (x >> 4) & 0xf
@@ -783,100 +814,162 @@ def byte_to_bases(x):
    cf = c & 0x3
    fc = (f >> 2) & 0x3
    ff = f & 0x3
-    return map(bits_to_base, (cc, cf, fc, ff))
+    return [bits_to_base(X) for X in (cc, cf, fc, ff)]
 def bits_to_base(x):
    """convert integer representation of two bits to correct base"""
-    if x is 0: return 'T'
+    if x is 0:
-    if x is 1: return 'C'
+        return 'T'
-    if x is 2: return 'A'
+    elif x is 1:
-    if x is 3: return 'G'
+        return 'C'
+    elif x is 2:
+        return 'A'
+    elif x is 3:
+        return 'G'
+    else:
+        raise ValueError('Only integers 0-3 are valid inputs')
 def base_to_bin(x):
    """
-        provided for user convenience
+    provided for user convenience
-        convert a nucleotide to its bit representation
+    convert a nucleotide to its bit representation
    """
-    if x == 'T': return '00'
+    if x == 'T':
-    if x == 'C': return '01'
+        return '00'
-    if x == 'A': return '10'
+    elif x == 'C':
-    if x == 'G': return '11'
+        return '01'
+    elif x == 'A':
+        return '10'
+    elif x == 'G':
+        return '11'
+    else:
+        raise ValueError('Only characters \'ATGC\' are valid inputs')
 def create_byte_table():
    """create BYTE_TABLE"""
    d = {}
-    for x in range(2**8):
+    for x in xrange(2 ** 8):
        d[x] = byte_to_bases(x)
    return d
 def split16(x):
    """
-        split a 16-bit number into integer representation
+    split a 16-bit number into integer representation
-        of its course and fine parts in binary representation
+    of its course and fine parts in binary representation
    """
    c = (x >> 8) & 0xff
    f = x & 0xff
    return c, f
 def create_twobyte_table():
    """create TWOBYTE_TABLE"""
    d = {}
-    for x in range(2**16):
+    for x in xrange(2 ** 16):
        c, f = split16(x)
-        d[x] = chain(byte_to_bases(c), byte_to_bases(f))
+        d[x] = list(byte_to_bases(c)) + list(byte_to_bases(f))
    return d
 BYTE_TABLE = create_byte_table()
 TWOBYTE_TABLE = create_twobyte_table()
-def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
+def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size,
+                        more_bytes=None):
    """
-        takes in a iterable of longs and converts them to bases in a char array
+    takes in an array of longs (4 bytes) and converts them to bases in
-        returns a ctypes string buffer
+    a char array
+    you must also provide the offset in the first and last block
+    (note these offsets are pythonic. last_offset is not included)
+    and the desired array_size
+    If you have less than a long worth of bases at the end, you can provide
+    them as a string with more_bytes=
+    NOTE: last_base_offset is inside more_bytes not the last long, if more_bytes
+          is not None
+    returns the correct subset of the array based on provided offsets
    """
+    if array_size == 0:
+        return array(_CHAR_CODE)
+    elif array_size < 0:
+        raise ValueError('array_size must be at least 0')
+    if not first_base_offset in range(16):
+        raise ValueError('first_base_offset must be in range(16)')
+    if not last_base_offset in range(1, 17):
+        raise ValueError('last_base_offset must be in range(1, 17)')
    longs_len = len(longs)
-    # dna = ctypes.create_string_buffer(array_size)
+    if more_bytes is None:
-    dna = array('b', 'N' * longs_len)
+        shorts_length = 0
+    else:
+        shorts_length = len(more_bytes)
+    if array_size > longs_len * 16 + 4 * shorts_length:
+        raise ValueError('array_size exceeds maximum possible for input')
+    dna = array(_CHAR_CODE, 'N' * (longs_len * 16 + 4 * shorts_length))
    # translate from 32-bit blocks to bytes
    # this method ensures correct endianess (byteswap as neeed)
-    bytes = array('B')
+    i = 0
-    bytes.fromstring(longs.tostring())
+    if longs_len > 0:
-    # first block
+        bytes_ = array('B')
-    first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
+        bytes_.fromstring(longs.tostring())
-    i = 16 - first_base_offset
+        # first block
-    if array_size < i: i = array_size
+        first_block = ''.join([''.join(BYTE_TABLE[bytes_[x]]) for x in range(4)])
-    dna[0:i] = array('b', first_block[first_base_offset:first_base_offset + i])
+        i = 16 - first_base_offset
-    if longs_len == 1: return dna
+        if array_size < i:
-    # middle blocks (implicitly skipped if they don't exist)
+            i = array_size
-    for byte in bytes[4:-4]:
+        dna[0:i] = array(_CHAR_CODE, first_block[first_base_offset:first_base_offset + i])
-        dna[i:i + 4] = array('b', BYTE_TABLE[byte])
+    if longs_len > 1:
-        i += 4
+        # middle blocks (implicitly skipped if they don't exist)
-    # last block
+        for byte in bytes_[4:-4]:
-    last_block = array('b', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4,0)]))
+            dna[i:i + 4] = array(_CHAR_CODE, BYTE_TABLE[byte])
-    dna[i:i + last_base_offset] = last_block[0:last_base_offset]
+            i += 4
-    return dna
+        # last block
+        last_block = array(_CHAR_CODE, ''.join([''.join(BYTE_TABLE[bytes_[x]])
+                                                for x in range(-4, 0)]))
+        if more_bytes is None:
+            dna[i:i + last_base_offset] = last_block[0:last_base_offset]
+        else:  # if there are more bytes, we need the whole last block
+            dna[i:i + 16] = last_block[0:16]
+        i += 16
+    if more_bytes is not None:
+        bytes_ = array('B')
+        bytes_.fromstring(more_bytes)
+        j = i
+        for byte in bytes_:
+            j = i + 4
+            if j > array_size:
+                dnabytes = array(_CHAR_CODE, BYTE_TABLE[byte])[0:(array_size - i)]
+                dna[i:array_size] = dnabytes
+                break
+            dna[i:i + last_base_offset] = array(_CHAR_CODE, BYTE_TABLE[byte])
+            i += 4
+    return dna[0:array_size]
 class TwoBitFile(dict):
    """
-        python-level reader for .2bit files (i.e., from UCSC genome browser)
+python-level reader for .2bit files (i.e., from UCSC genome browser)
-        (note: no writing support)
+(note: no writing support)
+TwoBitFile inherits from dict
-        TwoBitFile inherits from dict
+You may access sequences by name, e.g.
-        You may access sequences by name, e.g.
+>>> genome = TwoBitFile('hg18.2bit')
-        >>> genome = TwoBitFile('hg18.2bit')
+>>> chr20 = genome['chr20']
-        >>> chr20 = genome['chr20']
+Sequences are returned as TwoBitSequence objects
+You may access intervals by slicing or using str() to dump the entire entry
-        Sequences are returned as TwoBitSequence objects
+e.g.
-        You may access intervals by slicing or using str() to dump the entire entry
+>>> chr20[100100:100120]
-        e.g.
+'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
-        >>> chr20[100100:100200]
+tttgcatgttaagtttttttcc'
-        'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
+>>> whole_chr20 = str(chr20)
-        ttgcatgttaagtttttttcc'
+Fair warning: dumping the entire chromosome requires a lot of memory
-        >>> whole_chr20 = str(chr20)
+See TwoBitSequence for more info
-        Fair warning: dumping the entire chromosome requires a lot of memory
-        See TwoBitSequence for more info
    """
    def __init__(self, foo):
@@ -886,14 +979,19 @@ class TwoBitFile(dict):
        if not access(foo, R_OK):
            raise IOError(EACCES, strerror(EACCES), foo)
        self._filename = foo
+        self._file_size = getsize(foo)
        self._file_handle = open(foo, 'rb')
        self._load_header()
        self._load_index()
-        for name, offset in self._offset_dict.items():
+        for name, offset in iteritems(self._offset_dict):
            self[name] = TwoBitSequence(self._file_handle, offset,
+                                        self._file_size,
                                        self._byteswapped)
        return
+    def __reduce__(self):  # enables pickling
+        return (TwoBitFile, (self._filename,))
    def _load_header(self):
        file_handle = self._file_handle
        header = array(LONG)
@@ -907,8 +1005,9 @@ class TwoBitFile(dict):
            header.byteswap()
            (signature2, version, sequence_count, reserved) = header
            if not signature2 == 0x1A412743:
-                raise TwoBitFileError('Signature in header should be 0x1A412743'
+                raise TwoBitFileError('Signature in header should be ' +
-                                    + ', instead found 0x%X' % signature)
+                                      '0x1A412743, instead found 0x%X' %
+                                      signature)
        if not version == 0:
            raise TwoBitFileError('File version in header should be 0.')
        if not reserved == 0:
@@ -922,21 +1021,23 @@ class TwoBitFile(dict):
        remaining = self._sequence_count
        sequence_offsets = []
        file_handle.seek(16)
-        while True:
+        while remaining > 0:
-            if remaining == 0: break
            name_size = array('B')
            name_size.fromfile(file_handle, 1)
            if byteswapped:
                name_size.byteswap()
-            name = array('b')
+            # name = array(_CHAR_CODE)
+            name = array('B')
+            name.fromfile(file_handle, name_size[0])
+            name = "".join([chr(X) for X in name])
            if byteswapped:
                name.byteswap()
-            name.fromfile(file_handle, name_size[0])
            offset = array(LONG)
            offset.fromfile(file_handle, 1)
            if byteswapped:
                offset.byteswap()
-            sequence_offsets.append((name.tostring(), offset[0]))
+            sequence_offsets.append((name, offset[0]))
            remaining -= 1
        self._sequence_offsets = sequence_offsets
        self._offset_dict = dict(sequence_offsets)
@@ -946,71 +1047,78 @@ class TwoBitFile(dict):
        d = {}
        file_handle = self._file_handle
        byteswapped = self._byteswapped
-        for name, offset in self._offset_dict.items():
+        for name, offset in iteritems(self._offset_dict):
            file_handle.seek(offset)
            dna_size = array(LONG)
            dna_size.fromfile(file_handle, 1)
-            if byteswapped: dna_size.byteswap()
+            if byteswapped:
+                dna_size.byteswap()
            d[name] = dna_size[0]
        return d
 class TwoBitSequence(object):
    """
-        A TwoBitSequence object refers to an entry in a TwoBitFile
+A TwoBitSequence object refers to an entry in a TwoBitFile
+You may access intervals by slicing or using str() to dump the entire entry
-        You may access intervals by slicing or using str() to dump the entire entry
+e.g.
-        e.g.
+>>> genome = TwoBitFile('hg18.2bit')
-        >>> genome = TwoBitFile('hg18.2bit')
+>>> chr20 = genome['chr20']
-        >>> chr20 = genome['chr20']
+>>> chr20[100100:100200] # slicing returns a string
-        >>> chr20[100100:100200] # slicing returns a string
+'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggta
-        'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
+tttgcatgttaagtttttttcc'
-        ttgcatgttaagtttttttcc'
+>>> whole_chr20 = str(chr20) # get whole chr as string
-        >>> whole_chr20 = str(chr20) # get whole chr as string
+Fair warning: dumping the entire chromosome requires a lot of memory
+Note that we follow python/UCSC conventions:
-        Fair warning: dumping the entire chromosome requires a lot of memory
+Coordinates are 0-based, end-open
+(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
-        Note that we follow python/UCSC conventions:
+If you attempt to access a slice past the end of the sequence,
-        Coordinates are 0-based, end-open
+it will be truncated at the end.
-        (Note: The UCSC web-based genome browser uses 1-based closed coordinates)
+Your computer probably doesn't have enough memory to load a whole genome
-        If you attempt to access a slice past the end of the sequence,
+but if you want to string-ize your TwoBitFile, here's a recipe:
-        it will be truncated at the end.
+x = TwoBitFile('my.2bit')
+d = x.dict()
-        Your computer probably doesn't have enough memory to load a whole genome
+for k,v in d.items(): d[k] = str(v)
-        but if you want to string-ize your TwoBitFile, here's a recipe:
-        x = TwoBitFile('my.2bit')
-        d = x.dict()
-        for k,v in d.iteritems(): d[k] = str(v)
    """
-    def __init__(self, file_handle, offset, byteswapped=False):
+    def __init__(self, file_handle, offset, file_size, byteswapped=False):
+        self._file_size = file_size
        self._file_handle = file_handle
        self._original_offset = offset
        self._byteswapped = byteswapped
        file_handle.seek(offset)
        header = array(LONG)
        header.fromfile(file_handle, 2)
-        if byteswapped: header.byteswap()
+        if byteswapped:
+            header.byteswap()
        dna_size, n_block_count = header
-        self._dna_size = dna_size
+        self._dna_size = dna_size  # number of characters, 2 bits each
-        self._packed_dna_size = (dna_size + 15) / 16 # this is 32-bit fragments
+        self._n_bytes = (dna_size + 3) / 4  # number of bytes
+        # number of 32-bit fragments
+        self._packed_dna_size = (dna_size + 15) / 16
        n_block_starts = array(LONG)
        n_block_sizes = array(LONG)
        n_block_starts.fromfile(file_handle, n_block_count)
-        if byteswapped: n_block_starts.byteswap()
+        if byteswapped:
+            n_block_starts.byteswap()
        n_block_sizes.fromfile(file_handle, n_block_count)
-        if byteswapped: n_block_sizes.byteswap()
+        if byteswapped:
+            n_block_sizes.byteswap()
        self._n_block_starts = n_block_starts
-        self._n_block_sizes= n_block_sizes
+        self._n_block_sizes = n_block_sizes
        mask_rawc = array(LONG)
        mask_rawc.fromfile(file_handle, 1)
-        if byteswapped: mask_rawc.byteswap()
+        if byteswapped:
+            mask_rawc.byteswap()
        mask_block_count = mask_rawc[0]
        mask_block_starts = array(LONG)
        mask_block_starts.fromfile(file_handle, mask_block_count)
-        if byteswapped: mask_block_starts.byteswap()
+        if byteswapped:
+            mask_block_starts.byteswap()
        mask_block_sizes = array(LONG)
        mask_block_sizes.fromfile(file_handle, mask_block_count)
-        if byteswapped: mask_block_sizes.byteswap()
+        if byteswapped:
+            mask_block_sizes.byteswap()
        self._mask_block_starts = mask_block_starts
        self._mask_block_sizes = mask_block_sizes
        file_handle.read(4)
@@ -1019,8 +1127,22 @@ class TwoBitSequence(object):
    def __len__(self):
        return self._dna_size
-    def __getslice__(self, min_, max_=None):
+    def __getitem__(self, slice_or_key):
-        return self.get_slice(min_, max_)
+        """
+        return a sub-sequence, given a slice object
+        """
+        step = None
+        if isinstance(slice_or_key, slice):
+            step = slice_or_key.step
+            if step is not None:
+                raise ValueError("Slicing by step not currently supported")
+            return self.get_slice(min_=slice_or_key.start, max_=slice_or_key.stop)
+        elif isinstance(slice_or_key, int):
+            max_ = slice_or_key + 1
+            if max_ == 0:
+                max_ = None
+            return self.get_slice(min_=slice_or_key, max_=max_)
    def get_slice(self, min_, max_=None):
        """
@@ -1028,22 +1150,26 @@ class TwoBitSequence(object):
        """
        # handle negative coordinates
        dna_size = self._dna_size
-        if max_ < 0:
+        if min_ is None:  # for slicing e.g. [:]
-            if max_ < -dna_size: raise IndexError('index out of range')
+            min_ = 0
-            max_ = dna_size + 1 + max_
+        if max_ is not None and max_ < 0:
-        if min_ < 0:
+            if max_ < -dna_size:
-            if max_ < -dna_size: raise IndexError('index out of range')
+                raise IndexError('index out of range')
-            min_ = dna_size + 1 + min_
+            max_ = dna_size + max_
-        # Find out if the reverse complement is sought
+        if min_ is not None and min_ < 0:
-        reverse = False # assume not RC
+            if min_ < -dna_size:
-        if min_ > max_ and max_ is not None:
+                raise IndexError('index out of range')
-            reverse = True
+            min_ = dna_size + min_
-            mymax = max_
+        # make sure there's a proper range
-            max_ = min_
+        if max_ is not None and min_ > max_:
-            min_ = mymax
+            return ''
-        if max_ == 0: return ''
+        if max_ == 0 or max_ == min_:
+            return ''
        # load all the data
-        if max_ > dna_size: max_ = dna_size
+        if max_ is None or max_ > dna_size:
+            max_ = dna_size
        file_handle = self._file_handle
        byteswapped = self._byteswapped
        n_block_starts = self._n_block_starts
@@ -1052,140 +1178,158 @@ class TwoBitSequence(object):
        mask_block_sizes = self._mask_block_sizes
        offset = self._offset
        packed_dna_size = self._packed_dna_size
+        # n_bytes = self._n_bytes
        # region_size is how many bases the region is
-        if max_ is None: region_size = dna_size - min_
+        if max_ is None:
-        else: region_size = max_ - min_
+            region_size = dna_size - min_
+        else:
+            region_size = max_ - min_
        # start_block, end_block are the first/last 32-bit blocks we need
-        # note: end_block is not read
        # blocks start at 0
-        start_block = min_ / 16
+        start_block = min_ // 16
-        end_block = max_ / 16
+        # jump directly to desired file location
+        local_offset = offset + (start_block * 4)
+        end_block = (max_ - 1 + 16) // 16
        # don't read past seq end
-        if end_block >= packed_dna_size: end_block = packed_dna_size - 1
-        # +1 we still need to read block
-        blocks_to_read = end_block - start_block + 1
-        # jump directly to desired file location
-        local_offset = offset + start_block * 4
        file_handle.seek(local_offset)
        # note we won't actually read the last base
        # this is a python slice first_base_offset:16*blocks+last_base_offset
        first_base_offset = min_ % 16
        last_base_offset = max_ % 16
+        if last_base_offset == 0:
+            last_base_offset = 16
+        # +1 we still need to read end_block maybe
+        blocks_to_read = end_block - start_block
+        if (blocks_to_read + start_block) > packed_dna_size:
+            blocks_to_read = packed_dna_size - start_block
        fourbyte_dna = array(LONG)
-        fourbyte_dna.fromfile(file_handle, blocks_to_read)
+        # remainder_seq = None
-        if byteswapped: fourbyte_dna.byteswap()
+        if (blocks_to_read * 4 + local_offset) > self._file_size:
-        string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
+            fourbyte_dna.fromfile(file_handle, blocks_to_read - 1)
-                                              last_base_offset, region_size)
+            morebytes = file_handle.read()  # read the remaining characters
-        for start, size in zip(n_block_starts, n_block_sizes):
+        # if byteswapped:
+        #                morebytes = ''.join(reversed(morebytes))
+        else:
+            fourbyte_dna.fromfile(file_handle, blocks_to_read)
+            morebytes = None
+        if byteswapped:
+            fourbyte_dna.byteswap()
+        str_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
+                                           last_base_offset, region_size,
+                                           more_bytes=morebytes)
+        for start, size in izip(n_block_starts, n_block_sizes):
            end = start + size
-            if end <= min_: continue
+            if end <= min_:
-            if start > max_: break
+                continue
-            if start < min_: start = min_
+            if start > max_:
-            if end > max_: end = max_
+                break
+            if start < min_:
+                start = min_
+            if end > max_:
+                end = max_
            start -= min_
            end -= min_
-            string_as_array[start:end] = array('b', 'N'*(end-start))
+            # this should actually be decoded, 00=N, 01=n
+            str_as_array[start:end] = array(_CHAR_CODE, 'N' * (end - start))
        lower = str.lower
        first_masked_region = max(0,
                                  bisect_right(mask_block_starts, min_) - 1)
        last_masked_region = min(len(mask_block_starts),
                                 1 + bisect_right(mask_block_starts, max_,
                                                  lo=first_masked_region))
-        for start, size in zip(mask_block_starts[first_masked_region:last_masked_region],
+        for start, size in izip(mask_block_starts[first_masked_region:
-                                mask_block_sizes[first_masked_region:last_masked_region]):
+        last_masked_region],
+                                mask_block_sizes[first_masked_region:
+                                last_masked_region]):
            end = start + size
-            if end <= min_: continue
+            if end <= min_:
-            if start > max_: break
+                continue
-            if start < min_: start = min_
+            if start > max_:
-            if end > max_: end = max_
+                break
+            if start < min_:
+                start = min_
+            if end > max_:
+                end = max_
            start -= min_
            end -= min_
-            string_as_array[start:end] = array('b', lower(string_as_array[start:end].tostring()))
+            str_as_array[start:end] = array(_CHAR_CODE,
-        if not len(string_as_array) == max_ - min_:
+                                            lower(safe_tostring(str_as_array[start:end])))
-            raise RuntimeError("Sequence was longer than it should be")
+        if not len(str_as_array) == max_ - min_:
-        if reverse:
+            raise RuntimeError("Sequence was the wrong size")
-            return self.reverseComplement(string_as_array.tostring())
+        return safe_tostring(str_as_array)
-        return string_as_array.tostring()
-    def reverseComplement(self, dna):
-        """ Return a new sequence: the reverse complement of this sequence. """
-        newseq=''
-        symbols={'A':'T','C':'G','T':'A','G':'C','a':'t','c':'g','t':'a','g':'c','n':'n','N':'N'} # reverse complement dictionary
-        for symbol in dna[::-1]:
-            newsymbol=symbols[symbol] # uses the reverse complement symbols in dictionary
-            newseq+=newsymbol
-        return newseq  # returns RC sequences
    def __str__(self):
        """
        returns the entire chromosome
        """
-        return self.__getslice__(0, None)
+        return self.get_slice(0, None)
 class TwoBitFileError(Exception):
    """
    Base exception for TwoBit module
    """
    def __init__(self, msg):
        errtext = 'Invalid 2-bit file. ' + msg
        return super(TwoBitFileError, self).__init__(errtext)
 def print_specification():
    """
    Prints the twoBit file format specification I got from the Internet.
    This is only here for reference
    """
    return """
-        From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
+From http://www.its.caltech.edu/~alok/reviews/blatSpecs.html
+.2bit files
-        .2bit files
+A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact \
+randomly accessible format. The two bit files contain masking information as \
-        A .2bit file can store multiple DNA sequence (up to 4 gig total) in a compact \
+well as the DNA itself. The file begins with a 16 byte header containing the \
-        randomly accessible format. The two bit files contain masking information as \
+following fields:
-        well as the DNA itself. The file begins with a 16 byte header containing the \
+signature - the number 0x1A412743 in the architecture of the machine that \
-        following fields:
+created the file.
+version - zero for now. Readers should abort if they see a version number \
-        signature - the number 0x1A412743 in the architecture of the machine that \
+higher than 0.
-        created the file.
+sequenceCount - the number of sequences in the file
-        version - zero for now. Readers should abort if they see a version number \
+reserved - always zero for now.
-        higher than 0.
+All fields are 32 bits unless noted. If the signature value is not as given, \
-        sequenceCount - the number of sequences in the file
+the reader program should byte swap the signature and see if the swapped \
-        reserved - always zero for now.
+version matches. If so all multiple-byte entities in the file will need to be \
-        All fields are 32 bits unless noted. If the signature value is not as given, \
+byte-swapped. This enables these binary files to be used unchanged on \
-        the reader program should byte swap the signature and see if the swapped \
+different architectures.
-        version matches. If so all multiple-byte entities in the file will need to be \
+The header is followed by a file index. There is one entry in the index for \
-        byte-swapped. This enables these binary files to be used unchanged on \
+each sequence. Each index entry contains three fields:
-        different architectures.
+nameSize - a byte containing the length of the name field
+name - this contains the sequence name itself, and is variable length \
-        The header is followed by a file index. There is one entry in the index for \
+depending on nameSize.
-        each sequence. Each index entry contains three fields:
+offset - 32 bit offset of the sequence data relative to the start of the file
+The index is followed by the sequence records. These contain 9 fields:
-        nameSize - a byte containing the length of the name field
+dnaSize - number of bases of DNA in the sequence.
-        name - this contains the sequence name itself, and is variable length \
+nBlockCount - the number of blocks of N's in the file (representing unknown \
-        depending on nameSize.
+sequence).
-        offset - 32 bit offset of the sequence data relative to the start of the file
+nBlockStarts - a starting position for each block of N's
+nBlockSizes - the size of each block of N's
-        The index is followed by the sequence records. These contain 9 fields:
+maskBlockCount - the number of masked (lower case) blocks
+maskBlockStarts - starting position for each masked block
-        dnaSize - number of bases of DNA in the sequence.
+maskBlockSizes - the size of each masked block
-        nBlockCount - the number of blocks of N's in the file (representing unknown \
+packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A,\
-        sequence).
+11 - G. The first base is in the most significant 2 bits byte, and the last \
-        nBlockStarts - a starting position for each block of N's
+base in the least significant 2 bits, so that the sequence TCAG would be \
-        nBlockSizes - the size of each block of N's
+represented as 00011011. The packedDna field will be padded with 0 bits as \
-        maskBlockCount - the number of masked (lower case) blocks
+necessary so that it takes an even multiple of 32 bit in the file, as this \
-        maskBlockStarts - starting position for each masked block
+improves i/o performance on some machines.
-        maskBlockSizes - the size of each masked block
+.nib files
-        packedDna - the dna packed to two bits per base as so: 00 - T, 01 - C, 10 - A, \
+"""
-        11 - G. The first base is in the most significant 2 bits byte, and the last \
-        base in the least significant 2 bits, so that the sequence TCAG would be \
-        represented as 00011011. The packedDna field will be padded with 0 bits as \
+if __name__ == '__main__':
+    hg19 = TwoBitFile('/Users/mikael/simhome/share/hg19.2bit')  # assumes that the genome is stored in your current directory
-        necessary so that it takes an even multiple of 32 bit in the file, as this \
+    for key in hg19:
-        improves i/o performance on some machines.
+        print(key)
-        .nib files
+    print(hg19['chrX'][1000000:1000060])
-    """
\ No newline at end of file