Skip to content

Commit

Permalink
Merge pull request #2 from Sentieon/dev
Browse files Browse the repository at this point in the history
Add support for the csi index
  • Loading branch information
DonFreed authored Apr 9, 2024
2 parents 39c476e + 10b8cb5 commit 631e669
Show file tree
Hide file tree
Showing 11 changed files with 203 additions and 87 deletions.
33 changes: 33 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: CI
on:
push:
pull_request:

jobs:
ci:
strategy:
fail-fast: true
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
os: [ubuntu-22.04] #, macos-latest, windows-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Test
run: |
PYTHONPATH=$(pwd) python example/filter_dp.py \
--input_vcf tests/hc_subset.vcf.gz \
--output_vcf tests/hc_subset_dp.vcf.gz
if [ ! -f tests/hc_subset_dp.vcf.gz ]; then exit 1; fi
if [ ! -f tests/hc_subset_dp.vcf.gz.tbi ]; then exit 1; fi
- name: Test csi
run: |
rm tests/hc_subset.vcf.gz.tbi
PYTHONPATH=$(pwd) VCF_INDEX_TYPE=2 python example/filter_dp.py \
--input_vcf tests/hc_subset.vcf.gz \
--output_vcf tests/hc_subset_dp.vcf.gz
if [ ! -f tests/hc_subset_dp.vcf.gz ]; then exit 1; fi
if [ ! -f tests/hc_subset_dp.vcf.gz.csi ]; then exit 1; fi
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
tmp/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
Binary file added tests/hc_subset.vcf.gz
Binary file not shown.
Binary file added tests/hc_subset.vcf.gz.csi
Binary file not shown.
Binary file added tests/hc_subset.vcf.gz.tbi
Binary file not shown.
2 changes: 1 addition & 1 deletion vcflib/bgzf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
import io
import struct
import zlib
Expand Down
2 changes: 1 addition & 1 deletion vcflib/compat.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
import sys

if sys.version_info[0] == 2:
Expand Down
2 changes: 1 addition & 1 deletion vcflib/sharder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
from abc import ABCMeta, abstractmethod
import copy
import heapq
Expand Down
213 changes: 146 additions & 67 deletions vcflib/tabix.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
import collections
import os
import struct
import sys

Expand All @@ -9,8 +10,7 @@
__all__ = ['Tabix']

class Header(object):
__slots__ = ('magic', 'n_ref', 'format',
'col_seq', 'col_beg', 'col_end', 'meta', 'skip', 'l_nm')
__slots__ = ('format', 'col_seq', 'col_beg', 'col_end', 'meta', 'skip')
def __init__(self, *args):
for k,v in zip(self.__slots__, args):
setattr(self, k, v)
Expand All @@ -19,13 +19,12 @@ def __iter__(self):
yield getattr(self, k)

class Tabix(object):
SHIFTS = (14, 17, 20, 23, 26, 29)
MAXBIN = ((1 << SHIFTS[-1]-SHIFTS[0]+3) - 1) // 7 + 1
MAGIC = 0x01494254
TBI_MAGIC = 0x01494254
CSI_MAGIC = 0x01495343
FMT_GENERIC, FMT_SAM, FMT_VCF, FMT_ZERO_BASED = 0, 1, 2, 0x10000

def __init__(self, idxf, mode='r'):
self.path = idxf
def __init__(self, path, mode='r'):
self.path = path
self.mode = mode
if mode[0:1] == 'r':
self.load()
Expand All @@ -37,72 +36,116 @@ def __init__(self, idxf, mode='r'):
def load(self):
self.indices = collections.OrderedDict()

with bgzf.open(self.path, 'rb') as fp:
if os.path.exists(self.path + '.csi'):
idxf = self.path + '.csi'
magic = self.CSI_MAGIC
else:
idxf = self.path + '.tbi'
magic = self.TBI_MAGIC

with bgzf.open(idxf, 'rb') as fp:
s4 = struct.Struct('<L')
s8 = struct.Struct('<Q')
sh = struct.Struct('<9L')
data = fp.read()
off = 0
h = Header(*sh.unpack_from(data, off)); off += sh.size
if h.magic != self.MAGIC:
sh = struct.Struct('<6L')
data = fp.read(); off = 0
self.magic, = s4.unpack_from(data, off); off += s4.size
if self.magic != magic:
raise RuntimeError('Not a tabix file')
self.header = h
names, l_nm = [], 0
for i in xrange(h.n_ref):
eos = data.find(b'\0', off)
if eos < 0: break
names.append(data[off:eos].decode())
l_nm += eos + 1 - off
off = eos + 1
if h.l_nm != l_nm:
if self.magic == self.TBI_MAGIC:
self.min_shift, self.depth = 14, 5
n_ref, = s4.unpack_from(data, off); off += s4.size
aux = sh.unpack_from(data, off); off += sh.size
l_nm, = s4.unpack_from(data, off); off += s4.size
names = data[off:off+l_nm].split(b'\0'); off += l_nm
else:
self.min_shift, = s4.unpack_from(data, off); off += s4.size
self.depth, = s4.unpack_from(data, off); off += s4.size
l_aux, = s4.unpack_from(data, off); off += s4.size
if l_aux < sh.size + s4.size:
raise RuntimeError('Invalid header')
aux = sh.unpack_from(data, off); off += sh.size
l_nm, = s4.unpack_from(data, off); off += s4.size
names = data[off:off+l_nm].split(b'\0'); off += l_nm
off += l_aux - (sh.size + s4.size + l_nm)
n_ref, = s4.unpack_from(data, off); off += s4.size
if len(names) != n_ref+1 or len(names[-1]) != 0:
raise RuntimeError('Header sequence name length mismatch')
for i in xrange(h.n_ref):
self.header = Header(*aux)
for i in xrange(n_ref):
bins = {}
n_bin, = s4.unpack_from(data, off); off += s4.size
for _ in xrange(n_bin):
bin, = s4.unpack_from(data, off); off += s4.size
if self.magic == self.TBI_MAGIC:
loffset = 0
else:
loffset, = s8.unpack_from(data, off); off += s8.size
chunks = []
n_chunk, = s4.unpack_from(data, off); off += s4.size
for _ in xrange(n_chunk):
s, = s8.unpack_from(data, off); off += s8.size
e, = s8.unpack_from(data, off); off += s8.size
chunks.append((s, e))
bins[bin] = chunks
bins[bin] = (loffset, chunks)
intvs = []
n_intv, = s4.unpack_from(data, off); off += s4.size
for _ in xrange(n_intv):
o, = s8.unpack_from(data, off); off += s8.size
intvs.append(o)
if n_intv == 0:
intvs.append(0)
self.indices[names[i]] = (bins, intvs)
if self.magic == self.TBI_MAGIC:
n_intv, = s4.unpack_from(data, off); off += s4.size
for _ in xrange(n_intv):
o, = s8.unpack_from(data, off); off += s8.size
intvs.append(o)
if n_intv == 0:
intvs.append(0)
self.indices[names[i].decode()] = (bins, intvs)
self.max_shift = self.min_shift + self.depth * 3

def save(self):
if self.header is None:
return
self.add(None, 0, 0, self.end)
h = self.header
h.n_ref = len(self.indices)
nms = b''.join(c.encode()+b'\0' for c,_ in iteritems(self.indices))
h.l_nm = len(nms)
with bgzf.open(self.path, 'wb') as fp:

for ext in ('.tbi', '.csi'):
f = self.path + ext
if os.path.exists(f): os.remove(f)

if self.magic == self.TBI_MAGIC:
idxf = self.path + '.tbi'
else:
idxf = self.path + '.csi'

with bgzf.open(idxf, 'wb') as fp:
s4 = struct.Struct('<L')
s8 = struct.Struct('<Q')
sh = struct.Struct('<9L')
fp.write(sh.pack(*h))
fp.write(nms)
sh = struct.Struct('<6L')
nms = b''.join(c.encode()+b'\0' for c,_ in iteritems(self.indices))
fp.write(s4.pack(self.magic))
if self.magic == self.TBI_MAGIC:
fp.write(s4.pack(len(self.indices)))
fp.write(sh.pack(*self.header))
fp.write(s4.pack(len(nms)))
fp.write(nms)
else:
fp.write(s4.pack(self.min_shift))
fp.write(s4.pack(self.depth))
fp.write(s4.pack(sh.size + s4.size + len(nms)))
fp.write(sh.pack(*self.header))
fp.write(s4.pack(len(nms)))
fp.write(nms)
fp.write(s4.pack(len(self.indices)))
for c, (bins, intvs) in iteritems(self.indices):
fp.write(s4.pack(len(bins)))
for bin in sorted(bins.keys()):
chunks = bins[bin]
loffset, chunks = bins[bin]
fp.write(s4.pack(bin))
if self.magic != self.TBI_MAGIC:
fp.write(s8.pack(loffset))
fp.write(s4.pack(len(chunks)))
for s,e in chunks:
fp.write(s8.pack(s))
fp.write(s8.pack(e))
fp.write(s4.pack(len(intvs)))
for o in intvs:
fp.write(s8.pack(o))
if self.magic == self.TBI_MAGIC:
fp.write(s4.pack(len(intvs)))
for o in intvs:
fp.write(s8.pack(o))
self.header = None

def query(self, c, s, e):
Expand All @@ -111,19 +154,24 @@ def query(self, c, s, e):
if ci is None:
return ranges
s = max(s, 0)
i = s >> self.SHIFTS[0]
minoff = ci[1][i] if i < len(ci[1]) else ci[1][-1]
for shift in reversed(self.SHIFTS):
bo = ((1 << 29-shift) - 1) // 7
i = s >> self.min_shift
minoff = ci[1][min(i,len(ci[1])-1)] if ci[1] else 0
for shift in range(self.max_shift, self.min_shift-3, -3):
bo = ((1 << self.max_shift - shift) - 1) // 7
bs = bo + (s >> shift)
be = bo + (e-1 >> shift)
be = min(be, self.MAXBIN-1)
if not ci[1]:
for bi in xrange(bs, bo-1, -1):
b = ci[0].get(bi)
if b is not None:
minoff = max(minoff, b[0])
break
for bi in xrange(bs, be+1):
if bi not in ci[0]:
continue
for chunk in ci[0][bi]:
if chunk[1] > minoff:
ranges.append(chunk)
b = ci[0].get(bi)
if b is not None:
ranges.extend(b[1])
if minoff > 0:
ranges = [(max(s,minoff), e) for s,e in ranges if e > minoff]
return self.merge(ranges, 16)

@staticmethod
Expand All @@ -141,14 +189,37 @@ def merge(ranges, shift):
yield p

def init(self):
h = Header(self.MAGIC, 0, self.FMT_VCF, 1, 2, 2, ord('#'), 0, 0)
self.header = h
self.magic = self.TBI_MAGIC
self.min_shift = 14
self.depth = 5
type = list(map(int, os.getenv('VCF_INDEX_TYPE', '1').split(':')))
if len(type) > 0 and type[0] == 2:
self.magic = self.CSI_MAGIC
if len(type) > 1:
self.min_shift = type[1]
if len(type) > 2:
self.depth = type[2]
self.max_shift = self.min_shift + self.depth * 3
self.header = Header(self.FMT_VCF, 1, 2, 2, ord('#'), 0)
self.indices = collections.OrderedDict()
self.ci = None
self.pos = 0
self.end = 0

def add(self, c, s, e, off):
if c is None and s > 0:
# s is the max contig length
shift = self.min_shift
limit = 1 << shift
while s > limit:
limit <<= 1
shift += 1
if shift >= 32:
raise RuntimeError('Some contigs are too long')
if shift > self.min_shift + self.depth * 3:
self.magic = self.CSI_MAGIC;
self.depth = (shift - self.min_shift + 2) // 3;
self.max_shift = self.min_shift + self.depth * 3
if self.ci and self.ci[0] != c:
self.optimize(self.ci)
self.ci = None
Expand All @@ -159,17 +230,19 @@ def add(self, c, s, e, off):
if self.ci:
chrom, bins, intvs = self.ci
assert chrom == c and s >= self.pos
be = e-1 >> self.SHIFTS[0]
be = e-1 >> self.min_shift
if be >= len(intvs):
intvs += [self.end] * (be+1 - len(intvs))
bin = 0
for shift in self.SHIFTS:
for shift in range(self.min_shift, self.max_shift+3, 3):
bs, be = s >> shift, e-1 >> shift
if bs == be:
bo = ((1 << 29-shift) - 1) // 7
bo = ((1 << self.max_shift - shift) - 1) // 7
bin = bo + bs
break
chunks = bins.setdefault(bin,[])
b = bins.setdefault(bin,[])
if not b: b.extend((0, []))
chunks = b[1]
if chunks and chunks[-1][1] == self.end:
chunks[-1] = (chunks[-1][0], off)
else:
Expand All @@ -179,25 +252,31 @@ def add(self, c, s, e, off):

def optimize(self, ci):
bins = ci[1]
for shift in self.SHIFTS[:-1]:
bo = ((1 << 29-shift) - 1) // 7
for shift in range(self.min_shift, self.max_shift+3, 3):
bo = ((1 << self.max_shift - shift) - 1) // 7
for bin in sorted(bins.keys()):
if bin < bo:
continue
if bin > bo << 3:
break
chunks = bins.get(bin)
if chunks is None:
b = bins.get(bin)
if b is None:
continue
chunks = b[1]
if len(chunks) == 0:
del bins[bin]
continue
bs = chunks[0][0] >> 16
be = chunks[-1][1] >> 16
if be - bs < 65536:
if be - bs < 65536 and bo > 0:
del bins[bin]
bin = bin-1 >> 3
chunks += bins.get(bin,[])
bins[bin] = list(self.merge(chunks, 16))
b = bins.setdefault(bin,[])
if not b: b.extend((0, []))
b[1] = list(self.merge(chunks + b[1], 16))
elif ci[2]:
intv = (bin - bo) << (shift - self.min_shift)
intv = min(intv, len(ci[2])-1)
b[0] = ci[2][intv]

# vim: ts=4 sw=4 expandtab
Loading

0 comments on commit 631e669

Please sign in to comment.