Skip to content

Commit 33c9764

Browse files
committed
Gf2
1 parent 40a4b14 commit 33c9764

12 files changed

+197
-293
lines changed

CHANGELOG.md

-4
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
## [1.6.1](https://github.com/GMOD/tabix-js/compare/v1.6.0...v1.6.1) (2024-12-07)
22

3-
4-
53
# [1.6.0](https://github.com/GMOD/tabix-js/compare/v1.5.15...v1.6.0) (2024-11-30)
64

7-
8-
95
## [1.5.15](https://github.com/GMOD/tabix-js/compare/v1.5.14...v1.5.15) (2024-08-30)
106

117
## [1.5.14](https://github.com/GMOD/tabix-js/compare/v1.5.13...v1.5.14) (2024-07-23)

package.json

+3-6
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
],
4343
"dependencies": {
4444
"@gmod/abortable-promise-cache": "^2.0.0",
45-
"@gmod/bgzf-filehandle": "^1.3.3",
46-
"generic-filehandle": "^3.0.0",
45+
"@gmod/bgzf-filehandle": "^2.0.0",
46+
"generic-filehandle2": "^0.0.1",
4747
"long": "^4.0.0",
4848
"quick-lru": "^4.0.0"
4949
},
@@ -55,16 +55,13 @@
5555
"@typescript-eslint/eslint-plugin": "^8.0.1",
5656
"@typescript-eslint/parser": "^8.0.1",
5757
"@vitest/coverage-v8": "^2.0.5",
58-
"buffer": "^6.0.3",
5958
"documentation": "^14.0.3",
6059
"eslint": "^9.9.0",
61-
"eslint-config-prettier": "^9.1.0",
62-
"eslint-plugin-prettier": "^5.0.1",
6360
"eslint-plugin-unicorn": "^56.0.0",
6461
"prettier": "^3.3.3",
6562
"rimraf": "^6.0.1",
6663
"standard-changelog": "^6.0.0",
67-
"typescript": "~5.6.0",
64+
"typescript": "^5.7.0",
6865
"typescript-eslint": "^8.0.1",
6966
"vitest": "^2.0.5",
7067
"webpack": "^5.93.0",

src/csi.ts

+49-40
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import Long from 'long'
2-
import { Buffer } from 'buffer'
32
import { unzip } from '@gmod/bgzf-filehandle'
43

54
import VirtualOffset, { fromBytes } from './virtualOffset'
@@ -11,6 +10,12 @@ import IndexFile, { Options } from './indexFile'
1110
const CSI1_MAGIC = 21582659 // CSI\1
1211
const CSI2_MAGIC = 38359875 // CSI\2
1312

13+
const formats = {
14+
0: 'generic',
15+
1: 'SAM',
16+
2: 'VCF',
17+
}
18+
1419
function lshift(num: number, bits: number) {
1520
return num * 2 ** bits
1621
}
@@ -49,26 +54,27 @@ export default class CSI extends IndexFile {
4954
throw new Error('CSI indexes do not support indexcov')
5055
}
5156

52-
parseAuxData(bytes: Buffer, offset: number) {
53-
const formatFlags = bytes.readInt32LE(offset)
57+
parseAuxData(bytes: Uint8Array, offset: number) {
58+
const dataView = new DataView(bytes.buffer)
59+
const formatFlags = dataView.getInt32(offset, true)
5460
const coordinateType =
5561
formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
56-
const format = { 0: 'generic', 1: 'SAM', 2: 'VCF' }[formatFlags & 0xf]
62+
const format = formats[(formatFlags & 0xf) as 0 | 1 | 2]
5763
if (!format) {
5864
throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
5965
}
6066
const columnNumbers = {
61-
ref: bytes.readInt32LE(offset + 4),
62-
start: bytes.readInt32LE(offset + 8),
63-
end: bytes.readInt32LE(offset + 12),
67+
ref: dataView.getInt32(offset + 4, true),
68+
start: dataView.getInt32(offset + 8, true),
69+
end: dataView.getInt32(offset + 12, true),
6470
}
65-
const metaValue = bytes.readInt32LE(offset + 16)
71+
const metaValue = dataView.getInt32(offset + 16, true)
6672
const metaChar = metaValue ? String.fromCharCode(metaValue) : null
67-
const skipLines = bytes.readInt32LE(offset + 20)
68-
const nameSectionLength = bytes.readInt32LE(offset + 24)
73+
const skipLines = dataView.getInt32(offset + 20, true)
74+
const nameSectionLength = dataView.getInt32(offset + 24, true)
6975

7076
const { refIdToName, refNameToId } = this._parseNameBytes(
71-
bytes.slice(offset + 28, offset + 28 + nameSectionLength),
77+
bytes.subarray(offset + 28, offset + 28 + nameSectionLength),
7278
)
7379

7480
return {
@@ -82,47 +88,52 @@ export default class CSI extends IndexFile {
8288
}
8389
}
8490

85-
_parseNameBytes(namesBytes: Buffer) {
91+
_parseNameBytes(namesBytes: Uint8Array) {
8692
let currRefId = 0
8793
let currNameStart = 0
8894
const refIdToName = []
8995
const refNameToId: Record<string, number> = {}
96+
const decoder = new TextDecoder('utf8')
9097
for (let i = 0; i < namesBytes.length; i += 1) {
9198
if (!namesBytes[i]) {
9299
if (currNameStart < i) {
93-
let refName = namesBytes.toString('utf8', currNameStart, i)
94-
refName = this.renameRefSeq(refName)
100+
const refName = this.renameRefSeq(
101+
decoder.decode(namesBytes.subarray(currNameStart, i)),
102+
)
95103
refIdToName[currRefId] = refName
96104
refNameToId[refName] = currRefId
97105
}
98106
currNameStart = i + 1
99107
currRefId += 1
100108
}
101109
}
102-
return { refNameToId, refIdToName }
110+
return {
111+
refNameToId,
112+
refIdToName,
113+
}
103114
}
104115

105116
// fetch and parse the index
106117

107118
async _parse(opts: Options = {}) {
108119
const bytes = await unzip(await this.filehandle.readFile(opts))
120+
const dataView = new DataView(bytes.buffer)
109121

110122
// check TBI magic numbers
111123
let csiVersion
112-
if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
124+
if (dataView.getUint32(0, true) === CSI1_MAGIC) {
113125
csiVersion = 1
114-
} else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
126+
} else if (dataView.getUint32(0, true) === CSI2_MAGIC) {
115127
csiVersion = 2
116128
} else {
117129
throw new Error('Not a CSI file')
118-
// TODO: do we need to support big-endian CSI files?
119130
}
120131

121-
this.minShift = bytes.readInt32LE(4)
122-
this.depth = bytes.readInt32LE(8)
132+
this.minShift = dataView.getInt32(4, true)
133+
this.depth = dataView.getInt32(8, true)
123134
this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
124135
const maxRefLength = 2 ** (this.minShift + this.depth * 3)
125-
const auxLength = bytes.readInt32LE(12)
136+
const auxLength = dataView.getInt32(12, true)
126137
const aux =
127138
auxLength && auxLength >= 30
128139
? this.parseAuxData(bytes, 16)
@@ -134,35 +145,33 @@ export default class CSI extends IndexFile {
134145
coordinateType: 'zero-based-half-open',
135146
format: 'generic',
136147
}
137-
const refCount = bytes.readInt32LE(16 + auxLength)
148+
const refCount = dataView.getInt32(16 + auxLength, true)
138149

139150
// read the indexes for each reference sequence
140151
let firstDataLine: VirtualOffset | undefined
141152
let currOffset = 16 + auxLength + 4
142153
const indices = new Array(refCount).fill(0).map(() => {
143-
// the binning index
144-
const binCount = bytes.readInt32LE(currOffset)
154+
const binCount = dataView.getInt32(currOffset, true)
145155
currOffset += 4
146156
const binIndex: Record<string, Chunk[]> = {}
147-
let stats // < provided by parsing a pseudo-bin, if present
157+
let stats
148158
for (let j = 0; j < binCount; j += 1) {
149-
const bin = bytes.readUInt32LE(currOffset)
159+
const bin = dataView.getUint32(currOffset, true)
150160
if (bin > this.maxBinNumber) {
151-
// this is a fake bin that actually has stats information
152-
// about the reference sequence in it
161+
// this is a fake bin that actually has stats information about the
162+
// reference sequence in it
153163
stats = this.parsePseudoBin(bytes, currOffset + 4)
154164
currOffset += 4 + 8 + 4 + 16 + 16
155165
} else {
156166
const loffset = fromBytes(bytes, currOffset + 4)
157167
firstDataLine = this._findFirstData(firstDataLine, loffset)
158-
const chunkCount = bytes.readInt32LE(currOffset + 12)
168+
const chunkCount = dataView.getInt32(currOffset + 12, true)
159169
currOffset += 16
160170
const chunks = new Array(chunkCount)
161171
for (let k = 0; k < chunkCount; k += 1) {
162172
const u = fromBytes(bytes, currOffset)
163173
const v = fromBytes(bytes, currOffset + 8)
164174
currOffset += 16
165-
// this._findFirstData(data, u)
166175
chunks[k] = new Chunk(u, v, bin)
167176
}
168177
binIndex[bin] = chunks
@@ -186,14 +195,15 @@ export default class CSI extends IndexFile {
186195
}
187196
}
188197

189-
parsePseudoBin(bytes: Buffer, offset: number) {
190-
const lineCount = longToNumber(
191-
Long.fromBytesLE(
192-
bytes.slice(offset + 28, offset + 36) as unknown as number[],
193-
true,
198+
parsePseudoBin(bytes: Uint8Array, offset: number) {
199+
return {
200+
lineCount: longToNumber(
201+
Long.fromBytesLE(
202+
bytes.subarray(offset + 28, offset + 36) as unknown as number[],
203+
true,
204+
),
194205
),
195-
)
196-
return { lineCount }
206+
}
197207
}
198208

199209
async blocksForRange(
@@ -216,9 +226,8 @@ export default class CSI extends IndexFile {
216226
return []
217227
}
218228

219-
// const { linearIndex, binIndex } = indexes
220-
221-
const overlappingBins = this.reg2bins(min, max) // List of bin #s that overlap min, max
229+
// List of bin #s that overlap min, max
230+
const overlappingBins = this.reg2bins(min, max)
222231
const chunks: Chunk[] = []
223232

224233
// Find chunks in overlapping bins. Leaf bins (< 4681) are not pruned

src/indexFile.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { GenericFilehandle } from 'generic-filehandle'
1+
import { GenericFilehandle } from 'generic-filehandle2'
22
import VirtualOffset from './virtualOffset'
33
import Chunk from './chunk'
44

src/tabixIndexedFile.ts

+15-33
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import AbortablePromiseCache from '@gmod/abortable-promise-cache'
22
import LRU from 'quick-lru'
3-
import { Buffer } from 'buffer'
4-
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle'
3+
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle2'
54
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
65
import { checkAbortSignal } from './util'
76
import IndexFile, { Options, IndexData } from './indexFile'
@@ -17,17 +16,14 @@ function isASCII(str: string) {
1716

1817
type GetLinesCallback = (line: string, fileOffset: number) => void
1918

20-
const decoder =
21-
typeof TextDecoder !== 'undefined' ? new TextDecoder('utf8') : undefined
22-
2319
interface GetLinesOpts {
2420
[key: string]: unknown
2521
signal?: AbortSignal
2622
lineCallback: GetLinesCallback
2723
}
2824

2925
interface ReadChunk {
30-
buffer: Buffer
26+
buffer: Uint8Array
3127
cpositions: number[]
3228
dpositions: number[]
3329
}
@@ -196,6 +192,7 @@ export default class TabixIndexedFile {
196192

197193
const chunks = await this.index.blocksForRange(refName, start, end, options)
198194
checkAbortSignal(signal)
195+
const decoder = new TextDecoder('utf8')
199196

200197
// now go through each chunk and parse and filter the lines out of it
201198
for (const c of chunks) {
@@ -209,11 +206,11 @@ export default class TabixIndexedFile {
209206
let blockStart = 0
210207
let pos = 0
211208

212-
const str = decoder?.decode(buffer) ?? buffer.toString()
213209
// fast path, Buffer is just ASCII chars and not gigantor, can be
214210
// converted to string and processed directly. if it is not ASCII or
215211
// gigantic (chrome max str len is 512Mb), we have to decode line by line
216-
const strIsASCII = buffer.length < 500_000_000 && isASCII(str)
212+
const str = decoder.decode(buffer)
213+
const strIsASCII = isASCII(str)
217214
while (blockStart < str.length) {
218215
let line: string
219216
let n: number
@@ -224,12 +221,12 @@ export default class TabixIndexedFile {
224221
}
225222
line = str.slice(blockStart, n)
226223
} else {
227-
n = buffer.indexOf('\n', blockStart)
224+
n = buffer.indexOf('\n'.charCodeAt(0), blockStart)
228225
if (n === -1) {
229226
break
230227
}
231228
const b = buffer.slice(blockStart, n)
232-
line = decoder?.decode(b) ?? b.toString()
229+
line = decoder.decode(b)
233230
}
234231

235232
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
@@ -292,10 +289,10 @@ export default class TabixIndexedFile {
292289
checkAbortSignal(opts.signal)
293290

294291
const maxFetch = (firstDataLine?.blockPosition || 0) + maxBlockSize
295-
// TODO: what if we don't have a firstDataLine, and the header
296-
// actually takes up more than one block? this case is not covered here
292+
// TODO: what if we don't have a firstDataLine, and the header actually
293+
// takes up more than one block? this case is not covered here
297294

298-
const buf = await this._readRegion(0, maxFetch, opts)
295+
const buf = await this.filehandle.read(maxFetch, 0, opts)
299296
const bytes = await unzip(buf)
300297

301298
// trim off lines after the last non-meta line
@@ -324,8 +321,9 @@ export default class TabixIndexedFile {
324321
* @returns {Promise} for a string
325322
*/
326323
async getHeader(opts: Options = {}) {
324+
const decoder = new TextDecoder('utf8')
327325
const bytes = await this.getHeaderBuffer(opts)
328-
return bytes.toString('utf8')
326+
return decoder.decode(bytes)
329327
}
330328

331329
/**
@@ -492,32 +490,16 @@ export default class TabixIndexedFile {
492490
return this.index.lineCount(refName, opts)
493491
}
494492

495-
async _readRegion(pos: number, size: number, opts: Options = {}) {
496-
const b = Buffer.alloc(size)
497-
const { bytesRead, buffer } = await this.filehandle.read(
498-
b,
499-
0,
500-
size,
501-
pos,
502-
opts,
503-
)
504-
505-
return buffer.subarray(0, bytesRead)
506-
}
507-
508493
/**
509494
* read and uncompress the data in a chunk (composed of one or more
510495
* contiguous bgzip blocks) of the file
511496
*/
512497
async readChunk(c: Chunk, opts: Options = {}) {
513-
// fetch the uncompressed data, uncompress carefully a block at a time, and
514-
// stop when done
515-
516-
const data = await this._readRegion(
517-
c.minv.blockPosition,
498+
const ret = await this.filehandle.read(
518499
c.fetchedSize(),
500+
c.minv.blockPosition,
519501
opts,
520502
)
521-
return unzipChunkSlice(data, c)
503+
return unzipChunkSlice(ret, c)
522504
}
523505
}

0 commit comments

Comments
 (0)