Skip to content

Commit 9e5a27d

Browse files
committed
Gf2
1 parent 40a4b14 commit 9e5a27d

8 files changed

+88
-178
lines changed

package.json

+3-6
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
],
4343
"dependencies": {
4444
"@gmod/abortable-promise-cache": "^2.0.0",
45-
"@gmod/bgzf-filehandle": "^1.3.3",
46-
"generic-filehandle": "^3.0.0",
45+
"@gmod/bgzf-filehandle": "^2.0.0",
46+
"generic-filehandle2": "^0.0.1",
4747
"long": "^4.0.0",
4848
"quick-lru": "^4.0.0"
4949
},
@@ -55,16 +55,13 @@
5555
"@typescript-eslint/eslint-plugin": "^8.0.1",
5656
"@typescript-eslint/parser": "^8.0.1",
5757
"@vitest/coverage-v8": "^2.0.5",
58-
"buffer": "^6.0.3",
5958
"documentation": "^14.0.3",
6059
"eslint": "^9.9.0",
61-
"eslint-config-prettier": "^9.1.0",
62-
"eslint-plugin-prettier": "^5.0.1",
6360
"eslint-plugin-unicorn": "^56.0.0",
6461
"prettier": "^3.3.3",
6562
"rimraf": "^6.0.1",
6663
"standard-changelog": "^6.0.0",
67-
"typescript": "~5.6.0",
64+
"typescript": "^5.7.0",
6865
"typescript-eslint": "^8.0.1",
6966
"vitest": "^2.0.5",
7067
"webpack": "^5.93.0",

src/csi.ts

+35-28
Original file line numberDiff line numberDiff line change
@@ -49,26 +49,27 @@ export default class CSI extends IndexFile {
4949
throw new Error('CSI indexes do not support indexcov')
5050
}
5151

52-
parseAuxData(bytes: Buffer, offset: number) {
53-
const formatFlags = bytes.readInt32LE(offset)
52+
parseAuxData(bytes: Uint8Array, offset: number) {
53+
const dataView = new DataView(bytes.buffer)
54+
const formatFlags = dataView.getInt32(offset, true)
5455
const coordinateType =
5556
formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
5657
const format = { 0: 'generic', 1: 'SAM', 2: 'VCF' }[formatFlags & 0xf]
5758
if (!format) {
5859
throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
5960
}
6061
const columnNumbers = {
61-
ref: bytes.readInt32LE(offset + 4),
62-
start: bytes.readInt32LE(offset + 8),
63-
end: bytes.readInt32LE(offset + 12),
62+
ref: dataView.getInt32(offset + 4, true),
63+
start: dataView.getInt32(offset + 8, true),
64+
end: dataView.getInt32(offset + 12, true),
6465
}
65-
const metaValue = bytes.readInt32LE(offset + 16)
66+
const metaValue = dataView.getInt32(offset + 16, true)
6667
const metaChar = metaValue ? String.fromCharCode(metaValue) : null
67-
const skipLines = bytes.readInt32LE(offset + 20)
68-
const nameSectionLength = bytes.readInt32LE(offset + 24)
68+
const skipLines = dataView.getInt32(offset + 20, true)
69+
const nameSectionLength = dataView.getInt32(offset + 24, true)
6970

7071
const { refIdToName, refNameToId } = this._parseNameBytes(
71-
bytes.slice(offset + 28, offset + 28 + nameSectionLength),
72+
bytes.subarray(offset + 28, offset + 28 + nameSectionLength),
7273
)
7374

7475
return {
@@ -82,15 +83,16 @@ export default class CSI extends IndexFile {
8283
}
8384
}
8485

85-
_parseNameBytes(namesBytes: Buffer) {
86+
_parseNameBytes(namesBytes: Uint8Array) {
8687
let currRefId = 0
8788
let currNameStart = 0
8889
const refIdToName = []
8990
const refNameToId: Record<string, number> = {}
91+
const decoder = new TextDecoder('utf8')
9092
for (let i = 0; i < namesBytes.length; i += 1) {
9193
if (!namesBytes[i]) {
9294
if (currNameStart < i) {
93-
let refName = namesBytes.toString('utf8', currNameStart, i)
95+
let refName = decoder.decode(namesBytes.subarray(currNameStart, i))
9496
refName = this.renameRefSeq(refName)
9597
refIdToName[currRefId] = refName
9698
refNameToId[refName] = currRefId
@@ -99,30 +101,34 @@ export default class CSI extends IndexFile {
99101
currRefId += 1
100102
}
101103
}
102-
return { refNameToId, refIdToName }
104+
return {
105+
refNameToId,
106+
refIdToName,
107+
}
103108
}
104109

105110
// fetch and parse the index
106111

107112
async _parse(opts: Options = {}) {
108113
const bytes = await unzip(await this.filehandle.readFile(opts))
114+
const dataView = new DataView(bytes.buffer)
109115

110116
// check TBI magic numbers
111117
let csiVersion
112-
if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
118+
if (dataView.getUint32(0, true) === CSI1_MAGIC) {
113119
csiVersion = 1
114-
} else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
120+
} else if (dataView.getUint32(0, true) === CSI2_MAGIC) {
115121
csiVersion = 2
116122
} else {
117123
throw new Error('Not a CSI file')
118124
// TODO: do we need to support big-endian CSI files?
119125
}
120126

121-
this.minShift = bytes.readInt32LE(4)
122-
this.depth = bytes.readInt32LE(8)
127+
this.minShift = dataView.getInt32(4, true)
128+
this.depth = dataView.getInt32(8, true)
123129
this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
124130
const maxRefLength = 2 ** (this.minShift + this.depth * 3)
125-
const auxLength = bytes.readInt32LE(12)
131+
const auxLength = dataView.getInt32(12, true)
126132
const aux =
127133
auxLength && auxLength >= 30
128134
? this.parseAuxData(bytes, 16)
@@ -134,19 +140,19 @@ export default class CSI extends IndexFile {
134140
coordinateType: 'zero-based-half-open',
135141
format: 'generic',
136142
}
137-
const refCount = bytes.readInt32LE(16 + auxLength)
143+
const refCount = dataView.getInt32(16 + auxLength, true)
138144

139145
// read the indexes for each reference sequence
140146
let firstDataLine: VirtualOffset | undefined
141147
let currOffset = 16 + auxLength + 4
142148
const indices = new Array(refCount).fill(0).map(() => {
143149
// the binning index
144-
const binCount = bytes.readInt32LE(currOffset)
150+
const binCount = dataView.getInt32(currOffset, true)
145151
currOffset += 4
146152
const binIndex: Record<string, Chunk[]> = {}
147153
let stats // < provided by parsing a pseudo-bin, if present
148154
for (let j = 0; j < binCount; j += 1) {
149-
const bin = bytes.readUInt32LE(currOffset)
155+
const bin = dataView.getUint32(currOffset, true)
150156
if (bin > this.maxBinNumber) {
151157
// this is a fake bin that actually has stats information
152158
// about the reference sequence in it
@@ -155,7 +161,7 @@ export default class CSI extends IndexFile {
155161
} else {
156162
const loffset = fromBytes(bytes, currOffset + 4)
157163
firstDataLine = this._findFirstData(firstDataLine, loffset)
158-
const chunkCount = bytes.readInt32LE(currOffset + 12)
164+
const chunkCount = dataView.getInt32(currOffset + 12, true)
159165
currOffset += 16
160166
const chunks = new Array(chunkCount)
161167
for (let k = 0; k < chunkCount; k += 1) {
@@ -186,14 +192,15 @@ export default class CSI extends IndexFile {
186192
}
187193
}
188194

189-
parsePseudoBin(bytes: Buffer, offset: number) {
190-
const lineCount = longToNumber(
191-
Long.fromBytesLE(
192-
bytes.slice(offset + 28, offset + 36) as unknown as number[],
193-
true,
195+
parsePseudoBin(bytes: Uint8Array, offset: number) {
196+
return {
197+
lineCount: longToNumber(
198+
Long.fromBytesLE(
199+
bytes.slice(offset + 28, offset + 36) as unknown as number[],
200+
true,
201+
),
194202
),
195-
)
196-
return { lineCount }
203+
}
197204
}
198205

199206
async blocksForRange(

src/indexFile.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { GenericFilehandle } from 'generic-filehandle'
1+
import { GenericFilehandle } from 'generic-filehandle2'
22
import VirtualOffset from './virtualOffset'
33
import Chunk from './chunk'
44

src/tabixIndexedFile.ts

+13-34
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import AbortablePromiseCache from '@gmod/abortable-promise-cache'
22
import LRU from 'quick-lru'
3-
import { Buffer } from 'buffer'
4-
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle'
3+
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle2'
54
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
65
import { checkAbortSignal } from './util'
76
import IndexFile, { Options, IndexData } from './indexFile'
@@ -17,17 +16,14 @@ function isASCII(str: string) {
1716

1817
type GetLinesCallback = (line: string, fileOffset: number) => void
1918

20-
const decoder =
21-
typeof TextDecoder !== 'undefined' ? new TextDecoder('utf8') : undefined
22-
2319
interface GetLinesOpts {
2420
[key: string]: unknown
2521
signal?: AbortSignal
2622
lineCallback: GetLinesCallback
2723
}
2824

2925
interface ReadChunk {
30-
buffer: Buffer
26+
buffer: Uint8Array
3127
cpositions: number[]
3228
dpositions: number[]
3329
}
@@ -196,6 +192,7 @@ export default class TabixIndexedFile {
196192

197193
const chunks = await this.index.blocksForRange(refName, start, end, options)
198194
checkAbortSignal(signal)
195+
const decoder = new TextDecoder('utf8')
199196

200197
// now go through each chunk and parse and filter the lines out of it
201198
for (const c of chunks) {
@@ -209,11 +206,11 @@ export default class TabixIndexedFile {
209206
let blockStart = 0
210207
let pos = 0
211208

212-
const str = decoder?.decode(buffer) ?? buffer.toString()
213209
// fast path, Buffer is just ASCII chars and not gigantor, can be
214210
// converted to string and processed directly. if it is not ASCII or
215211
// gigantic (chrome max str len is 512Mb), we have to decode line by line
216-
const strIsASCII = buffer.length < 500_000_000 && isASCII(str)
212+
const str = decoder.decode(buffer)
213+
const strIsASCII = isASCII(str)
217214
while (blockStart < str.length) {
218215
let line: string
219216
let n: number
@@ -224,12 +221,12 @@ export default class TabixIndexedFile {
224221
}
225222
line = str.slice(blockStart, n)
226223
} else {
227-
n = buffer.indexOf('\n', blockStart)
224+
n = buffer.indexOf('\n'.charCodeAt(0), blockStart)
228225
if (n === -1) {
229226
break
230227
}
231228
const b = buffer.slice(blockStart, n)
232-
line = decoder?.decode(b) ?? b.toString()
229+
line = decoder.decode(b)
233230
}
234231

235232
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
@@ -292,10 +289,10 @@ export default class TabixIndexedFile {
292289
checkAbortSignal(opts.signal)
293290

294291
const maxFetch = (firstDataLine?.blockPosition || 0) + maxBlockSize
295-
// TODO: what if we don't have a firstDataLine, and the header
296-
// actually takes up more than one block? this case is not covered here
292+
// TODO: what if we don't have a firstDataLine, and the header actually
293+
// takes up more than one block? this case is not covered here
297294

298-
const buf = await this._readRegion(0, maxFetch, opts)
295+
const buf = await this.filehandle.read(0, maxFetch, opts)
299296
const bytes = await unzip(buf)
300297

301298
// trim off lines after the last non-meta line
@@ -492,32 +489,14 @@ export default class TabixIndexedFile {
492489
return this.index.lineCount(refName, opts)
493490
}
494491

495-
async _readRegion(pos: number, size: number, opts: Options = {}) {
496-
const b = Buffer.alloc(size)
497-
const { bytesRead, buffer } = await this.filehandle.read(
498-
b,
499-
0,
500-
size,
501-
pos,
502-
opts,
503-
)
504-
505-
return buffer.subarray(0, bytesRead)
506-
}
507-
508492
/**
509493
* read and uncompress the data in a chunk (composed of one or more
510494
* contiguous bgzip blocks) of the file
511495
*/
512496
async readChunk(c: Chunk, opts: Options = {}) {
513-
// fetch the uncompressed data, uncompress carefully a block at a time, and
514-
// stop when done
515-
516-
const data = await this._readRegion(
517-
c.minv.blockPosition,
518-
c.fetchedSize(),
519-
opts,
497+
return unzipChunkSlice(
498+
await this.filehandle.read(c.minv.blockPosition, c.fetchedSize(), opts),
499+
c,
520500
)
521-
return unzipChunkSlice(data, c)
522501
}
523502
}

0 commit comments

Comments
 (0)