Skip to content

Commit 6687a1f

Browse files
authored
Small optimizations for large GWAS type data (#148)
1 parent a2828e4 commit 6687a1f

File tree

5 files changed

+1016
-1067
lines changed

5 files changed

+1016
-1067
lines changed

eslint.config.mjs

+10-4
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,17 @@ export default tseslint.config(
3636
},
3737
],
3838

39-
'no-underscore-dangle': 0,
39+
'no-console': [
40+
'warn',
41+
{
42+
allow: ['error', 'warn'],
43+
},
44+
],
45+
'no-underscore-dangle': 'off',
4046
curly: 'error',
41-
'@typescript-eslint/no-explicit-any': 0,
42-
'@typescript-eslint/explicit-module-boundary-types': 0,
43-
'@typescript-eslint/ban-ts-comment': 0,
47+
'@typescript-eslint/no-explicit-any': 'off',
48+
'@typescript-eslint/explicit-module-boundary-types': 'off',
49+
'@typescript-eslint/ban-ts-comment': 'off',
4450
semi: ['error', 'never'],
4551
'unicorn/no-new-array': 'off',
4652
'unicorn/no-empty-file': 'off',

package.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626
"docs": "documentation readme --shallow src/tabixIndexedFile.ts --section TabixIndexedFile",
2727
"clean": "rimraf dist esm",
2828
"prebuild": "npm run clean && npm run lint",
29-
"build:esm": "tsc --target es2018 --outDir esm",
30-
"build:es5": "tsc --target es2015 --module commonjs --outDir dist",
29+
"build:esm": "tsc --outDir esm",
30+
"build:es5": "tsc --module commonjs --outDir dist",
3131
"build": "npm run build:esm && npm run build:es5",
3232
"postbuild": "webpack",
3333
"preversion": "npm run lint && npm test run && npm run build",
@@ -60,7 +60,7 @@
6060
"eslint": "^9.9.0",
6161
"eslint-config-prettier": "^9.1.0",
6262
"eslint-plugin-prettier": "^5.0.1",
63-
"eslint-plugin-unicorn": "^55.0.0",
63+
"eslint-plugin-unicorn": "^56.0.0",
6464
"prettier": "^3.3.3",
6565
"rimraf": "^6.0.1",
6666
"standard-changelog": "^6.0.0",

src/tabixIndexedFile.ts

+64-58
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ import Chunk from './chunk'
1010
import TBI from './tbi'
1111
import CSI from './csi'
1212

13+
function isASCII(str: string) {
14+
return /^[\u0000-\u007F]*$/.test(str)
15+
}
16+
1317
type GetLinesCallback = (line: string, fileOffset: number) => void
1418

1519
const decoder =
@@ -27,13 +31,9 @@ interface ReadChunk {
2731
dpositions: number[]
2832
}
2933

30-
function timeout(time: number) {
31-
return new Promise(resolve => setTimeout(resolve, time))
32-
}
3334
export default class TabixIndexedFile {
3435
private filehandle: GenericFilehandle
3536
private index: IndexFile
36-
private yieldTime: number
3737
private renameRefSeq: (n: string) => string
3838
private chunkCache: AbortablePromiseCache<Chunk, ReadChunk>
3939

@@ -58,9 +58,6 @@ export default class TabixIndexedFile {
5858
*
5959
* @param {tbiUrl} [args.tbiUrl]
6060
*
61-
* @param {number} [args.yieldTime] yield to main thread after N milliseconds
62-
* if reading features is taking a long time to avoid hanging main thread
63-
*
6461
* @param {function} [args.renameRefSeqs] optional function with sig `string
6562
* => string` to transform reference sequence names for the purpose of
6663
* indexing and querying. note that the data that is returned is not altered,
@@ -76,7 +73,6 @@ export default class TabixIndexedFile {
7673
csiPath,
7774
csiUrl,
7875
csiFilehandle,
79-
yieldTime = 500,
8076
renameRefSeqs = n => n,
8177
chunkCacheSize = 5 * 2 ** 20,
8278
}: {
@@ -89,7 +85,6 @@ export default class TabixIndexedFile {
8985
csiPath?: string
9086
csiUrl?: string
9187
csiFilehandle?: GenericFilehandle
92-
yieldTime?: number
9388
renameRefSeqs?: (n: string) => string
9489
chunkCacheSize?: number
9590
}) {
@@ -147,7 +142,6 @@ export default class TabixIndexedFile {
147142
}
148143

149144
this.renameRefSeq = renameRefSeqs
150-
this.yieldTime = yieldTime
151145
this.chunkCache = new AbortablePromiseCache<Chunk, ReadChunk>({
152146
cache: new LRU({ maxSize: Math.floor(chunkCacheSize / (1 << 16)) }),
153147
fill: (args: Chunk, signal?: AbortSignal) =>
@@ -203,9 +197,7 @@ export default class TabixIndexedFile {
203197
checkAbortSignal(signal)
204198

205199
// now go through each chunk and parse and filter the lines out of it
206-
let last = Date.now()
207200
for (const c of chunks) {
208-
let previousStartCoordinate: number | undefined
209201
const { buffer, cpositions, dpositions } = await this.chunkCache.get(
210202
c.toString(),
211203
c,
@@ -215,13 +207,29 @@ export default class TabixIndexedFile {
215207
checkAbortSignal(signal)
216208
let blockStart = 0
217209
let pos = 0
218-
while (blockStart < buffer.length) {
219-
const n = buffer.indexOf('\n', blockStart)
220-
if (n === -1) {
221-
break
210+
211+
const str = decoder?.decode(buffer) ?? buffer.toString()
212+
// fast path, Buffer is just ASCII chars and not gigantor, can be
213+
// converted to string and processed directly. if it is not ASCII or
214+
// gigantic (chrome max str len is 512Mb), we have to decode line by line
215+
const strIsASCII = buffer.length < 500_000_000 && isASCII(str)
216+
while (blockStart < str.length) {
217+
let line: string
218+
let n: number
219+
if (strIsASCII) {
220+
n = str.indexOf('\n', blockStart)
221+
if (n === -1) {
222+
break
223+
}
224+
line = str.slice(blockStart, n)
225+
} else {
226+
n = buffer.indexOf('\n', blockStart)
227+
if (n === -1) {
228+
break
229+
}
230+
const b = buffer.slice(blockStart, n)
231+
line = decoder?.decode(b) ?? b.toString()
222232
}
223-
const b = buffer.slice(blockStart, n)
224-
const line = decoder?.decode(b) ?? b.toString()
225233

226234
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
227235
if (dpositions) {
@@ -238,48 +246,31 @@ export default class TabixIndexedFile {
238246
line,
239247
)
240248

241-
// do a small check just to make sure that the lines are really sorted
242-
// by start coordinate
243-
if (
244-
previousStartCoordinate !== undefined &&
245-
startCoordinate !== undefined &&
246-
previousStartCoordinate > startCoordinate
247-
) {
248-
throw new Error(
249-
`Lines not sorted by start coordinate (${previousStartCoordinate} > ${startCoordinate}), this file is not usable with Tabix.`,
250-
)
251-
}
252-
previousStartCoordinate = startCoordinate
253-
254249
if (overlaps) {
255250
callback(
256-
line.trim(),
257-
// cpositions[pos] refers to actual file offset of a bgzip block boundaries
251+
line,
252+
// cpositions[pos] refers to actual file offset of a bgzip block
253+
// boundaries
258254
//
259-
// we multiply by (1 <<8) in order to make sure each block has a "unique"
260-
// address space so that data in that block could never overlap
255+
// we multiply by (1 <<8) in order to make sure each block has a
256+
// "unique" address space so that data in that block could never
257+
// overlap
261258
//
262-
// then the blockStart-dpositions is an uncompressed file offset from
263-
// that bgzip block boundary, and since the cpositions are multiplied by
264-
// (1 << 8) these uncompressed offsets get a unique space
259+
// then the blockStart-dpositions is an uncompressed file offset
260+
// from that bgzip block boundary, and since the cpositions are
261+
// multiplied by (1 << 8) these uncompressed offsets get a unique
262+
// space
265263
cpositions[pos]! * (1 << 8) +
266264
(blockStart - dpositions[pos]!) +
267265
c.minv.dataPosition +
268266
1,
269267
)
270268
} else if (startCoordinate !== undefined && startCoordinate >= end) {
271-
// the lines were overlapping the region, but now have stopped, so
272-
// we must be at the end of the relevant data and we can stop
273-
// processing data now
269+
// the lines were overlapping the region, but now have stopped, so we
270+
// must be at the end of the relevant data and we can stop processing
271+
// data now
274272
return
275273
}
276-
277-
// yield if we have emitted beyond the yield limit
278-
if (this.yieldTime && last - Date.now() > this.yieldTime) {
279-
last = Date.now()
280-
checkAbortSignal(signal)
281-
await timeout(1)
282-
}
283274
blockStart = n + 1
284275
}
285276
}
@@ -296,6 +287,7 @@ export default class TabixIndexedFile {
296287
async getHeaderBuffer(opts: Options = {}) {
297288
const { firstDataLine, metaChar, maxBlockSize } =
298289
await this.getMetadata(opts)
290+
299291
checkAbortSignal(opts.signal)
300292

301293
// eslint-disable-next-line @typescript-eslint/restrict-plus-operands
@@ -320,7 +312,7 @@ export default class TabixIndexedFile {
320312
lastNewline = i
321313
}
322314
}
323-
return bytes.slice(0, lastNewline + 1)
315+
return bytes.subarray(0, lastNewline + 1)
324316
}
325317
return bytes
326318
}
@@ -397,14 +389,17 @@ export default class TabixIndexedFile {
397389
let currentColumnStart = 0
398390
let refSeq = ''
399391
let startCoordinate = -Infinity
400-
for (let i = 0; i < line.length + 1; i += 1) {
401-
if (line[i] === '\t' || i === line.length) {
392+
const l = line.length
393+
for (let i = 0; i < l + 1; i++) {
394+
if (line[i] === '\t' || i === l) {
402395
if (currentColumnNumber === ref) {
403396
if (
404397
this.renameRefSeq(line.slice(currentColumnStart, i)) !==
405398
regionRefName
406399
) {
407-
return { overlaps: false }
400+
return {
401+
overlaps: false,
402+
}
408403
}
409404
} else if (currentColumnNumber === start) {
410405
startCoordinate = parseInt(line.slice(currentColumnStart, i), 10)
@@ -413,12 +408,18 @@ export default class TabixIndexedFile {
413408
startCoordinate -= 1
414409
}
415410
if (startCoordinate >= regionEnd) {
416-
return { startCoordinate, overlaps: false }
411+
return {
412+
startCoordinate,
413+
overlaps: false,
414+
}
417415
}
418416
if (end === 0 || end === start) {
419417
// if we have no end, we assume the feature is 1 bp long
420418
if (startCoordinate + 1 <= regionStart) {
421-
return { startCoordinate, overlaps: false }
419+
return {
420+
startCoordinate,
421+
overlaps: false,
422+
}
422423
}
423424
}
424425
} else if (format === 'VCF' && currentColumnNumber === 4) {
@@ -432,9 +433,11 @@ export default class TabixIndexedFile {
432433
refSeq,
433434
line.slice(currentColumnStart, i),
434435
)
435-
: parseInt(line.slice(currentColumnStart, i), 10)
436+
: Number.parseInt(line.slice(currentColumnStart, i), 10)
436437
if (endCoordinate <= regionStart) {
437-
return { overlaps: false }
438+
return {
439+
overlaps: false,
440+
}
438441
}
439442
}
440443
currentColumnStart = i + 1
@@ -444,7 +447,10 @@ export default class TabixIndexedFile {
444447
}
445448
}
446449
}
447-
return { startCoordinate, overlaps: true }
450+
return {
451+
startCoordinate,
452+
overlaps: true,
453+
}
448454
}
449455

450456
_getVcfEnd(startCoordinate: number, refSeq: string, info: any) {
@@ -496,7 +502,7 @@ export default class TabixIndexedFile {
496502
opts,
497503
)
498504

499-
return buffer.slice(0, bytesRead)
505+
return buffer.subarray(0, bytesRead)
500506
}
501507

502508
/**

tsconfig.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"compilerOptions": {
33
"moduleResolution": "node",
4-
"lib": ["es2017", "es7", "es6", "dom"],
4+
"target": "es2018",
55
"declaration": true,
66
"noUncheckedIndexedAccess": true,
77
"outDir": "dist",

0 commit comments

Comments
 (0)