@@ -10,6 +10,10 @@ import Chunk from './chunk'
10
10
import TBI from './tbi'
11
11
import CSI from './csi'
12
12
13
+ function isASCII ( str : string ) {
14
+ return / ^ [ \u0000 - \u007F ] * $ / . test ( str )
15
+ }
16
+
13
17
type GetLinesCallback = ( line : string , fileOffset : number ) => void
14
18
15
19
const decoder =
@@ -27,13 +31,9 @@ interface ReadChunk {
27
31
dpositions : number [ ]
28
32
}
29
33
30
- function timeout ( time : number ) {
31
- return new Promise ( resolve => setTimeout ( resolve , time ) )
32
- }
33
34
export default class TabixIndexedFile {
34
35
private filehandle : GenericFilehandle
35
36
private index : IndexFile
36
- private yieldTime : number
37
37
private renameRefSeq : ( n : string ) => string
38
38
private chunkCache : AbortablePromiseCache < Chunk , ReadChunk >
39
39
@@ -58,9 +58,6 @@ export default class TabixIndexedFile {
58
58
*
59
59
* @param {tbiUrl } [args.tbiUrl]
60
60
*
61
- * @param {number } [args.yieldTime] yield to main thread after N milliseconds
62
- * if reading features is taking a long time to avoid hanging main thread
63
- *
64
61
* @param {function } [args.renameRefSeqs] optional function with sig `string
65
62
* => string` to transform reference sequence names for the purpose of
66
63
* indexing and querying. note that the data that is returned is not altered,
@@ -76,7 +73,6 @@ export default class TabixIndexedFile {
76
73
csiPath,
77
74
csiUrl,
78
75
csiFilehandle,
79
- yieldTime = 500 ,
80
76
renameRefSeqs = n => n ,
81
77
chunkCacheSize = 5 * 2 ** 20 ,
82
78
} : {
@@ -89,7 +85,6 @@ export default class TabixIndexedFile {
89
85
csiPath ?: string
90
86
csiUrl ?: string
91
87
csiFilehandle ?: GenericFilehandle
92
- yieldTime ?: number
93
88
renameRefSeqs ?: ( n : string ) => string
94
89
chunkCacheSize ?: number
95
90
} ) {
@@ -147,7 +142,6 @@ export default class TabixIndexedFile {
147
142
}
148
143
149
144
this . renameRefSeq = renameRefSeqs
150
- this . yieldTime = yieldTime
151
145
this . chunkCache = new AbortablePromiseCache < Chunk , ReadChunk > ( {
152
146
cache : new LRU ( { maxSize : Math . floor ( chunkCacheSize / ( 1 << 16 ) ) } ) ,
153
147
fill : ( args : Chunk , signal ?: AbortSignal ) =>
@@ -203,9 +197,7 @@ export default class TabixIndexedFile {
203
197
checkAbortSignal ( signal )
204
198
205
199
// now go through each chunk and parse and filter the lines out of it
206
- let last = Date . now ( )
207
200
for ( const c of chunks ) {
208
- let previousStartCoordinate : number | undefined
209
201
const { buffer, cpositions, dpositions } = await this . chunkCache . get (
210
202
c . toString ( ) ,
211
203
c ,
@@ -215,13 +207,29 @@ export default class TabixIndexedFile {
215
207
checkAbortSignal ( signal )
216
208
let blockStart = 0
217
209
let pos = 0
218
- while ( blockStart < buffer . length ) {
219
- const n = buffer . indexOf ( '\n' , blockStart )
220
- if ( n === - 1 ) {
221
- break
210
+
211
+ const str = decoder ?. decode ( buffer ) ?? buffer . toString ( )
212
+ // fast path, Buffer is just ASCII chars and not gigantor, can be
213
+ // converted to string and processed directly. if it is not ASCII or
214
+ // gigantic (chrome max str len is 512Mb), we have to decode line by line
215
+ const strIsASCII = buffer . length < 500_000_000 && isASCII ( str )
216
+ while ( blockStart < str . length ) {
217
+ let line : string
218
+ let n : number
219
+ if ( strIsASCII ) {
220
+ n = str . indexOf ( '\n' , blockStart )
221
+ if ( n === - 1 ) {
222
+ break
223
+ }
224
+ line = str . slice ( blockStart , n )
225
+ } else {
226
+ n = buffer . indexOf ( '\n' , blockStart )
227
+ if ( n === - 1 ) {
228
+ break
229
+ }
230
+ const b = buffer . slice ( blockStart , n )
231
+ line = decoder ?. decode ( b ) ?? b . toString ( )
222
232
}
223
- const b = buffer . slice ( blockStart , n )
224
- const line = decoder ?. decode ( b ) ?? b . toString ( )
225
233
226
234
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
227
235
if ( dpositions ) {
@@ -238,48 +246,31 @@ export default class TabixIndexedFile {
238
246
line ,
239
247
)
240
248
241
- // do a small check just to make sure that the lines are really sorted
242
- // by start coordinate
243
- if (
244
- previousStartCoordinate !== undefined &&
245
- startCoordinate !== undefined &&
246
- previousStartCoordinate > startCoordinate
247
- ) {
248
- throw new Error (
249
- `Lines not sorted by start coordinate (${ previousStartCoordinate } > ${ startCoordinate } ), this file is not usable with Tabix.` ,
250
- )
251
- }
252
- previousStartCoordinate = startCoordinate
253
-
254
249
if ( overlaps ) {
255
250
callback (
256
- line . trim ( ) ,
257
- // cpositions[pos] refers to actual file offset of a bgzip block boundaries
251
+ line ,
252
+ // cpositions[pos] refers to actual file offset of a bgzip block
253
+ // boundaries
258
254
//
259
- // we multiply by (1 <<8) in order to make sure each block has a "unique"
260
- // address space so that data in that block could never overlap
255
+ // we multiply by (1 <<8) in order to make sure each block has a
256
+ // "unique" address space so that data in that block could never
257
+ // overlap
261
258
//
262
- // then the blockStart-dpositions is an uncompressed file offset from
263
- // that bgzip block boundary, and since the cpositions are multiplied by
264
- // (1 << 8) these uncompressed offsets get a unique space
259
+ // then the blockStart-dpositions is an uncompressed file offset
260
+ // from that bgzip block boundary, and since the cpositions are
261
+ // multiplied by (1 << 8) these uncompressed offsets get a unique
262
+ // space
265
263
cpositions [ pos ] ! * ( 1 << 8 ) +
266
264
( blockStart - dpositions [ pos ] ! ) +
267
265
c . minv . dataPosition +
268
266
1 ,
269
267
)
270
268
} else if ( startCoordinate !== undefined && startCoordinate >= end ) {
271
- // the lines were overlapping the region, but now have stopped, so
272
- // we must be at the end of the relevant data and we can stop
273
- // processing data now
269
+ // the lines were overlapping the region, but now have stopped, so we
270
+ // must be at the end of the relevant data and we can stop processing
271
+ // data now
274
272
return
275
273
}
276
-
277
- // yield if we have emitted beyond the yield limit
278
- if ( this . yieldTime && last - Date . now ( ) > this . yieldTime ) {
279
- last = Date . now ( )
280
- checkAbortSignal ( signal )
281
- await timeout ( 1 )
282
- }
283
274
blockStart = n + 1
284
275
}
285
276
}
@@ -296,6 +287,7 @@ export default class TabixIndexedFile {
296
287
async getHeaderBuffer ( opts : Options = { } ) {
297
288
const { firstDataLine, metaChar, maxBlockSize } =
298
289
await this . getMetadata ( opts )
290
+
299
291
checkAbortSignal ( opts . signal )
300
292
301
293
// eslint-disable-next-line @typescript-eslint/restrict-plus-operands
@@ -320,7 +312,7 @@ export default class TabixIndexedFile {
320
312
lastNewline = i
321
313
}
322
314
}
323
- return bytes . slice ( 0 , lastNewline + 1 )
315
+ return bytes . subarray ( 0 , lastNewline + 1 )
324
316
}
325
317
return bytes
326
318
}
@@ -397,14 +389,17 @@ export default class TabixIndexedFile {
397
389
let currentColumnStart = 0
398
390
let refSeq = ''
399
391
let startCoordinate = - Infinity
400
- for ( let i = 0 ; i < line . length + 1 ; i += 1 ) {
401
- if ( line [ i ] === '\t' || i === line . length ) {
392
+ const l = line . length
393
+ for ( let i = 0 ; i < l + 1 ; i ++ ) {
394
+ if ( line [ i ] === '\t' || i === l ) {
402
395
if ( currentColumnNumber === ref ) {
403
396
if (
404
397
this . renameRefSeq ( line . slice ( currentColumnStart , i ) ) !==
405
398
regionRefName
406
399
) {
407
- return { overlaps : false }
400
+ return {
401
+ overlaps : false ,
402
+ }
408
403
}
409
404
} else if ( currentColumnNumber === start ) {
410
405
startCoordinate = parseInt ( line . slice ( currentColumnStart , i ) , 10 )
@@ -413,12 +408,18 @@ export default class TabixIndexedFile {
413
408
startCoordinate -= 1
414
409
}
415
410
if ( startCoordinate >= regionEnd ) {
416
- return { startCoordinate, overlaps : false }
411
+ return {
412
+ startCoordinate,
413
+ overlaps : false ,
414
+ }
417
415
}
418
416
if ( end === 0 || end === start ) {
419
417
// if we have no end, we assume the feature is 1 bp long
420
418
if ( startCoordinate + 1 <= regionStart ) {
421
- return { startCoordinate, overlaps : false }
419
+ return {
420
+ startCoordinate,
421
+ overlaps : false ,
422
+ }
422
423
}
423
424
}
424
425
} else if ( format === 'VCF' && currentColumnNumber === 4 ) {
@@ -432,9 +433,11 @@ export default class TabixIndexedFile {
432
433
refSeq ,
433
434
line . slice ( currentColumnStart , i ) ,
434
435
)
435
- : parseInt ( line . slice ( currentColumnStart , i ) , 10 )
436
+ : Number . parseInt ( line . slice ( currentColumnStart , i ) , 10 )
436
437
if ( endCoordinate <= regionStart ) {
437
- return { overlaps : false }
438
+ return {
439
+ overlaps : false ,
440
+ }
438
441
}
439
442
}
440
443
currentColumnStart = i + 1
@@ -444,7 +447,10 @@ export default class TabixIndexedFile {
444
447
}
445
448
}
446
449
}
447
- return { startCoordinate, overlaps : true }
450
+ return {
451
+ startCoordinate,
452
+ overlaps : true ,
453
+ }
448
454
}
449
455
450
456
_getVcfEnd ( startCoordinate : number , refSeq : string , info : any ) {
@@ -496,7 +502,7 @@ export default class TabixIndexedFile {
496
502
opts ,
497
503
)
498
504
499
- return buffer . slice ( 0 , bytesRead )
505
+ return buffer . subarray ( 0 , bytesRead )
500
506
}
501
507
502
508
/**
0 commit comments