Small optimizations for large GWAS type data (#148)

cmdcolin · web-flow · commit 6687a1f48c41 · 2024-11-30T17:29:56.000-05:00
diff --git a/eslint.config.mjs b/eslint.config.mjs
@@ -36,11 +36,17 @@ export default tseslint.config(
         },
       ],
 
-      'no-underscore-dangle': 0,
+      'no-console': [
+        'warn',
+        {
+          allow: ['error', 'warn'],
+        },
+      ],
+      'no-underscore-dangle': 'off',
       curly: 'error',
-      '@typescript-eslint/no-explicit-any': 0,
-      '@typescript-eslint/explicit-module-boundary-types': 0,
-      '@typescript-eslint/ban-ts-comment': 0,
+      '@typescript-eslint/no-explicit-any': 'off',
+      '@typescript-eslint/explicit-module-boundary-types': 'off',
+      '@typescript-eslint/ban-ts-comment': 'off',
       semi: ['error', 'never'],
       'unicorn/no-new-array': 'off',
       'unicorn/no-empty-file': 'off',
diff --git a/package.json b/package.json
@@ -26,8 +26,8 @@
     "docs": "documentation readme --shallow src/tabixIndexedFile.ts --section TabixIndexedFile",
     "clean": "rimraf dist esm",
     "prebuild": "npm run clean && npm run lint",
-    "build:esm": "tsc --target es2018 --outDir esm",
-    "build:es5": "tsc --target es2015 --module commonjs --outDir dist",
+    "build:esm": "tsc --outDir esm",
+    "build:es5": "tsc --module commonjs --outDir dist",
     "build": "npm run build:esm && npm run build:es5",
     "postbuild": "webpack",
     "preversion": "npm run lint && npm test run && npm run build",
@@ -60,7 +60,7 @@
     "eslint": "^9.9.0",
     "eslint-config-prettier": "^9.1.0",
     "eslint-plugin-prettier": "^5.0.1",
-    "eslint-plugin-unicorn": "^55.0.0",
+    "eslint-plugin-unicorn": "^56.0.0",
     "prettier": "^3.3.3",
     "rimraf": "^6.0.1",
     "standard-changelog": "^6.0.0",
diff --git a/src/tabixIndexedFile.ts b/src/tabixIndexedFile.ts
@@ -10,6 +10,10 @@ import Chunk from './chunk'
 import TBI from './tbi'
 import CSI from './csi'
 
+function isASCII(str: string) {
+  return /^[\u0000-\u007F]*$/.test(str)
+}
+
 type GetLinesCallback = (line: string, fileOffset: number) => void
 
 const decoder =
@@ -27,13 +31,9 @@ interface ReadChunk {
   dpositions: number[]
 }
 
-function timeout(time: number) {
-  return new Promise(resolve => setTimeout(resolve, time))
-}
 export default class TabixIndexedFile {
   private filehandle: GenericFilehandle
   private index: IndexFile
-  private yieldTime: number
   private renameRefSeq: (n: string) => string
   private chunkCache: AbortablePromiseCache<Chunk, ReadChunk>
 
@@ -58,9 +58,6 @@ export default class TabixIndexedFile {
    *
    * @param {tbiUrl} [args.tbiUrl]
    *
-   * @param {number} [args.yieldTime] yield to main thread after N milliseconds
-   * if reading features is taking a long time to avoid hanging main thread
-   *
    * @param {function} [args.renameRefSeqs] optional function with sig `string
    * => string` to transform reference sequence names for the purpose of
    * indexing and querying. note that the data that is returned is not altered,
@@ -76,7 +73,6 @@ export default class TabixIndexedFile {
     csiPath,
     csiUrl,
     csiFilehandle,
-    yieldTime = 500,
     renameRefSeqs = n => n,
     chunkCacheSize = 5 * 2 ** 20,
   }: {
@@ -89,7 +85,6 @@ export default class TabixIndexedFile {
     csiPath?: string
     csiUrl?: string
     csiFilehandle?: GenericFilehandle
-    yieldTime?: number
     renameRefSeqs?: (n: string) => string
     chunkCacheSize?: number
   }) {
@@ -147,7 +142,6 @@ export default class TabixIndexedFile {
     }
 
     this.renameRefSeq = renameRefSeqs
-    this.yieldTime = yieldTime
     this.chunkCache = new AbortablePromiseCache<Chunk, ReadChunk>({
       cache: new LRU({ maxSize: Math.floor(chunkCacheSize / (1 << 16)) }),
       fill: (args: Chunk, signal?: AbortSignal) =>
@@ -203,9 +197,7 @@ export default class TabixIndexedFile {
     checkAbortSignal(signal)
 
     // now go through each chunk and parse and filter the lines out of it
-    let last = Date.now()
     for (const c of chunks) {
-      let previousStartCoordinate: number | undefined
       const { buffer, cpositions, dpositions } = await this.chunkCache.get(
         c.toString(),
         c,
@@ -215,13 +207,29 @@ export default class TabixIndexedFile {
       checkAbortSignal(signal)
       let blockStart = 0
       let pos = 0
-      while (blockStart < buffer.length) {
-        const n = buffer.indexOf('\n', blockStart)
-        if (n === -1) {
-          break
+
+      const str = decoder?.decode(buffer) ?? buffer.toString()
+      // fast path, Buffer is just ASCII chars and not gigantor, can be
+      // converted to string and processed directly. if it is not ASCII or
+      // gigantic (chrome max str len is 512Mb), we have to decode line by line
+      const strIsASCII = buffer.length < 500_000_000 && isASCII(str)
+      while (blockStart < str.length) {
+        let line: string
+        let n: number
+        if (strIsASCII) {
+          n = str.indexOf('\n', blockStart)
+          if (n === -1) {
+            break
+          }
+          line = str.slice(blockStart, n)
+        } else {
+          n = buffer.indexOf('\n', blockStart)
+          if (n === -1) {
+            break
+          }
+          const b = buffer.slice(blockStart, n)
+          line = decoder?.decode(b) ?? b.toString()
         }
-        const b = buffer.slice(blockStart, n)
-        const line = decoder?.decode(b) ?? b.toString()
 
         // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
         if (dpositions) {
@@ -238,48 +246,31 @@ export default class TabixIndexedFile {
           line,
         )
 
-        // do a small check just to make sure that the lines are really sorted
-        // by start coordinate
-        if (
-          previousStartCoordinate !== undefined &&
-          startCoordinate !== undefined &&
-          previousStartCoordinate > startCoordinate
-        ) {
-          throw new Error(
-            `Lines not sorted by start coordinate (${previousStartCoordinate} > ${startCoordinate}), this file is not usable with Tabix.`,
-          )
-        }
-        previousStartCoordinate = startCoordinate
-
         if (overlaps) {
           callback(
-            line.trim(),
-            // cpositions[pos] refers to actual file offset of a bgzip block boundaries
+            line,
+            // cpositions[pos] refers to actual file offset of a bgzip block
+            // boundaries
             //
-            // we multiply by (1 <<8) in order to make sure each block has a "unique"
-            // address space so that data in that block could never overlap
+            // we multiply by (1 <<8) in order to make sure each block has a
+            // "unique" address space so that data in that block could never
+            // overlap
             //
-            // then the blockStart-dpositions is an uncompressed file offset from
-            // that bgzip block boundary, and since the cpositions are multiplied by
-            // (1 << 8) these uncompressed offsets get a unique space
+            // then the blockStart-dpositions is an uncompressed file offset
+            // from that bgzip block boundary, and since the cpositions are
+            // multiplied by (1 << 8) these uncompressed offsets get a unique
+            // space
             cpositions[pos]! * (1 << 8) +
               (blockStart - dpositions[pos]!) +
               c.minv.dataPosition +
               1,
           )
         } else if (startCoordinate !== undefined && startCoordinate >= end) {
-          // the lines were overlapping the region, but now have stopped, so
-          // we must be at the end of the relevant data and we can stop
-          // processing data now
+          // the lines were overlapping the region, but now have stopped, so we
+          // must be at the end of the relevant data and we can stop processing
+          // data now
           return
         }
-
-        // yield if we have emitted beyond the yield limit
-        if (this.yieldTime && last - Date.now() > this.yieldTime) {
-          last = Date.now()
-          checkAbortSignal(signal)
-          await timeout(1)
-        }
         blockStart = n + 1
       }
     }
@@ -296,6 +287,7 @@ export default class TabixIndexedFile {
   async getHeaderBuffer(opts: Options = {}) {
     const { firstDataLine, metaChar, maxBlockSize } =
       await this.getMetadata(opts)
+
     checkAbortSignal(opts.signal)
 
     // eslint-disable-next-line @typescript-eslint/restrict-plus-operands
@@ -320,7 +312,7 @@ export default class TabixIndexedFile {
           lastNewline = i
         }
       }
-      return bytes.slice(0, lastNewline + 1)
+      return bytes.subarray(0, lastNewline + 1)
     }
     return bytes
   }
@@ -397,14 +389,17 @@ export default class TabixIndexedFile {
     let currentColumnStart = 0
     let refSeq = ''
     let startCoordinate = -Infinity
-    for (let i = 0; i < line.length + 1; i += 1) {
-      if (line[i] === '\t' || i === line.length) {
+    const l = line.length
+    for (let i = 0; i < l + 1; i++) {
+      if (line[i] === '\t' || i === l) {
         if (currentColumnNumber === ref) {
           if (
             this.renameRefSeq(line.slice(currentColumnStart, i)) !==
             regionRefName
           ) {
-            return { overlaps: false }
+            return {
+              overlaps: false,
+            }
           }
         } else if (currentColumnNumber === start) {
           startCoordinate = parseInt(line.slice(currentColumnStart, i), 10)
@@ -413,12 +408,18 @@ export default class TabixIndexedFile {
             startCoordinate -= 1
           }
           if (startCoordinate >= regionEnd) {
-            return { startCoordinate, overlaps: false }
+            return {
+              startCoordinate,
+              overlaps: false,
+            }
           }
           if (end === 0 || end === start) {
             // if we have no end, we assume the feature is 1 bp long
             if (startCoordinate + 1 <= regionStart) {
-              return { startCoordinate, overlaps: false }
+              return {
+                startCoordinate,
+                overlaps: false,
+              }
             }
           }
         } else if (format === 'VCF' && currentColumnNumber === 4) {
@@ -432,9 +433,11 @@ export default class TabixIndexedFile {
                   refSeq,
                   line.slice(currentColumnStart, i),
                 )
-              : parseInt(line.slice(currentColumnStart, i), 10)
+              : Number.parseInt(line.slice(currentColumnStart, i), 10)
           if (endCoordinate <= regionStart) {
-            return { overlaps: false }
+            return {
+              overlaps: false,
+            }
           }
         }
         currentColumnStart = i + 1
@@ -444,7 +447,10 @@ export default class TabixIndexedFile {
         }
       }
     }
-    return { startCoordinate, overlaps: true }
+    return {
+      startCoordinate,
+      overlaps: true,
+    }
   }
 
   _getVcfEnd(startCoordinate: number, refSeq: string, info: any) {
@@ -496,7 +502,7 @@ export default class TabixIndexedFile {
       opts,
     )
 
-    return buffer.slice(0, bytesRead)
+    return buffer.subarray(0, bytesRead)
   }
 
   /**
diff --git a/tsconfig.json b/tsconfig.json
@@ -1,7 +1,7 @@
 {
   "compilerOptions": {
     "moduleResolution": "node",
-    "lib": ["es2017", "es7", "es6", "dom"],
+    "target": "es2018",
     "declaration": true,
     "noUncheckedIndexedAccess": true,
     "outDir": "dist",
diff --git a/yarn.lock b/yarn.lock

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"compilerOptions": {`
`3`	`3`	`"moduleResolution": "node",`
`4`		`- "lib": ["es2017", "es7", "es6", "dom"],`
	`4`	`+ "target": "es2018",`
`5`	`5`	`"declaration": true,`
`6`	`6`	`"noUncheckedIndexedAccess": true,`
`7`	`7`	`"outDir": "dist",`