From a4e71c6d6178ad28680dd4e723c64ea3d8828cb7 Mon Sep 17 00:00:00 2001 From: Juanjo Diaz Date: Sat, 20 Jan 2024 17:12:40 +0100 Subject: [PATCH] feat: add support to BOM at the beginning of the stream --- packages/node/test/bom.ts | 12 ++++ packages/plainjs/dist/deno/tokenizer.ts | 73 +++++++++++++++++++++++-- packages/plainjs/src/tokenizer.ts | 73 +++++++++++++++++++++++-- packages/plainjs/test/bom.ts | 44 +++++++++++++++ packages/plainjs/test/inputs.ts | 4 +- packages/whatwg/test/bom.ts | 44 +++++++++++++++ packages/whatwg/test/inputs.ts | 4 +- 7 files changed, 240 insertions(+), 14 deletions(-) create mode 100644 packages/node/test/bom.ts create mode 100644 packages/plainjs/test/bom.ts create mode 100644 packages/whatwg/test/bom.ts diff --git a/packages/node/test/bom.ts b/packages/node/test/bom.ts new file mode 100644 index 0000000..0fba27a --- /dev/null +++ b/packages/node/test/bom.ts @@ -0,0 +1,12 @@ +import JSONParser from "../src/jsonparser.js"; +import { runJSONParserTest } from "./utils/testRunner.js"; + +describe("BOM", () => { + test("should support UTF-8 BOM", () => { + runJSONParserTest( + new JSONParser(), + new Uint8Array([0xef, 0xbb, 0xbf, 0x31]), + ({ value }) => expect(value).toBe(1), + ); + }); +}); diff --git a/packages/plainjs/dist/deno/tokenizer.ts b/packages/plainjs/dist/deno/tokenizer.ts index 6e8d2f8..e7da82f 100644 --- a/packages/plainjs/dist/deno/tokenizer.ts +++ b/packages/plainjs/dist/deno/tokenizer.ts @@ -38,6 +38,8 @@ const enum TokenizerStates { NUMBER_AFTER_E_AND_SIGN, NUMBER_AFTER_E_AND_DIGIT, SEPARATOR, + BOM_OR_START, + BOM, } function TokenizerStateToString(tokenizerState: TokenizerStates): string { @@ -71,6 +73,8 @@ function TokenizerStateToString(tokenizerState: TokenizerStates): string { "NUMBER_AFTER_E_AND_SIGN", "NUMBER_AFTER_E_AND_DIGIT", "SEPARATOR", + "BOM_OR_START", + "BOM", ][tokenizerState]; } @@ -97,7 +101,10 @@ export class TokenizerError extends Error { } export default class Tokenizer { - private state = TokenizerStates.START; + private state = TokenizerStates.BOM_OR_START; + + private bom?: number[]; + private bomIndex = 0; private emitPartialTokens: boolean; private separator?: string; @@ -144,11 +151,14 @@ export default class Tokenizer { buffer = input; } else if (typeof input === "string") { buffer = this.encoder.encode(input); - } else if ( - (typeof input === "object" && "buffer" in input) || - Array.isArray(input) - ) { + } else if (Array.isArray(input)) { buffer = Uint8Array.from(input); + } else if (ArrayBuffer.isView(input)) { + buffer = new Uint8Array( + input.buffer, + input.byteOffset, + input.byteLength, + ); } else { throw new TypeError( "Unexpected type. The `write` function only accepts Arrays, TypedArrays and Strings.", @@ -158,6 +168,45 @@ export default class Tokenizer { for (let i = 0; i < buffer.length; i += 1) { const n = buffer[i]; // get current byte from buffer switch (this.state) { + // @ts-ignore fall through case + case TokenizerStates.BOM_OR_START: + if (input instanceof Uint8Array && n === 0xef) { + this.bom = [0xef, 0xbb, 0xbf]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + + if (input instanceof Uint16Array) { + if (n === 0xfe) { + this.bom = [0xfe, 0xff]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + if (n === 0xff) { + this.bom = [0xff, 0xfe]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + } + + if (input instanceof Uint32Array) { + if (n === 0x00) { + this.bom = [0x00, 0x00, 0xfe, 0xff]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + if (n === 0xff) { + this.bom = [0xff, 0xfe, 0x00, 0x00]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + } + // Allow cascading case TokenizerStates.START: this.offset += 1; @@ -629,6 +678,19 @@ export default class Tokenizer { this.separatorIndex = 0; } continue; + // BOM support + case TokenizerStates.BOM: + if (n === this.bom![this.bomIndex]) { + if (this.bomIndex === this.bom!.length - 1) { + this.state = TokenizerStates.START; + this.bom = undefined; + this.bomIndex = 0; + continue; + } + this.bomIndex += 1; + continue; + } + break; case TokenizerStates.ENDED: if ( n === charset.SPACE || @@ -745,6 +807,7 @@ export default class Tokenizer { this.emitNumber(); this.onEnd(); break; + case TokenizerStates.BOM_OR_START: case TokenizerStates.START: case TokenizerStates.ERROR: case TokenizerStates.SEPARATOR: diff --git a/packages/plainjs/src/tokenizer.ts b/packages/plainjs/src/tokenizer.ts index 0880240..52b66b8 100644 --- a/packages/plainjs/src/tokenizer.ts +++ b/packages/plainjs/src/tokenizer.ts @@ -38,6 +38,8 @@ const enum TokenizerStates { NUMBER_AFTER_E_AND_SIGN, NUMBER_AFTER_E_AND_DIGIT, SEPARATOR, + BOM_OR_START, + BOM, } function TokenizerStateToString(tokenizerState: TokenizerStates): string { @@ -71,6 +73,8 @@ function TokenizerStateToString(tokenizerState: TokenizerStates): string { "NUMBER_AFTER_E_AND_SIGN", "NUMBER_AFTER_E_AND_DIGIT", "SEPARATOR", + "BOM_OR_START", + "BOM", ][tokenizerState]; } @@ -97,7 +101,10 @@ export class TokenizerError extends Error { } export default class Tokenizer { - private state = TokenizerStates.START; + private state = TokenizerStates.BOM_OR_START; + + private bom?: number[]; + private bomIndex = 0; private emitPartialTokens: boolean; private separator?: string; @@ -144,11 +151,14 @@ export default class Tokenizer { buffer = input; } else if (typeof input === "string") { buffer = this.encoder.encode(input); - } else if ( - (typeof input === "object" && "buffer" in input) || - Array.isArray(input) - ) { + } else if (Array.isArray(input)) { buffer = Uint8Array.from(input); + } else if (ArrayBuffer.isView(input)) { + buffer = new Uint8Array( + input.buffer, + input.byteOffset, + input.byteLength, + ); } else { throw new TypeError( "Unexpected type. The `write` function only accepts Arrays, TypedArrays and Strings.", @@ -158,6 +168,45 @@ export default class Tokenizer { for (let i = 0; i < buffer.length; i += 1) { const n = buffer[i]; // get current byte from buffer switch (this.state) { + // @ts-ignore fall through case + case TokenizerStates.BOM_OR_START: + if (input instanceof Uint8Array && n === 0xef) { + this.bom = [0xef, 0xbb, 0xbf]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + + if (input instanceof Uint16Array) { + if (n === 0xfe) { + this.bom = [0xfe, 0xff]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + if (n === 0xff) { + this.bom = [0xff, 0xfe]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + } + + if (input instanceof Uint32Array) { + if (n === 0x00) { + this.bom = [0x00, 0x00, 0xfe, 0xff]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + if (n === 0xff) { + this.bom = [0xff, 0xfe, 0x00, 0x00]; + this.bomIndex += 1; + this.state = TokenizerStates.BOM; + continue; + } + } + // Allow cascading case TokenizerStates.START: this.offset += 1; @@ -629,6 +678,19 @@ export default class Tokenizer { this.separatorIndex = 0; } continue; + // BOM support + case TokenizerStates.BOM: + if (n === this.bom![this.bomIndex]) { + if (this.bomIndex === this.bom!.length - 1) { + this.state = TokenizerStates.START; + this.bom = undefined; + this.bomIndex = 0; + continue; + } + this.bomIndex += 1; + continue; + } + break; case TokenizerStates.ENDED: if ( n === charset.SPACE || @@ -745,6 +807,7 @@ export default class Tokenizer { this.emitNumber(); this.onEnd(); break; + case TokenizerStates.BOM_OR_START: case TokenizerStates.START: case TokenizerStates.ERROR: case TokenizerStates.SEPARATOR: diff --git a/packages/plainjs/test/bom.ts b/packages/plainjs/test/bom.ts new file mode 100644 index 0000000..e4d21ff --- /dev/null +++ b/packages/plainjs/test/bom.ts @@ -0,0 +1,44 @@ +import JSONParser from "../src/jsonparser.js"; +import { runJSONParserTest } from "./utils/testRunner.js"; + +describe("BOM", () => { + test("should support UTF-8 BOM", () => { + runJSONParserTest( + new JSONParser(), + new Uint8Array([0xef, 0xbb, 0xbf, 0x31]), + ({ value }) => expect(value).toBe(1), + ); + }); + + test("should support UTF-16 BE BOM", () => { + runJSONParserTest( + new JSONParser(), + new Uint16Array([0xfeff, 0x3131]), + ({ value }) => expect(value).toBe(11), + ); + }); + + test("should support UTF-16 LE BOM", () => { + runJSONParserTest( + new JSONParser(), + new Uint16Array([0xfffe, 0x3131]), + ({ value }) => expect(value).toBe(11), + ); + }); + + test("should support UTF-32 BE BOM", () => { + runJSONParserTest( + new JSONParser(), + new Uint32Array([0x0000feff, 0x31313131]), + ({ value }) => expect(value).toBe(1111), + ); + }); + + test("should support UTF-32 LE BOM", () => { + runJSONParserTest( + new JSONParser(), + new Uint32Array([0xfffe0000, 0x31313131]), + ({ value }) => expect(value).toBe(1111), + ); + }); +}); diff --git a/packages/plainjs/test/inputs.ts b/packages/plainjs/test/inputs.ts index 1278295..ad8c92c 100644 --- a/packages/plainjs/test/inputs.ts +++ b/packages/plainjs/test/inputs.ts @@ -15,11 +15,11 @@ describe("inputs", () => { expected: ["test"], }, { - value: new Uint16Array([116, 101, 115, 116]), + value: new Uint16Array([25972, 29811]), expected: ["test"], }, { - value: new Uint32Array([116, 101, 115, 116]), + value: new Uint32Array([1953719668]), expected: ["test"], }, { diff --git a/packages/whatwg/test/bom.ts b/packages/whatwg/test/bom.ts new file mode 100644 index 0000000..cb78bb1 --- /dev/null +++ b/packages/whatwg/test/bom.ts @@ -0,0 +1,44 @@ +import JSONParser from "../src/jsonparser.js"; +import { runJSONParserTest } from "./utils/testRunner.js"; + +describe("BOM", () => { + test("should support UTF-8 BOM", () => { + runJSONParserTest( + new JSONParser(), + new Uint8Array([0xef, 0xbb, 0xbf, 0x31]), + ({ value }) => expect(value).toBe(1), + ); + }); + + test("should support UTF-16 BE BOM", () => { + runJSONParserTest( + new JSONParser(), + new Uint16Array([0xfeff, 0x3131]), + ({ value }) => expect(value).toBe(11), + ); + }); + + test("should support UTF-16 LE BOM", () => { + runJSONParserTest( + new JSONParser(), + new Uint16Array([0xfffe, 0x3131]), + ({ value }) => expect(value).toBe(11), + ); + }); + + // test("should support UTF-32 BE BOM", () => { + // runJSONParserTest( + // new JSONParser(), + // new Uint32Array([0x0000feff, 0x31313131]), + // ({ value }) => expect(value).toBe(1111), + // ); + // }); + + // test("should support UTF-32 LE BOM", () => { + // runJSONParserTest( + // new JSONParser(), + // new Uint32Array([0xfffe0000, 0x31313131]), + // ({ value }) => expect(value).toBe(1111), + // ); + // }); +}); diff --git a/packages/whatwg/test/inputs.ts b/packages/whatwg/test/inputs.ts index 8eb2b15..8615276 100644 --- a/packages/whatwg/test/inputs.ts +++ b/packages/whatwg/test/inputs.ts @@ -15,11 +15,11 @@ describe("inputs", () => { expected: ["test"], }, { - value: new Uint16Array([116, 101, 115, 116]), + value: new Uint16Array([25972, 29811]), expected: ["test"], }, { - value: new Uint32Array([116, 101, 115, 116]), + value: new Uint32Array([1953719668]), expected: ["test"], }, {