Skip to content

Commit

Permalink
feat: add support to BOM at the beginning of the stream
Browse files Browse the repository at this point in the history
  • Loading branch information
juanjoDiaz committed Jan 20, 2024
1 parent cda44fb commit a4e71c6
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 14 deletions.
12 changes: 12 additions & 0 deletions packages/node/test/bom.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import JSONParser from "../src/jsonparser.js";
import { runJSONParserTest } from "./utils/testRunner.js";

describe("BOM", () => {
test("should support UTF-8 BOM", () => {
runJSONParserTest(
new JSONParser(),
new Uint8Array([0xef, 0xbb, 0xbf, 0x31]),
({ value }) => expect(value).toBe(1),
);
});
});
73 changes: 68 additions & 5 deletions packages/plainjs/dist/deno/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const enum TokenizerStates {
NUMBER_AFTER_E_AND_SIGN,
NUMBER_AFTER_E_AND_DIGIT,
SEPARATOR,
BOM_OR_START,
BOM,
}

function TokenizerStateToString(tokenizerState: TokenizerStates): string {
Expand Down Expand Up @@ -71,6 +73,8 @@ function TokenizerStateToString(tokenizerState: TokenizerStates): string {
"NUMBER_AFTER_E_AND_SIGN",
"NUMBER_AFTER_E_AND_DIGIT",
"SEPARATOR",
"BOM_OR_START",
"BOM",
][tokenizerState];
}

Expand All @@ -97,7 +101,10 @@ export class TokenizerError extends Error {
}

export default class Tokenizer {
private state = TokenizerStates.START;
private state = TokenizerStates.BOM_OR_START;

private bom?: number[];
private bomIndex = 0;

private emitPartialTokens: boolean;
private separator?: string;
Expand Down Expand Up @@ -144,11 +151,14 @@ export default class Tokenizer {
buffer = input;
} else if (typeof input === "string") {
buffer = this.encoder.encode(input);
} else if (
(typeof input === "object" && "buffer" in input) ||
Array.isArray(input)
) {
} else if (Array.isArray(input)) {
buffer = Uint8Array.from(input);
} else if (ArrayBuffer.isView(input)) {
buffer = new Uint8Array(
input.buffer,
input.byteOffset,
input.byteLength,
);
} else {
throw new TypeError(
"Unexpected type. The `write` function only accepts Arrays, TypedArrays and Strings.",
Expand All @@ -158,6 +168,45 @@ export default class Tokenizer {
for (let i = 0; i < buffer.length; i += 1) {
const n = buffer[i]; // get current byte from buffer
switch (this.state) {
// @ts-ignore fall through case
case TokenizerStates.BOM_OR_START:
if (input instanceof Uint8Array && n === 0xef) {
this.bom = [0xef, 0xbb, 0xbf];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}

if (input instanceof Uint16Array) {
if (n === 0xfe) {
this.bom = [0xfe, 0xff];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}
if (n === 0xff) {
this.bom = [0xff, 0xfe];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}
}

if (input instanceof Uint32Array) {
if (n === 0x00) {
this.bom = [0x00, 0x00, 0xfe, 0xff];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}
if (n === 0xff) {
this.bom = [0xff, 0xfe, 0x00, 0x00];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}
}
// Allow cascading
case TokenizerStates.START:
this.offset += 1;

Expand Down Expand Up @@ -629,6 +678,19 @@ export default class Tokenizer {
this.separatorIndex = 0;
}
continue;
// BOM support
case TokenizerStates.BOM:
if (n === this.bom![this.bomIndex]) {
if (this.bomIndex === this.bom!.length - 1) {
this.state = TokenizerStates.START;
this.bom = undefined;
this.bomIndex = 0;
continue;
}
this.bomIndex += 1;
continue;
}
break;
case TokenizerStates.ENDED:
if (
n === charset.SPACE ||
Expand Down Expand Up @@ -745,6 +807,7 @@ export default class Tokenizer {
this.emitNumber();
this.onEnd();
break;
case TokenizerStates.BOM_OR_START:
case TokenizerStates.START:
case TokenizerStates.ERROR:
case TokenizerStates.SEPARATOR:
Expand Down
73 changes: 68 additions & 5 deletions packages/plainjs/src/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const enum TokenizerStates {
NUMBER_AFTER_E_AND_SIGN,
NUMBER_AFTER_E_AND_DIGIT,
SEPARATOR,
BOM_OR_START,
BOM,
}

function TokenizerStateToString(tokenizerState: TokenizerStates): string {
Expand Down Expand Up @@ -71,6 +73,8 @@ function TokenizerStateToString(tokenizerState: TokenizerStates): string {
"NUMBER_AFTER_E_AND_SIGN",
"NUMBER_AFTER_E_AND_DIGIT",
"SEPARATOR",
"BOM_OR_START",
"BOM",
][tokenizerState];
}

Expand All @@ -97,7 +101,10 @@ export class TokenizerError extends Error {
}

export default class Tokenizer {
private state = TokenizerStates.START;
private state = TokenizerStates.BOM_OR_START;

private bom?: number[];
private bomIndex = 0;

private emitPartialTokens: boolean;
private separator?: string;
Expand Down Expand Up @@ -144,11 +151,14 @@ export default class Tokenizer {
buffer = input;
} else if (typeof input === "string") {
buffer = this.encoder.encode(input);
} else if (
(typeof input === "object" && "buffer" in input) ||
Array.isArray(input)
) {
} else if (Array.isArray(input)) {
buffer = Uint8Array.from(input);
} else if (ArrayBuffer.isView(input)) {
buffer = new Uint8Array(
input.buffer,
input.byteOffset,
input.byteLength,
);
} else {
throw new TypeError(
"Unexpected type. The `write` function only accepts Arrays, TypedArrays and Strings.",
Expand All @@ -158,6 +168,45 @@ export default class Tokenizer {
for (let i = 0; i < buffer.length; i += 1) {
const n = buffer[i]; // get current byte from buffer
switch (this.state) {
// @ts-ignore fall through case
case TokenizerStates.BOM_OR_START:
if (input instanceof Uint8Array && n === 0xef) {
this.bom = [0xef, 0xbb, 0xbf];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}

if (input instanceof Uint16Array) {
if (n === 0xfe) {
this.bom = [0xfe, 0xff];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}
if (n === 0xff) {
this.bom = [0xff, 0xfe];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}
}

if (input instanceof Uint32Array) {
if (n === 0x00) {
this.bom = [0x00, 0x00, 0xfe, 0xff];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}
if (n === 0xff) {
this.bom = [0xff, 0xfe, 0x00, 0x00];
this.bomIndex += 1;
this.state = TokenizerStates.BOM;
continue;
}
}
// Allow cascading
case TokenizerStates.START:
this.offset += 1;

Expand Down Expand Up @@ -629,6 +678,19 @@ export default class Tokenizer {
this.separatorIndex = 0;
}
continue;
// BOM support
case TokenizerStates.BOM:
if (n === this.bom![this.bomIndex]) {
if (this.bomIndex === this.bom!.length - 1) {
this.state = TokenizerStates.START;
this.bom = undefined;
this.bomIndex = 0;
continue;
}
this.bomIndex += 1;
continue;
}
break;
case TokenizerStates.ENDED:
if (
n === charset.SPACE ||
Expand Down Expand Up @@ -745,6 +807,7 @@ export default class Tokenizer {
this.emitNumber();
this.onEnd();
break;
case TokenizerStates.BOM_OR_START:
case TokenizerStates.START:
case TokenizerStates.ERROR:
case TokenizerStates.SEPARATOR:
Expand Down
44 changes: 44 additions & 0 deletions packages/plainjs/test/bom.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import JSONParser from "../src/jsonparser.js";
import { runJSONParserTest } from "./utils/testRunner.js";

describe("BOM", () => {
test("should support UTF-8 BOM", () => {
runJSONParserTest(
new JSONParser(),
new Uint8Array([0xef, 0xbb, 0xbf, 0x31]),
({ value }) => expect(value).toBe(1),
);
});

test("should support UTF-16 BE BOM", () => {
runJSONParserTest(
new JSONParser(),
new Uint16Array([0xfeff, 0x3131]),
({ value }) => expect(value).toBe(11),
);
});

test("should support UTF-16 LE BOM", () => {
runJSONParserTest(
new JSONParser(),
new Uint16Array([0xfffe, 0x3131]),
({ value }) => expect(value).toBe(11),
);
});

test("should support UTF-32 BE BOM", () => {
runJSONParserTest(
new JSONParser(),
new Uint32Array([0x0000feff, 0x31313131]),
({ value }) => expect(value).toBe(1111),
);
});

test("should support UTF-32 LE BOM", () => {
runJSONParserTest(
new JSONParser(),
new Uint32Array([0xfffe0000, 0x31313131]),
({ value }) => expect(value).toBe(1111),
);
});
});
4 changes: 2 additions & 2 deletions packages/plainjs/test/inputs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ describe("inputs", () => {
expected: ["test"],
},
{
value: new Uint16Array([116, 101, 115, 116]),
value: new Uint16Array([25972, 29811]),
expected: ["test"],
},
{
value: new Uint32Array([116, 101, 115, 116]),
value: new Uint32Array([1953719668]),
expected: ["test"],
},
{
Expand Down
44 changes: 44 additions & 0 deletions packages/whatwg/test/bom.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import JSONParser from "../src/jsonparser.js";
import { runJSONParserTest } from "./utils/testRunner.js";

describe("BOM", () => {
test("should support UTF-8 BOM", () => {
runJSONParserTest(
new JSONParser(),
new Uint8Array([0xef, 0xbb, 0xbf, 0x31]),
({ value }) => expect(value).toBe(1),
);
});

test("should support UTF-16 BE BOM", () => {
runJSONParserTest(
new JSONParser(),
new Uint16Array([0xfeff, 0x3131]),
({ value }) => expect(value).toBe(11),
);
});

test("should support UTF-16 LE BOM", () => {
runJSONParserTest(
new JSONParser(),
new Uint16Array([0xfffe, 0x3131]),
({ value }) => expect(value).toBe(11),
);
});

// test("should support UTF-32 BE BOM", () => {
// runJSONParserTest(
// new JSONParser(),
// new Uint32Array([0x0000feff, 0x31313131]),
// ({ value }) => expect(value).toBe(1111),
// );
// });

// test("should support UTF-32 LE BOM", () => {
// runJSONParserTest(
// new JSONParser(),
// new Uint32Array([0xfffe0000, 0x31313131]),
// ({ value }) => expect(value).toBe(1111),
// );
// });
});
4 changes: 2 additions & 2 deletions packages/whatwg/test/inputs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ describe("inputs", () => {
expected: ["test"],
},
{
value: new Uint16Array([116, 101, 115, 116]),
value: new Uint16Array([25972, 29811]),
expected: ["test"],
},
{
value: new Uint32Array([116, 101, 115, 116]),
value: new Uint32Array([1953719668]),
expected: ["test"],
},
{
Expand Down

0 comments on commit a4e71c6

Please sign in to comment.