forked from webtoon/psd
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feature: implement EngineData parsing
fixes webtoon#6
- Loading branch information
1 parent
462dc9f
commit 4909aae
Showing
18 changed files
with
1,193 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
// @webtoon/psd | ||
// Copyright 2021-present NAVER WEBTOON | ||
// MIT License | ||
|
||
export * from "./lexer"; | ||
export * from "./parser"; | ||
export * from "./validator"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
// @webtoon/psd | ||
// Copyright 2021-present NAVER WEBTOON | ||
// MIT License | ||
|
||
// Based on PDF grammar: https://web.archive.org/web/20220226063926/https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf | ||
// Section 7.2 - Lexical Conventions | ||
|
||
import { | ||
Cursor, | ||
InvalidEngineDataBoolean, | ||
InvalidEngineDataNumber, | ||
InvalidEngineDataTextBOM, | ||
} from "../utils"; | ||
|
||
export enum TokenType { | ||
String, | ||
DictBeg, | ||
DictEnd, | ||
ArrBeg, | ||
ArrEnd, | ||
Name, | ||
Number, | ||
Boolean, | ||
} | ||
|
||
export type Token = | ||
| {type: TokenType.String; value: string} | ||
| {type: TokenType.DictBeg} | ||
| {type: TokenType.DictEnd} | ||
| {type: TokenType.ArrBeg} | ||
| {type: TokenType.ArrEnd} | ||
| {type: TokenType.Name; value: string} | ||
| {type: TokenType.Number; value: number} | ||
| {type: TokenType.Boolean; value: boolean}; | ||
|
||
const WhitespaceCharacters = new Set([ | ||
0, | ||
9, | ||
12, | ||
32, // ' ' | ||
10, // \n | ||
13, // \r | ||
]); | ||
|
||
const BooleanStartCharacters = new Set([ | ||
0x66, // f | ||
0x74, // t | ||
]); | ||
|
||
const Delimiters = { | ||
"(": 0x28, | ||
")": 0x29, | ||
"<": 0x3c, | ||
">": 0x3e, | ||
"[": 0x5b, | ||
"]": 0x5d, | ||
"/": 0x2f, | ||
"\\": 0x5c, | ||
// NOTE: These have meaning within PDF. Are they used here? | ||
// "{": 123, | ||
// "}": 125, | ||
// "%": 37, | ||
}; | ||
|
||
const DelimiterCharacters = new Set(Object.values(Delimiters)); | ||
|
||
export class Lexer { | ||
constructor(private cursor: Cursor) {} | ||
|
||
*tokens(): Generator<Token> { | ||
while (!this.done()) { | ||
const val = this.cursor.read("u8"); | ||
|
||
if (WhitespaceCharacters.has(val)) { | ||
while (!this.done() && WhitespaceCharacters.has(this.cursor.peek())) | ||
this.cursor.pass(1); | ||
continue; | ||
} | ||
if (DelimiterCharacters.has(val)) { | ||
if (val === Delimiters["("]) { | ||
yield {type: TokenType.String, value: this.text()}; | ||
continue; | ||
} | ||
if (val === Delimiters["["]) { | ||
yield {type: TokenType.ArrBeg}; | ||
continue; | ||
} | ||
if (val === Delimiters["]"]) { | ||
yield {type: TokenType.ArrEnd}; | ||
continue; | ||
} | ||
if (val === Delimiters["<"]) { | ||
// NOTE: assert that it is < indeed? | ||
this.cursor.pass(1); | ||
yield {type: TokenType.DictBeg}; | ||
continue; | ||
} | ||
if (val === Delimiters[">"]) { | ||
// NOTE: assert that it is > indeed? | ||
this.cursor.pass(1); | ||
yield {type: TokenType.DictEnd}; | ||
continue; | ||
} | ||
if (val === Delimiters["/"]) { | ||
yield {type: TokenType.Name, value: this.string()}; | ||
continue; | ||
} | ||
console.assert( | ||
false, | ||
"Unhandled delimiter: '%s'", | ||
String.fromCharCode(val) | ||
); | ||
continue; | ||
} | ||
// only two types left: number or boolean | ||
// we need to return val first since it starts value | ||
this.cursor.unpass(1); | ||
if (BooleanStartCharacters.has(val)) { | ||
yield {type: TokenType.Boolean, value: this.boolean()}; | ||
} else { | ||
yield {type: TokenType.Number, value: this.number()}; | ||
} | ||
} | ||
} | ||
|
||
private done(): boolean { | ||
return this.cursor.position >= this.cursor.length; | ||
} | ||
|
||
private text(): string { | ||
const firstByte = this.cursor.peek(); | ||
if (firstByte === Delimiters[")"]) { | ||
this.cursor.pass(1); | ||
return ""; | ||
} | ||
const hasBom = firstByte === 0xff || firstByte === 0xfe; | ||
let decoder = new TextDecoder("utf-16be"); | ||
if (hasBom) { | ||
decoder = this.textDecoderFromBOM(); | ||
} | ||
const textParts = [] as string[]; | ||
const readAhead = this.cursor.clone(); | ||
while (readAhead.peek() !== Delimiters[")"]) { | ||
readAhead.pass(1); | ||
if (readAhead.peek() === Delimiters["\\"]) { | ||
const length = readAhead.position - this.cursor.position; | ||
let raw = this.cursor.take(length); | ||
if (raw.at(-1) === 0x00) { | ||
// Sometimes there's extra padding before - we need to remove it | ||
raw = raw.subarray(0, -1); | ||
} | ||
textParts.push(decoder.decode(raw)); | ||
readAhead.pass(1); // skip over \\ | ||
textParts.push(String.fromCharCode(readAhead.take(1)[0])); // un-escape character | ||
this.cursor.pass(2); // skip over escaped character to avoid decoding it in subsequent part | ||
} | ||
} | ||
const length = readAhead.position - this.cursor.position; | ||
const raw = this.cursor.take(length); | ||
textParts.push(decoder.decode(raw)); | ||
this.cursor.pass(1); // final ) | ||
return textParts.join(""); | ||
} | ||
|
||
private textDecoderFromBOM(): TextDecoder { | ||
const firstBomPart = this.cursor.read("u8"); | ||
const sndBomPart = this.cursor.read("u8"); | ||
// https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16 | ||
// LE is FF FE | ||
if (firstBomPart === 0xff && sndBomPart === 0xfe) | ||
return new TextDecoder("utf-16le"); | ||
// BE is FE FF | ||
if (firstBomPart === 0xfe && sndBomPart === 0xff) | ||
return new TextDecoder("utf-16be"); | ||
throw new InvalidEngineDataTextBOM( | ||
`Unknown BOM value: [${firstBomPart}, ${sndBomPart}]` | ||
); | ||
} | ||
|
||
private string(): string { | ||
const decoder = new TextDecoder("ascii"); | ||
const readAhead = this.cursor.clone(); | ||
while ( | ||
!this.done() && | ||
!WhitespaceCharacters.has(this.cursor.peek()) && | ||
!DelimiterCharacters.has(this.cursor.peek()) | ||
) { | ||
this.cursor.pass(1); | ||
} | ||
const text = decoder.decode( | ||
readAhead.take(this.cursor.position - readAhead.position) | ||
); | ||
return text; | ||
} | ||
|
||
private number(): number { | ||
const text = this.string(); | ||
const value = Number(text); | ||
if (Number.isNaN(value)) { | ||
throw new InvalidEngineDataNumber(`parsing '${text}' as Number failed`); | ||
} | ||
return value; | ||
} | ||
|
||
private boolean(): boolean { | ||
const text = this.string(); | ||
if (text === "true") { | ||
return true; | ||
} | ||
if (text === "false") { | ||
return false; | ||
} | ||
throw new InvalidEngineDataBoolean( | ||
`'${text}' is neither 'true' nor 'false'` | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
// @webtoon/psd | ||
// Copyright 2021-present NAVER WEBTOON | ||
// MIT License | ||
|
||
import { | ||
InvalidEngineDataDictKey, | ||
InvalidTopLevelEngineDataValue, | ||
UnexpectedEndOfEngineData, | ||
UnexpectedEngineDataToken, | ||
} from "../utils"; | ||
import {Token, TokenType} from "./lexer"; | ||
|
||
export type RawEngineData = { | ||
[key: string]: RawEngineValue; | ||
}; | ||
export type RawEngineValue = | ||
| string | ||
| number | ||
| boolean | ||
| RawEngineValue[] | ||
| RawEngineData; | ||
|
||
export class Parser { | ||
// private done: boolean = false | ||
constructor(private tokens: Generator<Token>) {} | ||
|
||
parse(): RawEngineData { | ||
const value = this.value(); | ||
// TODO: for this to be true we'd need to force lexer somehow into reaching end-of-file | ||
// console.assert(this.done, "not all tokens from engine data were consumed") | ||
if (typeof value === "object" && !Array.isArray(value)) { | ||
return value; | ||
} | ||
throw new InvalidTopLevelEngineDataValue( | ||
`EngineData top-level value is not a dict; is ${typeof value}` | ||
); | ||
} | ||
|
||
private value(it?: Token): RawEngineValue { | ||
/** | ||
* NOTE: this is recursive descent parser - simplest solution in terms of code complexity | ||
* In case we ever start to run into stack-depth issues | ||
* ("RangeError: Maximum call stack size exceeded" ) | ||
* due to parsing data that's too big, this can be re-written into stack-based one. | ||
* That's because EngineData can be thought about as reverse-polish notation: | ||
* ] - end of array requires popping values from stack until you hit [ | ||
* (and pushing new value - an array - onto stack) | ||
* same for << and >>. | ||
*/ | ||
if (!it) { | ||
it = this.advance(); | ||
} | ||
switch (it.type) { | ||
case TokenType.Name: | ||
case TokenType.Number: | ||
case TokenType.Boolean: | ||
case TokenType.String: | ||
return it.value; | ||
case TokenType.DictBeg: | ||
return this.dict(); | ||
case TokenType.ArrBeg: | ||
return this.arr(); | ||
} | ||
throw new UnexpectedEngineDataToken( | ||
`Unexpected token: ${TokenType[it.type]}` | ||
); | ||
} | ||
|
||
private advance(): Token { | ||
const it = this.tokens.next(); | ||
// this.done = Boolean(it.done); | ||
if (!it.value) { | ||
throw new UnexpectedEndOfEngineData("End of stream"); | ||
} | ||
return it.value; | ||
} | ||
|
||
private dict(): RawEngineData { | ||
const val = {} as RawEngineData; | ||
for (;;) { | ||
const it = this.advance(); | ||
if (it.type === TokenType.DictEnd) { | ||
return val; | ||
} | ||
if (it.type !== TokenType.Name) { | ||
throw new InvalidEngineDataDictKey( | ||
`Dict key is not Name; is ${TokenType[it.type]}` | ||
); | ||
} | ||
const value = this.value(); | ||
val[it.value] = value; | ||
} | ||
} | ||
|
||
private arr(): RawEngineValue[] { | ||
const val = [] as RawEngineValue[]; | ||
for (;;) { | ||
const it = this.advance(); | ||
if (it.type === TokenType.ArrEnd) { | ||
return val; | ||
} | ||
const value = this.value(it); | ||
val.push(value); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
// @webtoon/psd | ||
// Copyright 2021-present NAVER WEBTOON | ||
// MIT License | ||
|
||
import {EngineData} from "../interfaces"; | ||
|
||
const REQUIRED_KEYS = new Set([ | ||
"DocumentResources", | ||
"EngineDict", | ||
"ResourceDict", | ||
]); | ||
|
||
function hasOwnProperty<K extends string>( | ||
obj: unknown, | ||
prop: K | ||
): obj is Record<K, unknown> { | ||
return Object.prototype.hasOwnProperty.call(obj, prop); | ||
} | ||
|
||
export function validateEngineData( | ||
engineData: unknown | ||
): engineData is EngineData { | ||
let ok = true; | ||
if (typeof engineData !== "object") { | ||
return false; | ||
} | ||
if (!engineData) { | ||
return false; | ||
} | ||
for (const key of REQUIRED_KEYS) { | ||
if (hasOwnProperty(engineData, key)) { | ||
const value = engineData[key]; | ||
ok &&= | ||
typeof value === "object" && !Array.isArray(value) && Boolean(value); | ||
} else { | ||
return false; | ||
} | ||
} | ||
return ok; | ||
} |
Oops, something went wrong.