Skip to content

Commit

Permalink
Improve performance, fix HEX HTML entities decoding in some cases.
Browse files Browse the repository at this point in the history
  • Loading branch information
mdevils committed Mar 28, 2021
1 parent 290e224 commit 454914c
Show file tree
Hide file tree
Showing 9 changed files with 166 additions and 134 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
2.3.1
-----

* Improve performace of `encode()`, `decode()` and `decodeEntity()` by using function inlining.
* Fix decoding HEX HTML entities in some cases.

2.3.0
-----

Expand Down
66 changes: 33 additions & 33 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,68 +124,68 @@ Common
Initialization / Load speed
* #1: html-entities x 2,941,745 ops/sec ±1.87% (81 runs sampled)
#2: entities x 2,061,661 ops/sec ±1.16% (82 runs sampled)
#3: he x 1,861,758 ops/sec ±1.15% (86 runs sampled)
* #1: html-entities x 2,544,400 ops/sec ±4.52% (77 runs sampled)
#2: entities x 1,757,526 ops/sec ±3.99% (81 runs sampled)
#3: he x 1,281,542 ops/sec ±9.31% (74 runs sampled)
HTML5
Encode test
* #1: html-entities.encode - html5, nonAscii x 439,350 ops/sec ±0.21% (96 runs sampled)
* #2: html-entities.encode - html5, nonAsciiPrintable x 410,462 ops/sec ±0.22% (93 runs sampled)
#3: entities.encodeNonAsciiHTML x 332,966 ops/sec ±0.54% (92 runs sampled)
* #4: html-entities.encode - html5, extensive x 280,865 ops/sec ±0.22% (95 runs sampled)
#5: entities.encodeHTML x 125,338 ops/sec ±0.30% (92 runs sampled)
#6: he.encode x 112,572 ops/sec ±0.25% (97 runs sampled)
* #1: html-entities.encode - html5, nonAscii x 402,711 ops/sec ±0.61% (92 runs sampled)
* #2: html-entities.encode - html5, nonAsciiPrintable x 402,631 ops/sec ±2.99% (92 runs sampled)
* #3: html-entities.encode - html5, extensive x 269,162 ops/sec ±0.26% (97 runs sampled)
#4: entities.encodeNonAsciiHTML x 260,447 ops/sec ±2.53% (95 runs sampled)
#5: entities.encodeHTML x 101,059 ops/sec ±3.99% (91 runs sampled)
#6: he.encode x 93,180 ops/sec ±3.17% (92 runs sampled)
Decode test
* #1: html-entities.decode - html5, body x 428,051 ops/sec ±0.22% (98 runs sampled)
* #2: html-entities.decode - html5, strict x 402,821 ops/sec ±0.22% (91 runs sampled)
* #3: html-entities.decode - html5, attribute x 391,007 ops/sec ±0.33% (90 runs sampled)
#4: entities.decodeHTMLStrict x 332,909 ops/sec ±0.56% (95 runs sampled)
#5: entities.decodeHTML x 274,700 ops/sec ±0.29% (97 runs sampled)
#6: he.decode x 184,440 ops/sec ±0.27% (95 runs sampled)
* #1: html-entities.decode - html5, attribute x 340,043 ops/sec ±2.82% (92 runs sampled)
* #2: html-entities.decode - html5, body x 330,002 ops/sec ±1.52% (87 runs sampled)
* #3: html-entities.decode - html5, strict x 320,582 ops/sec ±5.34% (88 runs sampled)
#4: entities.decodeHTMLStrict x 286,294 ops/sec ±3.14% (89 runs sampled)
#5: entities.decodeHTML x 232,856 ops/sec ±3.05% (90 runs sampled)
#6: he.decode x 163,300 ops/sec ±0.62% (92 runs sampled)
HTML4
Encode test
* #1: html-entities.encode - html4, nonAscii x 419,600 ops/sec ±0.65% (94 runs sampled)
* #2: html-entities.encode - html4, nonAsciiPrintable x 413,954 ops/sec ±0.83% (91 runs sampled)
* #3: html-entities.encode - html4, extensive x 216,838 ops/sec ±0.22% (96 runs sampled)
* #1: html-entities.encode - html4, nonAsciiPrintable x 391,885 ops/sec ±0.27% (95 runs sampled)
* #2: html-entities.encode - html4, nonAscii x 400,086 ops/sec ±2.54% (94 runs sampled)
* #3: html-entities.encode - html4, extensive x 193,623 ops/sec ±2.70% (92 runs sampled)
Decode test
* #1: html-entities.decode - html4, strict x 420,850 ops/sec ±0.23% (92 runs sampled)
* #2: html-entities.decode - html4, body x 413,042 ops/sec ±0.49% (94 runs sampled)
* #3: html-entities.decode - html4, attribute x 408,538 ops/sec ±2.59% (92 runs sampled)
* #1: html-entities.decode - html4, attribute x 356,174 ops/sec ±0.49% (96 runs sampled)
* #2: html-entities.decode - html4, body x 342,666 ops/sec ±2.38% (91 runs sampled)
* #3: html-entities.decode - html4, strict x 341,667 ops/sec ±4.46% (87 runs sampled)
XML
Encode test
* #1: html-entities.encode - xml, nonAscii x 511,788 ops/sec ±0.21% (97 runs sampled)
* #2: html-entities.encode - xml, nonAsciiPrintable x 482,136 ops/sec ±0.40% (93 runs sampled)
#3: entities.encodeXML x 353,189 ops/sec ±0.57% (95 runs sampled)
* #4: html-entities.encode - xml, extensive x 291,091 ops/sec ±0.23% (96 runs sampled)
* #1: html-entities.encode - xml, nonAscii x 450,968 ops/sec ±2.73% (92 runs sampled)
* #2: html-entities.encode - xml, nonAsciiPrintable x 432,058 ops/sec ±4.12% (93 runs sampled)
* #3: html-entities.encode - xml, extensive x 265,336 ops/sec ±3.41% (93 runs sampled)
#4: entities.encodeXML x 254,862 ops/sec ±3.01% (95 runs sampled)
Decode test
* #1: html-entities.decode - xml, body x 543,327 ops/sec ±0.25% (89 runs sampled)
* #2: html-entities.decode - xml, attribute x 533,470 ops/sec ±0.22% (94 runs sampled)
* #3: html-entities.decode - xml, strict x 528,014 ops/sec ±2.27% (95 runs sampled)
#4: entities.decodeXML x 421,154 ops/sec ±0.32% (96 runs sampled)
* #1: html-entities.decode - xml, strict x 432,820 ops/sec ±0.53% (89 runs sampled)
* #2: html-entities.decode - xml, attribute x 426,037 ops/sec ±0.75% (94 runs sampled)
* #3: html-entities.decode - xml, body x 424,618 ops/sec ±3.47% (93 runs sampled)
#4: entities.decodeXML x 378,536 ops/sec ±2.48% (93 runs sampled)
Escaping
Escape test
* #1: html-entities.encode - xml, specialChars x 1,583,074 ops/sec ±0.24% (95 runs sampled)
#2: he.escape x 1,131,879 ops/sec ±1.65% (94 runs sampled)
#3: entities.escapeUTF8 x 736,205 ops/sec ±0.28% (94 runs sampled)
#4: entities.escape x 314,225 ops/sec ±0.24% (93 runs sampled)
* #1: html-entities.encode - xml, specialChars x 1,424,362 ops/sec ±0.55% (95 runs sampled)
#2: he.escape x 962,420 ops/sec ±3.12% (94 runs sampled)
#3: entities.escapeUTF8 x 443,138 ops/sec ±1.06% (90 runs sampled)
#4: entities.escape x 197,515 ops/sec ±2.73% (91 runs sampled)
```

License
Expand Down
10 changes: 6 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@
"mocha": "^7.1.2",
"prettier": "^2.1.2",
"ts-node": "^8.9.1",
"typescript": "^3.8.3"
"ttypescript": "^1.5.12",
"typescript": "^3.8.3",
"typescript-transform-macros": "^1.1.1"
},
"repository": {
"type": "git",
Expand All @@ -47,12 +49,12 @@
"typings": "./lib/index.d.ts",
"types": "./lib/index.d.ts",
"scripts": {
"test": "mocha --recursive -r ts-node/register test/**/*.ts",
"test": "TS_NODE_COMPILER=ttypescript mocha --recursive -r ts-node/register test/**/*.ts",
"test:lib": "TEST_LIB=1 yarn test",
"benchmark": "ts-node benchmark/benchmark",
"benchmark": "TS_NODE_COMPILER=ttypescript ts-node benchmark/benchmark",
"lint": "eslint src/**.ts",
"flow-type-gen": "flowgen --add-flow-header lib/index.d.ts -o lib/index.js.flow",
"build": "rm -Rf lib/* && tsc && yarn flow-type-gen && yarn test:lib",
"build": "rm -Rf lib/* && ttsc && yarn flow-type-gen && yarn test:lib",
"prepublishOnly": "yarn build"
},
"files": [
Expand Down
173 changes: 84 additions & 89 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,36 @@ const allNamedReferences = {
all: namedReferences.html5
};

// MACRO from https://github.com/LeDDGroup/typescript-transform-macros
declare function MACRO<T>(t: T): T;

const replaceUsingRegExp = MACRO(
(macroText: string, macroRegExp: RegExp, macroReplacer: (input: string) => string): string => {
macroRegExp.lastIndex = 0;
let replaceMatch = macroRegExp.exec(macroText);
let replaceResult;
if (replaceMatch) {
replaceResult = '';
let replaceLastIndex = 0;
do {
if (replaceLastIndex !== replaceMatch.index) {
replaceResult += macroText.substring(replaceLastIndex, replaceMatch.index);
}
const replaceInput = replaceMatch[0];
replaceResult += macroReplacer(replaceInput);
replaceLastIndex = replaceMatch.index + replaceInput.length;
} while ((replaceMatch = macroRegExp.exec(macroText)));

if (replaceLastIndex !== macroText.length) {
replaceResult += macroText.substring(replaceLastIndex);
}
} else {
replaceResult = macroText;
}
return replaceResult;
}
);

export type Level = 'xml' | 'html4' | 'html5' | 'all';

interface CommonOptions {
Expand Down Expand Up @@ -39,6 +69,9 @@ const defaultEncodeOptions: EncodeOptions = {
numeric: 'decimal'
};

/**
* Encodes all the necessary (specified by `level`) characters in the text.
*/
export function encode(
text: string | undefined | null,
{mode = 'specialChars', numeric = 'decimal', level = 'all'}: EncodeOptions = defaultEncodeOptions
Expand All @@ -48,49 +81,26 @@ export function encode(
}

const encodeRegExp = encodeRegExps[mode];
encodeRegExp.lastIndex = 0;

let match = encodeRegExp.exec(text);

if (!match) {
return text;
}

const references = allNamedReferences[level].characters;
const isHex = numeric === 'hexadecimal';

let lastIndex = 0;
let result = '';

do {
if (lastIndex !== match.index) {
result += text.substring(lastIndex, match.index);
}
const input = match[0];
const entity = references[input];
if (entity) {
result += entity;
} else {
return replaceUsingRegExp(text, encodeRegExp, (input) => {
let result = references[input];
if (!result) {
const code = input.length > 1 ? getCodePoint(input, 0)! : input.charCodeAt(0);
result += (isHex ? '&#x' + code.toString(16) : '&#' + code) + ';';
result = (isHex ? '&#x' + code.toString(16) : '&#' + code) + ';';
}
lastIndex = match.index + input.length;
} while ((match = encodeRegExp.exec(text)));

if (lastIndex !== text.length) {
result += text.substring(lastIndex, text.length);
}

return result;
return result;
});
}

const defaultDecodeOptions: DecodeOptions = {
scope: 'body',
level: 'all'
};

const strict = /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);/g;
const attribute = /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+)[;=]?/g;
const strict = /&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);/g;
const attribute = /&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+)[;=]?/g;

const baseDecodeRegExps: Record<Exclude<Level, 'all'>, Record<DecodeScope, RegExp>> = {
xml: {
Expand Down Expand Up @@ -122,82 +132,67 @@ const defaultDecodeEntityOptions: CommonOptions = {
level: 'all'
};

const getDecodedEntity = MACRO(
(entity: string, references: Record<string, string>, isAttribute: boolean, isStrict: boolean): string => {
let decodeResult = entity;
const decodeEntityLastChar = entity[entity.length - 1];
if (isAttribute && decodeEntityLastChar === '=') {
decodeResult = entity;
} else if (isStrict && decodeEntityLastChar !== ';') {
decodeResult = entity;
} else {
const decodeResultByReference = references[entity];
if (decodeResultByReference) {
decodeResult = decodeResultByReference;
} else if (entity[0] === '&' && entity[1] === '#') {
const decodeSecondChar = entity[2];
const decodeCode =
decodeSecondChar == 'x' || decodeSecondChar == 'X'
? parseInt(entity.substr(3), 16)
: parseInt(entity.substr(2));

decodeResult =
decodeCode >= 0x10ffff
? outOfBoundsChar
: decodeCode > 65535
? fromCodePoint(decodeCode)
: fromCharCode(numericUnicodeMap[decodeCode] || decodeCode);
}
}
return decodeResult;
}
);

/**
* Decodes a single entity.
*/
export function decodeEntity(
entity: string | undefined | null,
{level = 'all'}: CommonOptions = defaultDecodeEntityOptions
): string {
if (!entity) {
return '';
}

const references = allNamedReferences[level].entities;
const resultByReference = references[entity];
if (resultByReference) {
return resultByReference;
}
if (entity[0] === '&' && entity[1] === '#') {
const secondChar = entity[2];
const code =
secondChar == 'x' || secondChar == 'X' ? parseInt(entity.substr(3), 16) : parseInt(entity.substr(2));

return code >= 0x10ffff
? outOfBoundsChar
: code > 65535
? fromCodePoint(code)
: fromCharCode(numericUnicodeMap[code] || code);
}
return entity;
return getDecodedEntity(entity, allNamedReferences[level].entities, false, false);
}

/**
* Decodes all entities in the text.
*/
export function decode(
text: string | undefined | null,
{level = 'all', scope = level === 'xml' ? 'strict' : 'body'}: DecodeOptions = defaultDecodeOptions
) {
if (!text) {
return '';
}
const decodeRegExp = decodeRegExps[level][scope];

let match = decodeRegExp.exec(text);

if (!match) {
return text;
}

const decodeRegExp = decodeRegExps[level][scope];
const references = allNamedReferences[level].entities;
const isAttribute = scope === 'attribute';
const isStrict = scope === 'strict';

let lastIndex = 0;
let result = '';

do {
const entity = match[0];
if (lastIndex !== match.index) {
result += text.substring(lastIndex, match.index);
}
if (isAttribute && entity[entity.length - 1] === '=') {
result += entity;
} else if (entity[1] != '#') {
result += references[entity] || entity;
} else {
const secondChar = entity[2];
const code =
secondChar == 'x' || secondChar == 'X' ? parseInt(entity.substr(3), 16) : parseInt(entity.substr(2));

result +=
code >= 0x10ffff
? outOfBoundsChar
: code > 65535
? fromCodePoint(code)
: fromCharCode(numericUnicodeMap[code] || code);
}

lastIndex = match.index + entity.length;
} while ((match = decodeRegExp.exec(text)));

if (lastIndex !== text.length) {
result += text.substring(lastIndex, text.length);
}

return result;
return replaceUsingRegExp(text, decodeRegExp, (entity) =>
getDecodedEntity(entity, references, isAttribute, isStrict)
);
}
6 changes: 3 additions & 3 deletions src/named-references.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ export type NamedReferences = {
}
};
export const bodyRegExps = {
xml: /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g,
html4: /&(?:nbsp|iexcl|cent|pound|curren|yen|brvbar|sect|uml|copy|ordf|laquo|not|shy|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|times|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|divide|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml|quot|amp|lt|gt|#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g,
html5: /&(?:AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml|#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g
xml: /&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g,
html4: /&(?:nbsp|iexcl|cent|pound|curren|yen|brvbar|sect|uml|copy|ordf|laquo|not|shy|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|times|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|divide|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml|quot|amp|lt|gt|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g,
html5: /&(?:AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g
};
export const namedReferences: NamedReferences = {
"xml": {
Expand Down
Loading

0 comments on commit 454914c

Please sign in to comment.