diff --git a/src/index.ts b/src/index.ts index e2617a8..7559797 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,4 @@ -import {namedReferences} from './named-references'; +import {bodyRegExps, namedReferences} from './named-references'; import {numericUnicodeMap} from './numeric-unicode-map'; import {fromCodePoint, getCodePoint} from './surrogate-pairs'; @@ -63,10 +63,30 @@ const defaultDecodeOptions: DecodeOptions = { level: 'all' }; -const decodeRegExps: Record = { - strict: /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);/g, - body: /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g, - attribute: /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+)[;=]?/g +const strict = /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);/g; +const attribute = /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+)[;=]?/g; + +const baseDecodeRegExps: Record, Record> = { + xml: { + strict, + attribute, + body: bodyRegExps.xml + }, + html4: { + strict, + attribute, + body: bodyRegExps.html4 + }, + html5: { + strict, + attribute, + body: bodyRegExps.html5 + } +}; + +const decodeRegExps: Record> = { + ...baseDecodeRegExps, + all: baseDecodeRegExps.html5 }; const fromCharCode = String.fromCharCode; @@ -81,7 +101,7 @@ export function decode( const references = allNamedReferences[level].entities; const isAttribute = scope === 'attribute'; - return text.replace(decodeRegExps[scope], function (entity) { + return text.replace(decodeRegExps[level][scope], function (entity) { if (isAttribute && entity[entity.length - 1] === '=') { return entity; } diff --git a/src/named-references.ts b/src/named-references.ts index fc66567..308e3d9 100644 --- a/src/named-references.ts +++ b/src/named-references.ts @@ -6,6 +6,11 @@ export type NamedReferences = { characters: Record; } }; +export const bodyRegExps = { + xml: /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g, + html4: /&(?:nbsp|iexcl|cent|pound|curren|yen|brvbar|sect|uml|copy|ordf|laquo|not|shy|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|times|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|divide|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml|quot|amp|lt|gt|#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g, + html5: /&(?:AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml|#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g +}; export const namedReferences: NamedReferences = { "xml": { "entities": { diff --git a/test/index.test.ts b/test/index.test.ts index 5562771..06aa902 100644 --- a/test/index.test.ts +++ b/test/index.test.ts @@ -71,6 +71,9 @@ describe('decode()', () => { it('should decode numeric entities without semicolon', () => { expect(decode('"C"')).to.equal('"C"'); }); + it('should decode incomplete named entities followed by alphanumeric characters', () => { + expect(decode('über')).to.equal('über'); + }); describe('level', () => { it('should decode according to the level', () => { expect(decode('a\n<>"'&©∆℞😂�', {level: 'all'})).to.equal( diff --git a/tools/process-named-references.ts b/tools/process-named-references.ts index 91fc442..323ebdc 100644 --- a/tools/process-named-references.ts +++ b/tools/process-named-references.ts @@ -12,13 +12,25 @@ interface LevelData { const result: {[key in Level]?: LevelData} = {}; +const regExpStart = '/&(?:'; +const regExpCommon = '#\\d+|#x[\\da-fA-F]+|[0-9a-zA-Z]+)'; +const regExpEndBody = ';?/g'; + +const bodyRegExps: [string, string][] = []; + for (const [level, entityInfos] of getObjectEntries(namedReferences)) { + const bodyRegExpNamedReferences: string[] = []; const levelData: LevelData = {entities: {}, characters: {}}; for (const [entity, {characters}] of getObjectEntries(entityInfos)) { levelData.entities[entity] = characters; levelData.characters[characters] = entity; + if (!entity.endsWith(';')) { + bodyRegExpNamedReferences.push(entity.slice(1)); + } } result[level] = levelData; + bodyRegExpNamedReferences.push(regExpCommon); + bodyRegExps.push([level, regExpStart + bodyRegExpNamedReferences.join('|') + regExpEndBody]); } const processedNamedReferences = `// This file is autogenerated by tools/process-named-references.ts @@ -31,6 +43,9 @@ export type NamedReferences = { characters: Record; } }; +export const bodyRegExps = { + ${bodyRegExps.map(([level, regExpStart]) => `${level}: ${regExpStart}`).join(',\n ')} +}; export const namedReferences: NamedReferences = ${JSON.stringify(result, null, 4)};\n`; fs.writeFileSync(path.join(__dirname, '..', 'src', 'named-references.ts'), processedNamedReferences);