decode.ts 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. import { htmlDecodeTree } from "./generated/decode-data-html.js";
  2. import { xmlDecodeTree } from "./generated/decode-data-xml.js";
  3. import { replaceCodePoint, fromCodePoint } from "./decode-codepoint.js";
  4. const enum CharCodes {
  5. NUM = 35, // "#"
  6. SEMI = 59, // ";"
  7. EQUALS = 61, // "="
  8. ZERO = 48, // "0"
  9. NINE = 57, // "9"
  10. LOWER_A = 97, // "a"
  11. LOWER_F = 102, // "f"
  12. LOWER_X = 120, // "x"
  13. LOWER_Z = 122, // "z"
  14. UPPER_A = 65, // "A"
  15. UPPER_F = 70, // "F"
  16. UPPER_Z = 90, // "Z"
  17. }
  18. /** Bit that needs to be set to convert an upper case ASCII character to lower case */
  19. const TO_LOWER_BIT = 0b10_0000;
  20. export enum BinTrieFlags {
  21. VALUE_LENGTH = 0b1100_0000_0000_0000,
  22. BRANCH_LENGTH = 0b0011_1111_1000_0000,
  23. JUMP_TABLE = 0b0000_0000_0111_1111,
  24. }
  25. function isNumber(code: number): boolean {
  26. return code >= CharCodes.ZERO && code <= CharCodes.NINE;
  27. }
  28. function isHexadecimalCharacter(code: number): boolean {
  29. return (
  30. (code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_F) ||
  31. (code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_F)
  32. );
  33. }
  34. function isAsciiAlphaNumeric(code: number): boolean {
  35. return (
  36. (code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_Z) ||
  37. (code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_Z) ||
  38. isNumber(code)
  39. );
  40. }
  41. /**
  42. * Checks if the given character is a valid end character for an entity in an attribute.
  43. *
  44. * Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
  45. * See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
  46. */
  47. function isEntityInAttributeInvalidEnd(code: number): boolean {
  48. return code === CharCodes.EQUALS || isAsciiAlphaNumeric(code);
  49. }
  50. const enum EntityDecoderState {
  51. EntityStart,
  52. NumericStart,
  53. NumericDecimal,
  54. NumericHex,
  55. NamedEntity,
  56. }
  57. export enum DecodingMode {
  58. /** Entities in text nodes that can end with any character. */
  59. Legacy = 0,
  60. /** Only allow entities terminated with a semicolon. */
  61. Strict = 1,
  62. /** Entities in attributes have limitations on ending characters. */
  63. Attribute = 2,
  64. }
  65. /**
  66. * Producers for character reference errors as defined in the HTML spec.
  67. */
  68. export interface EntityErrorProducer {
  69. missingSemicolonAfterCharacterReference(): void;
  70. absenceOfDigitsInNumericCharacterReference(
  71. consumedCharacters: number,
  72. ): void;
  73. validateNumericCharacterReference(code: number): void;
  74. }
  75. /**
  76. * Token decoder with support of writing partial entities.
  77. */
  78. export class EntityDecoder {
  79. constructor(
  80. /** The tree used to decode entities. */
  81. private readonly decodeTree: Uint16Array,
  82. /**
  83. * The function that is called when a codepoint is decoded.
  84. *
  85. * For multi-byte named entities, this will be called multiple times,
  86. * with the second codepoint, and the same `consumed` value.
  87. *
  88. * @param codepoint The decoded codepoint.
  89. * @param consumed The number of bytes consumed by the decoder.
  90. */
  91. private readonly emitCodePoint: (cp: number, consumed: number) => void,
  92. /** An object that is used to produce errors. */
  93. private readonly errors?: EntityErrorProducer | undefined,
  94. ) {}
  95. /** The current state of the decoder. */
  96. private state = EntityDecoderState.EntityStart;
  97. /** Characters that were consumed while parsing an entity. */
  98. private consumed = 1;
  99. /**
  100. * The result of the entity.
  101. *
  102. * Either the result index of a numeric entity, or the codepoint of a
  103. * numeric entity.
  104. */
  105. private result = 0;
  106. /** The current index in the decode tree. */
  107. private treeIndex = 0;
  108. /** The number of characters that were consumed in excess. */
  109. private excess = 1;
  110. /** The mode in which the decoder is operating. */
  111. private decodeMode = DecodingMode.Strict;
  112. /** Resets the instance to make it reusable. */
  113. startEntity(decodeMode: DecodingMode): void {
  114. this.decodeMode = decodeMode;
  115. this.state = EntityDecoderState.EntityStart;
  116. this.result = 0;
  117. this.treeIndex = 0;
  118. this.excess = 1;
  119. this.consumed = 1;
  120. }
  121. /**
  122. * Write an entity to the decoder. This can be called multiple times with partial entities.
  123. * If the entity is incomplete, the decoder will return -1.
  124. *
  125. * Mirrors the implementation of `getDecoder`, but with the ability to stop decoding if the
  126. * entity is incomplete, and resume when the next string is written.
  127. *
  128. * @param input The string containing the entity (or a continuation of the entity).
  129. * @param offset The offset at which the entity begins. Should be 0 if this is not the first call.
  130. * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
  131. */
  132. write(input: string, offset: number): number {
  133. switch (this.state) {
  134. case EntityDecoderState.EntityStart: {
  135. if (input.charCodeAt(offset) === CharCodes.NUM) {
  136. this.state = EntityDecoderState.NumericStart;
  137. this.consumed += 1;
  138. return this.stateNumericStart(input, offset + 1);
  139. }
  140. this.state = EntityDecoderState.NamedEntity;
  141. return this.stateNamedEntity(input, offset);
  142. }
  143. case EntityDecoderState.NumericStart: {
  144. return this.stateNumericStart(input, offset);
  145. }
  146. case EntityDecoderState.NumericDecimal: {
  147. return this.stateNumericDecimal(input, offset);
  148. }
  149. case EntityDecoderState.NumericHex: {
  150. return this.stateNumericHex(input, offset);
  151. }
  152. case EntityDecoderState.NamedEntity: {
  153. return this.stateNamedEntity(input, offset);
  154. }
  155. }
  156. }
  157. /**
  158. * Switches between the numeric decimal and hexadecimal states.
  159. *
  160. * Equivalent to the `Numeric character reference state` in the HTML spec.
  161. *
  162. * @param input The string containing the entity (or a continuation of the entity).
  163. * @param offset The current offset.
  164. * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
  165. */
  166. private stateNumericStart(input: string, offset: number): number {
  167. if (offset >= input.length) {
  168. return -1;
  169. }
  170. if ((input.charCodeAt(offset) | TO_LOWER_BIT) === CharCodes.LOWER_X) {
  171. this.state = EntityDecoderState.NumericHex;
  172. this.consumed += 1;
  173. return this.stateNumericHex(input, offset + 1);
  174. }
  175. this.state = EntityDecoderState.NumericDecimal;
  176. return this.stateNumericDecimal(input, offset);
  177. }
  178. private addToNumericResult(
  179. input: string,
  180. start: number,
  181. end: number,
  182. base: number,
  183. ): void {
  184. if (start !== end) {
  185. const digitCount = end - start;
  186. this.result =
  187. this.result * Math.pow(base, digitCount) +
  188. Number.parseInt(input.substr(start, digitCount), base);
  189. this.consumed += digitCount;
  190. }
  191. }
  192. /**
  193. * Parses a hexadecimal numeric entity.
  194. *
  195. * Equivalent to the `Hexademical character reference state` in the HTML spec.
  196. *
  197. * @param input The string containing the entity (or a continuation of the entity).
  198. * @param offset The current offset.
  199. * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
  200. */
  201. private stateNumericHex(input: string, offset: number): number {
  202. const startIndex = offset;
  203. while (offset < input.length) {
  204. const char = input.charCodeAt(offset);
  205. if (isNumber(char) || isHexadecimalCharacter(char)) {
  206. offset += 1;
  207. } else {
  208. this.addToNumericResult(input, startIndex, offset, 16);
  209. return this.emitNumericEntity(char, 3);
  210. }
  211. }
  212. this.addToNumericResult(input, startIndex, offset, 16);
  213. return -1;
  214. }
  215. /**
  216. * Parses a decimal numeric entity.
  217. *
  218. * Equivalent to the `Decimal character reference state` in the HTML spec.
  219. *
  220. * @param input The string containing the entity (or a continuation of the entity).
  221. * @param offset The current offset.
  222. * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
  223. */
  224. private stateNumericDecimal(input: string, offset: number): number {
  225. const startIndex = offset;
  226. while (offset < input.length) {
  227. const char = input.charCodeAt(offset);
  228. if (isNumber(char)) {
  229. offset += 1;
  230. } else {
  231. this.addToNumericResult(input, startIndex, offset, 10);
  232. return this.emitNumericEntity(char, 2);
  233. }
  234. }
  235. this.addToNumericResult(input, startIndex, offset, 10);
  236. return -1;
  237. }
  238. /**
  239. * Validate and emit a numeric entity.
  240. *
  241. * Implements the logic from the `Hexademical character reference start
  242. * state` and `Numeric character reference end state` in the HTML spec.
  243. *
  244. * @param lastCp The last code point of the entity. Used to see if the
  245. * entity was terminated with a semicolon.
  246. * @param expectedLength The minimum number of characters that should be
  247. * consumed. Used to validate that at least one digit
  248. * was consumed.
  249. * @returns The number of characters that were consumed.
  250. */
  251. private emitNumericEntity(lastCp: number, expectedLength: number): number {
  252. // Ensure we consumed at least one digit.
  253. if (this.consumed <= expectedLength) {
  254. this.errors?.absenceOfDigitsInNumericCharacterReference(
  255. this.consumed,
  256. );
  257. return 0;
  258. }
  259. // Figure out if this is a legit end of the entity
  260. if (lastCp === CharCodes.SEMI) {
  261. this.consumed += 1;
  262. } else if (this.decodeMode === DecodingMode.Strict) {
  263. return 0;
  264. }
  265. this.emitCodePoint(replaceCodePoint(this.result), this.consumed);
  266. if (this.errors) {
  267. if (lastCp !== CharCodes.SEMI) {
  268. this.errors.missingSemicolonAfterCharacterReference();
  269. }
  270. this.errors.validateNumericCharacterReference(this.result);
  271. }
  272. return this.consumed;
  273. }
  274. /**
  275. * Parses a named entity.
  276. *
  277. * Equivalent to the `Named character reference state` in the HTML spec.
  278. *
  279. * @param input The string containing the entity (or a continuation of the entity).
  280. * @param offset The current offset.
  281. * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
  282. */
  283. private stateNamedEntity(input: string, offset: number): number {
  284. const { decodeTree } = this;
  285. let current = decodeTree[this.treeIndex];
  286. // The mask is the number of bytes of the value, including the current byte.
  287. let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
  288. for (; offset < input.length; offset++, this.excess++) {
  289. const char = input.charCodeAt(offset);
  290. this.treeIndex = determineBranch(
  291. decodeTree,
  292. current,
  293. this.treeIndex + Math.max(1, valueLength),
  294. char,
  295. );
  296. if (this.treeIndex < 0) {
  297. return this.result === 0 ||
  298. // If we are parsing an attribute
  299. (this.decodeMode === DecodingMode.Attribute &&
  300. // We shouldn't have consumed any characters after the entity,
  301. (valueLength === 0 ||
  302. // And there should be no invalid characters.
  303. isEntityInAttributeInvalidEnd(char)))
  304. ? 0
  305. : this.emitNotTerminatedNamedEntity();
  306. }
  307. current = decodeTree[this.treeIndex];
  308. valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
  309. // If the branch is a value, store it and continue
  310. if (valueLength !== 0) {
  311. // If the entity is terminated by a semicolon, we are done.
  312. if (char === CharCodes.SEMI) {
  313. return this.emitNamedEntityData(
  314. this.treeIndex,
  315. valueLength,
  316. this.consumed + this.excess,
  317. );
  318. }
  319. // If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
  320. if (this.decodeMode !== DecodingMode.Strict) {
  321. this.result = this.treeIndex;
  322. this.consumed += this.excess;
  323. this.excess = 0;
  324. }
  325. }
  326. }
  327. return -1;
  328. }
  329. /**
  330. * Emit a named entity that was not terminated with a semicolon.
  331. *
  332. * @returns The number of characters consumed.
  333. */
  334. private emitNotTerminatedNamedEntity(): number {
  335. const { result, decodeTree } = this;
  336. const valueLength =
  337. (decodeTree[result] & BinTrieFlags.VALUE_LENGTH) >> 14;
  338. this.emitNamedEntityData(result, valueLength, this.consumed);
  339. this.errors?.missingSemicolonAfterCharacterReference();
  340. return this.consumed;
  341. }
  342. /**
  343. * Emit a named entity.
  344. *
  345. * @param result The index of the entity in the decode tree.
  346. * @param valueLength The number of bytes in the entity.
  347. * @param consumed The number of characters consumed.
  348. *
  349. * @returns The number of characters consumed.
  350. */
  351. private emitNamedEntityData(
  352. result: number,
  353. valueLength: number,
  354. consumed: number,
  355. ): number {
  356. const { decodeTree } = this;
  357. this.emitCodePoint(
  358. valueLength === 1
  359. ? decodeTree[result] & ~BinTrieFlags.VALUE_LENGTH
  360. : decodeTree[result + 1],
  361. consumed,
  362. );
  363. if (valueLength === 3) {
  364. // For multi-byte values, we need to emit the second byte.
  365. this.emitCodePoint(decodeTree[result + 2], consumed);
  366. }
  367. return consumed;
  368. }
  369. /**
  370. * Signal to the parser that the end of the input was reached.
  371. *
  372. * Remaining data will be emitted and relevant errors will be produced.
  373. *
  374. * @returns The number of characters consumed.
  375. */
  376. end(): number {
  377. switch (this.state) {
  378. case EntityDecoderState.NamedEntity: {
  379. // Emit a named entity if we have one.
  380. return this.result !== 0 &&
  381. (this.decodeMode !== DecodingMode.Attribute ||
  382. this.result === this.treeIndex)
  383. ? this.emitNotTerminatedNamedEntity()
  384. : 0;
  385. }
  386. // Otherwise, emit a numeric entity if we have one.
  387. case EntityDecoderState.NumericDecimal: {
  388. return this.emitNumericEntity(0, 2);
  389. }
  390. case EntityDecoderState.NumericHex: {
  391. return this.emitNumericEntity(0, 3);
  392. }
  393. case EntityDecoderState.NumericStart: {
  394. this.errors?.absenceOfDigitsInNumericCharacterReference(
  395. this.consumed,
  396. );
  397. return 0;
  398. }
  399. case EntityDecoderState.EntityStart: {
  400. // Return 0 if we have no entity.
  401. return 0;
  402. }
  403. }
  404. }
  405. }
  406. /**
  407. * Creates a function that decodes entities in a string.
  408. *
  409. * @param decodeTree The decode tree.
  410. * @returns A function that decodes entities in a string.
  411. */
  412. function getDecoder(decodeTree: Uint16Array) {
  413. let returnValue = "";
  414. const decoder = new EntityDecoder(
  415. decodeTree,
  416. (data) => (returnValue += fromCodePoint(data)),
  417. );
  418. return function decodeWithTrie(
  419. input: string,
  420. decodeMode: DecodingMode,
  421. ): string {
  422. let lastIndex = 0;
  423. let offset = 0;
  424. while ((offset = input.indexOf("&", offset)) >= 0) {
  425. returnValue += input.slice(lastIndex, offset);
  426. decoder.startEntity(decodeMode);
  427. const length = decoder.write(
  428. input,
  429. // Skip the "&"
  430. offset + 1,
  431. );
  432. if (length < 0) {
  433. lastIndex = offset + decoder.end();
  434. break;
  435. }
  436. lastIndex = offset + length;
  437. // If `length` is 0, skip the current `&` and continue.
  438. offset = length === 0 ? lastIndex + 1 : lastIndex;
  439. }
  440. const result = returnValue + input.slice(lastIndex);
  441. // Make sure we don't keep a reference to the final string.
  442. returnValue = "";
  443. return result;
  444. };
  445. }
  446. /**
  447. * Determines the branch of the current node that is taken given the current
  448. * character. This function is used to traverse the trie.
  449. *
  450. * @param decodeTree The trie.
  451. * @param current The current node.
  452. * @param nodeIdx The index right after the current node and its value.
  453. * @param char The current character.
  454. * @returns The index of the next node, or -1 if no branch is taken.
  455. */
  456. export function determineBranch(
  457. decodeTree: Uint16Array,
  458. current: number,
  459. nodeIndex: number,
  460. char: number,
  461. ): number {
  462. const branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
  463. const jumpOffset = current & BinTrieFlags.JUMP_TABLE;
  464. // Case 1: Single branch encoded in jump offset
  465. if (branchCount === 0) {
  466. return jumpOffset !== 0 && char === jumpOffset ? nodeIndex : -1;
  467. }
  468. // Case 2: Multiple branches encoded in jump table
  469. if (jumpOffset) {
  470. const value = char - jumpOffset;
  471. return value < 0 || value >= branchCount
  472. ? -1
  473. : decodeTree[nodeIndex + value] - 1;
  474. }
  475. // Case 3: Multiple branches encoded in dictionary
  476. // Binary search for the character.
  477. let lo = nodeIndex;
  478. let hi = lo + branchCount - 1;
  479. while (lo <= hi) {
  480. const mid = (lo + hi) >>> 1;
  481. const midValue = decodeTree[mid];
  482. if (midValue < char) {
  483. lo = mid + 1;
  484. } else if (midValue > char) {
  485. hi = mid - 1;
  486. } else {
  487. return decodeTree[mid + branchCount];
  488. }
  489. }
  490. return -1;
  491. }
  492. const htmlDecoder = /* #__PURE__ */ getDecoder(htmlDecodeTree);
  493. const xmlDecoder = /* #__PURE__ */ getDecoder(xmlDecodeTree);
  494. /**
  495. * Decodes an HTML string.
  496. *
  497. * @param htmlString The string to decode.
  498. * @param mode The decoding mode.
  499. * @returns The decoded string.
  500. */
  501. export function decodeHTML(
  502. htmlString: string,
  503. mode: DecodingMode = DecodingMode.Legacy,
  504. ): string {
  505. return htmlDecoder(htmlString, mode);
  506. }
  507. /**
  508. * Decodes an HTML string in an attribute.
  509. *
  510. * @param htmlAttribute The string to decode.
  511. * @returns The decoded string.
  512. */
  513. export function decodeHTMLAttribute(htmlAttribute: string): string {
  514. return htmlDecoder(htmlAttribute, DecodingMode.Attribute);
  515. }
  516. /**
  517. * Decodes an HTML string, requiring all entities to be terminated by a semicolon.
  518. *
  519. * @param htmlString The string to decode.
  520. * @returns The decoded string.
  521. */
  522. export function decodeHTMLStrict(htmlString: string): string {
  523. return htmlDecoder(htmlString, DecodingMode.Strict);
  524. }
  525. /**
  526. * Decodes an XML string, requiring all entities to be terminated by a semicolon.
  527. *
  528. * @param xmlString The string to decode.
  529. * @returns The decoded string.
  530. */
  531. export function decodeXML(xmlString: string): string {
  532. return xmlDecoder(xmlString, DecodingMode.Strict);
  533. }
  534. // Re-export for use by eg. htmlparser2
  535. export { htmlDecodeTree } from "./generated/decode-data-html.js";
  536. export { xmlDecodeTree } from "./generated/decode-data-xml.js";
  537. export {
  538. decodeCodePoint,
  539. replaceCodePoint,
  540. fromCodePoint,
  541. } from "./decode-codepoint.js";