html-encoding-sniffer.js 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. "use strict";
  2. const { getBOMEncoding, labelToName } = require("@exodus/bytes/encoding-lite.js");
  3. // https://html.spec.whatwg.org/#encoding-sniffing-algorithm
  4. module.exports = (uint8Array, { xml = false, transportLayerEncodingLabel, defaultEncoding } = {}) => {
  5. if (defaultEncoding === undefined) {
  6. defaultEncoding = xml ? "UTF-8" : "windows-1252";
  7. }
  8. let encoding = labelToName(getBOMEncoding(uint8Array));
  9. if (encoding === null && transportLayerEncodingLabel !== undefined) {
  10. encoding = labelToName(transportLayerEncodingLabel);
  11. }
  12. if (encoding === null && !xml) {
  13. encoding = prescanMetaCharset(uint8Array);
  14. }
  15. if (encoding === null) {
  16. encoding = defaultEncoding;
  17. }
  18. return encoding;
  19. };
  20. // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
  21. function prescanMetaCharset(uint8Array) {
  22. const l = Math.min(uint8Array.byteLength, 1024);
  23. for (let i = 0; i < l; i++) {
  24. let c = uint8Array[i];
  25. if (c === 0x3C) {
  26. // "<"
  27. const c1 = uint8Array[i + 1];
  28. const c2 = uint8Array[i + 2];
  29. const c3 = uint8Array[i + 3];
  30. const c4 = uint8Array[i + 4];
  31. const c5 = uint8Array[i + 5];
  32. // !-- (comment start)
  33. if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
  34. i += 4;
  35. for (; i < l; i++) {
  36. c = uint8Array[i];
  37. const cMinus1 = uint8Array[i - 1];
  38. const cMinus2 = uint8Array[i - 2];
  39. // --> (comment end)
  40. if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
  41. break;
  42. }
  43. }
  44. } else if ((c1 === 0x4D || c1 === 0x6D) &&
  45. (c2 === 0x45 || c2 === 0x65) &&
  46. (c3 === 0x54 || c3 === 0x74) &&
  47. (c4 === 0x41 || c4 === 0x61) &&
  48. (isSpaceCharacter(c5) || c5 === 0x2F)) {
  49. // "meta" + space or /
  50. i += 6;
  51. const attributeList = new Set();
  52. let gotPragma = false;
  53. let needPragma = null;
  54. let charset = null;
  55. let attrRes;
  56. do {
  57. attrRes = getAttribute(uint8Array, i, l);
  58. if (attrRes.attr && !attributeList.has(attrRes.attr.name)) {
  59. attributeList.add(attrRes.attr.name);
  60. if (attrRes.attr.name === "http-equiv") {
  61. gotPragma = attrRes.attr.value === "content-type";
  62. } else if (attrRes.attr.name === "content" && !charset) {
  63. charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
  64. if (charset !== null) {
  65. needPragma = true;
  66. }
  67. } else if (attrRes.attr.name === "charset") {
  68. charset = labelToName(attrRes.attr.value);
  69. needPragma = false;
  70. }
  71. }
  72. i = attrRes.i;
  73. } while (attrRes.attr);
  74. if (needPragma === null) {
  75. continue;
  76. }
  77. if (needPragma === true && gotPragma === false) {
  78. continue;
  79. }
  80. if (charset === null) {
  81. continue;
  82. }
  83. if (charset === "UTF-16LE" || charset === "UTF-16BE") {
  84. charset = "UTF-8";
  85. }
  86. if (charset === "x-user-defined") {
  87. charset = "windows-1252";
  88. }
  89. return charset;
  90. } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
  91. // a-z or A-Z
  92. for (i += 2; i < l; i++) {
  93. c = uint8Array[i];
  94. // space or >
  95. if (isSpaceCharacter(c) || c === 0x3E) {
  96. break;
  97. }
  98. }
  99. let attrRes;
  100. do {
  101. attrRes = getAttribute(uint8Array, i, l);
  102. i = attrRes.i;
  103. } while (attrRes.attr);
  104. } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
  105. // ! or / or ?
  106. for (i += 2; i < l; i++) {
  107. c = uint8Array[i];
  108. // >
  109. if (c === 0x3E) {
  110. break;
  111. }
  112. }
  113. }
  114. }
  115. }
  116. return null;
  117. }
  118. // https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
  119. function getAttribute(uint8Array, i, l) {
  120. for (; i < l; i++) {
  121. let c = uint8Array[i];
  122. // space or /
  123. if (isSpaceCharacter(c) || c === 0x2F) {
  124. continue;
  125. }
  126. // ">"
  127. if (c === 0x3E) {
  128. break;
  129. }
  130. let name = "";
  131. let value = "";
  132. nameLoop:for (; i < l; i++) {
  133. c = uint8Array[i];
  134. // "="
  135. if (c === 0x3D && name !== "") {
  136. i++;
  137. break;
  138. }
  139. // space
  140. if (isSpaceCharacter(c)) {
  141. for (i++; i < l; i++) {
  142. c = uint8Array[i];
  143. // space
  144. if (isSpaceCharacter(c)) {
  145. continue;
  146. }
  147. // not "="
  148. if (c !== 0x3D) {
  149. return { attr: { name, value }, i };
  150. }
  151. i++;
  152. break nameLoop;
  153. }
  154. break;
  155. }
  156. // / or >
  157. if (c === 0x2F || c === 0x3E) {
  158. return { attr: { name, value }, i };
  159. }
  160. // A-Z
  161. if (c >= 0x41 && c <= 0x5A) {
  162. name += String.fromCharCode(c + 0x20); // lowercase
  163. } else {
  164. name += String.fromCharCode(c);
  165. }
  166. }
  167. c = uint8Array[i];
  168. // space
  169. if (isSpaceCharacter(c)) {
  170. for (i++; i < l; i++) {
  171. c = uint8Array[i];
  172. // space
  173. if (isSpaceCharacter(c)) {
  174. continue;
  175. } else {
  176. break;
  177. }
  178. }
  179. }
  180. // " or '
  181. if (c === 0x22 || c === 0x27) {
  182. const quote = c;
  183. for (i++; i < l; i++) {
  184. c = uint8Array[i];
  185. if (c === quote) {
  186. i++;
  187. return { attr: { name, value }, i };
  188. }
  189. // A-Z
  190. if (c >= 0x41 && c <= 0x5A) {
  191. value += String.fromCharCode(c + 0x20); // lowercase
  192. } else {
  193. value += String.fromCharCode(c);
  194. }
  195. }
  196. }
  197. // >
  198. if (c === 0x3E) {
  199. return { attr: { name, value }, i };
  200. }
  201. // A-Z
  202. if (c >= 0x41 && c <= 0x5A) {
  203. value += String.fromCharCode(c + 0x20); // lowercase
  204. } else {
  205. value += String.fromCharCode(c);
  206. }
  207. for (i++; i < l; i++) {
  208. c = uint8Array[i];
  209. // space or >
  210. if (isSpaceCharacter(c) || c === 0x3E) {
  211. return { attr: { name, value }, i };
  212. }
  213. // A-Z
  214. if (c >= 0x41 && c <= 0x5A) {
  215. value += String.fromCharCode(c + 0x20); // lowercase
  216. } else {
  217. value += String.fromCharCode(c);
  218. }
  219. }
  220. }
  221. return { i };
  222. }
  223. function extractCharacterEncodingFromMeta(string) {
  224. let position = 0;
  225. while (true) {
  226. const indexOfCharset = string.substring(position).search(/charset/ui);
  227. if (indexOfCharset === -1) {
  228. return null;
  229. }
  230. let subPosition = position + indexOfCharset + "charset".length;
  231. while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
  232. ++subPosition;
  233. }
  234. if (string[subPosition] !== "=") {
  235. position = subPosition - 1;
  236. continue;
  237. }
  238. ++subPosition;
  239. while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
  240. ++subPosition;
  241. }
  242. position = subPosition;
  243. break;
  244. }
  245. if (string[position] === "\"" || string[position] === "'") {
  246. const nextIndex = string.indexOf(string[position], position + 1);
  247. if (nextIndex !== -1) {
  248. return labelToName(string.substring(position + 1, nextIndex));
  249. }
  250. // It is an unmatched quotation mark
  251. return null;
  252. }
  253. if (string.length === position + 1) {
  254. return null;
  255. }
  256. const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/u);
  257. const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
  258. string.length :
  259. position + indexOfASCIIWhitespaceOrSemicolon + 1;
  260. return labelToName(string.substring(position, end));
  261. }
  262. function isSpaceCharacter(c) {
  263. return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
  264. }