single-byte.js 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. import { assertU8, E_STRING } from './fallback/_utils.js'
  2. import { nativeDecoderLatin1, nativeEncoder } from './fallback/platform.js'
  3. import { encodeAscii, encodeAsciiPrefix, encodeLatin1 } from './fallback/latin1.js'
  4. import { assertEncoding, encodingDecoder, encodeMap, E_STRICT } from './fallback/single-byte.js'
  5. const { TextDecoder, btoa } = globalThis
  6. let windows1252works
  7. // prettier-ignore
  8. const skipNative = new Set([
  9. 'iso-8859-1', 'iso-8859-9', 'iso-8859-11', // non-WHATWG
  10. 'iso-8859-6', 'iso-8859-8', 'iso-8859-8-i', // slow in all 3 engines
  11. 'iso-8859-16', // iso-8859-16 is somehow broken in WebKit, at least on CI
  12. ])
  13. function shouldUseNative(enc) {
  14. // https://issues.chromium.org/issues/468458388
  15. // Also might be incorrectly imlemented on platforms as Latin1 (e.g. in Node.js) or regress
  16. // This is the most significant single-byte encoding, 'ascii' and 'latin1' alias to this
  17. // Even after Chrome bug is fixed, this should serve as a quick correctness check that it's actually windows-1252
  18. if (enc === 'windows-1252') {
  19. if (windows1252works === undefined) {
  20. windows1252works = false
  21. try {
  22. const u = new Uint8Array(9) // using 9 bytes is significant to catch the bug
  23. u[8] = 128
  24. windows1252works = new TextDecoder(enc).decode(u).codePointAt(8) === 0x20_ac
  25. } catch {}
  26. }
  27. return windows1252works
  28. }
  29. return !skipNative.has(enc)
  30. }
  31. export function createSinglebyteDecoder(encoding, loose = false) {
  32. if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
  33. assertEncoding(encoding)
  34. if (nativeDecoderLatin1 && shouldUseNative(encoding)) {
  35. // In try, as not all encodings might be implemented in all engines which have native TextDecoder
  36. try {
  37. const decoder = new TextDecoder(encoding, { fatal: !loose })
  38. return (arr) => {
  39. assertU8(arr)
  40. if (arr.byteLength === 0) return ''
  41. return decoder.decode(arr)
  42. }
  43. } catch {}
  44. }
  45. const jsDecoder = encodingDecoder(encoding)
  46. return (arr) => {
  47. assertU8(arr)
  48. if (arr.byteLength === 0) return ''
  49. return jsDecoder(arr, loose)
  50. }
  51. }
  52. const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
  53. function encode(s, m) {
  54. const len = s.length
  55. const x = new Uint8Array(len)
  56. let i = nativeEncoder ? 0 : encodeAsciiPrefix(x, s)
  57. for (const len3 = len - 3; i < len3; i += 4) {
  58. const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore
  59. const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
  60. if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) return null
  61. x[i] = c0
  62. x[i + 1] = c1
  63. x[i + 2] = c2
  64. x[i + 3] = c3
  65. }
  66. for (; i < len; i++) {
  67. const x0 = s.charCodeAt(i)
  68. const c0 = m[x0]
  69. if (!c0 && x0) return null
  70. x[i] = c0
  71. }
  72. return x
  73. }
  74. // fromBase64+btoa path is faster on everything where fromBase64 is fast
  75. const useLatin1btoa = Uint8Array.fromBase64 && btoa
  76. export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
  77. // TODO: replacement, truncate (replacement will need varying length)
  78. if (mode !== 'fatal') throw new Error('Unsupported mode')
  79. const m = encodeMap(encoding) // asserts
  80. const isLatin1 = encoding === 'iso-8859-1'
  81. // No single-byte encoder produces surrogate pairs, so any surrogate is invalid
  82. // This needs special treatment only to decide how many replacement chars to output, one or two
  83. // Not much use in running isWellFormed, most likely cause of error is unmapped chars, not surrogate pairs
  84. return (s) => {
  85. if (typeof s !== 'string') throw new TypeError(E_STRING)
  86. if (isLatin1) {
  87. // max limit is to not produce base64 strings that are too long
  88. if (useLatin1btoa && s.length >= 1024 && s.length < 1e8) {
  89. try {
  90. return Uint8Array.fromBase64(btoa(s)) // fails on non-latin1
  91. } catch {
  92. throw new TypeError(E_STRICT)
  93. }
  94. }
  95. if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
  96. return encodeLatin1(s)
  97. }
  98. // Instead of an ASCII regex check, encode optimistically - this is faster
  99. // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
  100. if (nativeEncoder && !NON_LATIN.test(s)) {
  101. try {
  102. return encodeAscii(s, E_STRICT)
  103. } catch {}
  104. }
  105. const res = encode(s, m)
  106. if (!res) throw new TypeError(E_STRICT)
  107. return res
  108. }
  109. }
  110. export const latin1toString = /* @__PURE__ */ createSinglebyteDecoder('iso-8859-1')
  111. export const latin1fromString = /* @__PURE__ */ createSinglebyteEncoder('iso-8859-1')
  112. export const windows1252toString = /* @__PURE__ */ createSinglebyteDecoder('windows-1252')
  113. export const windows1252fromString = /* @__PURE__ */ createSinglebyteEncoder('windows-1252')