latin1.js 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. import {
  2. nativeEncoder,
  3. nativeDecoder,
  4. nativeDecoderLatin1,
  5. nativeBuffer,
  6. encodeCharcodes,
  7. isHermes,
  8. isDeno,
  9. isLE,
  10. } from './platform.js'
  11. const atob = /* @__PURE__ */ (() => globalThis.atob)()
  12. const web64 = /* @__PURE__ */ (() => Uint8Array.prototype.toBase64)()
  13. // See http://stackoverflow.com/a/22747272/680742, which says that lowest limit is in Chrome, with 0xffff args
  14. // On Hermes, actual max is 0x20_000 minus current stack depth, 1/16 of that should be safe
  15. const maxFunctionArgs = 0x20_00
  16. // toBase64+atob path is faster on everything where fromBase64 is fast
  17. const useLatin1atob = web64 && atob
  18. export function asciiPrefix(arr) {
  19. let p = 0 // verified ascii bytes
  20. const length = arr.length
  21. // Threshold tested on Hermes (worse on <=48, better on >=52)
  22. // Also on v8 arrs of size <=64 might be on heap and using Uint32Array on them is unoptimal
  23. if (length > 64) {
  24. // Speedup with u32
  25. const u32start = (4 - (arr.byteOffset & 3)) % 4 // offset start by this many bytes for alignment
  26. for (; p < u32start; p++) if (arr[p] >= 0x80) return p
  27. const u32length = ((arr.byteLength - u32start) / 4) | 0
  28. const u32 = new Uint32Array(arr.buffer, arr.byteOffset + u32start, u32length)
  29. let i = 0
  30. for (const last3 = u32length - 3; ; p += 16, i += 4) {
  31. if (i >= last3) break // loop is fast enough for moving this here to be _very_ useful, likely due to array access checks
  32. const a = u32[i]
  33. const b = u32[i + 1]
  34. const c = u32[i + 2]
  35. const d = u32[i + 3]
  36. // "(a | b | c | d) & mask" is slower on Hermes though faster on v8
  37. if (a & 0x80_80_80_80 || b & 0x80_80_80_80 || c & 0x80_80_80_80 || d & 0x80_80_80_80) break
  38. }
  39. for (; i < u32length; p += 4, i++) if (u32[i] & 0x80_80_80_80) break
  40. }
  41. for (; p < length; p++) if (arr[p] >= 0x80) return p
  42. return length
  43. }
  44. // Capable of decoding Uint16Array to UTF-16 as well as Uint8Array to Latin-1
  45. export function decodeLatin1(arr, start = 0, stop = arr.length) {
  46. start |= 0
  47. stop |= 0
  48. const total = stop - start
  49. if (total === 0) return ''
  50. if (
  51. useLatin1atob &&
  52. total >= 256 &&
  53. total < 1e8 &&
  54. arr.toBase64 === web64 &&
  55. arr.BYTES_PER_ELEMENT === 1
  56. ) {
  57. const sliced = start === 0 && stop === arr.length ? arr : arr.subarray(start, stop)
  58. return atob(sliced.toBase64())
  59. }
  60. if (total > maxFunctionArgs) {
  61. let prefix = ''
  62. for (let i = start; i < stop; ) {
  63. const i1 = Math.min(stop, i + maxFunctionArgs)
  64. prefix += String.fromCharCode.apply(String, arr.subarray(i, i1))
  65. i = i1
  66. }
  67. return prefix
  68. }
  69. const sliced = start === 0 && stop === arr.length ? arr : arr.subarray(start, stop)
  70. return String.fromCharCode.apply(String, sliced)
  71. }
  72. // Unchecked for well-formedness, raw. Expects Uint16Array input
  73. export const decodeUCS2 =
  74. nativeBuffer && isLE && !isDeno
  75. ? (u16, stop = u16.length) => {
  76. // TODO: fast path for BE, perhaps faster path for Deno. Note that decoder replaces, this function doesn't
  77. if (stop > 32) return nativeBuffer.from(u16.buffer, u16.byteOffset, stop * 2).ucs2Slice() // from 64 bytes, below are in heap
  78. return decodeLatin1(u16, 0, stop)
  79. }
  80. : (u16, stop = u16.length) => decodeLatin1(u16, 0, stop)
  81. // Does not check input, uses best available method
  82. // Building an array for this is only faster than proper string concatenation when TextDecoder or native Buffer are available
  83. export const decodeAscii = nativeBuffer
  84. ? (a) =>
  85. // Buffer is faster on Node.js (but only for long enough data), if we know that output is ascii
  86. a.byteLength >= 0x3_00 && !isDeno
  87. ? nativeBuffer.from(a.buffer, a.byteOffset, a.byteLength).latin1Slice(0, a.byteLength) // .latin1Slice is faster than .asciiSlice
  88. : nativeDecoder.decode(a) // On Node.js, utf8 decoder is faster than latin1
  89. : nativeDecoderLatin1
  90. ? (a) => nativeDecoderLatin1.decode(a) // On browsers (specifically WebKit), latin1 decoder is faster than utf8
  91. : (a) =>
  92. decodeLatin1(
  93. a instanceof Uint8Array ? a : new Uint8Array(a.buffer, a.byteOffset, a.byteLength)
  94. )
  95. /* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
  96. export function encodeAsciiPrefix(x, s) {
  97. let i = 0
  98. for (const len3 = s.length - 3; i < len3; i += 4) {
  99. const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore
  100. if ((x0 | x1 | x2 | x3) >= 128) break
  101. x[i] = x0
  102. x[i + 1] = x1
  103. x[i + 2] = x2
  104. x[i + 3] = x3
  105. }
  106. return i
  107. }
  108. /* eslint-enable @exodus/mutable/no-param-reassign-prop-only */
  109. // Warning: can be used only on checked strings, converts strings to 8-bit
  110. export const encodeLatin1 = (str) => encodeCharcodes(str, new Uint8Array(str.length))
  111. // Expects nativeEncoder to be present
  112. const useEncodeInto = /* @__PURE__ */ (() => isHermes && nativeEncoder?.encodeInto)()
  113. export const encodeAscii = useEncodeInto
  114. ? (str, ERR) => {
  115. // Much faster in Hermes
  116. const codes = new Uint8Array(str.length + 4) // overshoot by a full utf8 char
  117. const info = nativeEncoder.encodeInto(str, codes)
  118. if (info.read !== str.length || info.written !== str.length) throw new SyntaxError(ERR) // non-ascii
  119. return codes.subarray(0, str.length)
  120. }
  121. : nativeBuffer
  122. ? (str, ERR) => {
  123. // TextEncoder is slow on Node.js 24 / 25 (was ok on 22)
  124. const codes = nativeBuffer.from(str, 'utf8') // ascii/latin1 coerces, we need to check
  125. if (codes.length !== str.length) throw new SyntaxError(ERR) // non-ascii
  126. return new Uint8Array(codes.buffer, codes.byteOffset, codes.byteLength)
  127. }
  128. : (str, ERR) => {
  129. const codes = nativeEncoder.encode(str)
  130. if (codes.length !== str.length) throw new SyntaxError(ERR) // non-ascii
  131. return codes
  132. }