single-byte.node.js 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. import { isAscii } from 'node:buffer'
  2. import { assertU8, toBuf, E_STRING } from './fallback/_utils.js'
  3. import { isDeno, isLE } from './fallback/platform.js'
  4. import { asciiPrefix } from './fallback/latin1.js'
  5. import { encodingMapper, encodingDecoder, encodeMap, E_STRICT } from './fallback/single-byte.js'
  6. function latin1Prefix(arr, start) {
  7. let p = start | 0
  8. const length = arr.length
  9. for (const len3 = length - 3; p < len3; p += 4) {
  10. if ((arr[p] & 0xe0) === 0x80) return p
  11. if ((arr[p + 1] & 0xe0) === 0x80) return p + 1
  12. if ((arr[p + 2] & 0xe0) === 0x80) return p + 2
  13. if ((arr[p + 3] & 0xe0) === 0x80) return p + 3
  14. }
  15. for (; p < length; p++) {
  16. if ((arr[p] & 0xe0) === 0x80) return p
  17. }
  18. return length
  19. }
  20. export function createSinglebyteDecoder(encoding, loose = false) {
  21. if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
  22. if (isDeno) {
  23. const jsDecoder = encodingDecoder(encoding) // asserts
  24. return (arr) => {
  25. assertU8(arr)
  26. if (arr.byteLength === 0) return ''
  27. if (isAscii(arr)) return toBuf(arr).toString()
  28. return jsDecoder(arr, loose) // somewhy faster on Deno anyway, TODO: optimize?
  29. }
  30. }
  31. const isLatin1 = encoding === 'iso-8859-1'
  32. const latin1path = encoding === 'windows-1252'
  33. const { incomplete, mapper } = encodingMapper(encoding) // asserts
  34. return (arr) => {
  35. assertU8(arr)
  36. if (arr.byteLength === 0) return ''
  37. if (isLatin1 || isAscii(arr)) return toBuf(arr).latin1Slice() // .latin1Slice is faster than .asciiSlice
  38. // Node.js TextDecoder is broken, so we can't use it. It's also slow anyway
  39. let prefixBytes = asciiPrefix(arr)
  40. let prefix = ''
  41. if (latin1path) prefixBytes = latin1Prefix(arr, prefixBytes)
  42. if (prefixBytes > 64 || prefixBytes === arr.length) {
  43. prefix = toBuf(arr).latin1Slice(0, prefixBytes) // .latin1Slice is faster than .asciiSlice
  44. if (prefixBytes === arr.length) return prefix
  45. }
  46. const b = toBuf(mapper(arr, prefix.length)) // prefix.length can mismatch prefixBytes
  47. if (!isLE) b.swap16()
  48. const suffix = b.ucs2Slice(0, b.byteLength)
  49. if (!loose && incomplete && suffix.includes('\uFFFD')) throw new TypeError(E_STRICT)
  50. return prefix + suffix
  51. }
  52. }
  53. const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
  54. function encode(s, m) {
  55. const len = s.length
  56. let i = 0
  57. const b = Buffer.from(s, 'utf-16le') // aligned
  58. if (!isLE) b.swap16()
  59. const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
  60. for (const len3 = len - 3; i < len3; i += 4) {
  61. const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
  62. const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
  63. if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) return null // prettier-ignore
  64. x[i] = c0
  65. x[i + 1] = c1
  66. x[i + 2] = c2
  67. x[i + 3] = c3
  68. }
  69. for (; i < len; i++) {
  70. const x0 = x[i]
  71. const c0 = m[x0]
  72. if (!c0 && x0) return null
  73. x[i] = c0
  74. }
  75. return new Uint8Array(x)
  76. }
  77. export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
  78. // TODO: replacement, truncate (replacement will need varying length)
  79. if (mode !== 'fatal') throw new Error('Unsupported mode')
  80. const m = encodeMap(encoding) // asserts
  81. const isLatin1 = encoding === 'iso-8859-1'
  82. return (s) => {
  83. if (typeof s !== 'string') throw new TypeError(E_STRING)
  84. if (isLatin1) {
  85. if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
  86. const b = Buffer.from(s, 'latin1')
  87. return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
  88. }
  89. // Instead of an ASCII regex check, encode optimistically - this is faster
  90. // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
  91. if (!NON_LATIN.test(s)) {
  92. const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
  93. if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
  94. }
  95. const res = encode(s, m)
  96. if (!res) throw new TypeError(E_STRICT)
  97. return res
  98. }
  99. }
  100. export const latin1toString = /* @__PURE__ */ createSinglebyteDecoder('iso-8859-1')
  101. export const latin1fromString = /* @__PURE__ */ createSinglebyteEncoder('iso-8859-1')
  102. export const windows1252toString = /* @__PURE__ */ createSinglebyteDecoder('windows-1252')
  103. export const windows1252fromString = /* @__PURE__ */ createSinglebyteEncoder('windows-1252')