utf8.js 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import { typedView } from './array.js'
  2. import { assertU8, E_STRING, E_STRICT_UNICODE } from './fallback/_utils.js'
  3. import { nativeDecoder, nativeEncoder } from './fallback/platform.js'
  4. import * as js from './fallback/utf8.auto.js'
  5. // ignoreBOM: true means that BOM will be left as-is, i.e. will be present in the output
  6. // We don't want to strip anything unexpectedly
  7. const decoderLoose = nativeDecoder
  8. const decoderFatal = nativeDecoder
  9. ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true })
  10. : null
  11. const { isWellFormed } = String.prototype
  12. function deLoose(str, loose, res) {
  13. if (loose || str.length === res.length) return res // length is equal only for ascii, which is automatically fine
  14. if (isWellFormed) {
  15. // We have a fast native method
  16. if (isWellFormed.call(str)) return res
  17. throw new TypeError(E_STRICT_UNICODE)
  18. }
  19. // Recheck if the string was encoded correctly
  20. let start = 0
  21. const last = res.length - 3
  22. // Search for EFBFBD (3-byte sequence)
  23. while (start <= last) {
  24. const pos = res.indexOf(0xef, start)
  25. if (pos === -1 || pos > last) break
  26. start = pos + 1
  27. if (res[pos + 1] === 0xbf && res[pos + 2] === 0xbd) {
  28. // Found a replacement char in output, need to recheck if we encoded the input correctly
  29. if (js.decodeFast && !nativeDecoder && str.length < 1e7) {
  30. // This is ~2x faster than decode in Hermes
  31. try {
  32. if (encodeURI(str) !== null) return res // guard against optimizing out
  33. } catch {}
  34. } else if (str === decode(res)) return res
  35. throw new TypeError(E_STRICT_UNICODE)
  36. }
  37. }
  38. return res
  39. }
  40. function encode(str, loose = false) {
  41. if (typeof str !== 'string') throw new TypeError(E_STRING)
  42. if (str.length === 0) return new Uint8Array() // faster than Uint8Array.of
  43. if (nativeEncoder || !js.encode) return deLoose(str, loose, nativeEncoder.encode(str))
  44. // No reason to use unescape + encodeURIComponent: it's slower than JS on normal engines, and modern Hermes already has TextEncoder
  45. return js.encode(str, loose)
  46. }
  47. function decode(arr, loose = false) {
  48. assertU8(arr)
  49. if (arr.byteLength === 0) return ''
  50. if (nativeDecoder || !js.decodeFast) {
  51. return loose ? decoderLoose.decode(arr) : decoderFatal.decode(arr) // Node.js and browsers
  52. }
  53. return js.decodeFast(arr, loose)
  54. }
  55. export const utf8fromString = (str, format = 'uint8') => typedView(encode(str, false), format)
  56. export const utf8fromStringLoose = (str, format = 'uint8') => typedView(encode(str, true), format)
  57. export const utf8toString = (arr) => decode(arr, false)
  58. export const utf8toStringLoose = (arr) => decode(arr, true)