utf16.js 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. import { decodeUCS2 } from './latin1.js'
  2. import { assertU8, E_STRING, E_STRICT_UNICODE } from './_utils.js'
  3. import { nativeDecoder, isLE, encodeCharcodes } from './platform.js'
  4. export const E_STRICT = 'Input is not well-formed utf16'
  5. const isWellFormedStr = /* @__PURE__ */ (() => String.prototype.isWellFormed)()
  6. const toWellFormedStr = /* @__PURE__ */ (() => String.prototype.toWellFormed)()
  7. const replacementCodepoint = 0xff_fd
  8. const replacementCodepointSwapped = 0xfd_ff
  9. const to16 = (a) => new Uint16Array(a.buffer, a.byteOffset, a.byteLength / 2) // Requires checked length and alignment!
  10. export function encodeApi(str, loose, format) {
  11. if (typeof str !== 'string') throw new TypeError(E_STRING)
  12. if (format !== 'uint16' && format !== 'uint8-le' && format !== 'uint8-be') {
  13. throw new TypeError('Unknown format')
  14. }
  15. // On v8 and SpiderMonkey, check via isWellFormed is faster than js
  16. // On JSC, check during loop is faster than isWellFormed
  17. // If isWellFormed is available, we skip check during decoding and recheck after
  18. // If isWellFormed is unavailable, we check in js during decoding
  19. if (!loose && isWellFormedStr && !isWellFormedStr.call(str)) throw new TypeError(E_STRICT_UNICODE)
  20. const shouldSwap = (isLE && format === 'uint8-be') || (!isLE && format === 'uint8-le')
  21. const u16 = encode(str, loose, !loose && isWellFormedStr, shouldSwap)
  22. // Bytes are already swapped and format is already checked, we need to just cast the view
  23. return format === 'uint16' ? u16 : new Uint8Array(u16.buffer, u16.byteOffset, u16.byteLength)
  24. }
  25. const fatalLE = nativeDecoder ? new TextDecoder('utf-16le', { ignoreBOM: true, fatal: true }) : null
  26. const looseLE = nativeDecoder ? new TextDecoder('utf-16le', { ignoreBOM: true }) : null
  27. const fatalBE = nativeDecoder ? new TextDecoder('utf-16be', { ignoreBOM: true, fatal: true }) : null
  28. const looseBE = nativeDecoder ? new TextDecoder('utf-16be', { ignoreBOM: true }) : null
  29. export function decodeApiDecoders(input, loose, format) {
  30. if (format === 'uint16') {
  31. if (!(input instanceof Uint16Array)) throw new TypeError('Expected an Uint16Array')
  32. } else if (format === 'uint8-le' || format === 'uint8-be') {
  33. assertU8(input)
  34. if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
  35. } else {
  36. throw new TypeError('Unknown format')
  37. }
  38. const le = format === 'uint8-le' || (format === 'uint16' && isLE)
  39. return (le ? (loose ? looseLE : fatalLE) : loose ? looseBE : fatalBE).decode(input)
  40. }
  41. export function decodeApiJS(input, loose, format) {
  42. let u16
  43. switch (format) {
  44. case 'uint16':
  45. if (!(input instanceof Uint16Array)) throw new TypeError('Expected an Uint16Array')
  46. u16 = input
  47. break
  48. case 'uint8-le':
  49. assertU8(input)
  50. if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
  51. u16 = to16input(input, true)
  52. break
  53. case 'uint8-be':
  54. assertU8(input)
  55. if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
  56. u16 = to16input(input, false)
  57. break
  58. default:
  59. throw new TypeError('Unknown format')
  60. }
  61. const str = decode(u16, loose, (!loose && isWellFormedStr) || (loose && toWellFormedStr))
  62. if (!loose && isWellFormedStr && !isWellFormedStr.call(str)) throw new TypeError(E_STRICT)
  63. if (loose && toWellFormedStr) return toWellFormedStr.call(str)
  64. return str
  65. }
  66. export function to16input(u8, le) {
  67. // Assume even number of bytes
  68. if (le === isLE) return to16(u8.byteOffset % 2 === 0 ? u8 : Uint8Array.from(u8))
  69. return to16(swap16(Uint8Array.from(u8)))
  70. }
  71. export const decode = (u16, loose = false, checked = false) => {
  72. if (checked || isWellFormed(u16)) return decodeUCS2(u16)
  73. if (!loose) throw new TypeError(E_STRICT)
  74. return decodeUCS2(toWellFormed(Uint16Array.from(u16))) // cloned for replacement
  75. }
  76. export function encode(str, loose = false, checked = false, swapped = false) {
  77. const arr = new Uint16Array(str.length)
  78. if (checked) return swapped ? encodeCheckedSwapped(str, arr) : encodeChecked(str, arr)
  79. return swapped ? encodeUncheckedSwapped(str, arr, loose) : encodeUnchecked(str, arr, loose)
  80. }
  81. /* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
  82. // Assumes checked length % 2 === 0, otherwise does not swap tail
  83. function swap16(u8) {
  84. let i = 0
  85. for (const last3 = u8.length - 3; i < last3; i += 4) {
  86. const x0 = u8[i]
  87. const x1 = u8[i + 1]
  88. const x2 = u8[i + 2]
  89. const x3 = u8[i + 3]
  90. u8[i] = x1
  91. u8[i + 1] = x0
  92. u8[i + 2] = x3
  93. u8[i + 3] = x2
  94. }
  95. for (const last = u8.length - 1; i < last; i += 2) {
  96. const x0 = u8[i]
  97. const x1 = u8[i + 1]
  98. u8[i] = x1
  99. u8[i + 1] = x0
  100. }
  101. return u8
  102. }
  103. // Splitting paths into small functions helps (at least on SpiderMonkey)
  104. const encodeChecked = (str, arr) => encodeCharcodes(str, arr) // Same as encodeLatin1, but with Uint16Array
  105. function encodeCheckedSwapped(str, arr) {
  106. // TODO: faster path for Hermes? See encodeCharcodes
  107. const length = str.length
  108. for (let i = 0; i < length; i++) {
  109. const x = str.charCodeAt(i)
  110. arr[i] = ((x & 0xff) << 8) | (x >> 8)
  111. }
  112. return arr
  113. }
  114. // lead: d800 - dbff, trail: dc00 - dfff
  115. function encodeUnchecked(str, arr, loose = false) {
  116. // TODO: faster path for Hermes? See encodeCharcodes
  117. const length = str.length
  118. for (let i = 0; i < length; i++) {
  119. const code = str.charCodeAt(i)
  120. arr[i] = code
  121. if (code >= 0xd8_00 && code < 0xe0_00) {
  122. // An unexpected trail or a lead at the very end of input
  123. if (code > 0xdb_ff || i + 1 >= length) {
  124. if (!loose) throw new TypeError(E_STRICT_UNICODE)
  125. arr[i] = replacementCodepoint
  126. } else {
  127. const next = str.charCodeAt(i + 1) // Process valid pairs immediately
  128. if (next < 0xdc_00 || next >= 0xe0_00) {
  129. if (!loose) throw new TypeError(E_STRICT_UNICODE)
  130. arr[i] = replacementCodepoint
  131. } else {
  132. i++ // consume next
  133. arr[i] = next
  134. }
  135. }
  136. }
  137. }
  138. return arr
  139. }
  140. function encodeUncheckedSwapped(str, arr, loose = false) {
  141. // TODO: faster path for Hermes? See encodeCharcodes
  142. const length = str.length
  143. for (let i = 0; i < length; i++) {
  144. const code = str.charCodeAt(i)
  145. arr[i] = ((code & 0xff) << 8) | (code >> 8)
  146. if (code >= 0xd8_00 && code < 0xe0_00) {
  147. // An unexpected trail or a lead at the very end of input
  148. if (code > 0xdb_ff || i + 1 >= length) {
  149. if (!loose) throw new TypeError(E_STRICT_UNICODE)
  150. arr[i] = replacementCodepointSwapped
  151. } else {
  152. const next = str.charCodeAt(i + 1) // Process valid pairs immediately
  153. if (next < 0xdc_00 || next >= 0xe0_00) {
  154. if (!loose) throw new TypeError(E_STRICT_UNICODE)
  155. arr[i] = replacementCodepointSwapped
  156. } else {
  157. i++ // consume next
  158. arr[i] = ((next & 0xff) << 8) | (next >> 8)
  159. }
  160. }
  161. }
  162. }
  163. return arr
  164. }
  165. // Only needed on Hermes, everything else has native impl
  166. export function toWellFormed(u16) {
  167. const length = u16.length
  168. for (let i = 0; i < length; i++) {
  169. const code = u16[i]
  170. if (code >= 0xd8_00 && code < 0xe0_00) {
  171. // An unexpected trail or a lead at the very end of input
  172. if (code > 0xdb_ff || i + 1 >= length) {
  173. u16[i] = replacementCodepoint
  174. } else {
  175. const next = u16[i + 1] // Process valid pairs immediately
  176. if (next < 0xdc_00 || next >= 0xe0_00) {
  177. u16[i] = replacementCodepoint
  178. } else {
  179. i++ // consume next
  180. }
  181. }
  182. }
  183. }
  184. return u16
  185. }
  186. // Only needed on Hermes, everything else has native impl
  187. export function isWellFormed(u16) {
  188. const length = u16.length
  189. let i = 0
  190. const m = 0x80_00_80_00
  191. const l = 0xd8_00
  192. const h = 0xe0_00
  193. // Speedup with u32, by skipping to the first surrogate
  194. // Only implemented for aligned input for now, but almost all input is aligned (pooled Buffer or 0 offset)
  195. if (length > 32 && u16.byteOffset % 4 === 0) {
  196. const u32length = (u16.byteLength / 4) | 0
  197. const u32 = new Uint32Array(u16.buffer, u16.byteOffset, u32length)
  198. for (const last3 = u32length - 3; ; i += 4) {
  199. if (i >= last3) break // loop is fast enough for moving this here to be _very_ useful, likely due to array access checks
  200. const a = u32[i]
  201. const b = u32[i + 1]
  202. const c = u32[i + 2]
  203. const d = u32[i + 3]
  204. if (a & m || b & m || c & m || d & m) break // bitwise OR does not make this faster on Hermes
  205. }
  206. for (; i < u32length; i++) if (u32[i] & m) break
  207. i *= 2
  208. }
  209. // An extra loop gives ~30-40% speedup e.g. on English text without surrogates but with other symbols above 0x80_00
  210. for (const last3 = length - 3; ; i += 4) {
  211. if (i >= last3) break
  212. const a = u16[i]
  213. const b = u16[i + 1]
  214. const c = u16[i + 2]
  215. const d = u16[i + 3]
  216. if ((a >= l && a < h) || (b >= l && b < h) || (c >= l && c < h) || (d >= l && d < h)) break
  217. }
  218. for (; i < length; i++) {
  219. const code = u16[i]
  220. if (code >= l && code < h) {
  221. // An unexpected trail or a lead at the very end of input
  222. if (code >= 0xdc_00 || i + 1 >= length) return false
  223. i++ // consume next
  224. const next = u16[i] // Process valid pairs immediately
  225. if (next < 0xdc_00 || next >= h) return false
  226. }
  227. }
  228. return true
  229. }