utf8.js 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. import { E_STRICT_UNICODE } from './_utils.js'
  2. import { isHermes } from './platform.js'
  3. import { asciiPrefix, decodeLatin1, encodeAsciiPrefix } from './latin1.js'
  4. export const E_STRICT = 'Input is not well-formed utf8'
  5. const replacementPoint = 0xff_fd
  6. const shouldUseEscapePath = isHermes // faster only on Hermes, js path beats it on normal engines
  7. const { decodeURIComponent, escape } = globalThis
  8. export function decodeFast(arr, loose) {
  9. // Fast path for ASCII prefix, this is faster than all alternatives below
  10. const prefix = decodeLatin1(arr, 0, asciiPrefix(arr)) // No native decoder to use, so decodeAscii is useless here
  11. if (prefix.length === arr.length) return prefix
  12. // This codepath gives a ~3x perf boost on Hermes
  13. if (shouldUseEscapePath && escape && decodeURIComponent) {
  14. const o = escape(decodeLatin1(arr, prefix.length, arr.length))
  15. try {
  16. return prefix + decodeURIComponent(o) // Latin1 to utf8
  17. } catch {
  18. if (!loose) throw new TypeError(E_STRICT)
  19. // Ok, we have to use manual implementation for loose decoder
  20. }
  21. }
  22. return prefix + decode(arr, loose, prefix.length)
  23. }
  24. // https://encoding.spec.whatwg.org/#utf-8-decoder
  25. // We are most likely in loose mode, for non-loose escape & decodeURIComponent solved everything
  26. export function decode(arr, loose, start = 0) {
  27. start |= 0
  28. const end = arr.length
  29. let out = ''
  30. const chunkSize = 0x2_00 // far below MAX_ARGUMENTS_LENGTH in npmjs.com/buffer, we use smaller chunks
  31. const tmpSize = Math.min(end - start, chunkSize + 1) // need 1 extra slot for last codepoint, which can be 2 charcodes
  32. const tmp = new Array(tmpSize).fill(0)
  33. let ti = 0
  34. for (let i = start; i < end; i++) {
  35. if (ti >= chunkSize) {
  36. tmp.length = ti // can be larger by 1 if last codepoint is two charcodes
  37. out += String.fromCharCode.apply(String, tmp)
  38. if (tmp.length <= chunkSize) tmp.push(0) // restore 1 extra slot for last codepoint
  39. ti = 0
  40. }
  41. const byte = arr[i]
  42. if (byte < 0x80) {
  43. tmp[ti++] = byte
  44. // ascii fast path is in decodeFast(), this is called only on non-ascii input
  45. // so we don't unroll this anymore
  46. } else if (byte < 0xc2) {
  47. if (!loose) throw new TypeError(E_STRICT)
  48. tmp[ti++] = replacementPoint
  49. } else if (byte < 0xe0) {
  50. // need 1 more
  51. if (i + 1 >= end) {
  52. if (!loose) throw new TypeError(E_STRICT)
  53. tmp[ti++] = replacementPoint
  54. break
  55. }
  56. const byte1 = arr[i + 1]
  57. if (byte1 < 0x80 || byte1 > 0xbf) {
  58. if (!loose) throw new TypeError(E_STRICT)
  59. tmp[ti++] = replacementPoint
  60. continue
  61. }
  62. i++
  63. tmp[ti++] = ((byte & 0x1f) << 6) | (byte1 & 0x3f)
  64. } else if (byte < 0xf0) {
  65. // need 2 more
  66. if (i + 1 >= end) {
  67. if (!loose) throw new TypeError(E_STRICT)
  68. tmp[ti++] = replacementPoint
  69. break
  70. }
  71. const lower = byte === 0xe0 ? 0xa0 : 0x80
  72. const upper = byte === 0xed ? 0x9f : 0xbf
  73. const byte1 = arr[i + 1]
  74. if (byte1 < lower || byte1 > upper) {
  75. if (!loose) throw new TypeError(E_STRICT)
  76. tmp[ti++] = replacementPoint
  77. continue
  78. }
  79. i++
  80. if (i + 1 >= end) {
  81. if (!loose) throw new TypeError(E_STRICT)
  82. tmp[ti++] = replacementPoint
  83. break
  84. }
  85. const byte2 = arr[i + 1]
  86. if (byte2 < 0x80 || byte2 > 0xbf) {
  87. if (!loose) throw new TypeError(E_STRICT)
  88. tmp[ti++] = replacementPoint
  89. continue
  90. }
  91. i++
  92. tmp[ti++] = ((byte & 0xf) << 12) | ((byte1 & 0x3f) << 6) | (byte2 & 0x3f)
  93. } else if (byte <= 0xf4) {
  94. // need 3 more
  95. if (i + 1 >= end) {
  96. if (!loose) throw new TypeError(E_STRICT)
  97. tmp[ti++] = replacementPoint
  98. break
  99. }
  100. const lower = byte === 0xf0 ? 0x90 : 0x80
  101. const upper = byte === 0xf4 ? 0x8f : 0xbf
  102. const byte1 = arr[i + 1]
  103. if (byte1 < lower || byte1 > upper) {
  104. if (!loose) throw new TypeError(E_STRICT)
  105. tmp[ti++] = replacementPoint
  106. continue
  107. }
  108. i++
  109. if (i + 1 >= end) {
  110. if (!loose) throw new TypeError(E_STRICT)
  111. tmp[ti++] = replacementPoint
  112. break
  113. }
  114. const byte2 = arr[i + 1]
  115. if (byte2 < 0x80 || byte2 > 0xbf) {
  116. if (!loose) throw new TypeError(E_STRICT)
  117. tmp[ti++] = replacementPoint
  118. continue
  119. }
  120. i++
  121. if (i + 1 >= end) {
  122. if (!loose) throw new TypeError(E_STRICT)
  123. tmp[ti++] = replacementPoint
  124. break
  125. }
  126. const byte3 = arr[i + 1]
  127. if (byte3 < 0x80 || byte3 > 0xbf) {
  128. if (!loose) throw new TypeError(E_STRICT)
  129. tmp[ti++] = replacementPoint
  130. continue
  131. }
  132. i++
  133. const codePoint =
  134. ((byte & 0xf) << 18) | ((byte1 & 0x3f) << 12) | ((byte2 & 0x3f) << 6) | (byte3 & 0x3f)
  135. if (codePoint > 0xff_ff) {
  136. // split into char codes as String.fromCharCode is faster than String.fromCodePoint
  137. const u = codePoint - 0x1_00_00
  138. tmp[ti++] = 0xd8_00 + ((u >> 10) & 0x3_ff)
  139. tmp[ti++] = 0xdc_00 + (u & 0x3_ff)
  140. } else {
  141. tmp[ti++] = codePoint
  142. }
  143. // eslint-disable-next-line sonarjs/no-duplicated-branches
  144. } else {
  145. if (!loose) throw new TypeError(E_STRICT)
  146. tmp[ti++] = replacementPoint
  147. }
  148. }
  149. if (ti === 0) return out
  150. tmp.length = ti
  151. return out + String.fromCharCode.apply(String, tmp)
  152. }
  153. export function encode(string, loose) {
  154. const length = string.length
  155. let small = true
  156. let bytes = new Uint8Array(length) // assume ascii
  157. let i = encodeAsciiPrefix(bytes, string)
  158. let p = i
  159. for (; i < length; i++) {
  160. let code = string.charCodeAt(i)
  161. if (code < 0x80) {
  162. bytes[p++] = code
  163. // Unroll the loop a bit for faster ops
  164. while (true) {
  165. i++
  166. if (i >= length) break
  167. code = string.charCodeAt(i)
  168. if (code >= 0x80) break
  169. bytes[p++] = code
  170. i++
  171. if (i >= length) break
  172. code = string.charCodeAt(i)
  173. if (code >= 0x80) break
  174. bytes[p++] = code
  175. i++
  176. if (i >= length) break
  177. code = string.charCodeAt(i)
  178. if (code >= 0x80) break
  179. bytes[p++] = code
  180. i++
  181. if (i >= length) break
  182. code = string.charCodeAt(i)
  183. if (code >= 0x80) break
  184. bytes[p++] = code
  185. }
  186. if (i >= length) break
  187. // now, code is present and >= 0x80
  188. }
  189. if (small) {
  190. // TODO: use resizable array buffers? will have to return a non-resizeable one
  191. if (p !== i) /* c8 ignore next */ throw new Error('Unreachable') // Here, p === i (only when small is still true)
  192. const bytesNew = new Uint8Array(p + (length - i) * 3) // maximium can be 3x of the string length in charcodes
  193. bytesNew.set(bytes)
  194. bytes = bytesNew
  195. small = false
  196. }
  197. // surrogate, charcodes = [d800 + a & 3ff, dc00 + b & 3ff]; codePoint = 0x1_00_00 | (a << 10) | b
  198. // lead: d800 - dbff
  199. // trail: dc00 - dfff
  200. if (code >= 0xd8_00 && code < 0xe0_00) {
  201. // Can't be a valid trail as we already processed that below
  202. if (code > 0xdb_ff || i + 1 >= length) {
  203. // An unexpected trail or a lead at the very end of input
  204. if (!loose) throw new TypeError(E_STRICT_UNICODE)
  205. bytes[p++] = 0xef
  206. bytes[p++] = 0xbf
  207. bytes[p++] = 0xbd
  208. continue
  209. }
  210. const next = string.charCodeAt(i + 1) // Process valid pairs immediately
  211. if (next >= 0xdc_00 && next < 0xe0_00) {
  212. // here, codePoint is always between 0x1_00_00 and 0x11_00_00, we encode as 4 bytes
  213. const codePoint = (((code - 0xd8_00) << 10) | (next - 0xdc_00)) + 0x1_00_00
  214. bytes[p++] = (codePoint >> 18) | 0xf0
  215. bytes[p++] = ((codePoint >> 12) & 0x3f) | 0x80
  216. bytes[p++] = ((codePoint >> 6) & 0x3f) | 0x80
  217. bytes[p++] = (codePoint & 0x3f) | 0x80
  218. i++ // consume next
  219. } else {
  220. // Next is not a trail, leave next unconsumed but process unmatched lead error
  221. if (!loose) throw new TypeError(E_STRICT_UNICODE)
  222. bytes[p++] = 0xef
  223. bytes[p++] = 0xbf
  224. bytes[p++] = 0xbd
  225. }
  226. continue
  227. }
  228. // We are left with a non-pair char code above ascii, it gets encoded to 2 or 3 bytes
  229. if (code < 0x8_00) {
  230. bytes[p++] = (code >> 6) | 0xc0
  231. bytes[p++] = (code & 0x3f) | 0x80
  232. } else {
  233. bytes[p++] = (code >> 12) | 0xe0
  234. bytes[p++] = ((code >> 6) & 0x3f) | 0x80
  235. bytes[p++] = (code & 0x3f) | 0x80
  236. }
  237. }
  238. return bytes.length === p ? bytes : bytes.slice(0, p)
  239. }