encoding.js 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. // We can't return native TextDecoder if it's present, as Node.js one is broken on windows-1252 and we fix that
  2. // We are also faster than Node.js built-in on both TextEncoder and TextDecoder
  3. import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
  4. import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
  5. import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
  6. import labels from './encoding.labels.js'
  7. import { fromSource, getBOMEncoding } from './encoding.api.js'
  8. import { unfinishedBytes, mergePrefix } from './encoding.util.js'
  9. export { getBOMEncoding } from './encoding.api.js'
  10. export const E_ENCODING = 'Unknown encoding'
  11. const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
  12. const E_OPTIONS = 'The "options" argument must be of type object'
  13. const replacementChar = '\uFFFD'
  14. const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
  15. let createMultibyteDecoder, multibyteEncoder
  16. let labelsMap
  17. // Warning: unlike whatwg-encoding, returns lowercased labels
  18. // Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
  19. // https://encoding.spec.whatwg.org/#names-and-labels
  20. export function normalizeEncoding(label) {
  21. // fast path
  22. if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
  23. if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
  24. // full map
  25. if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
  26. const low = `${label}`.trim().toLowerCase()
  27. if (Object.hasOwn(labels, low)) return low
  28. if (!labelsMap) {
  29. labelsMap = new Map()
  30. for (const [name, aliases] of Object.entries(labels)) {
  31. for (const alias of aliases) labelsMap.set(alias, name)
  32. }
  33. }
  34. const mapped = labelsMap.get(low)
  35. if (mapped) return mapped
  36. return null
  37. }
  38. const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
  39. // Unlike normalizeEncoding, case-sensitive
  40. // https://encoding.spec.whatwg.org/#names-and-labels
  41. export function labelToName(label) {
  42. const enc = normalizeEncoding(label)
  43. if (enc === 'utf-8') return 'UTF-8' // fast path
  44. if (!enc) return enc
  45. if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
  46. if (enc === 'big5') return 'Big5'
  47. if (enc === 'shift_jis') return 'Shift_JIS'
  48. return enc
  49. }
  50. export const isMultibyte = (enc) => multibyteSet.has(enc)
  51. export function setMultibyte(createDecoder, createEncoder) {
  52. createMultibyteDecoder = createDecoder
  53. multibyteEncoder = createEncoder
  54. }
  55. export function getMultibyteEncoder() {
  56. if (!multibyteEncoder) throw new Error(E_MULTI)
  57. return multibyteEncoder
  58. }
  59. const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
  60. function isAnyUint8Array(x) {
  61. if (x instanceof Uint8Array) return true
  62. if (!x || !ArrayBuffer.isView(x) || x.BYTES_PER_ELEMENT !== 1) return false
  63. return Object.prototype.toString.call(x) === '[object Uint8Array]'
  64. }
  65. function unicodeDecoder(encoding, loose) {
  66. if (encoding === 'utf-8') return loose ? utf8toStringLoose : utf8toString // likely
  67. const form = encoding === 'utf-16le' ? 'uint8-le' : 'uint8-be'
  68. return loose ? (u) => utf16toStringLoose(u, form) : (u) => utf16toString(u, form)
  69. }
  70. export class TextDecoder {
  71. #decode
  72. #unicode
  73. #multibyte
  74. #chunk
  75. #canBOM
  76. constructor(encoding = 'utf-8', options = {}) {
  77. if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
  78. const enc = normalizeEncoding(encoding)
  79. if (!enc || enc === 'replacement') throw new RangeError(E_ENCODING)
  80. define(this, 'encoding', enc)
  81. define(this, 'fatal', !!options.fatal)
  82. define(this, 'ignoreBOM', !!options.ignoreBOM)
  83. this.#unicode = enc === 'utf-8' || enc === 'utf-16le' || enc === 'utf-16be'
  84. this.#multibyte = !this.#unicode && isMultibyte(enc)
  85. this.#canBOM = this.#unicode && !this.ignoreBOM
  86. }
  87. get [Symbol.toStringTag]() {
  88. return 'TextDecoder'
  89. }
  90. decode(input, options = {}) {
  91. if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
  92. const stream = !!options.stream
  93. let u = input === undefined ? new Uint8Array() : fromSource(input)
  94. const empty = u.length === 0 // also can't be streaming after next line
  95. if (empty && stream) return '' // no state change
  96. if (this.#unicode) {
  97. let prefix
  98. if (this.#chunk) {
  99. const merged = mergePrefix(u, this.#chunk, this.encoding)
  100. if (u.length < 3) {
  101. u = merged // might be unfinished, but fully consumed old u
  102. } else {
  103. prefix = merged // stops at complete chunk
  104. const add = prefix.length - this.#chunk.length
  105. if (add > 0) u = u.subarray(add)
  106. }
  107. this.#chunk = null
  108. } else if (empty) {
  109. this.#canBOM = !this.ignoreBOM // not streaming
  110. return ''
  111. }
  112. // For non-stream utf-8 we don't have to do this as it matches utf8toStringLoose already
  113. // For non-stream loose utf-16 we still have to do this as this API supports uneven byteLength unlike utf16toStringLoose
  114. let suffix = ''
  115. if (stream || (!this.fatal && this.encoding !== 'utf-8')) {
  116. const trail = unfinishedBytes(u, u.byteLength, this.encoding)
  117. if (trail > 0) {
  118. if (stream) {
  119. this.#chunk = Uint8Array.from(u.subarray(-trail)) // copy
  120. } else {
  121. // non-fatal mode as already checked
  122. suffix = replacementChar
  123. }
  124. u = u.subarray(0, -trail)
  125. }
  126. }
  127. let seenBOM = false
  128. if (this.#canBOM) {
  129. const bom = this.#findBom(prefix ?? u)
  130. if (bom) {
  131. seenBOM = true
  132. if (prefix) {
  133. prefix = prefix.subarray(bom)
  134. } else {
  135. u = u.subarray(bom)
  136. }
  137. }
  138. } else if (!stream && !this.ignoreBOM) {
  139. this.#canBOM = true
  140. }
  141. if (!this.#decode) this.#decode = unicodeDecoder(this.encoding, !this.fatal)
  142. try {
  143. const res = (prefix ? this.#decode(prefix) : '') + this.#decode(u) + suffix
  144. // "BOM seen" is set on the current decode call only if it did not error, in "serialize I/O queue" after decoding
  145. if (stream && (seenBOM || res.length > 0)) this.#canBOM = false
  146. return res
  147. } catch (err) {
  148. this.#chunk = null // reset unfinished chunk on errors
  149. // The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
  150. // See also multi-byte.js
  151. throw err
  152. }
  153. // eslint-disable-next-line no-else-return
  154. } else if (this.#multibyte) {
  155. if (!createMultibyteDecoder) throw new Error(E_MULTI)
  156. if (!this.#decode) this.#decode = createMultibyteDecoder(this.encoding, !this.fatal) // can contain state!
  157. return this.#decode(u, stream)
  158. } else {
  159. if (!this.#decode) this.#decode = createSinglebyteDecoder(this.encoding, !this.fatal)
  160. return this.#decode(u)
  161. }
  162. }
  163. #findBom(u) {
  164. switch (this.encoding) {
  165. case 'utf-8':
  166. return u.byteLength >= 3 && u[0] === 0xef && u[1] === 0xbb && u[2] === 0xbf ? 3 : 0
  167. case 'utf-16le':
  168. return u.byteLength >= 2 && u[0] === 0xff && u[1] === 0xfe ? 2 : 0
  169. case 'utf-16be':
  170. return u.byteLength >= 2 && u[0] === 0xfe && u[1] === 0xff ? 2 : 0
  171. }
  172. /* c8 ignore next */
  173. throw new Error('Unreachable')
  174. }
  175. }
  176. export class TextEncoder {
  177. constructor() {
  178. define(this, 'encoding', 'utf-8')
  179. }
  180. get [Symbol.toStringTag]() {
  181. return 'TextEncoder'
  182. }
  183. encode(str = '') {
  184. if (typeof str !== 'string') str = `${str}`
  185. const res = utf8fromStringLoose(str)
  186. // match new Uint8Array (per spec), which is non-pooled
  187. return res.byteOffset === 0 && res.length === res.buffer.byteLength ? res : res.slice(0)
  188. }
  189. encodeInto(str, target) {
  190. if (typeof str !== 'string') str = `${str}`
  191. if (!isAnyUint8Array(target)) throw new TypeError('Target must be an Uint8Array')
  192. if (target.buffer.detached) return { read: 0, written: 0 } // Until https://github.com/whatwg/encoding/issues/324 is resolved
  193. const tlen = target.length
  194. if (tlen < str.length) str = str.slice(0, tlen)
  195. let u8 = utf8fromStringLoose(str)
  196. let read
  197. if (tlen >= u8.length) {
  198. read = str.length
  199. } else if (u8.length === str.length) {
  200. if (u8.length > tlen) u8 = u8.subarray(0, tlen) // ascii can be truncated
  201. read = u8.length
  202. } else {
  203. u8 = u8.subarray(0, tlen)
  204. const unfinished = unfinishedBytes(u8, u8.length, 'utf-8')
  205. if (unfinished > 0) u8 = u8.subarray(0, u8.length - unfinished)
  206. // We can do this because loose str -> u8 -> str preserves length, unlike loose u8 -> str -> u8
  207. // Each unpaired surrogate (1 charcode) is replaced with a single charcode
  208. read = utf8toStringLoose(u8).length // FIXME: Converting back is very inefficient
  209. }
  210. try {
  211. target.set(u8)
  212. } catch {
  213. return { read: 0, written: 0 } // see above, likely detached but no .detached property support
  214. }
  215. return { read, written: u8.length }
  216. }
  217. }
  218. const E_NO_STREAMS = 'TransformStream global not present in the environment'
  219. // https://encoding.spec.whatwg.org/#interface-textdecoderstream
  220. export class TextDecoderStream {
  221. constructor(encoding = 'utf-8', options = {}) {
  222. if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
  223. const decoder = new TextDecoder(encoding, options)
  224. const transform = new TransformStream({
  225. transform: (chunk, controller) => {
  226. const value = decoder.decode(fromSource(chunk), { stream: true })
  227. if (value) controller.enqueue(value)
  228. },
  229. flush: (controller) => {
  230. // https://streams.spec.whatwg.org/#dom-transformer-flush
  231. const value = decoder.decode()
  232. if (value) controller.enqueue(value)
  233. // No need to call .terminate() (Node.js is wrong)
  234. },
  235. })
  236. define(this, 'encoding', decoder.encoding)
  237. define(this, 'fatal', decoder.fatal)
  238. define(this, 'ignoreBOM', decoder.ignoreBOM)
  239. define(this, 'readable', transform.readable)
  240. define(this, 'writable', transform.writable)
  241. }
  242. get [Symbol.toStringTag]() {
  243. return 'TextDecoderStream'
  244. }
  245. }
  246. // https://encoding.spec.whatwg.org/#interface-textencoderstream
  247. // Only UTF-8 per spec
  248. export class TextEncoderStream {
  249. constructor() {
  250. if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
  251. let lead
  252. const transform = new TransformStream({
  253. // https://encoding.spec.whatwg.org/#encode-and-enqueue-a-chunk
  254. // Not identical in code, but reuses loose mode to have identical behavior
  255. transform: (chunk, controller) => {
  256. let s = String(chunk) // DOMString, might contain unpaired surrogates
  257. if (s.length === 0) return
  258. if (lead) {
  259. s = lead + s
  260. lead = null
  261. }
  262. const last = s.charCodeAt(s.length - 1) // Can't come from previous lead due to length check
  263. if ((last & 0xfc_00) === 0xd8_00) {
  264. lead = s[s.length - 1]
  265. s = s.slice(0, -1)
  266. }
  267. if (s) controller.enqueue(utf8fromStringLoose(s))
  268. },
  269. // https://encoding.spec.whatwg.org/#encode-and-flush
  270. flush: (controller) => {
  271. if (lead) controller.enqueue(Uint8Array.of(0xef, 0xbf, 0xbd))
  272. },
  273. })
  274. define(this, 'encoding', 'utf-8')
  275. define(this, 'readable', transform.readable)
  276. define(this, 'writable', transform.writable)
  277. }
  278. get [Symbol.toStringTag]() {
  279. return 'TextEncoderStream'
  280. }
  281. }
  282. // https://encoding.spec.whatwg.org/#decode
  283. // Warning: encoding sniffed from BOM takes preference over the supplied one
  284. // Warning: lossy, performs replacement, no option of throwing
  285. // Completely ignores encoding and even skips validation when BOM is found
  286. // Unlike TextDecoder public API, additionally supports 'replacement' encoding
  287. export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
  288. let u8 = fromSource(input)
  289. const bomEncoding = getBOMEncoding(u8)
  290. if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
  291. const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
  292. if (enc === 'utf-8') return utf8toStringLoose(u8)
  293. if (enc === 'utf-16le' || enc === 'utf-16be') {
  294. let suffix = ''
  295. if (u8.byteLength % 2 !== 0) {
  296. suffix = replacementChar
  297. u8 = u8.subarray(0, -unfinishedBytes(u8, u8.byteLength, enc))
  298. }
  299. return utf16toStringLoose(u8, enc === 'utf-16le' ? 'uint8-le' : 'uint8-be') + suffix
  300. }
  301. if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)
  302. if (isMultibyte(enc)) {
  303. if (!createMultibyteDecoder) throw new Error(E_MULTI)
  304. return createMultibyteDecoder(enc, true)(u8)
  305. }
  306. // https://encoding.spec.whatwg.org/#replacement-decoder
  307. // On non-streaming non-fatal case, it just replaces any non-empty input with a single replacement char
  308. if (enc === 'replacement') return input.byteLength > 0 ? replacementChar : ''
  309. return createSinglebyteDecoder(enc, true)(u8)
  310. }