multi-byte.js 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962
  1. import { E_STRING } from './_utils.js'
  2. import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2, encodeAscii } from './latin1.js'
  3. import { getTable } from './multi-byte.table.js'
  4. export const E_STRICT = 'Input is not well-formed for this encoding'
  5. /* Decoders */
  6. // If the decoder is not cleared properly, state can be preserved between non-streaming calls!
  7. // See comment about fatal stream
  8. // All except iso-2022-jp are ASCII supersets
  9. // When adding something that is not an ASCII superset, ajust the ASCII fast path
  10. const mappers = {
  11. // https://encoding.spec.whatwg.org/#euc-kr-decoder
  12. 'euc-kr': (err) => {
  13. const euc = getTable('euc-kr')
  14. let lead = 0
  15. let oi = 0
  16. let o16
  17. const decodeLead = (b) => {
  18. if (b < 0x41 || b > 0xfe) {
  19. lead = 0
  20. o16[oi++] = err()
  21. if (b < 128) o16[oi++] = b
  22. } else {
  23. const p = euc[(lead - 0x81) * 190 + b - 0x41]
  24. lead = 0
  25. if (p) {
  26. o16[oi++] = p
  27. } else {
  28. o16[oi++] = err()
  29. if (b < 128) o16[oi++] = b
  30. }
  31. }
  32. }
  33. const decode = (arr, start, end, stream) => {
  34. let i = start
  35. o16 = new Uint16Array(end - start + (lead ? 1 : 0)) // there are pairs but they consume more than one byte
  36. oi = 0
  37. // Fast path
  38. if (!lead) {
  39. for (const last1 = end - 1; i < last1; ) {
  40. const l = arr[i]
  41. if (l < 128) {
  42. o16[oi++] = l
  43. i++
  44. } else {
  45. if (l === 0x80 || l === 0xff) break
  46. const b = arr[i + 1]
  47. if (b < 0x41 || b === 0xff) break
  48. const p = euc[(l - 0x81) * 190 + b - 0x41]
  49. if (!p) break
  50. o16[oi++] = p
  51. i += 2
  52. }
  53. }
  54. }
  55. if (lead && i < end) decodeLead(arr[i++])
  56. while (i < end) {
  57. const b = arr[i++]
  58. if (b < 128) {
  59. o16[oi++] = b
  60. } else if (b === 0x80 || b === 0xff) {
  61. o16[oi++] = err()
  62. } else {
  63. lead = b
  64. if (i < end) decodeLead(arr[i++])
  65. }
  66. }
  67. if (lead && !stream) {
  68. lead = 0
  69. o16[oi++] = err()
  70. }
  71. const res = decodeUCS2(o16, oi)
  72. o16 = null
  73. return res
  74. }
  75. return { decode, isAscii: () => lead === 0 }
  76. },
  77. // https://encoding.spec.whatwg.org/#euc-jp-decoder
  78. 'euc-jp': (err) => {
  79. const jis0208 = getTable('jis0208')
  80. const jis0212 = getTable('jis0212')
  81. let j12 = false
  82. let lead = 0
  83. let oi = 0
  84. let o16
  85. const decodeLead = (b) => {
  86. if (lead === 0x8e && b >= 0xa1 && b <= 0xdf) {
  87. lead = 0
  88. o16[oi++] = 0xfe_c0 + b
  89. } else if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
  90. j12 = true
  91. lead = b
  92. } else {
  93. let cp
  94. if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
  95. cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
  96. }
  97. lead = 0
  98. j12 = false
  99. if (cp) {
  100. o16[oi++] = cp
  101. } else {
  102. o16[oi++] = err()
  103. if (b < 128) o16[oi++] = b
  104. }
  105. }
  106. }
  107. const decode = (arr, start, end, stream) => {
  108. let i = start
  109. o16 = new Uint16Array(end - start + (lead ? 1 : 0))
  110. oi = 0
  111. // Fast path, non-j12
  112. // lead = 0 means j12 = 0
  113. if (!lead) {
  114. for (const last1 = end - 1; i < last1; ) {
  115. const l = arr[i]
  116. if (l < 128) {
  117. o16[oi++] = l
  118. i++
  119. } else {
  120. const b = arr[i + 1]
  121. if (l === 0x8e && b >= 0xa1 && b <= 0xdf) {
  122. o16[oi++] = 0xfe_c0 + b
  123. i += 2
  124. } else {
  125. if (l < 0xa1 || l === 0xff || b < 0xa1 || b === 0xff) break
  126. const cp = jis0208[(l - 0xa1) * 94 + b - 0xa1]
  127. if (!cp) break
  128. o16[oi++] = cp
  129. i += 2
  130. }
  131. }
  132. }
  133. }
  134. if (lead && i < end) decodeLead(arr[i++])
  135. if (lead && i < end) decodeLead(arr[i++]) // could be two leads, but no more
  136. while (i < end) {
  137. const b = arr[i++]
  138. if (b < 128) {
  139. o16[oi++] = b
  140. } else if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) {
  141. o16[oi++] = err()
  142. } else {
  143. lead = b
  144. if (i < end) decodeLead(arr[i++])
  145. if (lead && i < end) decodeLead(arr[i++]) // could be two leads
  146. }
  147. }
  148. if (lead && !stream) {
  149. lead = 0
  150. j12 = false // can be true only when lead is non-zero
  151. o16[oi++] = err()
  152. }
  153. const res = decodeUCS2(o16, oi)
  154. o16 = null
  155. return res
  156. }
  157. return { decode, isAscii: () => lead === 0 } // j12 can be true only when lead is non-zero
  158. },
  159. // https://encoding.spec.whatwg.org/#iso-2022-jp-decoder
  160. 'iso-2022-jp': (err) => {
  161. const jis0208 = getTable('jis0208')
  162. let dState = 1
  163. let oState = 1
  164. let lead = 0 // 0 or 0x21-0x7e
  165. let out = false
  166. const bytes = (pushback, b) => {
  167. if (dState < 5 && b === 0x1b) {
  168. dState = 6 // escape start
  169. return
  170. }
  171. switch (dState) {
  172. case 1:
  173. case 2:
  174. // ASCII, Roman (common)
  175. out = false
  176. if (dState === 2) {
  177. if (b === 0x5c) return 0xa5
  178. if (b === 0x7e) return 0x20_3e
  179. }
  180. if (b <= 0x7f && b !== 0x0e && b !== 0x0f) return b
  181. return err()
  182. case 3:
  183. // Katakana
  184. out = false
  185. if (b >= 0x21 && b <= 0x5f) return 0xff_40 + b
  186. return err()
  187. case 4:
  188. // Leading byte
  189. out = false
  190. if (b < 0x21 || b > 0x7e) return err()
  191. lead = b
  192. dState = 5
  193. return
  194. case 5:
  195. // Trailing byte
  196. out = false
  197. if (b === 0x1b) {
  198. dState = 6 // escape start
  199. return err()
  200. }
  201. dState = 4
  202. if (b >= 0x21 && b <= 0x7e) {
  203. const cp = jis0208[(lead - 0x21) * 94 + b - 0x21]
  204. if (cp) return cp
  205. }
  206. return err()
  207. case 6:
  208. // Escape start
  209. if (b === 0x24 || b === 0x28) {
  210. lead = b
  211. dState = 7
  212. return
  213. }
  214. out = false
  215. dState = oState
  216. pushback.push(b)
  217. return err()
  218. case 7: {
  219. // Escape
  220. const l = lead
  221. lead = 0
  222. let s
  223. if (l === 0x28) {
  224. // eslint-disable-next-line unicorn/prefer-switch
  225. if (b === 0x42) {
  226. s = 1
  227. } else if (b === 0x4a) {
  228. s = 2
  229. } else if (b === 0x49) {
  230. s = 3
  231. }
  232. } else if (l === 0x24 && (b === 0x40 || b === 0x42)) {
  233. s = 4
  234. }
  235. if (s) {
  236. dState = oState = s
  237. const output = out
  238. out = true
  239. return output ? err() : undefined
  240. }
  241. out = false
  242. dState = oState
  243. pushback.push(b, l)
  244. return err()
  245. }
  246. }
  247. }
  248. const eof = (pushback) => {
  249. if (dState < 5) return null
  250. out = false
  251. switch (dState) {
  252. case 5:
  253. dState = 4
  254. return err()
  255. case 6:
  256. dState = oState
  257. return err()
  258. case 7: {
  259. dState = oState
  260. pushback.push(lead)
  261. lead = 0
  262. return err()
  263. }
  264. }
  265. }
  266. const decode = (arr, start, end, stream) => {
  267. const o16 = new Uint16Array(end - start + 2) // err in eof + lead from state
  268. let oi = 0
  269. let i = start
  270. const pushback = [] // local and auto-cleared
  271. // First, dump everything until EOF
  272. // Same as the full loop, but without EOF handling
  273. while (i < end || pushback.length > 0) {
  274. const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
  275. if (c !== undefined) o16[oi++] = c // 16-bit
  276. }
  277. // Then, dump EOF. This needs the same loop as the characters can be pushed back
  278. if (!stream) {
  279. while (i <= end || pushback.length > 0) {
  280. if (i < end || pushback.length > 0) {
  281. const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
  282. if (c !== undefined) o16[oi++] = c // 16-bit
  283. } else {
  284. const c = eof(pushback)
  285. if (c === null) break // clean exit
  286. o16[oi++] = c
  287. }
  288. }
  289. }
  290. // Chrome and WebKit fail on this, we don't: completely destroy the old decoder state when finished streaming
  291. // > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
  292. // > Set this’s do not flush to options["stream"]
  293. if (!stream) {
  294. dState = oState = 1
  295. lead = 0
  296. out = false
  297. }
  298. return decodeUCS2(o16, oi)
  299. }
  300. return { decode, isAscii: () => false }
  301. },
  302. // https://encoding.spec.whatwg.org/#shift_jis-decoder
  303. shift_jis: (err) => {
  304. const jis0208 = getTable('jis0208')
  305. let lead = 0
  306. let oi = 0
  307. let o16
  308. const decodeLead = (b) => {
  309. const l = lead
  310. lead = 0
  311. if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
  312. const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
  313. if (p >= 8836 && p <= 10_715) {
  314. o16[oi++] = 0xe0_00 - 8836 + p
  315. return
  316. }
  317. const cp = jis0208[p]
  318. if (cp) {
  319. o16[oi++] = cp
  320. return
  321. }
  322. }
  323. o16[oi++] = err()
  324. if (b < 128) o16[oi++] = b
  325. }
  326. const decode = (arr, start, end, stream) => {
  327. o16 = new Uint16Array(end - start + (lead ? 1 : 0))
  328. oi = 0
  329. let i = start
  330. // Fast path
  331. if (!lead) {
  332. for (const last1 = end - 1; i < last1; ) {
  333. const l = arr[i]
  334. if (l <= 0x80) {
  335. o16[oi++] = l
  336. i++
  337. } else if (l >= 0xa1 && l <= 0xdf) {
  338. o16[oi++] = 0xfe_c0 + l
  339. i++
  340. } else {
  341. if (l === 0xa0 || l > 0xfc) break
  342. const b = arr[i + 1]
  343. if (b < 0x40 || b > 0xfc || b === 0x7f) break
  344. const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
  345. if (p >= 8836 && p <= 10_715) {
  346. o16[oi++] = 0xe0_00 - 8836 + p
  347. i += 2
  348. } else {
  349. const cp = jis0208[p]
  350. if (!cp) break
  351. o16[oi++] = cp
  352. i += 2
  353. }
  354. }
  355. }
  356. }
  357. if (lead && i < end) decodeLead(arr[i++])
  358. while (i < end) {
  359. const b = arr[i++]
  360. if (b <= 0x80) {
  361. o16[oi++] = b // 0x80 is allowed
  362. } else if (b >= 0xa1 && b <= 0xdf) {
  363. o16[oi++] = 0xfe_c0 + b
  364. } else if (b === 0xa0 || b > 0xfc) {
  365. o16[oi++] = err()
  366. } else {
  367. lead = b
  368. if (i < end) decodeLead(arr[i++])
  369. }
  370. }
  371. if (lead && !stream) {
  372. lead = 0
  373. o16[oi++] = err()
  374. }
  375. const res = decodeUCS2(o16, oi)
  376. o16 = null
  377. return res
  378. }
  379. return { decode, isAscii: () => lead === 0 }
  380. },
  381. // https://encoding.spec.whatwg.org/#gbk-decoder
  382. gbk: (err) => mappers.gb18030(err), // 10.1.1. GBK’s decoder is gb18030’s decoder
  383. // https://encoding.spec.whatwg.org/#gb18030-decoder
  384. gb18030: (err) => {
  385. const gb18030 = getTable('gb18030')
  386. const gb18030r = getTable('gb18030-ranges')
  387. let g1 = 0, g2 = 0, g3 = 0 // prettier-ignore
  388. const index = (p) => {
  389. if ((p > 39_419 && p < 189_000) || p > 1_237_575) return
  390. if (p === 7457) return 0xe7_c7
  391. let a = 0, b = 0 // prettier-ignore
  392. for (const [c, d] of gb18030r) {
  393. if (c > p) break
  394. a = c
  395. b = d
  396. }
  397. return b + p - a
  398. }
  399. // g1 is 0 or 0x81-0xfe
  400. // g2 is 0 or 0x30-0x39
  401. // g3 is 0 or 0x81-0xfe
  402. const decode = (arr, start, end, stream) => {
  403. const o16 = new Uint16Array(end - start + (g1 ? 3 : 0)) // even with pushback it's at most 1 char per byte
  404. let oi = 0
  405. let i = start
  406. const pushback = [] // local and auto-cleared
  407. // Fast path for 2-byte only
  408. // pushback is always empty ad start, and g1 = 0 means g2 = g3 = 0
  409. if (g1 === 0) {
  410. for (const last1 = end - 1; i < last1; ) {
  411. const b = arr[i]
  412. if (b < 128) {
  413. o16[oi++] = b
  414. i++
  415. } else if (b === 0x80) {
  416. o16[oi++] = 0x20_ac
  417. i++
  418. } else {
  419. if (b === 0xff) break
  420. const n = arr[i + 1]
  421. let cp
  422. if (n < 0x7f) {
  423. if (n < 0x40) break
  424. cp = gb18030[(b - 0x81) * 190 + n - 0x40]
  425. } else {
  426. if (n === 0xff || n === 0x7f) break
  427. cp = gb18030[(b - 0x81) * 190 + n - 0x41]
  428. }
  429. if (!cp) break
  430. o16[oi++] = cp // 16-bit
  431. i += 2
  432. }
  433. }
  434. }
  435. // First, dump everything until EOF
  436. // Same as the full loop, but without EOF handling
  437. while (i < end || pushback.length > 0) {
  438. const b = pushback.length > 0 ? pushback.pop() : arr[i++]
  439. if (g1) {
  440. // g2 can be set only when g1 is set, g3 can be set only when g2 is set
  441. // hence, 3 checks for g3 is faster than 3 checks for g1
  442. if (g2) {
  443. if (g3) {
  444. if (b <= 0x39 && b >= 0x30) {
  445. const p = index(
  446. (g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30
  447. )
  448. g1 = g2 = g3 = 0
  449. if (p === undefined) {
  450. o16[oi++] = err()
  451. } else if (p <= 0xff_ff) {
  452. o16[oi++] = p // Can validly return replacement
  453. } else {
  454. const d = p - 0x1_00_00
  455. o16[oi++] = 0xd8_00 | (d >> 10)
  456. o16[oi++] = 0xdc_00 | (d & 0x3_ff)
  457. }
  458. } else {
  459. pushback.push(b, g3, g2)
  460. g1 = g2 = g3 = 0
  461. o16[oi++] = err()
  462. }
  463. } else if (b >= 0x81 && b <= 0xfe) {
  464. g3 = b
  465. } else {
  466. pushback.push(b, g2)
  467. g1 = g2 = 0
  468. o16[oi++] = err()
  469. }
  470. } else if (b <= 0x39 && b >= 0x30) {
  471. g2 = b
  472. } else {
  473. let cp
  474. if (b >= 0x40 && b <= 0xfe && b !== 0x7f) {
  475. cp = gb18030[(g1 - 0x81) * 190 + b - (b < 0x7f ? 0x40 : 0x41)]
  476. }
  477. g1 = 0
  478. if (cp) {
  479. o16[oi++] = cp // 16-bit
  480. } else {
  481. o16[oi++] = err()
  482. if (b < 128) o16[oi++] = b // can be processed immediately
  483. }
  484. }
  485. } else if (b < 128) {
  486. o16[oi++] = b
  487. } else if (b === 0x80) {
  488. o16[oi++] = 0x20_ac
  489. } else if (b === 0xff) {
  490. o16[oi++] = err()
  491. } else {
  492. g1 = b
  493. }
  494. }
  495. // if g1 = 0 then g2 = g3 = 0
  496. if (g1 && !stream) {
  497. g1 = g2 = g3 = 0
  498. o16[oi++] = err()
  499. }
  500. return decodeUCS2(o16, oi)
  501. }
  502. return { decode, isAscii: () => g1 === 0 } // if g1 = 0 then g2 = g3 = 0
  503. },
  504. // https://encoding.spec.whatwg.org/#big5
  505. big5: (err) => {
  506. // The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
  507. // We store that as strings
  508. const big5 = getTable('big5')
  509. let lead = 0
  510. let oi = 0
  511. let o16
  512. const decodeLead = (b) => {
  513. if (b < 0x40 || (b > 0x7e && b < 0xa1) || b === 0xff) {
  514. lead = 0
  515. o16[oi++] = err()
  516. if (b < 128) o16[oi++] = b
  517. } else {
  518. const p = big5[(lead - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)]
  519. lead = 0
  520. if (p > 0x1_00_00) {
  521. o16[oi++] = p >> 16
  522. o16[oi++] = p & 0xff_ff
  523. } else if (p) {
  524. o16[oi++] = p
  525. } else {
  526. o16[oi++] = err()
  527. if (b < 128) o16[oi++] = b
  528. }
  529. }
  530. }
  531. // eslint-disable-next-line sonarjs/no-identical-functions
  532. const decode = (arr, start, end, stream) => {
  533. let i = start
  534. o16 = new Uint16Array(end - start + (lead ? 1 : 0)) // there are pairs but they consume more than one byte
  535. oi = 0
  536. // Fast path
  537. if (!lead) {
  538. for (const last1 = end - 1; i < last1; ) {
  539. const l = arr[i]
  540. if (l < 128) {
  541. o16[oi++] = l
  542. i++
  543. } else {
  544. if (l === 0x80 || l === 0xff) break
  545. const b = arr[i + 1]
  546. if (b < 0x40 || (b > 0x7e && b < 0xa1) || b === 0xff) break
  547. const p = big5[(l - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)]
  548. if (p > 0x1_00_00) {
  549. o16[oi++] = p >> 16
  550. o16[oi++] = p & 0xff_ff
  551. } else {
  552. if (!p) break
  553. o16[oi++] = p
  554. }
  555. i += 2
  556. }
  557. }
  558. }
  559. if (lead && i < end) decodeLead(arr[i++])
  560. while (i < end) {
  561. const b = arr[i++]
  562. if (b < 128) {
  563. o16[oi++] = b
  564. } else if (b === 0x80 || b === 0xff) {
  565. o16[oi++] = err()
  566. } else {
  567. lead = b
  568. if (i < end) decodeLead(arr[i++])
  569. }
  570. }
  571. if (lead && !stream) {
  572. lead = 0
  573. o16[oi++] = err()
  574. }
  575. const res = decodeUCS2(o16, oi)
  576. o16 = null
  577. return res
  578. }
  579. return { decode, isAscii: () => lead === 0 }
  580. },
  581. }
  582. export const isAsciiSuperset = (enc) => enc !== 'iso-2022-jp' // all others are ASCII supersets and can use fast path
  583. export function multibyteDecoder(enc, loose = false) {
  584. if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
  585. if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
  586. // Input is assumed to be typechecked already
  587. let mapper
  588. const asciiSuperset = isAsciiSuperset(enc)
  589. let streaming // because onErr is cached in mapper
  590. const onErr = loose
  591. ? () => 0xff_fd
  592. : () => {
  593. // The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
  594. // Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
  595. // iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
  596. if (!streaming) mapper = null // destroy state, effectively the same as 'do not flush' = false, but early
  597. throw new TypeError(E_STRICT)
  598. }
  599. return (arr, stream = false) => {
  600. let res = ''
  601. if (asciiSuperset && (!mapper || mapper.isAscii?.())) {
  602. const prefixLen = asciiPrefix(arr)
  603. if (prefixLen === arr.length) return decodeAscii(arr) // ascii
  604. res = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
  605. }
  606. streaming = stream // affects onErr
  607. if (!mapper) mapper = mappers[enc](onErr)
  608. return res + mapper.decode(arr, res.length, arr.length, stream)
  609. }
  610. }
  611. /* Encoders */
  612. const maps = new Map()
  613. const e7 = [[148, 236], [149, 237], [150, 243]] // prettier-ignore
  614. const e8 = [[30, 89], [38, 97], [43, 102], [44, 103], [50, 109], [67, 126], [84, 144], [100, 160]] // prettier-ignore
  615. const preencoders = {
  616. __proto__: null,
  617. big5: (p) => ((((p / 157) | 0) + 0x81) << 8) | ((p % 157 < 0x3f ? 0x40 : 0x62) + (p % 157)),
  618. shift_jis: (p) => {
  619. const l = (p / 188) | 0
  620. const t = p % 188
  621. return ((l + (l < 0x1f ? 0x81 : 0xc1)) << 8) | ((t < 0x3f ? 0x40 : 0x41) + t)
  622. },
  623. 'iso-2022-jp': (p) => ((((p / 94) | 0) + 0x21) << 8) | ((p % 94) + 0x21),
  624. 'euc-jp': (p) => ((((p / 94) | 0) + 0xa1) << 8) | ((p % 94) + 0xa1),
  625. 'euc-kr': (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190) + 0x41),
  626. gb18030: (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190 < 0x3f ? 0x40 : 0x41) + (p % 190)),
  627. }
  628. preencoders.gbk = preencoders.gb18030
  629. // We accept that encoders use non-trivial amount of mem, for perf
  630. // most are are 128 KiB mem, big5 is 380 KiB, lazy-loaded at first use
  631. function getMap(id, size, ascii) {
  632. const cached = maps.get(id)
  633. if (cached) return cached
  634. let tname = id
  635. const sjis = id === 'shift_jis'
  636. const iso2022jp = id === 'iso-2022-jp'
  637. if (iso2022jp) tname = 'jis0208'
  638. if (id === 'gbk') tname = 'gb18030'
  639. if (id === 'euc-jp' || sjis) tname = 'jis0208'
  640. const table = getTable(tname)
  641. const map = new Uint16Array(size)
  642. const enc = preencoders[id] || ((p) => p + 1)
  643. for (let i = 0; i < table.length; i++) {
  644. const c = table[i]
  645. if (!c) continue
  646. if (id === 'big5') {
  647. if (i < 5024) continue // this also skips multi-codepoint strings
  648. // In big5, all return first entries except for these
  649. if (
  650. map[c] &&
  651. c !== 0x25_50 &&
  652. c !== 0x25_5e &&
  653. c !== 0x25_61 &&
  654. c !== 0x25_6a &&
  655. c !== 0x53_41 &&
  656. c !== 0x53_45
  657. ) {
  658. continue
  659. }
  660. } else {
  661. if (sjis && i >= 8272 && i <= 8835) continue
  662. if (map[c]) continue
  663. }
  664. if (c > 0xff_ff) {
  665. // always a single codepoint here
  666. const s = String.fromCharCode(c >> 16, c & 0xff_ff)
  667. map[s.codePointAt(0)] = enc(i)
  668. } else {
  669. map[c] = enc(i)
  670. }
  671. }
  672. if (ascii) for (let i = 0; i < 0x80; i++) map[i] = i
  673. if (sjis || id === 'euc-jp') {
  674. if (sjis) map[0x80] = 0x80
  675. const d = sjis ? 0xfe_c0 : 0x70_c0
  676. for (let i = 0xff_61; i <= 0xff_9f; i++) map[i] = i - d
  677. map[0x22_12] = map[0xff_0d]
  678. map[0xa5] = 0x5c
  679. map[0x20_3e] = 0x7e
  680. } else if (tname === 'gb18030') {
  681. if (id === 'gbk') map[0x20_ac] = 0x80
  682. for (let i = 0xe7_8d; i <= 0xe7_93; i++) map[i] = i - 0x40_b4
  683. for (const [a, b] of e7) map[0xe7_00 | a] = 0xa6_00 | b
  684. for (const [a, b] of e8) map[0xe8_00 | a] = 0xfe_00 | b
  685. }
  686. maps.set(id, map)
  687. return map
  688. }
  689. const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
  690. let gb18030r, katakana
  691. export function multibyteEncoder(enc, onError) {
  692. if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
  693. const size = enc === 'big5' ? 0x2_f8_a7 : 0x1_00_00 // for big5, max codepoint in table + 1
  694. const iso2022jp = enc === 'iso-2022-jp'
  695. const gb18030 = enc === 'gb18030'
  696. const ascii = isAsciiSuperset(enc)
  697. const width = iso2022jp ? 5 : gb18030 ? 4 : 2
  698. const tailsize = iso2022jp ? 3 : 0
  699. const map = getMap(enc, size, ascii)
  700. if (gb18030 && !gb18030r) gb18030r = getTable('gb18030-ranges')
  701. if (iso2022jp && !katakana) katakana = getTable('iso-2022-jp-katakana')
  702. return (str) => {
  703. if (typeof str !== 'string') throw new TypeError(E_STRING)
  704. if (ascii && !NON_LATIN.test(str)) {
  705. try {
  706. return encodeAscii(str, E_STRICT)
  707. } catch {}
  708. }
  709. const length = str.length
  710. const u8 = new Uint8Array(length * width + tailsize)
  711. let i = 0
  712. if (ascii) {
  713. while (i < length) {
  714. const x = str.charCodeAt(i)
  715. if (x >= 128) break
  716. u8[i++] = x
  717. }
  718. }
  719. // eslint-disable-next-line unicorn/consistent-function-scoping
  720. const err = (code) => {
  721. if (onError) return onError(code, u8, i)
  722. throw new TypeError(E_STRICT)
  723. }
  724. if (!map || map.length < size) /* c8 ignore next */ throw new Error('Unreachable') // Important for perf
  725. if (iso2022jp) {
  726. let state = 0 // 0 = ASCII, 1 = Roman, 2 = jis0208
  727. const restore = () => {
  728. state = 0
  729. u8[i++] = 0x1b
  730. u8[i++] = 0x28
  731. u8[i++] = 0x42
  732. }
  733. for (let j = 0; j < length; j++) {
  734. let x = str.charCodeAt(j)
  735. if (x >= 0xd8_00 && x < 0xe0_00) {
  736. if (state === 2) restore()
  737. if (x >= 0xdc_00 || j + 1 === length) {
  738. i += err(x) // lone
  739. } else {
  740. const x1 = str.charCodeAt(j + 1)
  741. if (x1 < 0xdc_00 || x1 >= 0xe0_00) {
  742. i += err(x) // lone
  743. } else {
  744. j++ // consume x1
  745. i += err(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10)))
  746. }
  747. }
  748. } else if (x < 0x80) {
  749. if (state === 2 || (state === 1 && (x === 0x5c || x === 0x7e))) restore()
  750. if (x === 0xe || x === 0xf || x === 0x1b) {
  751. i += err(0xff_fd) // 12.2.2. step 3: This returns U+FFFD rather than codePoint to prevent attacks
  752. } else {
  753. u8[i++] = x
  754. }
  755. } else if (x === 0xa5 || x === 0x20_3e) {
  756. if (state !== 1) {
  757. state = 1
  758. u8[i++] = 0x1b
  759. u8[i++] = 0x28
  760. u8[i++] = 0x4a
  761. }
  762. u8[i++] = x === 0xa5 ? 0x5c : 0x7e
  763. } else {
  764. if (x === 0x22_12) x = 0xff_0d
  765. if (x >= 0xff_61 && x <= 0xff_9f) x = katakana[x - 0xff_61]
  766. const e = map[x]
  767. if (e) {
  768. if (state !== 2) {
  769. state = 2
  770. u8[i++] = 0x1b
  771. u8[i++] = 0x24
  772. u8[i++] = 0x42
  773. }
  774. u8[i++] = e >> 8
  775. u8[i++] = e & 0xff
  776. } else {
  777. if (state === 2) restore()
  778. i += err(x)
  779. }
  780. }
  781. }
  782. if (state) restore()
  783. } else if (gb18030) {
  784. // Deduping this branch hurts other encoders perf
  785. const encode = (cp) => {
  786. let a = 0, b = 0 // prettier-ignore
  787. for (const [c, d] of gb18030r) {
  788. if (d > cp) break
  789. a = c
  790. b = d
  791. }
  792. let rp = cp === 0xe7_c7 ? 7457 : a + cp - b
  793. u8[i++] = 0x81 + ((rp / 12_600) | 0)
  794. rp %= 12_600
  795. u8[i++] = 0x30 + ((rp / 1260) | 0)
  796. rp %= 1260
  797. u8[i++] = 0x81 + ((rp / 10) | 0)
  798. u8[i++] = 0x30 + (rp % 10)
  799. }
  800. for (let j = i; j < length; j++) {
  801. const x = str.charCodeAt(j)
  802. if (x >= 0xd8_00 && x < 0xe0_00) {
  803. if (x >= 0xdc_00 || j + 1 === length) {
  804. i += err(x) // lone
  805. } else {
  806. const x1 = str.charCodeAt(j + 1)
  807. if (x1 < 0xdc_00 || x1 >= 0xe0_00) {
  808. i += err(x) // lone
  809. } else {
  810. j++ // consume x1
  811. encode(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10)))
  812. }
  813. }
  814. } else {
  815. const e = map[x]
  816. if (e & 0xff_00) {
  817. u8[i++] = e >> 8
  818. u8[i++] = e & 0xff
  819. } else if (e || x === 0) {
  820. u8[i++] = e
  821. } else if (x === 0xe5_e5) {
  822. i += err(x)
  823. } else {
  824. encode(x)
  825. }
  826. }
  827. }
  828. } else {
  829. const long =
  830. enc === 'big5'
  831. ? (x) => {
  832. const e = map[x]
  833. if (e & 0xff_00) {
  834. u8[i++] = e >> 8
  835. u8[i++] = e & 0xff
  836. } else if (e || x === 0) {
  837. u8[i++] = e
  838. } else {
  839. i += err(x)
  840. }
  841. }
  842. : (x) => {
  843. i += err(x)
  844. }
  845. for (let j = i; j < length; j++) {
  846. const x = str.charCodeAt(j)
  847. if (x >= 0xd8_00 && x < 0xe0_00) {
  848. if (x >= 0xdc_00 || j + 1 === length) {
  849. i += err(x) // lone
  850. } else {
  851. const x1 = str.charCodeAt(j + 1)
  852. if (x1 < 0xdc_00 || x1 >= 0xe0_00) {
  853. i += err(x) // lone
  854. } else {
  855. j++ // consume x1
  856. long(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10)))
  857. }
  858. }
  859. } else {
  860. const e = map[x]
  861. if (e & 0xff_00) {
  862. u8[i++] = e >> 8
  863. u8[i++] = e & 0xff
  864. } else if (e || x === 0) {
  865. u8[i++] = e
  866. } else {
  867. i += err(x)
  868. }
  869. }
  870. }
  871. }
  872. return i === u8.length ? u8 : u8.subarray(0, i)
  873. }
  874. }