multi-byte.table.js 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. import { fromBase64url } from '@exodus/bytes/base64.js'
  2. import { utf16toString } from '@exodus/bytes/utf16.js'
  3. import loadEncodings from './multi-byte.encodings.cjs'
  4. export const sizes = {
  5. jis0208: 11_104,
  6. jis0212: 7211,
  7. 'euc-kr': 23_750,
  8. gb18030: 23_940,
  9. big5: 19_782,
  10. }
  11. // This is huge. It's _much_ smaller than https://npmjs.com/text-encoding though
  12. // Exactly as mapped by the index table
  13. // 0,x - hole of x empty elements
  14. // n,c - continious [c, ...] of length n
  15. // $.. - references to common chunks
  16. // -{x} - same as 1,{x}
  17. // See tests/multi-byte.test.js to verify that this data decodes exactly into the encoding spec tables
  18. let indices
  19. const tables = new Map()
  20. /* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
  21. function loadBase64(str) {
  22. const x = fromBase64url(str)
  23. const len = x.length
  24. const len2 = len >> 1
  25. const y = new Uint8Array(len)
  26. let a = -1, b = 0 // prettier-ignore
  27. for (let i = 0, j = 0; i < len; i += 2, j++) {
  28. a = (a + x[j] + 1) & 0xff
  29. b = (b + x[len2 + j]) & 0xff
  30. y[i] = a
  31. y[i + 1] = b
  32. }
  33. return y
  34. }
  35. function unwrap(res, t, pos) {
  36. let code = 0
  37. for (let i = 0; i < t.length; i++) {
  38. let x = t[i]
  39. if (typeof x === 'number') {
  40. if (x === 0) {
  41. pos += t[++i]
  42. } else {
  43. if (x < 0) {
  44. code -= x
  45. x = 1
  46. } else {
  47. code += t[++i]
  48. }
  49. for (let k = 0; k < x; k++, pos++, code++) {
  50. if (code <= 0xff_ff) {
  51. res[pos] = code
  52. } else {
  53. const c = String.fromCodePoint(code)
  54. res[pos] = (c.charCodeAt(0) << 16) | c.charCodeAt(1)
  55. }
  56. }
  57. }
  58. } else if (x[0] === '$' && Object.hasOwn(indices, x)) {
  59. pos = unwrap(res, indices[x], pos) // self-reference using shared chunks
  60. } else {
  61. let last
  62. // splits by codepoints
  63. for (const c of utf16toString(loadBase64(x), 'uint8-le')) {
  64. last = c
  65. res[pos++] = c.length === 1 ? c.charCodeAt(0) : (c.charCodeAt(0) << 16) | c.charCodeAt(1)
  66. }
  67. code = last.codePointAt(0) + 1
  68. }
  69. }
  70. return pos
  71. }
  72. export function getTable(id) {
  73. const cached = tables.get(id)
  74. if (cached) return cached
  75. if (!indices) indices = loadEncodings() // lazy-load
  76. if (!Object.hasOwn(indices, id)) throw new Error('Unknown encoding')
  77. if (!indices[id]) throw new Error('Table already used (likely incorrect bundler dedupe)')
  78. let res
  79. if (id.endsWith('-ranges')) {
  80. res = []
  81. let a = 0, b = 0 // prettier-ignore
  82. const idx = indices[id]
  83. while (idx.length > 0) res.push([(a += idx.shift()), (b += idx.shift())]) // destroying, we remove it later anyway
  84. } else if (id.endsWith('-katakana')) {
  85. let a = -1
  86. res = new Uint16Array(indices[id].map((x) => (a += x + 1)))
  87. } else if (id === 'big5') {
  88. res = new Uint32Array(sizes[id]) // single or double charcodes
  89. unwrap(res, indices[id], 0)
  90. // Pointer code updates are embedded into the table
  91. // These are skipped in encoder as encoder uses only pointers >= (0xA1 - 0x81) * 157
  92. res[1133] = 0xca_03_04
  93. res[1135] = 0xca_03_0c
  94. res[1164] = 0xea_03_04
  95. res[1166] = 0xea_03_0c
  96. } else {
  97. if (!Object.hasOwn(sizes, id)) throw new Error('Unknown encoding')
  98. res = new Uint16Array(sizes[id])
  99. unwrap(res, indices[id], 0)
  100. }
  101. indices[id] = null // gc
  102. tables.set(id, res)
  103. return res
  104. }