encoding.util.js 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. // Get a number of last bytes in an Uint8Array `u` ending at `len` that don't
  2. // form a codepoint yet, but can be a part of a single codepoint on more data
  3. export function unfinishedBytes(u, len, enc) {
  4. switch (enc) {
  5. case 'utf-8': {
  6. // 0-3
  7. let p = 0
  8. while (p < 2 && p < len && (u[len - p - 1] & 0xc0) === 0x80) p++ // go back 0-2 trailing bytes
  9. if (p === len) return 0 // no space for lead
  10. const l = u[len - p - 1]
  11. if (l < 0xc2 || l > 0xf4) return 0 // not a lead
  12. if (p === 0) return 1 // nothing to recheck, we have only lead, return it. 2-byte must return here
  13. if (l < 0xe0 || (l < 0xf0 && p >= 2)) return 0 // 2-byte, or 3-byte or less and we already have 2 trailing
  14. const lower = l === 0xf0 ? 0x90 : l === 0xe0 ? 0xa0 : 0x80
  15. const upper = l === 0xf4 ? 0x8f : l === 0xed ? 0x9f : 0xbf
  16. const n = u[len - p]
  17. return n >= lower && n <= upper ? p + 1 : 0
  18. }
  19. case 'utf-16le':
  20. case 'utf-16be': {
  21. // 0-3
  22. const p = len % 2 // uneven byte length adds 1
  23. if (len < 2) return p
  24. const l = len - p - 1
  25. const last = enc === 'utf-16le' ? (u[l] << 8) ^ u[l - 1] : (u[l - 1] << 8) ^ u[l]
  26. return last >= 0xd8_00 && last < 0xdc_00 ? p + 2 : p // lone lead adds 2
  27. }
  28. }
  29. throw new Error('Unsupported encoding')
  30. }
  31. // Merge prefix `chunk` with `u` and return new combined prefix
  32. // For u.length < 3, fully consumes u and can return unfinished data,
  33. // otherwise returns a prefix with no unfinished bytes
  34. export function mergePrefix(u, chunk, enc) {
  35. if (u.length === 0) return chunk
  36. if (u.length < 3) {
  37. // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
  38. const a = new Uint8Array(u.length + chunk.length)
  39. a.set(chunk)
  40. a.set(u, chunk.length)
  41. return a
  42. }
  43. // Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
  44. const t = new Uint8Array(chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
  45. t.set(chunk)
  46. t.set(u.subarray(0, 3), chunk.length)
  47. // Stop at the first offset where unfinished bytes reaches 0 or fits into u
  48. // If that doesn't happen (u too short), just concat chunk and u completely (above)
  49. for (let i = 1; i <= 3; i++) {
  50. const unfinished = unfinishedBytes(t, chunk.length + i, enc) // 0-3
  51. if (unfinished <= i) {
  52. // Always reachable at 3, but we still need 'unfinished' value for it
  53. const add = i - unfinished // 0-3
  54. return add > 0 ? t.subarray(0, chunk.length + add) : chunk
  55. }
  56. }
  57. // Unreachable
  58. }