ClawLab
/
RobotDaily


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
							"use strict";
const { getBOMEncoding, labelToName } = require("@exodus/bytes/encoding-lite.js");

// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
module.exports = (uint8Array, { xml = false, transportLayerEncodingLabel, defaultEncoding } = {}) => {
  if (defaultEncoding === undefined) {
    defaultEncoding = xml ? "UTF-8" : "windows-1252";
  }

  let encoding = labelToName(getBOMEncoding(uint8Array));

  if (encoding === null && transportLayerEncodingLabel !== undefined) {
    encoding = labelToName(transportLayerEncodingLabel);
  }

  if (encoding === null && !xml) {
    encoding = prescanMetaCharset(uint8Array);
  }

  if (encoding === null) {
    encoding = defaultEncoding;
  }

  return encoding;
};

// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
function prescanMetaCharset(uint8Array) {
  const l = Math.min(uint8Array.byteLength, 1024);
  for (let i = 0; i < l; i++) {
    let c = uint8Array[i];
    if (c === 0x3C) {
      // "<"
      const c1 = uint8Array[i + 1];
      const c2 = uint8Array[i + 2];
      const c3 = uint8Array[i + 3];
      const c4 = uint8Array[i + 4];
      const c5 = uint8Array[i + 5];
      // !-- (comment start)
      if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
        i += 4;
        for (; i < l; i++) {
          c = uint8Array[i];
          const cMinus1 = uint8Array[i - 1];
          const cMinus2 = uint8Array[i - 2];
          // --> (comment end)
          if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
            break;
          }
        }
      } else if ((c1 === 0x4D || c1 === 0x6D) &&
         (c2 === 0x45 || c2 === 0x65) &&
         (c3 === 0x54 || c3 === 0x74) &&
         (c4 === 0x41 || c4 === 0x61) &&
         (isSpaceCharacter(c5) || c5 === 0x2F)) {
        // "meta" + space or /
        i += 6;
        const attributeList = new Set();
        let gotPragma = false;
        let needPragma = null;
        let charset = null;

        let attrRes;
        do {
          attrRes = getAttribute(uint8Array, i, l);
          if (attrRes.attr && !attributeList.has(attrRes.attr.name)) {
            attributeList.add(attrRes.attr.name);
            if (attrRes.attr.name === "http-equiv") {
              gotPragma = attrRes.attr.value === "content-type";
            } else if (attrRes.attr.name === "content" && !charset) {
              charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
              if (charset !== null) {
                needPragma = true;
              }
            } else if (attrRes.attr.name === "charset") {
              charset = labelToName(attrRes.attr.value);
              needPragma = false;
            }
          }
          i = attrRes.i;
        } while (attrRes.attr);

        if (needPragma === null) {
          continue;
        }
        if (needPragma === true && gotPragma === false) {
          continue;
        }
        if (charset === null) {
          continue;
        }

        if (charset === "UTF-16LE" || charset === "UTF-16BE") {
          charset = "UTF-8";
        }
        if (charset === "x-user-defined") {
          charset = "windows-1252";
        }

        return charset;
      } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
        // a-z or A-Z
        for (i += 2; i < l; i++) {
          c = uint8Array[i];
          // space or >
          if (isSpaceCharacter(c) || c === 0x3E) {
            break;
          }
        }
        let attrRes;
        do {
          attrRes = getAttribute(uint8Array, i, l);
          i = attrRes.i;
        } while (attrRes.attr);
      } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
        // ! or / or ?
        for (i += 2; i < l; i++) {
          c = uint8Array[i];
          // >
          if (c === 0x3E) {
            break;
          }
        }
      }
    }
  }
  return null;
}

// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
function getAttribute(uint8Array, i, l) {
  for (; i < l; i++) {
    let c = uint8Array[i];
    // space or /
    if (isSpaceCharacter(c) || c === 0x2F) {
      continue;
    }
    // ">"
    if (c === 0x3E) {
      break;
    }
    let name = "";
    let value = "";
    nameLoop:for (; i < l; i++) {
      c = uint8Array[i];
      // "="
      if (c === 0x3D && name !== "") {
        i++;
        break;
      }
      // space
      if (isSpaceCharacter(c)) {
        for (i++; i < l; i++) {
          c = uint8Array[i];
          // space
          if (isSpaceCharacter(c)) {
            continue;
          }
          // not "="
          if (c !== 0x3D) {
            return { attr: { name, value }, i };
          }

          i++;
          break nameLoop;
        }
        break;
      }
      // / or >
      if (c === 0x2F || c === 0x3E) {
        return { attr: { name, value }, i };
      }
      // A-Z
      if (c >= 0x41 && c <= 0x5A) {
        name += String.fromCharCode(c + 0x20); // lowercase
      } else {
        name += String.fromCharCode(c);
      }
    }
    c = uint8Array[i];
    // space
    if (isSpaceCharacter(c)) {
      for (i++; i < l; i++) {
        c = uint8Array[i];
        // space
        if (isSpaceCharacter(c)) {
          continue;
        } else {
          break;
        }
      }
    }
    // " or '
    if (c === 0x22 || c === 0x27) {
      const quote = c;
      for (i++; i < l; i++) {
        c = uint8Array[i];

        if (c === quote) {
          i++;
          return { attr: { name, value }, i };
        }

        // A-Z
        if (c >= 0x41 && c <= 0x5A) {
          value += String.fromCharCode(c + 0x20); // lowercase
        } else {
          value += String.fromCharCode(c);
        }
      }
    }

    // >
    if (c === 0x3E) {
      return { attr: { name, value }, i };
    }

    // A-Z
    if (c >= 0x41 && c <= 0x5A) {
      value += String.fromCharCode(c + 0x20); // lowercase
    } else {
      value += String.fromCharCode(c);
    }

    for (i++; i < l; i++) {
      c = uint8Array[i];

      // space or >
      if (isSpaceCharacter(c) || c === 0x3E) {
        return { attr: { name, value }, i };
      }

      // A-Z
      if (c >= 0x41 && c <= 0x5A) {
        value += String.fromCharCode(c + 0x20); // lowercase
      } else {
        value += String.fromCharCode(c);
      }
    }
  }
  return { i };
}

function extractCharacterEncodingFromMeta(string) {
  let position = 0;

  while (true) {
    const indexOfCharset = string.substring(position).search(/charset/ui);

    if (indexOfCharset === -1) {
      return null;
    }
    let subPosition = position + indexOfCharset + "charset".length;

    while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
      ++subPosition;
    }

    if (string[subPosition] !== "=") {
      position = subPosition - 1;
      continue;
    }

    ++subPosition;

    while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
      ++subPosition;
    }

    position = subPosition;
    break;
  }

  if (string[position] === "\"" || string[position] === "'") {
    const nextIndex = string.indexOf(string[position], position + 1);

    if (nextIndex !== -1) {
      return labelToName(string.substring(position + 1, nextIndex));
    }

    // It is an unmatched quotation mark
    return null;
  }

  if (string.length === position + 1) {
    return null;
  }

  const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/u);
  const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
    string.length :
    position + indexOfASCIIWhitespaceOrSemicolon + 1;

  return labelToName(string.substring(position, end));
}

function isSpaceCharacter(c) {
  return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
}