char-code-definitions.js 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. const EOF = 0;
  2. // https://drafts.csswg.org/css-syntax-3/
  3. // § 4.2. Definitions
  4. // digit
  5. // A code point between U+0030 DIGIT ZERO (0) and U+0039 DIGIT NINE (9).
  6. export function isDigit(code) {
  7. return code >= 0x0030 && code <= 0x0039;
  8. }
  9. // hex digit
  10. // A digit, or a code point between U+0041 LATIN CAPITAL LETTER A (A) and U+0046 LATIN CAPITAL LETTER F (F),
  11. // or a code point between U+0061 LATIN SMALL LETTER A (a) and U+0066 LATIN SMALL LETTER F (f).
  12. export function isHexDigit(code) {
  13. return (
  14. isDigit(code) || // 0 .. 9
  15. (code >= 0x0041 && code <= 0x0046) || // A .. F
  16. (code >= 0x0061 && code <= 0x0066) // a .. f
  17. );
  18. }
  19. // uppercase letter
  20. // A code point between U+0041 LATIN CAPITAL LETTER A (A) and U+005A LATIN CAPITAL LETTER Z (Z).
  21. export function isUppercaseLetter(code) {
  22. return code >= 0x0041 && code <= 0x005A;
  23. }
  24. // lowercase letter
  25. // A code point between U+0061 LATIN SMALL LETTER A (a) and U+007A LATIN SMALL LETTER Z (z).
  26. export function isLowercaseLetter(code) {
  27. return code >= 0x0061 && code <= 0x007A;
  28. }
  29. // letter
  30. // An uppercase letter or a lowercase letter.
  31. export function isLetter(code) {
  32. return isUppercaseLetter(code) || isLowercaseLetter(code);
  33. }
  34. // non-ASCII code point
  35. // A code point with a value equal to or greater than U+0080 <control>.
  36. //
  37. // 2024-09-02: The latest spec narrows the range for non-ASCII characters (see https://github.com/csstree/csstree/issues/188).
  38. // However, all modern browsers support a wider range, and strictly following the latest spec could result
  39. // in some CSS being parsed incorrectly, even though it works in the browser. Therefore, this function adheres
  40. // to the previous, broader definition of non-ASCII characters.
  41. export function isNonAscii(code) {
  42. return code >= 0x0080;
  43. }
  44. // name-start code point
  45. // A letter, a non-ASCII code point, or U+005F LOW LINE (_).
  46. export function isNameStart(code) {
  47. return isLetter(code) || isNonAscii(code) || code === 0x005F;
  48. }
  49. // name code point
  50. // A name-start code point, a digit, or U+002D HYPHEN-MINUS (-).
  51. export function isName(code) {
  52. return isNameStart(code) || isDigit(code) || code === 0x002D;
  53. }
  54. // non-printable code point
  55. // A code point between U+0000 NULL and U+0008 BACKSPACE, or U+000B LINE TABULATION,
  56. // or a code point between U+000E SHIFT OUT and U+001F INFORMATION SEPARATOR ONE, or U+007F DELETE.
  57. export function isNonPrintable(code) {
  58. return (
  59. (code >= 0x0000 && code <= 0x0008) ||
  60. (code === 0x000B) ||
  61. (code >= 0x000E && code <= 0x001F) ||
  62. (code === 0x007F)
  63. );
  64. }
  65. // newline
  66. // U+000A LINE FEED. Note that U+000D CARRIAGE RETURN and U+000C FORM FEED are not included in this definition,
  67. // as they are converted to U+000A LINE FEED during preprocessing.
  68. // TODO: we doesn't do a preprocessing, so check a code point for U+000D CARRIAGE RETURN and U+000C FORM FEED
  69. export function isNewline(code) {
  70. return code === 0x000A || code === 0x000D || code === 0x000C;
  71. }
  72. // whitespace
  73. // A newline, U+0009 CHARACTER TABULATION, or U+0020 SPACE.
  74. export function isWhiteSpace(code) {
  75. return isNewline(code) || code === 0x0020 || code === 0x0009;
  76. }
  77. // § 4.3.8. Check if two code points are a valid escape
  78. export function isValidEscape(first, second) {
  79. // If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
  80. if (first !== 0x005C) {
  81. return false;
  82. }
  83. // Otherwise, if the second code point is a newline or EOF, return false.
  84. if (isNewline(second) || second === EOF) {
  85. return false;
  86. }
  87. // Otherwise, return true.
  88. return true;
  89. }
  90. // § 4.3.9. Check if three code points would start an identifier
  91. export function isIdentifierStart(first, second, third) {
  92. // Look at the first code point:
  93. // U+002D HYPHEN-MINUS
  94. if (first === 0x002D) {
  95. // If the second code point is a name-start code point or a U+002D HYPHEN-MINUS,
  96. // or the second and third code points are a valid escape, return true. Otherwise, return false.
  97. return (
  98. isNameStart(second) ||
  99. second === 0x002D ||
  100. isValidEscape(second, third)
  101. );
  102. }
  103. // name-start code point
  104. if (isNameStart(first)) {
  105. // Return true.
  106. return true;
  107. }
  108. // U+005C REVERSE SOLIDUS (\)
  109. if (first === 0x005C) {
  110. // If the first and second code points are a valid escape, return true. Otherwise, return false.
  111. return isValidEscape(first, second);
  112. }
  113. // anything else
  114. // Return false.
  115. return false;
  116. }
  117. // § 4.3.10. Check if three code points would start a number
  118. export function isNumberStart(first, second, third) {
  119. // Look at the first code point:
  120. // U+002B PLUS SIGN (+)
  121. // U+002D HYPHEN-MINUS (-)
  122. if (first === 0x002B || first === 0x002D) {
  123. // If the second code point is a digit, return true.
  124. if (isDigit(second)) {
  125. return 2;
  126. }
  127. // Otherwise, if the second code point is a U+002E FULL STOP (.)
  128. // and the third code point is a digit, return true.
  129. // Otherwise, return false.
  130. return second === 0x002E && isDigit(third) ? 3 : 0;
  131. }
  132. // U+002E FULL STOP (.)
  133. if (first === 0x002E) {
  134. // If the second code point is a digit, return true. Otherwise, return false.
  135. return isDigit(second) ? 2 : 0;
  136. }
  137. // digit
  138. if (isDigit(first)) {
  139. // Return true.
  140. return 1;
  141. }
  142. // anything else
  143. // Return false.
  144. return 0;
  145. }
  146. //
  147. // Misc
  148. //
  149. // detect BOM (https://en.wikipedia.org/wiki/Byte_order_mark)
  150. export function isBOM(code) {
  151. // UTF-16BE
  152. if (code === 0xFEFF) {
  153. return 1;
  154. }
  155. // UTF-16LE
  156. if (code === 0xFFFE) {
  157. return 1;
  158. }
  159. return 0;
  160. }
  161. // Fast code category
  162. // Only ASCII code points has a special meaning, that's why we define a maps for 0..127 codes only
  163. const CATEGORY = new Array(0x80);
  164. export const EofCategory = 0x80;
  165. export const WhiteSpaceCategory = 0x82;
  166. export const DigitCategory = 0x83;
  167. export const NameStartCategory = 0x84;
  168. export const NonPrintableCategory = 0x85;
  169. for (let i = 0; i < CATEGORY.length; i++) {
  170. CATEGORY[i] =
  171. isWhiteSpace(i) && WhiteSpaceCategory ||
  172. isDigit(i) && DigitCategory ||
  173. isNameStart(i) && NameStartCategory ||
  174. isNonPrintable(i) && NonPrintableCategory ||
  175. i || EofCategory;
  176. }
  177. export function charCodeCategory(code) {
  178. return code < 0x80 ? CATEGORY[code] : NameStartCategory;
  179. }