index.js 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. import * as TYPE from './types.js';
  2. import {
  3. isNewline,
  4. isName,
  5. isValidEscape,
  6. isNumberStart,
  7. isIdentifierStart,
  8. isBOM,
  9. charCodeCategory,
  10. WhiteSpaceCategory,
  11. DigitCategory,
  12. NameStartCategory,
  13. NonPrintableCategory
  14. } from './char-code-definitions.js';
  15. import {
  16. cmpStr,
  17. getNewlineLength,
  18. findWhiteSpaceEnd,
  19. consumeEscaped,
  20. consumeName,
  21. consumeNumber,
  22. consumeBadUrlRemnants
  23. } from './utils.js';
  24. export function tokenize(source, onToken) {
  25. function getCharCode(offset) {
  26. return offset < sourceLength ? source.charCodeAt(offset) : 0;
  27. }
  28. // § 4.3.3. Consume a numeric token
  29. function consumeNumericToken() {
  30. // Consume a number and let number be the result.
  31. offset = consumeNumber(source, offset);
  32. // If the next 3 input code points would start an identifier, then:
  33. if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) {
  34. // Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string.
  35. // Consume a name. Set the <dimension-token>’s unit to the returned value.
  36. // Return the <dimension-token>.
  37. type = TYPE.Dimension;
  38. offset = consumeName(source, offset);
  39. return;
  40. }
  41. // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it.
  42. if (getCharCode(offset) === 0x0025) {
  43. // Create a <percentage-token> with the same value as number, and return it.
  44. type = TYPE.Percentage;
  45. offset++;
  46. return;
  47. }
  48. // Otherwise, create a <number-token> with the same value and type flag as number, and return it.
  49. type = TYPE.Number;
  50. }
  51. // § 4.3.4. Consume an ident-like token
  52. function consumeIdentLikeToken() {
  53. const nameStartOffset = offset;
  54. // Consume a name, and let string be the result.
  55. offset = consumeName(source, offset);
  56. // If string’s value is an ASCII case-insensitive match for "url",
  57. // and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
  58. if (cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) {
  59. // While the next two input code points are whitespace, consume the next input code point.
  60. offset = findWhiteSpaceEnd(source, offset + 1);
  61. // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('),
  62. // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('),
  63. // then create a <function-token> with its value set to string and return it.
  64. if (getCharCode(offset) === 0x0022 ||
  65. getCharCode(offset) === 0x0027) {
  66. type = TYPE.Function;
  67. offset = nameStartOffset + 4;
  68. return;
  69. }
  70. // Otherwise, consume a url token, and return it.
  71. consumeUrlToken();
  72. return;
  73. }
  74. // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
  75. // Create a <function-token> with its value set to string and return it.
  76. if (getCharCode(offset) === 0x0028) {
  77. type = TYPE.Function;
  78. offset++;
  79. return;
  80. }
  81. // Otherwise, create an <ident-token> with its value set to string and return it.
  82. type = TYPE.Ident;
  83. }
  84. // § 4.3.5. Consume a string token
  85. function consumeStringToken(endingCodePoint) {
  86. // This algorithm may be called with an ending code point, which denotes the code point
  87. // that ends the string. If an ending code point is not specified,
  88. // the current input code point is used.
  89. if (!endingCodePoint) {
  90. endingCodePoint = getCharCode(offset++);
  91. }
  92. // Initially create a <string-token> with its value set to the empty string.
  93. type = TYPE.String;
  94. // Repeatedly consume the next input code point from the stream:
  95. for (; offset < source.length; offset++) {
  96. const code = source.charCodeAt(offset);
  97. switch (charCodeCategory(code)) {
  98. // ending code point
  99. case endingCodePoint:
  100. // Return the <string-token>.
  101. offset++;
  102. return;
  103. // EOF
  104. // case EofCategory:
  105. // This is a parse error. Return the <string-token>.
  106. // return;
  107. // newline
  108. case WhiteSpaceCategory:
  109. if (isNewline(code)) {
  110. // This is a parse error. Reconsume the current input code point,
  111. // create a <bad-string-token>, and return it.
  112. offset += getNewlineLength(source, offset, code);
  113. type = TYPE.BadString;
  114. return;
  115. }
  116. break;
  117. // U+005C REVERSE SOLIDUS (\)
  118. case 0x005C:
  119. // If the next input code point is EOF, do nothing.
  120. if (offset === source.length - 1) {
  121. break;
  122. }
  123. const nextCode = getCharCode(offset + 1);
  124. // Otherwise, if the next input code point is a newline, consume it.
  125. if (isNewline(nextCode)) {
  126. offset += getNewlineLength(source, offset + 1, nextCode);
  127. } else if (isValidEscape(code, nextCode)) {
  128. // Otherwise, (the stream starts with a valid escape) consume
  129. // an escaped code point and append the returned code point to
  130. // the <string-token>’s value.
  131. offset = consumeEscaped(source, offset) - 1;
  132. }
  133. break;
  134. // anything else
  135. // Append the current input code point to the <string-token>’s value.
  136. }
  137. }
  138. }
  139. // § 4.3.6. Consume a url token
  140. // Note: This algorithm assumes that the initial "url(" has already been consumed.
  141. // This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo).
  142. // A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token
  143. // automatically handles this distinction; this algorithm shouldn’t be called directly otherwise.
  144. function consumeUrlToken() {
  145. // Initially create a <url-token> with its value set to the empty string.
  146. type = TYPE.Url;
  147. // Consume as much whitespace as possible.
  148. offset = findWhiteSpaceEnd(source, offset);
  149. // Repeatedly consume the next input code point from the stream:
  150. for (; offset < source.length; offset++) {
  151. const code = source.charCodeAt(offset);
  152. switch (charCodeCategory(code)) {
  153. // U+0029 RIGHT PARENTHESIS ())
  154. case 0x0029:
  155. // Return the <url-token>.
  156. offset++;
  157. return;
  158. // EOF
  159. // case EofCategory:
  160. // This is a parse error. Return the <url-token>.
  161. // return;
  162. // whitespace
  163. case WhiteSpaceCategory:
  164. // Consume as much whitespace as possible.
  165. offset = findWhiteSpaceEnd(source, offset);
  166. // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
  167. // consume it and return the <url-token>
  168. // (if EOF was encountered, this is a parse error);
  169. if (getCharCode(offset) === 0x0029 || offset >= source.length) {
  170. if (offset < source.length) {
  171. offset++;
  172. }
  173. return;
  174. }
  175. // otherwise, consume the remnants of a bad url, create a <bad-url-token>,
  176. // and return it.
  177. offset = consumeBadUrlRemnants(source, offset);
  178. type = TYPE.BadUrl;
  179. return;
  180. // U+0022 QUOTATION MARK (")
  181. // U+0027 APOSTROPHE (')
  182. // U+0028 LEFT PARENTHESIS (()
  183. // non-printable code point
  184. case 0x0022:
  185. case 0x0027:
  186. case 0x0028:
  187. case NonPrintableCategory:
  188. // This is a parse error. Consume the remnants of a bad url,
  189. // create a <bad-url-token>, and return it.
  190. offset = consumeBadUrlRemnants(source, offset);
  191. type = TYPE.BadUrl;
  192. return;
  193. // U+005C REVERSE SOLIDUS (\)
  194. case 0x005C:
  195. // If the stream starts with a valid escape, consume an escaped code point and
  196. // append the returned code point to the <url-token>’s value.
  197. if (isValidEscape(code, getCharCode(offset + 1))) {
  198. offset = consumeEscaped(source, offset) - 1;
  199. break;
  200. }
  201. // Otherwise, this is a parse error. Consume the remnants of a bad url,
  202. // create a <bad-url-token>, and return it.
  203. offset = consumeBadUrlRemnants(source, offset);
  204. type = TYPE.BadUrl;
  205. return;
  206. // anything else
  207. // Append the current input code point to the <url-token>’s value.
  208. }
  209. }
  210. }
  211. // ensure source is a string
  212. source = String(source || '');
  213. const sourceLength = source.length;
  214. let start = isBOM(getCharCode(0));
  215. let offset = start;
  216. let type;
  217. // https://drafts.csswg.org/css-syntax-3/#consume-token
  218. // § 4.3.1. Consume a token
  219. while (offset < sourceLength) {
  220. const code = source.charCodeAt(offset);
  221. switch (charCodeCategory(code)) {
  222. // whitespace
  223. case WhiteSpaceCategory:
  224. // Consume as much whitespace as possible. Return a <whitespace-token>.
  225. type = TYPE.WhiteSpace;
  226. offset = findWhiteSpaceEnd(source, offset + 1);
  227. break;
  228. // U+0022 QUOTATION MARK (")
  229. case 0x0022:
  230. // Consume a string token and return it.
  231. consumeStringToken();
  232. break;
  233. // U+0023 NUMBER SIGN (#)
  234. case 0x0023:
  235. // If the next input code point is a name code point or the next two input code points are a valid escape, then:
  236. if (isName(getCharCode(offset + 1)) || isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) {
  237. // Create a <hash-token>.
  238. type = TYPE.Hash;
  239. // If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id".
  240. // if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
  241. // // TODO: set id flag
  242. // }
  243. // Consume a name, and set the <hash-token>’s value to the returned string.
  244. offset = consumeName(source, offset + 1);
  245. // Return the <hash-token>.
  246. } else {
  247. // Otherwise, return a <delim-token> with its value set to the current input code point.
  248. type = TYPE.Delim;
  249. offset++;
  250. }
  251. break;
  252. // U+0027 APOSTROPHE (')
  253. case 0x0027:
  254. // Consume a string token and return it.
  255. consumeStringToken();
  256. break;
  257. // U+0028 LEFT PARENTHESIS (()
  258. case 0x0028:
  259. // Return a <(-token>.
  260. type = TYPE.LeftParenthesis;
  261. offset++;
  262. break;
  263. // U+0029 RIGHT PARENTHESIS ())
  264. case 0x0029:
  265. // Return a <)-token>.
  266. type = TYPE.RightParenthesis;
  267. offset++;
  268. break;
  269. // U+002B PLUS SIGN (+)
  270. case 0x002B:
  271. // If the input stream starts with a number, ...
  272. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  273. // ... reconsume the current input code point, consume a numeric token, and return it.
  274. consumeNumericToken();
  275. } else {
  276. // Otherwise, return a <delim-token> with its value set to the current input code point.
  277. type = TYPE.Delim;
  278. offset++;
  279. }
  280. break;
  281. // U+002C COMMA (,)
  282. case 0x002C:
  283. // Return a <comma-token>.
  284. type = TYPE.Comma;
  285. offset++;
  286. break;
  287. // U+002D HYPHEN-MINUS (-)
  288. case 0x002D:
  289. // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
  290. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  291. consumeNumericToken();
  292. } else {
  293. // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
  294. if (getCharCode(offset + 1) === 0x002D &&
  295. getCharCode(offset + 2) === 0x003E) {
  296. type = TYPE.CDC;
  297. offset = offset + 3;
  298. } else {
  299. // Otherwise, if the input stream starts with an identifier, ...
  300. if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  301. // ... reconsume the current input code point, consume an ident-like token, and return it.
  302. consumeIdentLikeToken();
  303. } else {
  304. // Otherwise, return a <delim-token> with its value set to the current input code point.
  305. type = TYPE.Delim;
  306. offset++;
  307. }
  308. }
  309. }
  310. break;
  311. // U+002E FULL STOP (.)
  312. case 0x002E:
  313. // If the input stream starts with a number, ...
  314. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  315. // ... reconsume the current input code point, consume a numeric token, and return it.
  316. consumeNumericToken();
  317. } else {
  318. // Otherwise, return a <delim-token> with its value set to the current input code point.
  319. type = TYPE.Delim;
  320. offset++;
  321. }
  322. break;
  323. // U+002F SOLIDUS (/)
  324. case 0x002F:
  325. // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*),
  326. if (getCharCode(offset + 1) === 0x002A) {
  327. // ... consume them and all following code points up to and including the first U+002A ASTERISK (*)
  328. // followed by a U+002F SOLIDUS (/), or up to an EOF code point.
  329. type = TYPE.Comment;
  330. offset = source.indexOf('*/', offset + 2);
  331. offset = offset === -1 ? source.length : offset + 2;
  332. } else {
  333. type = TYPE.Delim;
  334. offset++;
  335. }
  336. break;
  337. // U+003A COLON (:)
  338. case 0x003A:
  339. // Return a <colon-token>.
  340. type = TYPE.Colon;
  341. offset++;
  342. break;
  343. // U+003B SEMICOLON (;)
  344. case 0x003B:
  345. // Return a <semicolon-token>.
  346. type = TYPE.Semicolon;
  347. offset++;
  348. break;
  349. // U+003C LESS-THAN SIGN (<)
  350. case 0x003C:
  351. // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ...
  352. if (getCharCode(offset + 1) === 0x0021 &&
  353. getCharCode(offset + 2) === 0x002D &&
  354. getCharCode(offset + 3) === 0x002D) {
  355. // ... consume them and return a <CDO-token>.
  356. type = TYPE.CDO;
  357. offset = offset + 4;
  358. } else {
  359. // Otherwise, return a <delim-token> with its value set to the current input code point.
  360. type = TYPE.Delim;
  361. offset++;
  362. }
  363. break;
  364. // U+0040 COMMERCIAL AT (@)
  365. case 0x0040:
  366. // If the next 3 input code points would start an identifier, ...
  367. if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
  368. // ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it.
  369. type = TYPE.AtKeyword;
  370. offset = consumeName(source, offset + 1);
  371. } else {
  372. // Otherwise, return a <delim-token> with its value set to the current input code point.
  373. type = TYPE.Delim;
  374. offset++;
  375. }
  376. break;
  377. // U+005B LEFT SQUARE BRACKET ([)
  378. case 0x005B:
  379. // Return a <[-token>.
  380. type = TYPE.LeftSquareBracket;
  381. offset++;
  382. break;
  383. // U+005C REVERSE SOLIDUS (\)
  384. case 0x005C:
  385. // If the input stream starts with a valid escape, ...
  386. if (isValidEscape(code, getCharCode(offset + 1))) {
  387. // ... reconsume the current input code point, consume an ident-like token, and return it.
  388. consumeIdentLikeToken();
  389. } else {
  390. // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
  391. type = TYPE.Delim;
  392. offset++;
  393. }
  394. break;
  395. // U+005D RIGHT SQUARE BRACKET (])
  396. case 0x005D:
  397. // Return a <]-token>.
  398. type = TYPE.RightSquareBracket;
  399. offset++;
  400. break;
  401. // U+007B LEFT CURLY BRACKET ({)
  402. case 0x007B:
  403. // Return a <{-token>.
  404. type = TYPE.LeftCurlyBracket;
  405. offset++;
  406. break;
  407. // U+007D RIGHT CURLY BRACKET (})
  408. case 0x007D:
  409. // Return a <}-token>.
  410. type = TYPE.RightCurlyBracket;
  411. offset++;
  412. break;
  413. // digit
  414. case DigitCategory:
  415. // Reconsume the current input code point, consume a numeric token, and return it.
  416. consumeNumericToken();
  417. break;
  418. // name-start code point
  419. case NameStartCategory:
  420. // Reconsume the current input code point, consume an ident-like token, and return it.
  421. consumeIdentLikeToken();
  422. break;
  423. // EOF
  424. // case EofCategory:
  425. // Return an <EOF-token>.
  426. // break;
  427. // anything else
  428. default:
  429. // Return a <delim-token> with its value set to the current input code point.
  430. type = TYPE.Delim;
  431. offset++;
  432. }
  433. // put token to stream
  434. onToken(type, start, start = offset);
  435. }
  436. }
  437. export * from './types.js';
  438. export * as tokenTypes from './types.js';
  439. export { default as tokenNames } from './names.js';
  440. export * from './char-code-definitions.js';
  441. export * from './utils.js';
  442. export * from './OffsetToLocation.js';
  443. export * from './TokenStream.js';