index.cjs 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. 'use strict';
  2. const types = require('./types.cjs');
  3. const charCodeDefinitions = require('./char-code-definitions.cjs');
  4. const utils = require('./utils.cjs');
  5. const names = require('./names.cjs');
  6. const OffsetToLocation = require('./OffsetToLocation.cjs');
  7. const TokenStream = require('./TokenStream.cjs');
  8. function tokenize(source, onToken) {
  9. function getCharCode(offset) {
  10. return offset < sourceLength ? source.charCodeAt(offset) : 0;
  11. }
  12. // § 4.3.3. Consume a numeric token
  13. function consumeNumericToken() {
  14. // Consume a number and let number be the result.
  15. offset = utils.consumeNumber(source, offset);
  16. // If the next 3 input code points would start an identifier, then:
  17. if (charCodeDefinitions.isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) {
  18. // Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string.
  19. // Consume a name. Set the <dimension-token>’s unit to the returned value.
  20. // Return the <dimension-token>.
  21. type = types.Dimension;
  22. offset = utils.consumeName(source, offset);
  23. return;
  24. }
  25. // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it.
  26. if (getCharCode(offset) === 0x0025) {
  27. // Create a <percentage-token> with the same value as number, and return it.
  28. type = types.Percentage;
  29. offset++;
  30. return;
  31. }
  32. // Otherwise, create a <number-token> with the same value and type flag as number, and return it.
  33. type = types.Number;
  34. }
  35. // § 4.3.4. Consume an ident-like token
  36. function consumeIdentLikeToken() {
  37. const nameStartOffset = offset;
  38. // Consume a name, and let string be the result.
  39. offset = utils.consumeName(source, offset);
  40. // If string’s value is an ASCII case-insensitive match for "url",
  41. // and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
  42. if (utils.cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) {
  43. // While the next two input code points are whitespace, consume the next input code point.
  44. offset = utils.findWhiteSpaceEnd(source, offset + 1);
  45. // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('),
  46. // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('),
  47. // then create a <function-token> with its value set to string and return it.
  48. if (getCharCode(offset) === 0x0022 ||
  49. getCharCode(offset) === 0x0027) {
  50. type = types.Function;
  51. offset = nameStartOffset + 4;
  52. return;
  53. }
  54. // Otherwise, consume a url token, and return it.
  55. consumeUrlToken();
  56. return;
  57. }
  58. // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
  59. // Create a <function-token> with its value set to string and return it.
  60. if (getCharCode(offset) === 0x0028) {
  61. type = types.Function;
  62. offset++;
  63. return;
  64. }
  65. // Otherwise, create an <ident-token> with its value set to string and return it.
  66. type = types.Ident;
  67. }
  68. // § 4.3.5. Consume a string token
  69. function consumeStringToken(endingCodePoint) {
  70. // This algorithm may be called with an ending code point, which denotes the code point
  71. // that ends the string. If an ending code point is not specified,
  72. // the current input code point is used.
  73. if (!endingCodePoint) {
  74. endingCodePoint = getCharCode(offset++);
  75. }
  76. // Initially create a <string-token> with its value set to the empty string.
  77. type = types.String;
  78. // Repeatedly consume the next input code point from the stream:
  79. for (; offset < source.length; offset++) {
  80. const code = source.charCodeAt(offset);
  81. switch (charCodeDefinitions.charCodeCategory(code)) {
  82. // ending code point
  83. case endingCodePoint:
  84. // Return the <string-token>.
  85. offset++;
  86. return;
  87. // EOF
  88. // case EofCategory:
  89. // This is a parse error. Return the <string-token>.
  90. // return;
  91. // newline
  92. case charCodeDefinitions.WhiteSpaceCategory:
  93. if (charCodeDefinitions.isNewline(code)) {
  94. // This is a parse error. Reconsume the current input code point,
  95. // create a <bad-string-token>, and return it.
  96. offset += utils.getNewlineLength(source, offset, code);
  97. type = types.BadString;
  98. return;
  99. }
  100. break;
  101. // U+005C REVERSE SOLIDUS (\)
  102. case 0x005C:
  103. // If the next input code point is EOF, do nothing.
  104. if (offset === source.length - 1) {
  105. break;
  106. }
  107. const nextCode = getCharCode(offset + 1);
  108. // Otherwise, if the next input code point is a newline, consume it.
  109. if (charCodeDefinitions.isNewline(nextCode)) {
  110. offset += utils.getNewlineLength(source, offset + 1, nextCode);
  111. } else if (charCodeDefinitions.isValidEscape(code, nextCode)) {
  112. // Otherwise, (the stream starts with a valid escape) consume
  113. // an escaped code point and append the returned code point to
  114. // the <string-token>’s value.
  115. offset = utils.consumeEscaped(source, offset) - 1;
  116. }
  117. break;
  118. // anything else
  119. // Append the current input code point to the <string-token>’s value.
  120. }
  121. }
  122. }
  123. // § 4.3.6. Consume a url token
  124. // Note: This algorithm assumes that the initial "url(" has already been consumed.
  125. // This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo).
  126. // A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token
  127. // automatically handles this distinction; this algorithm shouldn’t be called directly otherwise.
  128. function consumeUrlToken() {
  129. // Initially create a <url-token> with its value set to the empty string.
  130. type = types.Url;
  131. // Consume as much whitespace as possible.
  132. offset = utils.findWhiteSpaceEnd(source, offset);
  133. // Repeatedly consume the next input code point from the stream:
  134. for (; offset < source.length; offset++) {
  135. const code = source.charCodeAt(offset);
  136. switch (charCodeDefinitions.charCodeCategory(code)) {
  137. // U+0029 RIGHT PARENTHESIS ())
  138. case 0x0029:
  139. // Return the <url-token>.
  140. offset++;
  141. return;
  142. // EOF
  143. // case EofCategory:
  144. // This is a parse error. Return the <url-token>.
  145. // return;
  146. // whitespace
  147. case charCodeDefinitions.WhiteSpaceCategory:
  148. // Consume as much whitespace as possible.
  149. offset = utils.findWhiteSpaceEnd(source, offset);
  150. // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
  151. // consume it and return the <url-token>
  152. // (if EOF was encountered, this is a parse error);
  153. if (getCharCode(offset) === 0x0029 || offset >= source.length) {
  154. if (offset < source.length) {
  155. offset++;
  156. }
  157. return;
  158. }
  159. // otherwise, consume the remnants of a bad url, create a <bad-url-token>,
  160. // and return it.
  161. offset = utils.consumeBadUrlRemnants(source, offset);
  162. type = types.BadUrl;
  163. return;
  164. // U+0022 QUOTATION MARK (")
  165. // U+0027 APOSTROPHE (')
  166. // U+0028 LEFT PARENTHESIS (()
  167. // non-printable code point
  168. case 0x0022:
  169. case 0x0027:
  170. case 0x0028:
  171. case charCodeDefinitions.NonPrintableCategory:
  172. // This is a parse error. Consume the remnants of a bad url,
  173. // create a <bad-url-token>, and return it.
  174. offset = utils.consumeBadUrlRemnants(source, offset);
  175. type = types.BadUrl;
  176. return;
  177. // U+005C REVERSE SOLIDUS (\)
  178. case 0x005C:
  179. // If the stream starts with a valid escape, consume an escaped code point and
  180. // append the returned code point to the <url-token>’s value.
  181. if (charCodeDefinitions.isValidEscape(code, getCharCode(offset + 1))) {
  182. offset = utils.consumeEscaped(source, offset) - 1;
  183. break;
  184. }
  185. // Otherwise, this is a parse error. Consume the remnants of a bad url,
  186. // create a <bad-url-token>, and return it.
  187. offset = utils.consumeBadUrlRemnants(source, offset);
  188. type = types.BadUrl;
  189. return;
  190. // anything else
  191. // Append the current input code point to the <url-token>’s value.
  192. }
  193. }
  194. }
  195. // ensure source is a string
  196. source = String(source || '');
  197. const sourceLength = source.length;
  198. let start = charCodeDefinitions.isBOM(getCharCode(0));
  199. let offset = start;
  200. let type;
  201. // https://drafts.csswg.org/css-syntax-3/#consume-token
  202. // § 4.3.1. Consume a token
  203. while (offset < sourceLength) {
  204. const code = source.charCodeAt(offset);
  205. switch (charCodeDefinitions.charCodeCategory(code)) {
  206. // whitespace
  207. case charCodeDefinitions.WhiteSpaceCategory:
  208. // Consume as much whitespace as possible. Return a <whitespace-token>.
  209. type = types.WhiteSpace;
  210. offset = utils.findWhiteSpaceEnd(source, offset + 1);
  211. break;
  212. // U+0022 QUOTATION MARK (")
  213. case 0x0022:
  214. // Consume a string token and return it.
  215. consumeStringToken();
  216. break;
  217. // U+0023 NUMBER SIGN (#)
  218. case 0x0023:
  219. // If the next input code point is a name code point or the next two input code points are a valid escape, then:
  220. if (charCodeDefinitions.isName(getCharCode(offset + 1)) || charCodeDefinitions.isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) {
  221. // Create a <hash-token>.
  222. type = types.Hash;
  223. // If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id".
  224. // if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
  225. // // TODO: set id flag
  226. // }
  227. // Consume a name, and set the <hash-token>’s value to the returned string.
  228. offset = utils.consumeName(source, offset + 1);
  229. // Return the <hash-token>.
  230. } else {
  231. // Otherwise, return a <delim-token> with its value set to the current input code point.
  232. type = types.Delim;
  233. offset++;
  234. }
  235. break;
  236. // U+0027 APOSTROPHE (')
  237. case 0x0027:
  238. // Consume a string token and return it.
  239. consumeStringToken();
  240. break;
  241. // U+0028 LEFT PARENTHESIS (()
  242. case 0x0028:
  243. // Return a <(-token>.
  244. type = types.LeftParenthesis;
  245. offset++;
  246. break;
  247. // U+0029 RIGHT PARENTHESIS ())
  248. case 0x0029:
  249. // Return a <)-token>.
  250. type = types.RightParenthesis;
  251. offset++;
  252. break;
  253. // U+002B PLUS SIGN (+)
  254. case 0x002B:
  255. // If the input stream starts with a number, ...
  256. if (charCodeDefinitions.isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  257. // ... reconsume the current input code point, consume a numeric token, and return it.
  258. consumeNumericToken();
  259. } else {
  260. // Otherwise, return a <delim-token> with its value set to the current input code point.
  261. type = types.Delim;
  262. offset++;
  263. }
  264. break;
  265. // U+002C COMMA (,)
  266. case 0x002C:
  267. // Return a <comma-token>.
  268. type = types.Comma;
  269. offset++;
  270. break;
  271. // U+002D HYPHEN-MINUS (-)
  272. case 0x002D:
  273. // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
  274. if (charCodeDefinitions.isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  275. consumeNumericToken();
  276. } else {
  277. // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
  278. if (getCharCode(offset + 1) === 0x002D &&
  279. getCharCode(offset + 2) === 0x003E) {
  280. type = types.CDC;
  281. offset = offset + 3;
  282. } else {
  283. // Otherwise, if the input stream starts with an identifier, ...
  284. if (charCodeDefinitions.isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  285. // ... reconsume the current input code point, consume an ident-like token, and return it.
  286. consumeIdentLikeToken();
  287. } else {
  288. // Otherwise, return a <delim-token> with its value set to the current input code point.
  289. type = types.Delim;
  290. offset++;
  291. }
  292. }
  293. }
  294. break;
  295. // U+002E FULL STOP (.)
  296. case 0x002E:
  297. // If the input stream starts with a number, ...
  298. if (charCodeDefinitions.isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  299. // ... reconsume the current input code point, consume a numeric token, and return it.
  300. consumeNumericToken();
  301. } else {
  302. // Otherwise, return a <delim-token> with its value set to the current input code point.
  303. type = types.Delim;
  304. offset++;
  305. }
  306. break;
  307. // U+002F SOLIDUS (/)
  308. case 0x002F:
  309. // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*),
  310. if (getCharCode(offset + 1) === 0x002A) {
  311. // ... consume them and all following code points up to and including the first U+002A ASTERISK (*)
  312. // followed by a U+002F SOLIDUS (/), or up to an EOF code point.
  313. type = types.Comment;
  314. offset = source.indexOf('*/', offset + 2);
  315. offset = offset === -1 ? source.length : offset + 2;
  316. } else {
  317. type = types.Delim;
  318. offset++;
  319. }
  320. break;
  321. // U+003A COLON (:)
  322. case 0x003A:
  323. // Return a <colon-token>.
  324. type = types.Colon;
  325. offset++;
  326. break;
  327. // U+003B SEMICOLON (;)
  328. case 0x003B:
  329. // Return a <semicolon-token>.
  330. type = types.Semicolon;
  331. offset++;
  332. break;
  333. // U+003C LESS-THAN SIGN (<)
  334. case 0x003C:
  335. // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ...
  336. if (getCharCode(offset + 1) === 0x0021 &&
  337. getCharCode(offset + 2) === 0x002D &&
  338. getCharCode(offset + 3) === 0x002D) {
  339. // ... consume them and return a <CDO-token>.
  340. type = types.CDO;
  341. offset = offset + 4;
  342. } else {
  343. // Otherwise, return a <delim-token> with its value set to the current input code point.
  344. type = types.Delim;
  345. offset++;
  346. }
  347. break;
  348. // U+0040 COMMERCIAL AT (@)
  349. case 0x0040:
  350. // If the next 3 input code points would start an identifier, ...
  351. if (charCodeDefinitions.isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
  352. // ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it.
  353. type = types.AtKeyword;
  354. offset = utils.consumeName(source, offset + 1);
  355. } else {
  356. // Otherwise, return a <delim-token> with its value set to the current input code point.
  357. type = types.Delim;
  358. offset++;
  359. }
  360. break;
  361. // U+005B LEFT SQUARE BRACKET ([)
  362. case 0x005B:
  363. // Return a <[-token>.
  364. type = types.LeftSquareBracket;
  365. offset++;
  366. break;
  367. // U+005C REVERSE SOLIDUS (\)
  368. case 0x005C:
  369. // If the input stream starts with a valid escape, ...
  370. if (charCodeDefinitions.isValidEscape(code, getCharCode(offset + 1))) {
  371. // ... reconsume the current input code point, consume an ident-like token, and return it.
  372. consumeIdentLikeToken();
  373. } else {
  374. // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
  375. type = types.Delim;
  376. offset++;
  377. }
  378. break;
  379. // U+005D RIGHT SQUARE BRACKET (])
  380. case 0x005D:
  381. // Return a <]-token>.
  382. type = types.RightSquareBracket;
  383. offset++;
  384. break;
  385. // U+007B LEFT CURLY BRACKET ({)
  386. case 0x007B:
  387. // Return a <{-token>.
  388. type = types.LeftCurlyBracket;
  389. offset++;
  390. break;
  391. // U+007D RIGHT CURLY BRACKET (})
  392. case 0x007D:
  393. // Return a <}-token>.
  394. type = types.RightCurlyBracket;
  395. offset++;
  396. break;
  397. // digit
  398. case charCodeDefinitions.DigitCategory:
  399. // Reconsume the current input code point, consume a numeric token, and return it.
  400. consumeNumericToken();
  401. break;
  402. // name-start code point
  403. case charCodeDefinitions.NameStartCategory:
  404. // Reconsume the current input code point, consume an ident-like token, and return it.
  405. consumeIdentLikeToken();
  406. break;
  407. // EOF
  408. // case EofCategory:
  409. // Return an <EOF-token>.
  410. // break;
  411. // anything else
  412. default:
  413. // Return a <delim-token> with its value set to the current input code point.
  414. type = types.Delim;
  415. offset++;
  416. }
  417. // put token to stream
  418. onToken(type, start, start = offset);
  419. }
  420. }
  421. exports.AtKeyword = types.AtKeyword;
  422. exports.BadString = types.BadString;
  423. exports.BadUrl = types.BadUrl;
  424. exports.CDC = types.CDC;
  425. exports.CDO = types.CDO;
  426. exports.Colon = types.Colon;
  427. exports.Comma = types.Comma;
  428. exports.Comment = types.Comment;
  429. exports.Delim = types.Delim;
  430. exports.Dimension = types.Dimension;
  431. exports.EOF = types.EOF;
  432. exports.Function = types.Function;
  433. exports.Hash = types.Hash;
  434. exports.Ident = types.Ident;
  435. exports.LeftCurlyBracket = types.LeftCurlyBracket;
  436. exports.LeftParenthesis = types.LeftParenthesis;
  437. exports.LeftSquareBracket = types.LeftSquareBracket;
  438. exports.Number = types.Number;
  439. exports.Percentage = types.Percentage;
  440. exports.RightCurlyBracket = types.RightCurlyBracket;
  441. exports.RightParenthesis = types.RightParenthesis;
  442. exports.RightSquareBracket = types.RightSquareBracket;
  443. exports.Semicolon = types.Semicolon;
  444. exports.String = types.String;
  445. exports.Url = types.Url;
  446. exports.WhiteSpace = types.WhiteSpace;
  447. exports.tokenTypes = types;
  448. exports.DigitCategory = charCodeDefinitions.DigitCategory;
  449. exports.EofCategory = charCodeDefinitions.EofCategory;
  450. exports.NameStartCategory = charCodeDefinitions.NameStartCategory;
  451. exports.NonPrintableCategory = charCodeDefinitions.NonPrintableCategory;
  452. exports.WhiteSpaceCategory = charCodeDefinitions.WhiteSpaceCategory;
  453. exports.charCodeCategory = charCodeDefinitions.charCodeCategory;
  454. exports.isBOM = charCodeDefinitions.isBOM;
  455. exports.isDigit = charCodeDefinitions.isDigit;
  456. exports.isHexDigit = charCodeDefinitions.isHexDigit;
  457. exports.isIdentifierStart = charCodeDefinitions.isIdentifierStart;
  458. exports.isLetter = charCodeDefinitions.isLetter;
  459. exports.isLowercaseLetter = charCodeDefinitions.isLowercaseLetter;
  460. exports.isName = charCodeDefinitions.isName;
  461. exports.isNameStart = charCodeDefinitions.isNameStart;
  462. exports.isNewline = charCodeDefinitions.isNewline;
  463. exports.isNonAscii = charCodeDefinitions.isNonAscii;
  464. exports.isNonPrintable = charCodeDefinitions.isNonPrintable;
  465. exports.isNumberStart = charCodeDefinitions.isNumberStart;
  466. exports.isUppercaseLetter = charCodeDefinitions.isUppercaseLetter;
  467. exports.isValidEscape = charCodeDefinitions.isValidEscape;
  468. exports.isWhiteSpace = charCodeDefinitions.isWhiteSpace;
  469. exports.cmpChar = utils.cmpChar;
  470. exports.cmpStr = utils.cmpStr;
  471. exports.consumeBadUrlRemnants = utils.consumeBadUrlRemnants;
  472. exports.consumeEscaped = utils.consumeEscaped;
  473. exports.consumeName = utils.consumeName;
  474. exports.consumeNumber = utils.consumeNumber;
  475. exports.decodeEscaped = utils.decodeEscaped;
  476. exports.findDecimalNumberEnd = utils.findDecimalNumberEnd;
  477. exports.findWhiteSpaceEnd = utils.findWhiteSpaceEnd;
  478. exports.findWhiteSpaceStart = utils.findWhiteSpaceStart;
  479. exports.getNewlineLength = utils.getNewlineLength;
  480. exports.tokenNames = names;
  481. exports.OffsetToLocation = OffsetToLocation.OffsetToLocation;
  482. exports.TokenStream = TokenStream.TokenStream;
  483. exports.tokenize = tokenize;