TokenStream.js 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. import { adoptBuffer } from './adopt-buffer.js';
  2. import { cmpStr } from './utils.js';
  3. import tokenNames from './names.js';
  4. import {
  5. WhiteSpace,
  6. Comment,
  7. Delim,
  8. EOF,
  9. Function as FunctionToken,
  10. LeftParenthesis,
  11. RightParenthesis,
  12. LeftSquareBracket,
  13. RightSquareBracket,
  14. LeftCurlyBracket,
  15. RightCurlyBracket
  16. } from './types.js';
  17. const OFFSET_MASK = 0x00FFFFFF;
  18. const TYPE_SHIFT = 24;
  19. const balancePair = new Uint8Array(32); // 32b of memory ought to be enough for anyone (any number of tokens)
  20. balancePair[FunctionToken] = RightParenthesis;
  21. balancePair[LeftParenthesis] = RightParenthesis;
  22. balancePair[LeftSquareBracket] = RightSquareBracket;
  23. balancePair[LeftCurlyBracket] = RightCurlyBracket;
  24. function isBlockOpenerToken(tokenType) {
  25. return balancePair[tokenType] !== 0;
  26. }
  27. export class TokenStream {
  28. constructor(source, tokenize) {
  29. this.setSource(source, tokenize);
  30. }
  31. reset() {
  32. this.eof = false;
  33. this.tokenIndex = -1;
  34. this.tokenType = 0;
  35. this.tokenStart = this.firstCharOffset;
  36. this.tokenEnd = this.firstCharOffset;
  37. }
  38. setSource(source = '', tokenize = () => {}) {
  39. source = String(source || '');
  40. const sourceLength = source.length;
  41. const offsetAndType = adoptBuffer(this.offsetAndType, source.length + 1); // +1 because of eof-token
  42. const balance = adoptBuffer(this.balance, source.length + 1);
  43. let tokenCount = 0;
  44. let firstCharOffset = -1;
  45. let balanceCloseType = 0;
  46. let balanceStart = source.length;
  47. // capture buffers
  48. this.offsetAndType = null;
  49. this.balance = null;
  50. balance.fill(0);
  51. tokenize(source, (type, start, end) => {
  52. const index = tokenCount++;
  53. // type & offset
  54. offsetAndType[index] = (type << TYPE_SHIFT) | end;
  55. if (firstCharOffset === -1) {
  56. firstCharOffset = start;
  57. }
  58. // balance
  59. balance[index] = balanceStart;
  60. if (type === balanceCloseType) {
  61. const prevBalanceStart = balance[balanceStart];
  62. // set reference to balance end for a block opener
  63. balance[balanceStart] = index;
  64. // pop state
  65. balanceStart = prevBalanceStart;
  66. balanceCloseType = balancePair[offsetAndType[prevBalanceStart] >> TYPE_SHIFT];
  67. } else if (isBlockOpenerToken(type)) { // check for FunctionToken, <(-token>, <[-token> and <{-token>
  68. // push state
  69. balanceStart = index;
  70. balanceCloseType = balancePair[type];
  71. }
  72. });
  73. // finalize buffers
  74. offsetAndType[tokenCount] = (EOF << TYPE_SHIFT) | sourceLength; // <EOF-token>
  75. balance[tokenCount] = tokenCount; // prevents false positive balance match with any token
  76. // reverse references from balance start to end
  77. // tokens
  78. // token: a ( [ b c ] d e ) {
  79. // index: 0 1 2 3 4 5 6 7 8 9
  80. // before
  81. // balance: 0 8 5 2 2 2 1 1 1 0
  82. // - > > < < < < < < -
  83. // after
  84. // balance: 9 8 5 5 5 2 8 8 1 9
  85. // > > > > > < > > < >
  86. for (let i = 0; i < tokenCount; i++) {
  87. const balanceStart = balance[i];
  88. if (balanceStart <= i) {
  89. const balanceEnd = balance[balanceStart];
  90. if (balanceEnd !== i) {
  91. balance[i] = balanceEnd;
  92. }
  93. } else if (balanceStart > tokenCount) {
  94. balance[i] = tokenCount;
  95. }
  96. }
  97. // balance[0] = tokenCount;
  98. this.source = source;
  99. this.firstCharOffset = firstCharOffset === -1 ? 0 : firstCharOffset;
  100. this.tokenCount = tokenCount;
  101. this.offsetAndType = offsetAndType;
  102. this.balance = balance;
  103. this.reset();
  104. this.next();
  105. }
  106. lookupType(offset) {
  107. offset += this.tokenIndex;
  108. if (offset < this.tokenCount) {
  109. return this.offsetAndType[offset] >> TYPE_SHIFT;
  110. }
  111. return EOF;
  112. }
  113. lookupTypeNonSC(idx) {
  114. for (let offset = this.tokenIndex; offset < this.tokenCount; offset++) {
  115. const tokenType = this.offsetAndType[offset] >> TYPE_SHIFT;
  116. if (tokenType !== WhiteSpace && tokenType !== Comment) {
  117. if (idx-- === 0) {
  118. return tokenType;
  119. }
  120. }
  121. }
  122. return EOF;
  123. }
  124. lookupOffset(offset) {
  125. offset += this.tokenIndex;
  126. if (offset < this.tokenCount) {
  127. return this.offsetAndType[offset - 1] & OFFSET_MASK;
  128. }
  129. return this.source.length;
  130. }
  131. lookupOffsetNonSC(idx) {
  132. for (let offset = this.tokenIndex; offset < this.tokenCount; offset++) {
  133. const tokenType = this.offsetAndType[offset] >> TYPE_SHIFT;
  134. if (tokenType !== WhiteSpace && tokenType !== Comment) {
  135. if (idx-- === 0) {
  136. return offset - this.tokenIndex;
  137. }
  138. }
  139. }
  140. return EOF;
  141. }
  142. lookupValue(offset, referenceStr) {
  143. offset += this.tokenIndex;
  144. if (offset < this.tokenCount) {
  145. return cmpStr(
  146. this.source,
  147. this.offsetAndType[offset - 1] & OFFSET_MASK,
  148. this.offsetAndType[offset] & OFFSET_MASK,
  149. referenceStr
  150. );
  151. }
  152. return false;
  153. }
  154. getTokenStart(tokenIndex) {
  155. if (tokenIndex === this.tokenIndex) {
  156. return this.tokenStart;
  157. }
  158. if (tokenIndex > 0) {
  159. return tokenIndex < this.tokenCount
  160. ? this.offsetAndType[tokenIndex - 1] & OFFSET_MASK
  161. : this.offsetAndType[this.tokenCount] & OFFSET_MASK;
  162. }
  163. return this.firstCharOffset;
  164. }
  165. substrToCursor(start) {
  166. return this.source.substring(start, this.tokenStart);
  167. }
  168. isBalanceEdge(pos) {
  169. return this.balance[this.tokenIndex] < pos;
  170. // return this.balance[this.balance[pos]] !== this.tokenIndex;
  171. }
  172. isDelim(code, offset) {
  173. if (offset) {
  174. return (
  175. this.lookupType(offset) === Delim &&
  176. this.source.charCodeAt(this.lookupOffset(offset)) === code
  177. );
  178. }
  179. return (
  180. this.tokenType === Delim &&
  181. this.source.charCodeAt(this.tokenStart) === code
  182. );
  183. }
  184. skip(tokenCount) {
  185. let next = this.tokenIndex + tokenCount;
  186. if (next < this.tokenCount) {
  187. this.tokenIndex = next;
  188. this.tokenStart = this.offsetAndType[next - 1] & OFFSET_MASK;
  189. next = this.offsetAndType[next];
  190. this.tokenType = next >> TYPE_SHIFT;
  191. this.tokenEnd = next & OFFSET_MASK;
  192. } else {
  193. this.tokenIndex = this.tokenCount;
  194. this.next();
  195. }
  196. }
  197. next() {
  198. let next = this.tokenIndex + 1;
  199. if (next < this.tokenCount) {
  200. this.tokenIndex = next;
  201. this.tokenStart = this.tokenEnd;
  202. next = this.offsetAndType[next];
  203. this.tokenType = next >> TYPE_SHIFT;
  204. this.tokenEnd = next & OFFSET_MASK;
  205. } else {
  206. this.eof = true;
  207. this.tokenIndex = this.tokenCount;
  208. this.tokenType = EOF;
  209. this.tokenStart = this.tokenEnd = this.source.length;
  210. }
  211. }
  212. skipSC() {
  213. while (this.tokenType === WhiteSpace || this.tokenType === Comment) {
  214. this.next();
  215. }
  216. }
  217. skipUntilBalanced(startToken, stopConsume) {
  218. let cursor = startToken;
  219. let balanceEnd = 0;
  220. let offset = 0;
  221. loop:
  222. for (; cursor < this.tokenCount; cursor++) {
  223. balanceEnd = this.balance[cursor];
  224. // stop scanning on balance edge that points to offset before start token
  225. if (balanceEnd < startToken) {
  226. break loop;
  227. }
  228. offset = cursor > 0 ? this.offsetAndType[cursor - 1] & OFFSET_MASK : this.firstCharOffset;
  229. // check stop condition
  230. switch (stopConsume(this.source.charCodeAt(offset))) {
  231. case 1: // just stop
  232. break loop;
  233. case 2: // stop & included
  234. cursor++;
  235. break loop;
  236. default:
  237. // fast forward to the end of balanced block for an open block tokens
  238. if (isBlockOpenerToken(this.offsetAndType[cursor] >> TYPE_SHIFT)) {
  239. cursor = balanceEnd;
  240. }
  241. }
  242. }
  243. this.skip(cursor - this.tokenIndex);
  244. }
  245. forEachToken(fn) {
  246. for (let i = 0, offset = this.firstCharOffset; i < this.tokenCount; i++) {
  247. const start = offset;
  248. const item = this.offsetAndType[i];
  249. const end = item & OFFSET_MASK;
  250. const type = item >> TYPE_SHIFT;
  251. offset = end;
  252. fn(type, start, end, i);
  253. }
  254. }
  255. dump() {
  256. const tokens = new Array(this.tokenCount);
  257. this.forEachToken((type, start, end, index) => {
  258. tokens[index] = {
  259. idx: index,
  260. type: tokenNames[type],
  261. chunk: this.source.substring(start, end),
  262. balance: this.balance[index]
  263. };
  264. });
  265. return tokens;
  266. }
  267. };