index.js 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. 'use strict';
  2. /**
  3. * Check if `vhost` is a valid suffix of `hostname` (top-domain)
  4. *
  5. * It means that `vhost` needs to be a suffix of `hostname` and we then need to
  6. * make sure that: either they are equal, or the character preceding `vhost` in
  7. * `hostname` is a '.' (it should not be a partial label).
  8. *
  9. * * hostname = 'not.evil.com' and vhost = 'vil.com' => not ok
  10. * * hostname = 'not.evil.com' and vhost = 'evil.com' => ok
  11. * * hostname = 'not.evil.com' and vhost = 'not.evil.com' => ok
  12. */
  13. function shareSameDomainSuffix(hostname, vhost) {
  14. if (hostname.endsWith(vhost)) {
  15. return (hostname.length === vhost.length ||
  16. hostname[hostname.length - vhost.length - 1] === '.');
  17. }
  18. return false;
  19. }
  20. /**
  21. * Given a hostname and its public suffix, extract the general domain.
  22. */
  23. function extractDomainWithSuffix(hostname, publicSuffix) {
  24. // Locate the index of the last '.' in the part of the `hostname` preceding
  25. // the public suffix.
  26. //
  27. // examples:
  28. // 1. not.evil.co.uk => evil.co.uk
  29. // ^ ^
  30. // | | start of public suffix
  31. // | index of the last dot
  32. //
  33. // 2. example.co.uk => example.co.uk
  34. // ^ ^
  35. // | | start of public suffix
  36. // |
  37. // | (-1) no dot found before the public suffix
  38. const publicSuffixIndex = hostname.length - publicSuffix.length - 2;
  39. const lastDotBeforeSuffixIndex = hostname.lastIndexOf('.', publicSuffixIndex);
  40. // No '.' found, then `hostname` is the general domain (no sub-domain)
  41. if (lastDotBeforeSuffixIndex === -1) {
  42. return hostname;
  43. }
  44. // Extract the part between the last '.'
  45. return hostname.slice(lastDotBeforeSuffixIndex + 1);
  46. }
  47. /**
  48. * Detects the domain based on rules and upon and a host string
  49. */
  50. function getDomain(suffix, hostname, options) {
  51. // Check if `hostname` ends with a member of `validHosts`.
  52. if (options.validHosts !== null) {
  53. const validHosts = options.validHosts;
  54. for (const vhost of validHosts) {
  55. if ( /*@__INLINE__*/shareSameDomainSuffix(hostname, vhost)) {
  56. return vhost;
  57. }
  58. }
  59. }
  60. let numberOfLeadingDots = 0;
  61. if (hostname.startsWith('.')) {
  62. while (numberOfLeadingDots < hostname.length &&
  63. hostname[numberOfLeadingDots] === '.') {
  64. numberOfLeadingDots += 1;
  65. }
  66. }
  67. // If `hostname` is a valid public suffix, then there is no domain to return.
  68. // Since we already know that `getPublicSuffix` returns a suffix of `hostname`
  69. // there is no need to perform a string comparison and we only compare the
  70. // size.
  71. if (suffix.length === hostname.length - numberOfLeadingDots) {
  72. return null;
  73. }
  74. // To extract the general domain, we start by identifying the public suffix
  75. // (if any), then consider the domain to be the public suffix with one added
  76. // level of depth. (e.g.: if hostname is `not.evil.co.uk` and public suffix:
  77. // `co.uk`, then we take one more level: `evil`, giving the final result:
  78. // `evil.co.uk`).
  79. return /*@__INLINE__*/ extractDomainWithSuffix(hostname, suffix);
  80. }
  81. /**
  82. * Return the part of domain without suffix.
  83. *
  84. * Example: for domain 'foo.com', the result would be 'foo'.
  85. */
  86. function getDomainWithoutSuffix(domain, suffix) {
  87. // Note: here `domain` and `suffix` cannot have the same length because in
  88. // this case we set `domain` to `null` instead. It is thus safe to assume
  89. // that `suffix` is shorter than `domain`.
  90. return domain.slice(0, -suffix.length - 1);
  91. }
  92. /**
  93. * @param url - URL we want to extract a hostname from.
  94. * @param urlIsValidHostname - hint from caller; true if `url` is already a valid hostname.
  95. */
  96. function extractHostname(url, urlIsValidHostname) {
  97. let start = 0;
  98. let end = url.length;
  99. let hasUpper = false;
  100. // If url is not already a valid hostname, then try to extract hostname.
  101. if (!urlIsValidHostname) {
  102. // Special handling of data URLs
  103. if (url.startsWith('data:')) {
  104. return null;
  105. }
  106. // Trim leading spaces
  107. while (start < url.length && url.charCodeAt(start) <= 32) {
  108. start += 1;
  109. }
  110. // Trim trailing spaces
  111. while (end > start + 1 && url.charCodeAt(end - 1) <= 32) {
  112. end -= 1;
  113. }
  114. // Skip scheme.
  115. if (url.charCodeAt(start) === 47 /* '/' */ &&
  116. url.charCodeAt(start + 1) === 47 /* '/' */) {
  117. start += 2;
  118. }
  119. else {
  120. const indexOfProtocol = url.indexOf(':/', start);
  121. if (indexOfProtocol !== -1) {
  122. // Implement fast-path for common protocols. We expect most protocols
  123. // should be one of these 4 and thus we will not need to perform the
  124. // more expansive validity check most of the time.
  125. const protocolSize = indexOfProtocol - start;
  126. const c0 = url.charCodeAt(start);
  127. const c1 = url.charCodeAt(start + 1);
  128. const c2 = url.charCodeAt(start + 2);
  129. const c3 = url.charCodeAt(start + 3);
  130. const c4 = url.charCodeAt(start + 4);
  131. if (protocolSize === 5 &&
  132. c0 === 104 /* 'h' */ &&
  133. c1 === 116 /* 't' */ &&
  134. c2 === 116 /* 't' */ &&
  135. c3 === 112 /* 'p' */ &&
  136. c4 === 115 /* 's' */) ;
  137. else if (protocolSize === 4 &&
  138. c0 === 104 /* 'h' */ &&
  139. c1 === 116 /* 't' */ &&
  140. c2 === 116 /* 't' */ &&
  141. c3 === 112 /* 'p' */) ;
  142. else if (protocolSize === 3 &&
  143. c0 === 119 /* 'w' */ &&
  144. c1 === 115 /* 's' */ &&
  145. c2 === 115 /* 's' */) ;
  146. else if (protocolSize === 2 &&
  147. c0 === 119 /* 'w' */ &&
  148. c1 === 115 /* 's' */) ;
  149. else {
  150. // Check that scheme is valid
  151. for (let i = start; i < indexOfProtocol; i += 1) {
  152. const lowerCaseCode = url.charCodeAt(i) | 32;
  153. if (!(((lowerCaseCode >= 97 && lowerCaseCode <= 122) || // [a, z]
  154. (lowerCaseCode >= 48 && lowerCaseCode <= 57) || // [0, 9]
  155. lowerCaseCode === 46 || // '.'
  156. lowerCaseCode === 45 || // '-'
  157. lowerCaseCode === 43) // '+'
  158. )) {
  159. return null;
  160. }
  161. }
  162. }
  163. // Skip 0, 1 or more '/' after ':/'
  164. start = indexOfProtocol + 2;
  165. while (url.charCodeAt(start) === 47 /* '/' */) {
  166. start += 1;
  167. }
  168. }
  169. }
  170. // Detect first occurrence of '/', '?' or '#'. We also keep track of the
  171. // last occurrence of '@', ']' or ':' to speed-up subsequent parsing of
  172. // (respectively), identifier, ipv6 or port.
  173. let indexOfIdentifier = -1;
  174. let indexOfClosingBracket = -1;
  175. let indexOfPort = -1;
  176. for (let i = start; i < end; i += 1) {
  177. const code = url.charCodeAt(i);
  178. if (code === 35 || // '#'
  179. code === 47 || // '/'
  180. code === 63 // '?'
  181. ) {
  182. end = i;
  183. break;
  184. }
  185. else if (code === 64) {
  186. // '@'
  187. indexOfIdentifier = i;
  188. }
  189. else if (code === 93) {
  190. // ']'
  191. indexOfClosingBracket = i;
  192. }
  193. else if (code === 58) {
  194. // ':'
  195. indexOfPort = i;
  196. }
  197. else if (code >= 65 && code <= 90) {
  198. hasUpper = true;
  199. }
  200. }
  201. // Detect identifier: '@'
  202. if (indexOfIdentifier !== -1 &&
  203. indexOfIdentifier > start &&
  204. indexOfIdentifier < end) {
  205. start = indexOfIdentifier + 1;
  206. }
  207. // Handle ipv6 addresses
  208. if (url.charCodeAt(start) === 91 /* '[' */) {
  209. if (indexOfClosingBracket !== -1) {
  210. return url.slice(start + 1, indexOfClosingBracket).toLowerCase();
  211. }
  212. return null;
  213. }
  214. else if (indexOfPort !== -1 && indexOfPort > start && indexOfPort < end) {
  215. // Detect port: ':'
  216. end = indexOfPort;
  217. }
  218. }
  219. // Trim trailing dots
  220. while (end > start + 1 && url.charCodeAt(end - 1) === 46 /* '.' */) {
  221. end -= 1;
  222. }
  223. const hostname = start !== 0 || end !== url.length ? url.slice(start, end) : url;
  224. if (hasUpper) {
  225. return hostname.toLowerCase();
  226. }
  227. return hostname;
  228. }
  229. /**
  230. * Check if a hostname is an IP. You should be aware that this only works
  231. * because `hostname` is already garanteed to be a valid hostname!
  232. */
  233. function isProbablyIpv4(hostname) {
  234. // Cannot be shorted than 1.1.1.1
  235. if (hostname.length < 7) {
  236. return false;
  237. }
  238. // Cannot be longer than: 255.255.255.255
  239. if (hostname.length > 15) {
  240. return false;
  241. }
  242. let numberOfDots = 0;
  243. for (let i = 0; i < hostname.length; i += 1) {
  244. const code = hostname.charCodeAt(i);
  245. if (code === 46 /* '.' */) {
  246. numberOfDots += 1;
  247. }
  248. else if (code < 48 /* '0' */ || code > 57 /* '9' */) {
  249. return false;
  250. }
  251. }
  252. return (numberOfDots === 3 &&
  253. hostname.charCodeAt(0) !== 46 /* '.' */ &&
  254. hostname.charCodeAt(hostname.length - 1) !== 46 /* '.' */);
  255. }
  256. /**
  257. * Similar to isProbablyIpv4.
  258. */
  259. function isProbablyIpv6(hostname) {
  260. if (hostname.length < 3) {
  261. return false;
  262. }
  263. let start = hostname.startsWith('[') ? 1 : 0;
  264. let end = hostname.length;
  265. if (hostname[end - 1] === ']') {
  266. end -= 1;
  267. }
  268. // We only consider the maximum size of a normal IPV6. Note that this will
  269. // fail on so-called "IPv4 mapped IPv6 addresses" but this is a corner-case
  270. // and a proper validation library should be used for these.
  271. if (end - start > 39) {
  272. return false;
  273. }
  274. let hasColon = false;
  275. for (; start < end; start += 1) {
  276. const code = hostname.charCodeAt(start);
  277. if (code === 58 /* ':' */) {
  278. hasColon = true;
  279. }
  280. else if (!(((code >= 48 && code <= 57) || // 0-9
  281. (code >= 97 && code <= 102) || // a-f
  282. (code >= 65 && code <= 90)) // A-F
  283. )) {
  284. return false;
  285. }
  286. }
  287. return hasColon;
  288. }
  289. /**
  290. * Check if `hostname` is *probably* a valid ip addr (either ipv6 or ipv4).
  291. * This *will not* work on any string. We need `hostname` to be a valid
  292. * hostname.
  293. */
  294. function isIp(hostname) {
  295. return isProbablyIpv6(hostname) || isProbablyIpv4(hostname);
  296. }
  297. /**
  298. * Implements fast shallow verification of hostnames. This does not perform a
  299. * struct check on the content of labels (classes of Unicode characters, etc.)
  300. * but instead check that the structure is valid (number of labels, length of
  301. * labels, etc.).
  302. *
  303. * If you need stricter validation, consider using an external library.
  304. */
  305. function isValidAscii(code) {
  306. return ((code >= 97 && code <= 122) || (code >= 48 && code <= 57) || code > 127);
  307. }
  308. /**
  309. * Check if a hostname string is valid. It's usually a preliminary check before
  310. * trying to use getDomain or anything else.
  311. *
  312. * Beware: it does not check if the TLD exists.
  313. */
  314. function isValidHostname (hostname) {
  315. if (hostname.length > 255) {
  316. return false;
  317. }
  318. if (hostname.length === 0) {
  319. return false;
  320. }
  321. if (
  322. /*@__INLINE__*/ !isValidAscii(hostname.charCodeAt(0)) &&
  323. hostname.charCodeAt(0) !== 46 && // '.' (dot)
  324. hostname.charCodeAt(0) !== 95 // '_' (underscore)
  325. ) {
  326. return false;
  327. }
  328. // Validate hostname according to RFC
  329. let lastDotIndex = -1;
  330. let lastCharCode = -1;
  331. const len = hostname.length;
  332. for (let i = 0; i < len; i += 1) {
  333. const code = hostname.charCodeAt(i);
  334. if (code === 46 /* '.' */) {
  335. if (
  336. // Check that previous label is < 63 bytes long (64 = 63 + '.')
  337. i - lastDotIndex > 64 ||
  338. // Check that previous character was not already a '.'
  339. lastCharCode === 46 ||
  340. // Check that the previous label does not end with a '-' (dash)
  341. lastCharCode === 45 ||
  342. // Check that the previous label does not end with a '_' (underscore)
  343. lastCharCode === 95) {
  344. return false;
  345. }
  346. lastDotIndex = i;
  347. }
  348. else if (!( /*@__INLINE__*/(isValidAscii(code) || code === 45 || code === 95))) {
  349. // Check if there is a forbidden character in the label
  350. return false;
  351. }
  352. lastCharCode = code;
  353. }
  354. return (
  355. // Check that last label is shorter than 63 chars
  356. len - lastDotIndex - 1 <= 63 &&
  357. // Check that the last character is an allowed trailing label character.
  358. // Since we already checked that the char is a valid hostname character,
  359. // we only need to check that it's different from '-'.
  360. lastCharCode !== 45);
  361. }
  362. function setDefaultsImpl({ allowIcannDomains = true, allowPrivateDomains = false, detectIp = true, extractHostname = true, mixedInputs = true, validHosts = null, validateHostname = true, }) {
  363. return {
  364. allowIcannDomains,
  365. allowPrivateDomains,
  366. detectIp,
  367. extractHostname,
  368. mixedInputs,
  369. validHosts,
  370. validateHostname,
  371. };
  372. }
  373. const DEFAULT_OPTIONS = /*@__INLINE__*/ setDefaultsImpl({});
  374. function setDefaults(options) {
  375. if (options === undefined) {
  376. return DEFAULT_OPTIONS;
  377. }
  378. return /*@__INLINE__*/ setDefaultsImpl(options);
  379. }
  380. /**
  381. * Returns the subdomain of a hostname string
  382. */
  383. function getSubdomain(hostname, domain) {
  384. // If `hostname` and `domain` are the same, then there is no sub-domain
  385. if (domain.length === hostname.length) {
  386. return '';
  387. }
  388. return hostname.slice(0, -domain.length - 1);
  389. }
  390. /**
  391. * Implement a factory allowing to plug different implementations of suffix
  392. * lookup (e.g.: using a trie or the packed hashes datastructures). This is used
  393. * and exposed in `tldts.ts` and `tldts-experimental.ts` bundle entrypoints.
  394. */
  395. function getEmptyResult() {
  396. return {
  397. domain: null,
  398. domainWithoutSuffix: null,
  399. hostname: null,
  400. isIcann: null,
  401. isIp: null,
  402. isPrivate: null,
  403. publicSuffix: null,
  404. subdomain: null,
  405. };
  406. }
  407. function resetResult(result) {
  408. result.domain = null;
  409. result.domainWithoutSuffix = null;
  410. result.hostname = null;
  411. result.isIcann = null;
  412. result.isIp = null;
  413. result.isPrivate = null;
  414. result.publicSuffix = null;
  415. result.subdomain = null;
  416. }
  417. function parseImpl(url, step, suffixLookup, partialOptions, result) {
  418. const options = /*@__INLINE__*/ setDefaults(partialOptions);
  419. // Very fast approximate check to make sure `url` is a string. This is needed
  420. // because the library will not necessarily be used in a typed setup and
  421. // values of arbitrary types might be given as argument.
  422. if (typeof url !== 'string') {
  423. return result;
  424. }
  425. // Extract hostname from `url` only if needed. This can be made optional
  426. // using `options.extractHostname`. This option will typically be used
  427. // whenever we are sure the inputs to `parse` are already hostnames and not
  428. // arbitrary URLs.
  429. //
  430. // `mixedInput` allows to specify if we expect a mix of URLs and hostnames
  431. // as input. If only hostnames are expected then `extractHostname` can be
  432. // set to `false` to speed-up parsing. If only URLs are expected then
  433. // `mixedInputs` can be set to `false`. The `mixedInputs` is only a hint
  434. // and will not change the behavior of the library.
  435. if (!options.extractHostname) {
  436. result.hostname = url;
  437. }
  438. else if (options.mixedInputs) {
  439. result.hostname = extractHostname(url, isValidHostname(url));
  440. }
  441. else {
  442. result.hostname = extractHostname(url, false);
  443. }
  444. // Check if `hostname` is a valid ip address
  445. if (options.detectIp && result.hostname !== null) {
  446. result.isIp = isIp(result.hostname);
  447. if (result.isIp) {
  448. return result;
  449. }
  450. }
  451. // Perform hostname validation if enabled. If hostname is not valid, no need to
  452. // go further as there will be no valid domain or sub-domain. This validation
  453. // is applied before any early returns to ensure consistent behavior across
  454. // all API methods including getHostname().
  455. if (options.validateHostname &&
  456. options.extractHostname &&
  457. result.hostname !== null &&
  458. !isValidHostname(result.hostname)) {
  459. result.hostname = null;
  460. return result;
  461. }
  462. if (step === 0 /* FLAG.HOSTNAME */ || result.hostname === null) {
  463. return result;
  464. }
  465. // Extract public suffix
  466. suffixLookup(result.hostname, options, result);
  467. if (step === 2 /* FLAG.PUBLIC_SUFFIX */ || result.publicSuffix === null) {
  468. return result;
  469. }
  470. // Extract domain
  471. result.domain = getDomain(result.publicSuffix, result.hostname, options);
  472. if (step === 3 /* FLAG.DOMAIN */ || result.domain === null) {
  473. return result;
  474. }
  475. // Extract subdomain
  476. result.subdomain = getSubdomain(result.hostname, result.domain);
  477. if (step === 4 /* FLAG.SUB_DOMAIN */) {
  478. return result;
  479. }
  480. // Extract domain without suffix
  481. result.domainWithoutSuffix = getDomainWithoutSuffix(result.domain, result.publicSuffix);
  482. return result;
  483. }
  484. function fastPath (hostname, options, out) {
  485. // Fast path for very popular suffixes; this allows to by-pass lookup
  486. // completely as well as any extra allocation or string manipulation.
  487. if (!options.allowPrivateDomains && hostname.length > 3) {
  488. const last = hostname.length - 1;
  489. const c3 = hostname.charCodeAt(last);
  490. const c2 = hostname.charCodeAt(last - 1);
  491. const c1 = hostname.charCodeAt(last - 2);
  492. const c0 = hostname.charCodeAt(last - 3);
  493. if (c3 === 109 /* 'm' */ &&
  494. c2 === 111 /* 'o' */ &&
  495. c1 === 99 /* 'c' */ &&
  496. c0 === 46 /* '.' */) {
  497. out.isIcann = true;
  498. out.isPrivate = false;
  499. out.publicSuffix = 'com';
  500. return true;
  501. }
  502. else if (c3 === 103 /* 'g' */ &&
  503. c2 === 114 /* 'r' */ &&
  504. c1 === 111 /* 'o' */ &&
  505. c0 === 46 /* '.' */) {
  506. out.isIcann = true;
  507. out.isPrivate = false;
  508. out.publicSuffix = 'org';
  509. return true;
  510. }
  511. else if (c3 === 117 /* 'u' */ &&
  512. c2 === 100 /* 'd' */ &&
  513. c1 === 101 /* 'e' */ &&
  514. c0 === 46 /* '.' */) {
  515. out.isIcann = true;
  516. out.isPrivate = false;
  517. out.publicSuffix = 'edu';
  518. return true;
  519. }
  520. else if (c3 === 118 /* 'v' */ &&
  521. c2 === 111 /* 'o' */ &&
  522. c1 === 103 /* 'g' */ &&
  523. c0 === 46 /* '.' */) {
  524. out.isIcann = true;
  525. out.isPrivate = false;
  526. out.publicSuffix = 'gov';
  527. return true;
  528. }
  529. else if (c3 === 116 /* 't' */ &&
  530. c2 === 101 /* 'e' */ &&
  531. c1 === 110 /* 'n' */ &&
  532. c0 === 46 /* '.' */) {
  533. out.isIcann = true;
  534. out.isPrivate = false;
  535. out.publicSuffix = 'net';
  536. return true;
  537. }
  538. else if (c3 === 101 /* 'e' */ &&
  539. c2 === 100 /* 'd' */ &&
  540. c1 === 46 /* '.' */) {
  541. out.isIcann = true;
  542. out.isPrivate = false;
  543. out.publicSuffix = 'de';
  544. return true;
  545. }
  546. }
  547. return false;
  548. }
  549. exports.fastPathLookup = fastPath;
  550. exports.getEmptyResult = getEmptyResult;
  551. exports.parseImpl = parseImpl;
  552. exports.resetResult = resetResult;
  553. exports.setDefaults = setDefaults;
  554. //# sourceMappingURL=index.js.map