formdata-parser.js 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. 'use strict'
  2. const { bufferToLowerCasedHeaderName } = require('../../core/util')
  3. const { HTTP_TOKEN_CODEPOINTS } = require('./data-url')
  4. const { makeEntry } = require('./formdata')
  5. const { webidl } = require('../webidl')
  6. const assert = require('node:assert')
  7. const { isomorphicDecode } = require('../infra')
  8. const { utf8DecodeBytes } = require('../../encoding')
  9. const dd = Buffer.from('--')
  10. const decoder = new TextDecoder()
  11. /**
  12. * @param {string} chars
  13. */
  14. function isAsciiString (chars) {
  15. for (let i = 0; i < chars.length; ++i) {
  16. if ((chars.charCodeAt(i) & ~0x7F) !== 0) {
  17. return false
  18. }
  19. }
  20. return true
  21. }
  22. /**
  23. * @see https://andreubotella.github.io/multipart-form-data/#multipart-form-data-boundary
  24. * @param {string} boundary
  25. */
  26. function validateBoundary (boundary) {
  27. const length = boundary.length
  28. // - its length is greater or equal to 27 and lesser or equal to 70, and
  29. if (length < 27 || length > 70) {
  30. return false
  31. }
  32. // - it is composed by bytes in the ranges 0x30 to 0x39, 0x41 to 0x5A, or
  33. // 0x61 to 0x7A, inclusive (ASCII alphanumeric), or which are 0x27 ('),
  34. // 0x2D (-) or 0x5F (_).
  35. for (let i = 0; i < length; ++i) {
  36. const cp = boundary.charCodeAt(i)
  37. if (!(
  38. (cp >= 0x30 && cp <= 0x39) ||
  39. (cp >= 0x41 && cp <= 0x5a) ||
  40. (cp >= 0x61 && cp <= 0x7a) ||
  41. cp === 0x27 ||
  42. cp === 0x2d ||
  43. cp === 0x5f
  44. )) {
  45. return false
  46. }
  47. }
  48. return true
  49. }
  50. /**
  51. * @see https://andreubotella.github.io/multipart-form-data/#multipart-form-data-parser
  52. * @param {Buffer} input
  53. * @param {ReturnType<import('./data-url')['parseMIMEType']>} mimeType
  54. */
  55. function multipartFormDataParser (input, mimeType) {
  56. // 1. Assert: mimeType’s essence is "multipart/form-data".
  57. assert(mimeType !== 'failure' && mimeType.essence === 'multipart/form-data')
  58. const boundaryString = mimeType.parameters.get('boundary')
  59. // 2. If mimeType’s parameters["boundary"] does not exist, return failure.
  60. // Otherwise, let boundary be the result of UTF-8 decoding mimeType’s
  61. // parameters["boundary"].
  62. if (boundaryString === undefined) {
  63. throw parsingError('missing boundary in content-type header')
  64. }
  65. const boundary = Buffer.from(`--${boundaryString}`, 'utf8')
  66. // 3. Let entry list be an empty entry list.
  67. const entryList = []
  68. // 4. Let position be a pointer to a byte in input, initially pointing at
  69. // the first byte.
  70. const position = { position: 0 }
  71. // Note: Per RFC 2046 Section 5.1.1, we must ignore anything before the
  72. // first boundary delimiter line (preamble). Search for the first boundary.
  73. const firstBoundaryIndex = input.indexOf(boundary)
  74. if (firstBoundaryIndex === -1) {
  75. throw parsingError('no boundary found in multipart body')
  76. }
  77. // Start parsing from the first boundary, ignoring any preamble
  78. position.position = firstBoundaryIndex
  79. // 5. While true:
  80. while (true) {
  81. // 5.1. If position points to a sequence of bytes starting with 0x2D 0x2D
  82. // (`--`) followed by boundary, advance position by 2 + the length of
  83. // boundary. Otherwise, return failure.
  84. // Note: boundary is padded with 2 dashes already, no need to add 2.
  85. if (input.subarray(position.position, position.position + boundary.length).equals(boundary)) {
  86. position.position += boundary.length
  87. } else {
  88. throw parsingError('expected a value starting with -- and the boundary')
  89. }
  90. // 5.2. If position points to the sequence of bytes 0x2D 0x2D 0x0D 0x0A
  91. // (`--` followed by CR LF) followed by the end of input, return entry list.
  92. // Note: Per RFC 2046 Section 5.1.1, we must ignore anything after the
  93. // final boundary delimiter (epilogue). Check for -- or --CRLF and return
  94. // regardless of what follows.
  95. if (bufferStartsWith(input, dd, position)) {
  96. // Found closing boundary delimiter (--), ignore any epilogue
  97. return entryList
  98. }
  99. // 5.3. If position does not point to a sequence of bytes starting with 0x0D
  100. // 0x0A (CR LF), return failure.
  101. if (input[position.position] !== 0x0d || input[position.position + 1] !== 0x0a) {
  102. throw parsingError('expected CRLF')
  103. }
  104. // 5.4. Advance position by 2. (This skips past the newline.)
  105. position.position += 2
  106. // 5.5. Let name, filename and contentType be the result of parsing
  107. // multipart/form-data headers on input and position, if the result
  108. // is not failure. Otherwise, return failure.
  109. const result = parseMultipartFormDataHeaders(input, position)
  110. let { name, filename, contentType, encoding } = result
  111. // 5.6. Advance position by 2. (This skips past the empty line that marks
  112. // the end of the headers.)
  113. position.position += 2
  114. // 5.7. Let body be the empty byte sequence.
  115. let body
  116. // 5.8. Body loop: While position is not past the end of input:
  117. // TODO: the steps here are completely wrong
  118. {
  119. const boundaryIndex = input.indexOf(boundary.subarray(2), position.position)
  120. if (boundaryIndex === -1) {
  121. throw parsingError('expected boundary after body')
  122. }
  123. body = input.subarray(position.position, boundaryIndex - 4)
  124. position.position += body.length
  125. // Note: position must be advanced by the body's length before being
  126. // decoded, otherwise the parsing will fail.
  127. if (encoding === 'base64') {
  128. body = Buffer.from(body.toString(), 'base64')
  129. }
  130. }
  131. // 5.9. If position does not point to a sequence of bytes starting with
  132. // 0x0D 0x0A (CR LF), return failure. Otherwise, advance position by 2.
  133. if (input[position.position] !== 0x0d || input[position.position + 1] !== 0x0a) {
  134. throw parsingError('expected CRLF')
  135. } else {
  136. position.position += 2
  137. }
  138. // 5.10. If filename is not null:
  139. let value
  140. if (filename !== null) {
  141. // 5.10.1. If contentType is null, set contentType to "text/plain".
  142. contentType ??= 'text/plain'
  143. // 5.10.2. If contentType is not an ASCII string, set contentType to the empty string.
  144. // Note: `buffer.isAscii` can be used at zero-cost, but converting a string to a buffer is a high overhead.
  145. // Content-Type is a relatively small string, so it is faster to use `String#charCodeAt`.
  146. if (!isAsciiString(contentType)) {
  147. contentType = ''
  148. }
  149. // 5.10.3. Let value be a new File object with name filename, type contentType, and body body.
  150. value = new File([body], filename, { type: contentType })
  151. } else {
  152. // 5.11. Otherwise:
  153. // 5.11.1. Let value be the UTF-8 decoding without BOM of body.
  154. value = utf8DecodeBytes(Buffer.from(body))
  155. }
  156. // 5.12. Assert: name is a scalar value string and value is either a scalar value string or a File object.
  157. assert(webidl.is.USVString(name))
  158. assert((typeof value === 'string' && webidl.is.USVString(value)) || webidl.is.File(value))
  159. // 5.13. Create an entry with name and value, and append it to entry list.
  160. entryList.push(makeEntry(name, value, filename))
  161. }
  162. }
  163. /**
  164. * Parses content-disposition attributes (e.g., name="value" or filename*=utf-8''encoded)
  165. * @param {Buffer} input
  166. * @param {{ position: number }} position
  167. * @returns {{ name: string, value: string }}
  168. */
  169. function parseContentDispositionAttribute (input, position) {
  170. // Skip leading semicolon and whitespace
  171. if (input[position.position] === 0x3b /* ; */) {
  172. position.position++
  173. }
  174. // Skip whitespace
  175. collectASequenceOfBytes(
  176. (char) => char === 0x20 || char === 0x09,
  177. input,
  178. position
  179. )
  180. // Collect attribute name (token characters)
  181. const attributeName = collectASequenceOfBytes(
  182. (char) => isToken(char) && char !== 0x3d && char !== 0x2a, // not = or *
  183. input,
  184. position
  185. )
  186. if (attributeName.length === 0) {
  187. return null
  188. }
  189. const attrNameStr = attributeName.toString('ascii').toLowerCase()
  190. // Check for extended notation (attribute*)
  191. const isExtended = input[position.position] === 0x2a /* * */
  192. if (isExtended) {
  193. position.position++ // skip *
  194. }
  195. // Expect = sign
  196. if (input[position.position] !== 0x3d /* = */) {
  197. return null
  198. }
  199. position.position++ // skip =
  200. // Skip whitespace
  201. collectASequenceOfBytes(
  202. (char) => char === 0x20 || char === 0x09,
  203. input,
  204. position
  205. )
  206. let value
  207. if (isExtended) {
  208. // Extended attribute format: charset'language'encoded-value
  209. const headerValue = collectASequenceOfBytes(
  210. (char) => char !== 0x20 && char !== 0x0d && char !== 0x0a && char !== 0x3b, // not space, CRLF, or ;
  211. input,
  212. position
  213. )
  214. // Check for utf-8'' prefix (case insensitive)
  215. if (
  216. (headerValue[0] !== 0x75 && headerValue[0] !== 0x55) || // u or U
  217. (headerValue[1] !== 0x74 && headerValue[1] !== 0x54) || // t or T
  218. (headerValue[2] !== 0x66 && headerValue[2] !== 0x46) || // f or F
  219. headerValue[3] !== 0x2d || // -
  220. headerValue[4] !== 0x38 // 8
  221. ) {
  222. throw parsingError('unknown encoding, expected utf-8\'\'')
  223. }
  224. // Skip utf-8'' and decode the rest
  225. value = decodeURIComponent(decoder.decode(headerValue.subarray(7)))
  226. } else if (input[position.position] === 0x22 /* " */) {
  227. // Quoted string
  228. position.position++ // skip opening quote
  229. const quotedValue = collectASequenceOfBytes(
  230. (char) => char !== 0x0a && char !== 0x0d && char !== 0x22, // not LF, CR, or "
  231. input,
  232. position
  233. )
  234. if (input[position.position] !== 0x22) {
  235. throw parsingError('Closing quote not found')
  236. }
  237. position.position++ // skip closing quote
  238. value = decoder.decode(quotedValue)
  239. .replace(/%0A/ig, '\n')
  240. .replace(/%0D/ig, '\r')
  241. .replace(/%22/g, '"')
  242. } else {
  243. // Token value (no quotes)
  244. const tokenValue = collectASequenceOfBytes(
  245. (char) => isToken(char) && char !== 0x3b, // not ;
  246. input,
  247. position
  248. )
  249. value = decoder.decode(tokenValue)
  250. }
  251. return { name: attrNameStr, value }
  252. }
  253. /**
  254. * @see https://andreubotella.github.io/multipart-form-data/#parse-multipart-form-data-headers
  255. * @param {Buffer} input
  256. * @param {{ position: number }} position
  257. */
  258. function parseMultipartFormDataHeaders (input, position) {
  259. // 1. Let name, filename and contentType be null.
  260. let name = null
  261. let filename = null
  262. let contentType = null
  263. let encoding = null
  264. // 2. While true:
  265. while (true) {
  266. // 2.1. If position points to a sequence of bytes starting with 0x0D 0x0A (CR LF):
  267. if (input[position.position] === 0x0d && input[position.position + 1] === 0x0a) {
  268. // 2.1.1. If name is null, return failure.
  269. if (name === null) {
  270. throw parsingError('header name is null')
  271. }
  272. // 2.1.2. Return name, filename and contentType.
  273. return { name, filename, contentType, encoding }
  274. }
  275. // 2.2. Let header name be the result of collecting a sequence of bytes that are
  276. // not 0x0A (LF), 0x0D (CR) or 0x3A (:), given position.
  277. let headerName = collectASequenceOfBytes(
  278. (char) => char !== 0x0a && char !== 0x0d && char !== 0x3a,
  279. input,
  280. position
  281. )
  282. // 2.3. Remove any HTTP tab or space bytes from the start or end of header name.
  283. headerName = removeChars(headerName, true, true, (char) => char === 0x9 || char === 0x20)
  284. // 2.4. If header name does not match the field-name token production, return failure.
  285. if (!HTTP_TOKEN_CODEPOINTS.test(headerName.toString())) {
  286. throw parsingError('header name does not match the field-name token production')
  287. }
  288. // 2.5. If the byte at position is not 0x3A (:), return failure.
  289. if (input[position.position] !== 0x3a) {
  290. throw parsingError('expected :')
  291. }
  292. // 2.6. Advance position by 1.
  293. position.position++
  294. // 2.7. Collect a sequence of bytes that are HTTP tab or space bytes given position.
  295. // (Do nothing with those bytes.)
  296. collectASequenceOfBytes(
  297. (char) => char === 0x20 || char === 0x09,
  298. input,
  299. position
  300. )
  301. // 2.8. Byte-lowercase header name and switch on the result:
  302. switch (bufferToLowerCasedHeaderName(headerName)) {
  303. case 'content-disposition': {
  304. name = filename = null
  305. // Collect the disposition type (should be "form-data")
  306. const dispositionType = collectASequenceOfBytes(
  307. (char) => isToken(char),
  308. input,
  309. position
  310. )
  311. if (dispositionType.toString('ascii').toLowerCase() !== 'form-data') {
  312. throw parsingError('expected form-data for content-disposition header')
  313. }
  314. // Parse attributes recursively until CRLF
  315. while (
  316. position.position < input.length &&
  317. input[position.position] !== 0x0d &&
  318. input[position.position + 1] !== 0x0a
  319. ) {
  320. const attribute = parseContentDispositionAttribute(input, position)
  321. if (!attribute) {
  322. break
  323. }
  324. if (attribute.name === 'name') {
  325. name = attribute.value
  326. } else if (attribute.name === 'filename') {
  327. filename = attribute.value
  328. }
  329. }
  330. if (name === null) {
  331. throw parsingError('name attribute is required in content-disposition header')
  332. }
  333. break
  334. }
  335. case 'content-type': {
  336. // 1. Let header value be the result of collecting a sequence of bytes that are
  337. // not 0x0A (LF) or 0x0D (CR), given position.
  338. let headerValue = collectASequenceOfBytes(
  339. (char) => char !== 0x0a && char !== 0x0d,
  340. input,
  341. position
  342. )
  343. // 2. Remove any HTTP tab or space bytes from the end of header value.
  344. headerValue = removeChars(headerValue, false, true, (char) => char === 0x9 || char === 0x20)
  345. // 3. Set contentType to the isomorphic decoding of header value.
  346. contentType = isomorphicDecode(headerValue)
  347. break
  348. }
  349. case 'content-transfer-encoding': {
  350. let headerValue = collectASequenceOfBytes(
  351. (char) => char !== 0x0a && char !== 0x0d,
  352. input,
  353. position
  354. )
  355. headerValue = removeChars(headerValue, false, true, (char) => char === 0x9 || char === 0x20)
  356. encoding = isomorphicDecode(headerValue)
  357. break
  358. }
  359. default: {
  360. // Collect a sequence of bytes that are not 0x0A (LF) or 0x0D (CR), given position.
  361. // (Do nothing with those bytes.)
  362. collectASequenceOfBytes(
  363. (char) => char !== 0x0a && char !== 0x0d,
  364. input,
  365. position
  366. )
  367. }
  368. }
  369. // 2.9. If position does not point to a sequence of bytes starting with 0x0D 0x0A
  370. // (CR LF), return failure. Otherwise, advance position by 2 (past the newline).
  371. if (input[position.position] !== 0x0d && input[position.position + 1] !== 0x0a) {
  372. throw parsingError('expected CRLF')
  373. } else {
  374. position.position += 2
  375. }
  376. }
  377. }
  378. /**
  379. * @param {(char: number) => boolean} condition
  380. * @param {Buffer} input
  381. * @param {{ position: number }} position
  382. */
  383. function collectASequenceOfBytes (condition, input, position) {
  384. let start = position.position
  385. while (start < input.length && condition(input[start])) {
  386. ++start
  387. }
  388. return input.subarray(position.position, (position.position = start))
  389. }
  390. /**
  391. * @param {Buffer} buf
  392. * @param {boolean} leading
  393. * @param {boolean} trailing
  394. * @param {(charCode: number) => boolean} predicate
  395. * @returns {Buffer}
  396. */
  397. function removeChars (buf, leading, trailing, predicate) {
  398. let lead = 0
  399. let trail = buf.length - 1
  400. if (leading) {
  401. while (lead < buf.length && predicate(buf[lead])) lead++
  402. }
  403. if (trailing) {
  404. while (trail > 0 && predicate(buf[trail])) trail--
  405. }
  406. return lead === 0 && trail === buf.length - 1 ? buf : buf.subarray(lead, trail + 1)
  407. }
  408. /**
  409. * Checks if {@param buffer} starts with {@param start}
  410. * @param {Buffer} buffer
  411. * @param {Buffer} start
  412. * @param {{ position: number }} position
  413. */
  414. function bufferStartsWith (buffer, start, position) {
  415. if (buffer.length < start.length) {
  416. return false
  417. }
  418. for (let i = 0; i < start.length; i++) {
  419. if (start[i] !== buffer[position.position + i]) {
  420. return false
  421. }
  422. }
  423. return true
  424. }
  425. function parsingError (cause) {
  426. return new TypeError('Failed to parse body as FormData.', { cause: new TypeError(cause) })
  427. }
  428. /**
  429. * CTL = <any US-ASCII control character
  430. * (octets 0 - 31) and DEL (127)>
  431. * @param {number} char
  432. */
  433. function isCTL (char) {
  434. return char <= 0x1f || char === 0x7f
  435. }
  436. /**
  437. * tspecials := "(" / ")" / "<" / ">" / "@" /
  438. * "," / ";" / ":" / "\" / <">
  439. * "/" / "[" / "]" / "?" / "="
  440. * ; Must be in quoted-string,
  441. * ; to use within parameter values
  442. * @param {number} char
  443. */
  444. function isTSpecial (char) {
  445. return (
  446. char === 0x28 || // (
  447. char === 0x29 || // )
  448. char === 0x3c || // <
  449. char === 0x3e || // >
  450. char === 0x40 || // @
  451. char === 0x2c || // ,
  452. char === 0x3b || // ;
  453. char === 0x3a || // :
  454. char === 0x5c || // \
  455. char === 0x22 || // "
  456. char === 0x2f || // /
  457. char === 0x5b || // [
  458. char === 0x5d || // ]
  459. char === 0x3f || // ?
  460. char === 0x3d // +
  461. )
  462. }
  463. /**
  464. * token := 1*<any (US-ASCII) CHAR except SPACE, CTLs,
  465. * or tspecials>
  466. * @param {number} char
  467. */
  468. function isToken (char) {
  469. return (
  470. char <= 0x7f && // ascii
  471. char !== 0x20 && // space
  472. char !== 0x09 &&
  473. !isCTL(char) &&
  474. !isTSpecial(char)
  475. )
  476. }
  477. module.exports = {
  478. multipartFormDataParser,
  479. validateBoundary
  480. }