feat: implement stream based processing of the files (#12574)

Fixes: FRMW-2960 This PR adds support for processing large CSV files by breaking them into chunks and processing one chunk at a time. This is how it works in nutshell. - The CSV file is read as a stream and each chunk of the stream is one CSV row. - We read upto 1000 rows (plus a few more to ensure product variants of a product are not split into multiple chunks). - Each chunk is then normalized using the `CSVNormalizer` and validated using zod schemas. If there is an error, the entire process will be aborted and the existing chunks will be deleted. - Each chunk is written to a JSON file, so that we can process them later (after user confirms) without re-processing or validating the CSV file. - The confirmation process will start consuming one chunk at a time and create/update products using the `batchProducts` workflow. ## Resume or not to resume processing of chunks Let's imagine during processing of chunks, we find that chunk 3 leads to a database error. However, till this time we have processed the first two chunks already. How do we deal with this situation? Options are: - We store at which chunk we failed and then during the re-upload we ignore chunks before the failed one. In my conversation with @olivermrbl we discovered that resuming will have to work with certain assumptions if we decide to implement it. - What if a user updates the CSV rows which are part of the already processed chunks? These changes will be ignored and they will never notice it. - Resuming works if the file name is still the same. What if they made changes and saved the file with "Save as - New name". In that case we will anyways process the entire file. - We will have to fetch the old workflow from the workflow engine using some `ilike` search, so that we can see at which chunk the last run failed for the given file. Co-authored-by: Carlos R. L. Rodrigues <37986729+carlos-r-l-rodrigues@users.noreply.github.com>
2025-05-29 11:12:16 +05:30
parent 40e73c6ea2
commit cf0297f74a
12 changed files with 360 additions and 141 deletions
@@ -88,3 +88,4 @@ export * from "./upper-case-first"
 export * from "./validate-handle"
 export * from "./validate-module-name"
 export * from "./wrap-handler"
+export * from "./normalize-csv-value"
@@ -0,0 +1,10 @@
+/**
+ * Normalizes a CSV value by removing the leading "\r" from the
+ * value.
+ */
+export function normalizeCSVValue<T>(value: T): T {
+  if (typeof value === "string") {
+    return value.replace(/\\r$/, "").trim() as T
+  }
+  return value
+}
@@ -10,16 +10,16 @@ async function loadFixtureFile(fileName: string) {

 describe("CSV processor", () => {
  it("should error when both Product Id and Handle are missing", async () => {
-    const processor = new CSVNormalizer([{}])
-
-    expect(() => processor.proccess()).toThrow(
-      "Row 1: Missing product id and handle. One of them are required to process the row"
+    expect(() => CSVNormalizer.preProcess({}, 1)).toThrow(
+      "Row 1: Missing product id and handle. One of these columns are required to process the row"
    )
  })

  it("should process a CSV row", async () => {
-    const csvData = await loadFixtureFile("single-row-create.json")
-    const processor = new CSVNormalizer(csvData)
+    const csvData: any[] = await loadFixtureFile("single-row-create.json")
+    const processor = new CSVNormalizer(
+      csvData.map((row, index) => CSVNormalizer.preProcess(row, index + 1))
+    )

    const products = processor.proccess()
    expect(products).toMatchInlineSnapshot(`
@@ -87,8 +87,12 @@ describe("CSV processor", () => {
  })

  it("should process multiple CSV rows for the same product", async () => {
-    const csvData = await loadFixtureFile("same-product-multiple-rows.json")
-    const processor = new CSVNormalizer(csvData)
+    const csvData: any[] = await loadFixtureFile(
+      "same-product-multiple-rows.json"
+    )
+    const processor = new CSVNormalizer(
+      csvData.map((row, index) => CSVNormalizer.preProcess(row, index + 1))
+    )

    const products = processor.proccess()
    expect(products).toMatchInlineSnapshot(`
@@ -200,10 +204,12 @@ describe("CSV processor", () => {
  })

  it("should process multiple CSV rows where each variant uses different options", async () => {
-    const csvData = await loadFixtureFile(
+    const csvData: any[] = await loadFixtureFile(
      "same-product-multiple-variant-options.json"
    )
-    const processor = new CSVNormalizer(csvData)
+    const processor = new CSVNormalizer(
+      csvData.map((row, index) => CSVNormalizer.preProcess(row, index + 1))
+    )

    const products = processor.proccess()
    expect(products).toMatchInlineSnapshot(`
@@ -325,10 +331,12 @@ describe("CSV processor", () => {
  })

  it("should process multiple CSV rows with multiple products and variants", async () => {
-    const csvData = await loadFixtureFile(
+    const csvData: any[] = await loadFixtureFile(
      "multiple-products-multiple-variants.json"
    )
-    const processor = new CSVNormalizer(csvData)
+    const processor = new CSVNormalizer(
+      csvData.map((row, index) => CSVNormalizer.preProcess(row, index + 1))
+    )

    const products = processor.proccess()
    expect(products).toMatchInlineSnapshot(`
@@ -3,6 +3,7 @@ import {
  tryConvertToNumber,
  tryConvertToBoolean,
  MedusaError,
+  normalizeCSVValue,
 } from "../common"
 import { AdminCreateProduct, AdminCreateProductVariant } from "@medusajs/types"

@@ -17,6 +18,20 @@ type ColumnProcessor<Output> = (
  output: Output
 ) => void

+type NormalizedRow =
+  | (Record<string, string | number | boolean> & {
+      "product id": string
+      "product handle": string
+    })
+  | {
+      "product id"?: string
+      "product handle": string
+    }
+  | {
+      "product id": string
+      "product handle"?: string
+    }
+
 /**
 * Creates an error with the CSV row number
 */
@@ -27,23 +42,12 @@ function createError(rowNumber: number, message: string) {
  )
 }

-/**
- * Normalizes a CSV value by removing the leading "\r" from the
- * value.
- */
-function normalizeValue<T>(value: T): T {
-  if (typeof value === "string") {
-    return value.replace(/\\r$/, "").trim() as T
-  }
-  return value
-}
-
 /**
 * Parses different patterns to extract variant price iso
 * and the region name. The iso is converted to lowercase
 */
 function parseVariantPriceColumn(columnName: string, rowNumber: number) {
-  const normalizedValue = normalizeValue(columnName)
+  const normalizedValue = columnName
  const potentialRegion = /\[(.*)\]/g.exec(normalizedValue)?.[1]
  const iso = normalizedValue.split(" ").pop()

@@ -68,7 +72,7 @@ function processAsString<Output>(
  outputKey: keyof Output
 ): ColumnProcessor<Output> {
  return (csvRow, _, __, output) => {
-    const value = normalizeValue(csvRow[inputKey])
+    const value = csvRow[inputKey]
    if (isPresent(value)) {
      output[outputKey as any] = value
    }
@@ -83,7 +87,7 @@ function processAsBoolean<Output>(
  outputKey: keyof Output
 ): ColumnProcessor<Output> {
  return (csvRow, _, __, output) => {
-    const value = normalizeValue(csvRow[inputKey])
+    const value = csvRow[inputKey]
    if (isPresent(value)) {
      output[outputKey as any] = tryConvertToBoolean(value, value)
    }
@@ -99,7 +103,7 @@ function processAsNumber<Output>(
  options?: { asNumericString: boolean }
 ): ColumnProcessor<Output> {
  return (csvRow, _, rowNumber, output) => {
-    const value = normalizeValue(csvRow[inputKey])
+    const value = csvRow[inputKey]
    if (isPresent(value)) {
      const numericValue = tryConvertToNumber(value)
      if (numericValue === undefined) {
@@ -135,7 +139,7 @@ function processAsCounterValue<Output extends Record<string, any[]>>(
    rowColumns
      .filter((rowKey) => inputMatcher.test(rowKey))
      .forEach((rowKey) => {
-        const value = normalizeValue(csvRow[rowKey])
+        const value = csvRow[rowKey]
        if (!existingIds.includes(value) && isPresent(value)) {
          output[outputKey].push({ [arrayItemKey]: value })
        }
@@ -243,7 +247,7 @@ const variantStaticColumns: {
    "variant origin country",
    "origin_country"
  ),
-  "variant variant rank": processAsString(
+  "variant variant rank": processAsNumber(
    "variant variant rank",
    "variant_rank"
  ),
@@ -268,7 +272,7 @@ const variantWildcardColumns: {

    pricesColumns.forEach((columnName) => {
      const { iso } = parseVariantPriceColumn(columnName, rowNumber)
-      const value = normalizeValue(csvRow[columnName])
+      const value = csvRow[columnName]

      const numericValue = tryConvertToNumber(value)
      if (numericValue === undefined) {
@@ -298,13 +302,13 @@ const optionColumns: {
  "variant option": (csvRow, rowColumns, rowNumber, output) => {
    const matcher = /variant option \d+ name/
    const optionNameColumns = rowColumns.filter((rowKey) => {
-      return matcher.test(rowKey) && isPresent(normalizeValue(csvRow[rowKey]))
+      return matcher.test(rowKey) && isPresent(csvRow[rowKey])
    })

    output["options"] = optionNameColumns.map((columnName) => {
      const [, , counter] = columnName.split(" ")
-      const key = normalizeValue(csvRow[columnName])
-      const value = normalizeValue(csvRow[`variant option ${counter} value`])
+      const key = csvRow[columnName]
+      const value = csvRow[`variant option ${counter} value`]

      if (!isPresent(value)) {
        throw createError(rowNumber, `Missing option value for "${columnName}"`)
@@ -336,6 +340,52 @@ const knownWildcardColumns = Object.keys(productWildcardColumns)
 * the required fields in the normalized output.
 */
 export class CSVNormalizer {
+  /**
+   * Normalizes a row by converting all keys to lowercase and removing
+   * the leading "\r" from the keys and the values.
+   *
+   * Also, it values the row to contain unknown columns and must contain
+   * the "product id" or "product handle" columns.
+   */
+  static preProcess(
+    row: Record<string, string | boolean | number>,
+    rowNumber: number
+  ): NormalizedRow {
+    const unknownColumns: string[] = []
+
+    const normalized = Object.keys(row).reduce((result, key) => {
+      const lowerCaseKey = normalizeCSVValue(key).toLowerCase()
+
+      if (
+        !knownStaticColumns.includes(lowerCaseKey) &&
+        !knownWildcardColumns.some((column) => lowerCaseKey.startsWith(column))
+      ) {
+        unknownColumns.push(key)
+      }
+
+      result[lowerCaseKey] = normalizeCSVValue(row[key])
+      return result
+    }, {})
+
+    if (unknownColumns.length) {
+      throw new MedusaError(
+        MedusaError.Types.INVALID_DATA,
+        `Invalid column name(s) "${unknownColumns.join('","')}"`
+      )
+    }
+
+    const productId = normalized["product id"]
+    const productHandle = normalized["product handle"]
+    if (!isPresent(productId) && !isPresent(productHandle)) {
+      throw createError(
+        rowNumber,
+        "Missing product id and handle. One of these columns are required to process the row"
+      )
+    }
+
+    return normalized as NormalizedRow
+  }
+
  #rows: Record<string, string | boolean | number>[]

  #products: {
@@ -354,30 +404,10 @@ export class CSVNormalizer {
    toUpdate: {},
  }

-  constructor(rows: Record<string, string | boolean | number>[]) {
+  constructor(rows: NormalizedRow[]) {
    this.#rows = rows
  }

-  /**
-   * Ensures atleast one of the product id or the handle is provided. Otherwise
-   * we cannot process the row
-   */
-  #ensureRowHasProductIdentifier(
-    row: Record<string, string | boolean | number>,
-    rowNumber: number
-  ) {
-    const productId = row["product id"]
-    const productHandle = row["product handle"]
-    if (!isPresent(productId) && !isPresent(productHandle)) {
-      throw createError(
-        rowNumber,
-        "Missing product id and handle. One of them are required to process the row"
-      )
-    }
-
-    return { productId, productHandle }
-  }
-
  /**
   * Initializes a product object or returns an existing one
   * by its id. The products with ids are treated as updates
@@ -400,37 +430,6 @@ export class CSVNormalizer {
    return this.#products.toCreate[handle]!
  }

-  /**
-   * Normalizes a row by converting all keys to lowercase and creating a
-   * new object
-   */
-  #normalizeRow(row: Record<string, any>) {
-    const unknownColumns: string[] = []
-
-    const normalized = Object.keys(row).reduce((result, key) => {
-      const lowerCaseKey = key.toLowerCase()
-      result[lowerCaseKey] = row[key]
-
-      if (
-        !knownStaticColumns.includes(lowerCaseKey) &&
-        !knownWildcardColumns.some((column) => lowerCaseKey.startsWith(column))
-      ) {
-        unknownColumns.push(key)
-      }
-
-      return result
-    }, {})
-
-    if (unknownColumns.length) {
-      throw new MedusaError(
-        MedusaError.Types.INVALID_DATA,
-        `Invalid column name(s) "${unknownColumns.join('","')}"`
-      )
-    }
-
-    return normalized
-  }
-
  /**
   * Processes a given CSV row
   */
@@ -439,10 +438,8 @@ export class CSVNormalizer {
    rowNumber: number
  ) {
    const rowColumns = Object.keys(row)
-    const { productHandle, productId } = this.#ensureRowHasProductIdentifier(
-      row,
-      rowNumber
-    )
+    const productId = row["product id"]
+    const productHandle = row["product handle"]

    /**
     * Create representation of a product by its id or handle and process
@@ -508,10 +505,11 @@ export class CSVNormalizer {
  /**
   * Process CSV rows. The return value is a tree of products
   */
-  proccess() {
+  proccess(resumingFromIndex: number = 0) {
    this.#rows.forEach((row, index) =>
-      this.#processRow(this.#normalizeRow(row), index + 1)
+      this.#processRow(row, resumingFromIndex + index + 1)
    )
+    this.#rows = []
    return this.#products
  }
 }