feat(medusa): Parsing CSV files (#1572)

* add: csv parser * fix: tests * fix: linting + comment * add: missing dependency * fix: generic transformers * fix: address comments * fix: make parser options generic * add: support regex columns + fix missing columns bug * improve test case * add: support for reducer + tests * fix: add comments to csv parser
2022-06-19 16:10:15 +01:00
parent 46a6e1a4d3
commit 9e686a8e47
6 changed files with 730 additions and 4 deletions
--- a/packages/medusa/package.json
+++ b/packages/medusa/package.json
@@ -79,6 +79,7 @@
    "morgan": "^1.9.1",
    "multer": "^1.4.2",
    "node-schedule": "^2.1.0",
+    "papaparse": "^5.3.2",
    "passport": "^0.4.0",
    "passport-http-bearer": "^1.0.1",
    "passport-jwt": "^4.0.0",
--- a/packages/medusa/src/interfaces/abstract-parser.ts
+++ b/packages/medusa/src/interfaces/abstract-parser.ts
@@ -0,0 +1,42 @@
+/**
+ * Generic parsing interface. All different parsing implementations (csv, json, etc.) should implement this interface
+ */
+export interface IParser<TResult, TParseOptions> {
+  /**
+   *
+   * @param readableStream readable stream to parse
+   * @param options options used for parsing by underlying parser implementation
+   */
+  parse(
+    readableStream: NodeJS.ReadableStream,
+    options?: TParseOptions
+  ): Promise<TResult[]>
+}
+
+/**
+ * Abstract class implementation of the IParser interface. All different parsing implementations should extend this class
+ */
+export abstract class AbstractParser<
+  TSchema,
+  TParserResult,
+  TParseOptions,
+  TOutputResult
+> implements IParser<TParserResult, TParseOptions>
+{
+  protected readonly $$schema: TSchema
+
+  protected constructor(schema: TSchema) {
+    this.$$schema = schema
+  }
+
+  public abstract parse(
+    readableStream: NodeJS.ReadableStream,
+    options?: TParseOptions
+  ): Promise<TParserResult[]>
+
+  /**
+   *
+   * @param data data to be built after parsing. Includes validation according to schema, transformation of values, etc.
+   */
+  public abstract buildData(data: TParserResult[]): Promise<TOutputResult[]>
+}
--- a/packages/medusa/src/interfaces/csv-parser.ts
+++ b/packages/medusa/src/interfaces/csv-parser.ts
@@ -0,0 +1,73 @@
+import { AwilixContainer } from "awilix"
+
+/**
+ * Generic validation interface used to run validation logic on every line or record.
+ * All different validation objects should implement this interface
+ */
+export interface ICsvValidator<TCsvLine, TBuiltLine> {
+  /**
+   *
+   * @param value value of column or property
+   * @param context includes contextual information such as line number, line, etc.
+   */
+  validate: (
+    value: TBuiltLine,
+    context: CsvParserContext<TCsvLine>
+  ) => Promise<boolean | never>
+}
+
+export type CsvParserContext<TLine> = LineContext<TLine> & {
+  column: string
+}
+
+export type LineContext<TLine> = {
+  lineNumber: number
+  line: TLine
+}
+
+/**
+ * Abstract class implementation of the IValidator interface.
+ * All validation objects part of the schema should extend this class.
+ */
+export abstract class AbstractCsvValidator<TCsvLine, TBuiltLine>
+  implements ICsvValidator<TCsvLine, TBuiltLine>
+{
+  constructor(protected readonly container: AwilixContainer) {}
+
+  abstract validate(
+    builtLine: TBuiltLine,
+    context: CsvParserContext<TCsvLine>
+  ): Promise<boolean | never>
+}
+
+export type CsvSchemaColumn<TCsvLine, TBuiltLine> = {
+  name: string
+  required?: boolean
+  validator?: AbstractCsvValidator<TCsvLine, TBuiltLine>
+} & (
+  | {
+      mapTo?: string
+      transform?: ColumnTransformer<TCsvLine>
+    }
+  | {
+      match?: RegExp
+      reducer?: ColumnReducer<TCsvLine, TBuiltLine>
+      transform?: ColumnTransformer<TCsvLine>
+    }
+)
+
+export type ColumnTransformer<TCsvLine> = (
+  value: string,
+  context: CsvParserContext<TCsvLine>
+) => unknown
+
+export type ColumnReducer<TCsvLine = unknown, TBuiltLine = unknown> = (
+  builtLine: TBuiltLine,
+  key: string,
+  value: string,
+  context: CsvParserContext<TCsvLine>
+) => TBuiltLine
+
+export type CsvSchema<TCsvLine = unknown, TBuiltLine = unknown> = {
+  columns: CsvSchemaColumn<TCsvLine, TBuiltLine>[]
+}
--- a/packages/medusa/src/services/tests/csv-parser.js
+++ b/packages/medusa/src/services/tests/csv-parser.js
@@ -0,0 +1,403 @@
+import { createContainer } from "awilix"
+import { Readable } from "stream"
+import { AbstractCsvValidator } from "../../interfaces/csv-parser"
+import CsvParser from "../csv-parser"
+import { currencies } from "../../utils/currencies"
+
+describe("CsvParser", () => {
+  describe("parse", () => {
+    const csvParser = new CsvParser(createContainer(), {
+      columns: [],
+    })
+
+    let csvContent =
+      'title,subtitle\n"T-shirt","summer tee"\n"Sunglasses","Red sunglasses"'
+
+    let expectedProducts = [
+      {
+        title: "T-shirt",
+        subtitle: "summer tee",
+      },
+      {
+        title: "Sunglasses",
+        subtitle: "Red sunglasses",
+      },
+    ]
+
+    afterEach(() => {
+      jest.clearAllMocks()
+    })
+
+    it("given a readable stream, can parse the stream content", async () => {
+      const stream = Readable.from(csvContent)
+      const content = await csvParser.parse(stream)
+
+      expect(content).toEqual(expectedProducts)
+    })
+  })
+
+  describe("buildData", () => {
+    describe("schema validation", () => {
+      class TitleValidator extends AbstractCsvValidator {
+        async validate(builtLine) {
+          if (/\d/.test(builtLine["title"])) {
+            throw new Error("title should not contain a number")
+          }
+          return true
+        }
+      }
+
+      const schema = {
+        columns: [
+          {
+            name: "title",
+            validator: new TitleValidator(createContainer()),
+          },
+          {
+            name: "size",
+          },
+          {
+            name: "height",
+          },
+        ],
+      }
+
+      const csvParser = new CsvParser(createContainer(), schema)
+
+      it("given a line containing a column which is not defined in the schema, then validation should fail", async () => {
+        try {
+          await csvParser.buildData([
+            {
+              title: "sunglasses",
+              size: "M",
+              height: "100",
+              first_name: "lebron",
+            },
+          ])
+        } catch (err) {
+          expect(err.message).toEqual(
+            "Unable to treat column first_name from the csv file. No target column found in the provided schema"
+          )
+        }
+      })
+
+      it("given a line containing a column which does not pass a validation constraint, then validation should fail", async () => {
+        try {
+          await csvParser.buildData([
+            { title: "contains a number 1", size: "M", height: "100" },
+          ])
+        } catch (err) {
+          expect(err.message).toEqual("title should not contain a number")
+        }
+      })
+
+      it("given a line which passes all validation constraints, then should returned validated content", async () => {
+        const content = await csvParser.buildData([
+          { title: "great product", size: "M", height: "100" },
+        ])
+
+        expect(content).toEqual([
+          {
+            title: "great product",
+            size: "M",
+            height: "100",
+          },
+        ])
+      })
+
+      it("given a line which does not provide a value for a required column, then should throw an error", async () => {
+        try {
+          await csvParser.buildData([{ size: "S", height: "100" }])
+        } catch (err) {
+          expect(err.message).toEqual(
+            `Missing column(s) "title" from the given csv file`
+          )
+        }
+      })
+
+      it("given a line which does not provide a value for multiple required columns, then should throw an error", async () => {
+        try {
+          await csvParser.buildData([{ size: "S" }])
+        } catch (err) {
+          expect(err.message).toEqual(
+            `Missing column(s) "title", "height" from the given csv file`
+          )
+        }
+      })
+
+      it("given a line which does not provide a value for a required column, then should throw an error", async () => {
+        try {
+          await csvParser.buildData([
+            { title: "t-shirt", height: "100", size: "" },
+          ])
+        } catch (err) {
+          expect(err.message).toEqual(
+            `No value found for target column "size" in line 1 of the given csv file`
+          )
+        }
+      })
+    })
+
+    describe("mapTo", () => {
+      const csvParser = new CsvParser(createContainer(), {
+        columns: [
+          {
+            name: "title",
+            mapTo: "product_title",
+          },
+        ],
+      })
+
+      it("given a mapTo field for a column, when building data including that column, should rename the column name to what mapTo refers to", async () => {
+        const content = await csvParser.buildData([{ title: "a product" }])
+
+        expect(content).toEqual([
+          {
+            product_title: "a product",
+          },
+        ])
+      })
+    })
+
+    describe("transformer", () => {
+      const csvParser = new CsvParser(createContainer(), {
+        columns: [
+          {
+            name: "title",
+          },
+          {
+            name: "price usd",
+            transform: (value) => Math.round(Number(value) * 100),
+          },
+        ],
+      })
+
+      it("given a transformer function for a column, when building data, should transform that column's value according to the transformation function", async () => {
+        const content = await csvParser.buildData([
+          { title: "medusa t-shirt", "price usd": "19.99" },
+        ])
+
+        expect(content).toEqual([
+          {
+            title: "medusa t-shirt",
+            "price usd": 1999,
+          },
+        ])
+      })
+    })
+
+    describe("match", () => {
+      describe("regex", () => {
+        const csvParser = new CsvParser(createContainer(), {
+          columns: [
+            {
+              name: "title",
+            },
+            {
+              name: "prices",
+              match: /.*Variant Price.*/i,
+              transform: (value) => Math.round(Number(value) * 100),
+            },
+          ],
+        })
+
+        it("given a column with the match property as regex and a transformer, when building data, should resolve that column for all entries in the line that match the regex", async () => {
+          const content = await csvParser.buildData([
+            {
+              title: "medusa t-shirt",
+              "variant price usd": "19.99",
+              "variant price cad": "26.79",
+              "variant price dkk": "1389",
+            },
+            {
+              title: "medusa sunglasses",
+              "variant price usd": "9.99",
+              "variant price cad": "16.79",
+              "variant price dkk": "389",
+            },
+          ])
+
+          expect(content).toEqual([
+            {
+              title: "medusa t-shirt",
+              "variant price usd": 1999,
+              "variant price cad": 2679,
+              "variant price dkk": 138900,
+            },
+            {
+              title: "medusa sunglasses",
+              "variant price usd": 999,
+              "variant price cad": 1679,
+              "variant price dkk": 38900,
+            },
+          ])
+        })
+      })
+
+      describe("reducer", () => {
+        const schema = {
+          columns: [
+            {
+              name: "title",
+            },
+            {
+              name: "prices",
+              match: /.*Variant Price ([a-z]+).*/i,
+              reducer: (builtLine, key, value) => {
+                const [, currency_code] = key.match(
+                  /.*Variant Price ([a-z]+).*/i
+                )
+                const existingPrices = builtLine.prices ?? []
+                const price = {
+                  amount: Math.round(Number(value) * 100),
+                  currency_code,
+                }
+                return {
+                  ...builtLine,
+                  prices: [...existingPrices, price],
+                }
+              },
+              validator: {
+                validate: (builtLine) => {
+                  const unexistingCurrency = builtLine.prices?.find(
+                    (price) => !currencies[price.currency_code.toUpperCase()]
+                  )
+                  if (unexistingCurrency) {
+                    throw new Error(
+                      `wrong currency: ${unexistingCurrency.currency_code}`
+                    )
+                  }
+                  return true
+                },
+              },
+            },
+          ],
+        }
+        const csvParser = new CsvParser(createContainer(), schema)
+
+        it("given a column with match and reducer properties, when building data, should return the result of the reducer function", async () => {
+          const content = await csvParser.buildData([
+            {
+              title: "medusa t-shirt",
+              "variant price usd": "19.99",
+              "variant price cad": "26.79",
+              "variant price dkk": "1389",
+            },
+            {
+              title: "medusa sunglasses",
+              "variant price usd": "9.99",
+              "variant price cad": "16.79",
+              "variant price dkk": "389",
+            },
+          ])
+
+          expect(content).toEqual([
+            {
+              title: "medusa t-shirt",
+              prices: [
+                {
+                  currency_code: "usd",
+                  amount: 1999,
+                },
+                {
+                  currency_code: "cad",
+                  amount: 2679,
+                },
+                {
+                  currency_code: "dkk",
+                  amount: 138900,
+                },
+              ],
+            },
+            {
+              title: "medusa sunglasses",
+              prices: [
+                {
+                  currency_code: "usd",
+                  amount: 999,
+                },
+                {
+                  currency_code: "cad",
+                  amount: 1679,
+                },
+                {
+                  currency_code: "dkk",
+                  amount: 38900,
+                },
+              ],
+            },
+          ])
+        })
+
+        it("given a column with match and reducer properties, when building data, should run validation on the built data", async () => {
+          try {
+            await csvParser.buildData([
+              {
+                title: "medusa t-shirt",
+                "variant price usd": "19.99",
+                "variant price cad": "26.79",
+                "variant price grp": "1389",
+              },
+              {
+                title: "medusa sunglasses",
+                "variant price usd": "9.99",
+                "variant price cad": "16.79",
+                "variant price grp": "389",
+              },
+            ])
+          } catch (err) {
+            expect(err.message).toEqual("wrong currency: grp")
+          }
+        })
+
+        describe("invalid column properties", () => {
+          const schema = {
+            columns: [
+              {
+                name: "title",
+              },
+              {
+                name: "variants",
+                match: /.*Variant Price ([a-z]+).*/i,
+                mapTo: "prices",
+              },
+            ],
+          }
+          const csvParser = new CsvParser(createContainer(), schema)
+
+          it("given a column with match and mapTo property, when building data, then the mapTo property should be ignored", async () => {
+            const content = await csvParser.buildData([
+              {
+                title: "medusa t-shirt",
+                "variant price usd": "19.99",
+                "variant price cad": "26.79",
+                "variant price dkk": "1389",
+              },
+              {
+                title: "medusa sunglasses",
+                "variant price usd": "9.99",
+                "variant price cad": "16.79",
+                "variant price dkk": "389",
+              },
+            ])
+
+            expect(content).toEqual([
+              {
+                title: "medusa t-shirt",
+                "variant price usd": "19.99",
+                "variant price cad": "26.79",
+                "variant price dkk": "1389",
+              },
+              {
+                title: "medusa sunglasses",
+                "variant price usd": "9.99",
+                "variant price cad": "16.79",
+                "variant price dkk": "389",
+              },
+            ])
+          })
+        })
+      })
+    })
+  })
+})
--- a/packages/medusa/src/services/csv-parser.ts
+++ b/packages/medusa/src/services/csv-parser.ts
@@ -0,0 +1,202 @@
+import { AwilixContainer } from "awilix"
+import { difference } from "lodash"
+import Papa, { ParseConfig } from "papaparse"
+import { AbstractParser } from "../interfaces/abstract-parser"
+import { CsvParserContext, CsvSchema } from "../interfaces/csv-parser"
+
+const DEFAULT_PARSE_OPTIONS = {
+  dynamicTyping: true,
+  header: true,
+}
+
+class CsvParser<
+  TSchema extends CsvSchema<TParserResult, TOutputResult> = CsvSchema,
+  TParserResult = unknown,
+  TOutputResult = unknown
+> extends AbstractParser<TSchema, TParserResult, ParseConfig, TOutputResult> {
+  protected readonly $$delimiter: string = ";"
+
+  constructor(
+    protected readonly container: AwilixContainer,
+    schema: TSchema,
+    delimiter?: string
+  ) {
+    super(schema)
+    if (delimiter) {
+      this.$$delimiter = delimiter
+    }
+  }
+
+  public async parse(
+    readableStream: NodeJS.ReadableStream,
+    options: ParseConfig = DEFAULT_PARSE_OPTIONS
+  ): Promise<TParserResult[]> {
+    const csvStream = Papa.parse(Papa.NODE_STREAM_INPUT, options)
+
+    const parsedContent: TParserResult[] = []
+    readableStream.pipe(csvStream)
+    for await (const chunk of csvStream) {
+      parsedContent.push(chunk)
+    }
+
+    return parsedContent
+  }
+
+  async buildData(data: TParserResult[]): Promise<TOutputResult[]> {
+    const validatedData = [] as TOutputResult[]
+    for (let i = 0; i < data.length; i++) {
+      const builtLine = await this._buildLine(data[i], i + 1)
+      validatedData.push(builtLine)
+    }
+    return validatedData
+  }
+
+  private async _buildLine(
+    line: TParserResult,
+    lineNumber: number
+  ): Promise<TOutputResult> {
+    let outputTuple = {} as TOutputResult
+    const columnMap = this.buildColumnMap_(this.$$schema.columns)
+
+    const tupleKeys = Object.keys(line)
+
+    /**
+     * map which keeps track of the columns processed
+     * used to detect any missing columns which are present in the schema but not in the line
+     */
+    const processedColumns = {}
+    for (const tupleKey of tupleKeys) {
+      const column = this.resolveColumn_(tupleKey, columnMap)
+
+      /**
+       * if the tupleKey does not correspond to any column defined in the schema
+       */
+      if (!column) {
+        throw new Error(
+          `Unable to treat column ${tupleKey} from the csv file. No target column found in the provided schema`
+        )
+      }
+
+      processedColumns[column.name] = true
+
+      /**
+       * if the value corresponding to the tupleKey is empty and the column is required in the schema
+       */
+      if (!line[tupleKey] && column.required) {
+        throw new Error(
+          `No value found for target column "${column.name}" in line ${lineNumber} of the given csv file`
+        )
+      }
+
+      const context = {
+        line,
+        lineNumber,
+        column: column.name,
+        tupleKey,
+      }
+
+      outputTuple = this.resolveTuple_(outputTuple, column, context)
+    }
+
+    /**
+     * missing columns = columns defined in the schema - columns present in the line
+     */
+    const missingColumns = difference(
+      Object.keys(columnMap),
+      Object.keys(processedColumns)
+    )
+
+    if (missingColumns.length > 0) {
+      throw new Error(
+        `Missing column(s) ${formatMissingColumns(
+          missingColumns
+        )} from the given csv file`
+      )
+    }
+
+    /**
+     * Runs the validation defined in the schema columns
+     */
+    for (const column of this.$$schema.columns) {
+      const context = {
+        line,
+        lineNumber,
+        column: column.name,
+      }
+
+      if (column.validator) {
+        await column.validator.validate(outputTuple, context)
+      }
+    }
+
+    return outputTuple
+  }
+
+  private buildColumnMap_(
+    columns: TSchema["columns"]
+  ): Record<string, TSchema["columns"][number]> {
+    return columns.reduce((map, column) => {
+      if (typeof column.name === "string") {
+        map[column.name] = column
+      }
+      return map
+    }, {})
+  }
+
+  private resolveColumn_(
+    tupleKey: string,
+    columnMap: Record<string, TSchema["columns"][number]>
+  ): TSchema["columns"][number] | undefined {
+    if (columnMap[tupleKey]) {
+      return columnMap[tupleKey]
+    }
+
+    const matchedColumn = this.$$schema.columns.find((column) =>
+      "match" in column &&
+      typeof column.match === "object" &&
+      column.match instanceof RegExp
+        ? column.match.test(tupleKey)
+        : false
+    )
+
+    return matchedColumn
+  }
+
+  private resolveTuple_(
+    tuple: TOutputResult,
+    column: TSchema["columns"][number],
+    context: CsvParserContext<TParserResult> & { tupleKey: string }
+  ): TOutputResult {
+    const outputTuple = { ...tuple }
+    const { tupleKey, ...csvContext } = context
+    const { line } = csvContext
+
+    let resolvedKey = tupleKey
+    /**
+     * if match is provided, then we should call the reducer if it's defined
+     * otherwise, before using the mapTo property, we should make sure match was not provided
+     */
+    if ("match" in column && column.reducer) {
+      return column.reducer(outputTuple, tupleKey, line[tupleKey], csvContext)
+    } else if (!("match" in column) && "mapTo" in column && column.mapTo) {
+      resolvedKey = column.mapTo
+    }
+
+    const resolvedValue = column.transform
+      ? column.transform(line[tupleKey], csvContext)
+      : line[tupleKey]
+
+    outputTuple[resolvedKey] = resolvedValue
+
+    return outputTuple
+  }
+}
+
+const formatMissingColumns = (list: string[]): string =>
+  list.reduce(
+    (text, curr, i, array) =>
+      text + (i < array.length - 1 ? `"${curr}", ` : `"${curr}"`),
+    ""
+  )
+
+export default CsvParser
--- a/yarn.lock
+++ b/yarn.lock
@@ -22201,6 +22201,11 @@ pako@~1.0.5:
  resolved "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz"
  integrity sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==

+papaparse@^5.3.2:
+  version "5.3.2"
+  resolved "https://registry.yarnpkg.com/papaparse/-/papaparse-5.3.2.tgz#d1abed498a0ee299f103130a6109720404fbd467"
+  integrity sha512-6dNZu0Ki+gyV0eBsFKJhYr+MdQYAzFUGlBMNj3GNrmHxmz1lfRa24CjFObPXtjcetlOv5Ad299MhIK0znp3afw==
+
 parallel-transform@^1.1.0:
  version "1.2.0"
  resolved "https://registry.npmjs.org/parallel-transform/-/parallel-transform-1.2.0.tgz"
@@ -27668,10 +27673,10 @@ typeorm@^0.2.29, typeorm@^0.2.31:
    yargs "^17.0.1"
    zen-observable-ts "^1.0.0"

-typescript@^3.7.3, typescript@^4.5.0:
-  version "4.7.2"
-  resolved "https://registry.yarnpkg.com/typescript/-/typescript-4.7.2.tgz#1f9aa2ceb9af87cca227813b4310fff0b51593c4"
-  integrity sha512-Mamb1iX2FDUpcTRzltPxgWMKy3fhg0TN378ylbktPGPK/99KbDtMQ4W1hwgsbPAsG3a0xKa1vmw4VKZQbkvz5A==
+typescript@^3.7.3:
+  version "3.9.10"
+  resolved "https://registry.yarnpkg.com/typescript/-/typescript-3.9.10.tgz#70f3910ac7a51ed6bef79da7800690b19bf778b8"
+  integrity sha512-w6fIxVE/H1PkLKcCPsFqKE7Kv7QUwhU8qQY2MueZXWx5cPZdwFupLgKK3vntcK98BtNHZtAF4LA/yl2a7k8R6Q==

 typescript@^4.1.3:
  version "4.4.2"