From 9e686a8e47c567ffdb57bb43af796dd38049294f Mon Sep 17 00:00:00 2001 From: Zakaria El Asri <33696020+zakariaelas@users.noreply.github.com> Date: Sun, 19 Jun 2022 16:10:15 +0100 Subject: [PATCH] feat(medusa): Parsing CSV files (#1572) * add: csv parser * fix: tests * fix: linting + comment * add: missing dependency * fix: generic transformers * fix: address comments * fix: make parser options generic * add: support regex columns + fix missing columns bug * improve test case * add: support for reducer + tests * fix: add comments to csv parser --- packages/medusa/package.json | 1 + .../medusa/src/interfaces/abstract-parser.ts | 42 ++ packages/medusa/src/interfaces/csv-parser.ts | 73 ++++ .../src/services/__tests__/csv-parser.js | 403 ++++++++++++++++++ packages/medusa/src/services/csv-parser.ts | 202 +++++++++ yarn.lock | 13 +- 6 files changed, 730 insertions(+), 4 deletions(-) create mode 100644 packages/medusa/src/interfaces/abstract-parser.ts create mode 100644 packages/medusa/src/interfaces/csv-parser.ts create mode 100644 packages/medusa/src/services/__tests__/csv-parser.js create mode 100644 packages/medusa/src/services/csv-parser.ts diff --git a/packages/medusa/package.json b/packages/medusa/package.json index 9bdf94304a..94604fdb61 100644 --- a/packages/medusa/package.json +++ b/packages/medusa/package.json @@ -79,6 +79,7 @@ "morgan": "^1.9.1", "multer": "^1.4.2", "node-schedule": "^2.1.0", + "papaparse": "^5.3.2", "passport": "^0.4.0", "passport-http-bearer": "^1.0.1", "passport-jwt": "^4.0.0", diff --git a/packages/medusa/src/interfaces/abstract-parser.ts b/packages/medusa/src/interfaces/abstract-parser.ts new file mode 100644 index 0000000000..6368c29195 --- /dev/null +++ b/packages/medusa/src/interfaces/abstract-parser.ts @@ -0,0 +1,42 @@ +/** + * Generic parsing interface. All different parsing implementations (csv, json, etc.) should implement this interface + */ +export interface IParser { + /** + * + * @param readableStream readable stream to parse + * @param options options used for parsing by underlying parser implementation + */ + parse( + readableStream: NodeJS.ReadableStream, + options?: TParseOptions + ): Promise +} + +/** + * Abstract class implementation of the IParser interface. All different parsing implementations should extend this class + */ +export abstract class AbstractParser< + TSchema, + TParserResult, + TParseOptions, + TOutputResult +> implements IParser +{ + protected readonly $$schema: TSchema + + protected constructor(schema: TSchema) { + this.$$schema = schema + } + + public abstract parse( + readableStream: NodeJS.ReadableStream, + options?: TParseOptions + ): Promise + + /** + * + * @param data data to be built after parsing. Includes validation according to schema, transformation of values, etc. + */ + public abstract buildData(data: TParserResult[]): Promise +} diff --git a/packages/medusa/src/interfaces/csv-parser.ts b/packages/medusa/src/interfaces/csv-parser.ts new file mode 100644 index 0000000000..25c05a9990 --- /dev/null +++ b/packages/medusa/src/interfaces/csv-parser.ts @@ -0,0 +1,73 @@ +import { AwilixContainer } from "awilix" + +/** + * Generic validation interface used to run validation logic on every line or record. + * All different validation objects should implement this interface + */ +export interface ICsvValidator { + /** + * + * @param value value of column or property + * @param context includes contextual information such as line number, line, etc. + */ + validate: ( + value: TBuiltLine, + context: CsvParserContext + ) => Promise +} + +export type CsvParserContext = LineContext & { + column: string +} + +export type LineContext = { + lineNumber: number + line: TLine +} + +/** + * Abstract class implementation of the IValidator interface. + * All validation objects part of the schema should extend this class. + */ +export abstract class AbstractCsvValidator + implements ICsvValidator +{ + constructor(protected readonly container: AwilixContainer) {} + + abstract validate( + builtLine: TBuiltLine, + context: CsvParserContext + ): Promise +} + +export type CsvSchemaColumn = { + name: string + required?: boolean + validator?: AbstractCsvValidator +} & ( + | { + mapTo?: string + transform?: ColumnTransformer + } + | { + match?: RegExp + reducer?: ColumnReducer + transform?: ColumnTransformer + } +) + +export type ColumnTransformer = ( + value: string, + context: CsvParserContext +) => unknown + +export type ColumnReducer = ( + builtLine: TBuiltLine, + key: string, + value: string, + context: CsvParserContext +) => TBuiltLine + +export type CsvSchema = { + columns: CsvSchemaColumn[] +} diff --git a/packages/medusa/src/services/__tests__/csv-parser.js b/packages/medusa/src/services/__tests__/csv-parser.js new file mode 100644 index 0000000000..6c72ea26b4 --- /dev/null +++ b/packages/medusa/src/services/__tests__/csv-parser.js @@ -0,0 +1,403 @@ +import { createContainer } from "awilix" +import { Readable } from "stream" +import { AbstractCsvValidator } from "../../interfaces/csv-parser" +import CsvParser from "../csv-parser" +import { currencies } from "../../utils/currencies" + +describe("CsvParser", () => { + describe("parse", () => { + const csvParser = new CsvParser(createContainer(), { + columns: [], + }) + + let csvContent = + 'title,subtitle\n"T-shirt","summer tee"\n"Sunglasses","Red sunglasses"' + + let expectedProducts = [ + { + title: "T-shirt", + subtitle: "summer tee", + }, + { + title: "Sunglasses", + subtitle: "Red sunglasses", + }, + ] + + afterEach(() => { + jest.clearAllMocks() + }) + + it("given a readable stream, can parse the stream content", async () => { + const stream = Readable.from(csvContent) + const content = await csvParser.parse(stream) + + expect(content).toEqual(expectedProducts) + }) + }) + + describe("buildData", () => { + describe("schema validation", () => { + class TitleValidator extends AbstractCsvValidator { + async validate(builtLine) { + if (/\d/.test(builtLine["title"])) { + throw new Error("title should not contain a number") + } + return true + } + } + + const schema = { + columns: [ + { + name: "title", + validator: new TitleValidator(createContainer()), + }, + { + name: "size", + }, + { + name: "height", + }, + ], + } + + const csvParser = new CsvParser(createContainer(), schema) + + it("given a line containing a column which is not defined in the schema, then validation should fail", async () => { + try { + await csvParser.buildData([ + { + title: "sunglasses", + size: "M", + height: "100", + first_name: "lebron", + }, + ]) + } catch (err) { + expect(err.message).toEqual( + "Unable to treat column first_name from the csv file. No target column found in the provided schema" + ) + } + }) + + it("given a line containing a column which does not pass a validation constraint, then validation should fail", async () => { + try { + await csvParser.buildData([ + { title: "contains a number 1", size: "M", height: "100" }, + ]) + } catch (err) { + expect(err.message).toEqual("title should not contain a number") + } + }) + + it("given a line which passes all validation constraints, then should returned validated content", async () => { + const content = await csvParser.buildData([ + { title: "great product", size: "M", height: "100" }, + ]) + + expect(content).toEqual([ + { + title: "great product", + size: "M", + height: "100", + }, + ]) + }) + + it("given a line which does not provide a value for a required column, then should throw an error", async () => { + try { + await csvParser.buildData([{ size: "S", height: "100" }]) + } catch (err) { + expect(err.message).toEqual( + `Missing column(s) "title" from the given csv file` + ) + } + }) + + it("given a line which does not provide a value for multiple required columns, then should throw an error", async () => { + try { + await csvParser.buildData([{ size: "S" }]) + } catch (err) { + expect(err.message).toEqual( + `Missing column(s) "title", "height" from the given csv file` + ) + } + }) + + it("given a line which does not provide a value for a required column, then should throw an error", async () => { + try { + await csvParser.buildData([ + { title: "t-shirt", height: "100", size: "" }, + ]) + } catch (err) { + expect(err.message).toEqual( + `No value found for target column "size" in line 1 of the given csv file` + ) + } + }) + }) + + describe("mapTo", () => { + const csvParser = new CsvParser(createContainer(), { + columns: [ + { + name: "title", + mapTo: "product_title", + }, + ], + }) + + it("given a mapTo field for a column, when building data including that column, should rename the column name to what mapTo refers to", async () => { + const content = await csvParser.buildData([{ title: "a product" }]) + + expect(content).toEqual([ + { + product_title: "a product", + }, + ]) + }) + }) + + describe("transformer", () => { + const csvParser = new CsvParser(createContainer(), { + columns: [ + { + name: "title", + }, + { + name: "price usd", + transform: (value) => Math.round(Number(value) * 100), + }, + ], + }) + + it("given a transformer function for a column, when building data, should transform that column's value according to the transformation function", async () => { + const content = await csvParser.buildData([ + { title: "medusa t-shirt", "price usd": "19.99" }, + ]) + + expect(content).toEqual([ + { + title: "medusa t-shirt", + "price usd": 1999, + }, + ]) + }) + }) + + describe("match", () => { + describe("regex", () => { + const csvParser = new CsvParser(createContainer(), { + columns: [ + { + name: "title", + }, + { + name: "prices", + match: /.*Variant Price.*/i, + transform: (value) => Math.round(Number(value) * 100), + }, + ], + }) + + it("given a column with the match property as regex and a transformer, when building data, should resolve that column for all entries in the line that match the regex", async () => { + const content = await csvParser.buildData([ + { + title: "medusa t-shirt", + "variant price usd": "19.99", + "variant price cad": "26.79", + "variant price dkk": "1389", + }, + { + title: "medusa sunglasses", + "variant price usd": "9.99", + "variant price cad": "16.79", + "variant price dkk": "389", + }, + ]) + + expect(content).toEqual([ + { + title: "medusa t-shirt", + "variant price usd": 1999, + "variant price cad": 2679, + "variant price dkk": 138900, + }, + { + title: "medusa sunglasses", + "variant price usd": 999, + "variant price cad": 1679, + "variant price dkk": 38900, + }, + ]) + }) + }) + + describe("reducer", () => { + const schema = { + columns: [ + { + name: "title", + }, + { + name: "prices", + match: /.*Variant Price ([a-z]+).*/i, + reducer: (builtLine, key, value) => { + const [, currency_code] = key.match( + /.*Variant Price ([a-z]+).*/i + ) + const existingPrices = builtLine.prices ?? [] + const price = { + amount: Math.round(Number(value) * 100), + currency_code, + } + return { + ...builtLine, + prices: [...existingPrices, price], + } + }, + validator: { + validate: (builtLine) => { + const unexistingCurrency = builtLine.prices?.find( + (price) => !currencies[price.currency_code.toUpperCase()] + ) + if (unexistingCurrency) { + throw new Error( + `wrong currency: ${unexistingCurrency.currency_code}` + ) + } + return true + }, + }, + }, + ], + } + const csvParser = new CsvParser(createContainer(), schema) + + it("given a column with match and reducer properties, when building data, should return the result of the reducer function", async () => { + const content = await csvParser.buildData([ + { + title: "medusa t-shirt", + "variant price usd": "19.99", + "variant price cad": "26.79", + "variant price dkk": "1389", + }, + { + title: "medusa sunglasses", + "variant price usd": "9.99", + "variant price cad": "16.79", + "variant price dkk": "389", + }, + ]) + + expect(content).toEqual([ + { + title: "medusa t-shirt", + prices: [ + { + currency_code: "usd", + amount: 1999, + }, + { + currency_code: "cad", + amount: 2679, + }, + { + currency_code: "dkk", + amount: 138900, + }, + ], + }, + { + title: "medusa sunglasses", + prices: [ + { + currency_code: "usd", + amount: 999, + }, + { + currency_code: "cad", + amount: 1679, + }, + { + currency_code: "dkk", + amount: 38900, + }, + ], + }, + ]) + }) + + it("given a column with match and reducer properties, when building data, should run validation on the built data", async () => { + try { + await csvParser.buildData([ + { + title: "medusa t-shirt", + "variant price usd": "19.99", + "variant price cad": "26.79", + "variant price grp": "1389", + }, + { + title: "medusa sunglasses", + "variant price usd": "9.99", + "variant price cad": "16.79", + "variant price grp": "389", + }, + ]) + } catch (err) { + expect(err.message).toEqual("wrong currency: grp") + } + }) + + describe("invalid column properties", () => { + const schema = { + columns: [ + { + name: "title", + }, + { + name: "variants", + match: /.*Variant Price ([a-z]+).*/i, + mapTo: "prices", + }, + ], + } + const csvParser = new CsvParser(createContainer(), schema) + + it("given a column with match and mapTo property, when building data, then the mapTo property should be ignored", async () => { + const content = await csvParser.buildData([ + { + title: "medusa t-shirt", + "variant price usd": "19.99", + "variant price cad": "26.79", + "variant price dkk": "1389", + }, + { + title: "medusa sunglasses", + "variant price usd": "9.99", + "variant price cad": "16.79", + "variant price dkk": "389", + }, + ]) + + expect(content).toEqual([ + { + title: "medusa t-shirt", + "variant price usd": "19.99", + "variant price cad": "26.79", + "variant price dkk": "1389", + }, + { + title: "medusa sunglasses", + "variant price usd": "9.99", + "variant price cad": "16.79", + "variant price dkk": "389", + }, + ]) + }) + }) + }) + }) + }) +}) diff --git a/packages/medusa/src/services/csv-parser.ts b/packages/medusa/src/services/csv-parser.ts new file mode 100644 index 0000000000..239390b7ba --- /dev/null +++ b/packages/medusa/src/services/csv-parser.ts @@ -0,0 +1,202 @@ +import { AwilixContainer } from "awilix" +import { difference } from "lodash" +import Papa, { ParseConfig } from "papaparse" +import { AbstractParser } from "../interfaces/abstract-parser" +import { CsvParserContext, CsvSchema } from "../interfaces/csv-parser" + +const DEFAULT_PARSE_OPTIONS = { + dynamicTyping: true, + header: true, +} + +class CsvParser< + TSchema extends CsvSchema = CsvSchema, + TParserResult = unknown, + TOutputResult = unknown +> extends AbstractParser { + protected readonly $$delimiter: string = ";" + + constructor( + protected readonly container: AwilixContainer, + schema: TSchema, + delimiter?: string + ) { + super(schema) + if (delimiter) { + this.$$delimiter = delimiter + } + } + + public async parse( + readableStream: NodeJS.ReadableStream, + options: ParseConfig = DEFAULT_PARSE_OPTIONS + ): Promise { + const csvStream = Papa.parse(Papa.NODE_STREAM_INPUT, options) + + const parsedContent: TParserResult[] = [] + readableStream.pipe(csvStream) + for await (const chunk of csvStream) { + parsedContent.push(chunk) + } + + return parsedContent + } + + async buildData(data: TParserResult[]): Promise { + const validatedData = [] as TOutputResult[] + for (let i = 0; i < data.length; i++) { + const builtLine = await this._buildLine(data[i], i + 1) + validatedData.push(builtLine) + } + return validatedData + } + + private async _buildLine( + line: TParserResult, + lineNumber: number + ): Promise { + let outputTuple = {} as TOutputResult + const columnMap = this.buildColumnMap_(this.$$schema.columns) + + const tupleKeys = Object.keys(line) + + /** + * map which keeps track of the columns processed + * used to detect any missing columns which are present in the schema but not in the line + */ + const processedColumns = {} + for (const tupleKey of tupleKeys) { + const column = this.resolveColumn_(tupleKey, columnMap) + + /** + * if the tupleKey does not correspond to any column defined in the schema + */ + if (!column) { + throw new Error( + `Unable to treat column ${tupleKey} from the csv file. No target column found in the provided schema` + ) + } + + processedColumns[column.name] = true + + /** + * if the value corresponding to the tupleKey is empty and the column is required in the schema + */ + if (!line[tupleKey] && column.required) { + throw new Error( + `No value found for target column "${column.name}" in line ${lineNumber} of the given csv file` + ) + } + + const context = { + line, + lineNumber, + column: column.name, + tupleKey, + } + + outputTuple = this.resolveTuple_(outputTuple, column, context) + } + + /** + * missing columns = columns defined in the schema - columns present in the line + */ + const missingColumns = difference( + Object.keys(columnMap), + Object.keys(processedColumns) + ) + + if (missingColumns.length > 0) { + throw new Error( + `Missing column(s) ${formatMissingColumns( + missingColumns + )} from the given csv file` + ) + } + + /** + * Runs the validation defined in the schema columns + */ + for (const column of this.$$schema.columns) { + const context = { + line, + lineNumber, + column: column.name, + } + + if (column.validator) { + await column.validator.validate(outputTuple, context) + } + } + + return outputTuple + } + + private buildColumnMap_( + columns: TSchema["columns"] + ): Record { + return columns.reduce((map, column) => { + if (typeof column.name === "string") { + map[column.name] = column + } + return map + }, {}) + } + + private resolveColumn_( + tupleKey: string, + columnMap: Record + ): TSchema["columns"][number] | undefined { + if (columnMap[tupleKey]) { + return columnMap[tupleKey] + } + + const matchedColumn = this.$$schema.columns.find((column) => + "match" in column && + typeof column.match === "object" && + column.match instanceof RegExp + ? column.match.test(tupleKey) + : false + ) + + return matchedColumn + } + + private resolveTuple_( + tuple: TOutputResult, + column: TSchema["columns"][number], + context: CsvParserContext & { tupleKey: string } + ): TOutputResult { + const outputTuple = { ...tuple } + const { tupleKey, ...csvContext } = context + const { line } = csvContext + + let resolvedKey = tupleKey + /** + * if match is provided, then we should call the reducer if it's defined + * otherwise, before using the mapTo property, we should make sure match was not provided + */ + if ("match" in column && column.reducer) { + return column.reducer(outputTuple, tupleKey, line[tupleKey], csvContext) + } else if (!("match" in column) && "mapTo" in column && column.mapTo) { + resolvedKey = column.mapTo + } + + const resolvedValue = column.transform + ? column.transform(line[tupleKey], csvContext) + : line[tupleKey] + + outputTuple[resolvedKey] = resolvedValue + + return outputTuple + } +} + +const formatMissingColumns = (list: string[]): string => + list.reduce( + (text, curr, i, array) => + text + (i < array.length - 1 ? `"${curr}", ` : `"${curr}"`), + "" + ) + +export default CsvParser diff --git a/yarn.lock b/yarn.lock index 1735653a80..f65b8590fa 100644 --- a/yarn.lock +++ b/yarn.lock @@ -22201,6 +22201,11 @@ pako@~1.0.5: resolved "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz" integrity sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw== +papaparse@^5.3.2: + version "5.3.2" + resolved "https://registry.yarnpkg.com/papaparse/-/papaparse-5.3.2.tgz#d1abed498a0ee299f103130a6109720404fbd467" + integrity sha512-6dNZu0Ki+gyV0eBsFKJhYr+MdQYAzFUGlBMNj3GNrmHxmz1lfRa24CjFObPXtjcetlOv5Ad299MhIK0znp3afw== + parallel-transform@^1.1.0: version "1.2.0" resolved "https://registry.npmjs.org/parallel-transform/-/parallel-transform-1.2.0.tgz" @@ -27668,10 +27673,10 @@ typeorm@^0.2.29, typeorm@^0.2.31: yargs "^17.0.1" zen-observable-ts "^1.0.0" -typescript@^3.7.3, typescript@^4.5.0: - version "4.7.2" - resolved "https://registry.yarnpkg.com/typescript/-/typescript-4.7.2.tgz#1f9aa2ceb9af87cca227813b4310fff0b51593c4" - integrity sha512-Mamb1iX2FDUpcTRzltPxgWMKy3fhg0TN378ylbktPGPK/99KbDtMQ4W1hwgsbPAsG3a0xKa1vmw4VKZQbkvz5A== +typescript@^3.7.3: + version "3.9.10" + resolved "https://registry.yarnpkg.com/typescript/-/typescript-3.9.10.tgz#70f3910ac7a51ed6bef79da7800690b19bf778b8" + integrity sha512-w6fIxVE/H1PkLKcCPsFqKE7Kv7QUwhU8qQY2MueZXWx5cPZdwFupLgKK3vntcK98BtNHZtAF4LA/yl2a7k8R6Q== typescript@^4.1.3: version "4.4.2"