feat(medusa): Parsing CSV files (#1572)

* add: csv parser

* fix: tests

* fix: linting + comment

* add: missing dependency

* fix: generic transformers

* fix: address comments

* fix: make parser options generic

* add: support regex columns + fix missing columns bug

* improve test case

* add: support for reducer + tests

* fix: add comments to csv parser
This commit is contained in:
Zakaria El Asri
2022-06-19 16:10:15 +01:00
committed by GitHub
parent 46a6e1a4d3
commit 9e686a8e47
6 changed files with 730 additions and 4 deletions

View File

@@ -79,6 +79,7 @@
"morgan": "^1.9.1",
"multer": "^1.4.2",
"node-schedule": "^2.1.0",
"papaparse": "^5.3.2",
"passport": "^0.4.0",
"passport-http-bearer": "^1.0.1",
"passport-jwt": "^4.0.0",

View File

@@ -0,0 +1,42 @@
/**
* Generic parsing interface. All different parsing implementations (csv, json, etc.) should implement this interface
*/
export interface IParser<TResult, TParseOptions> {
/**
*
* @param readableStream readable stream to parse
* @param options options used for parsing by underlying parser implementation
*/
parse(
readableStream: NodeJS.ReadableStream,
options?: TParseOptions
): Promise<TResult[]>
}
/**
* Abstract class implementation of the IParser interface. All different parsing implementations should extend this class
*/
export abstract class AbstractParser<
TSchema,
TParserResult,
TParseOptions,
TOutputResult
> implements IParser<TParserResult, TParseOptions>
{
protected readonly $$schema: TSchema
protected constructor(schema: TSchema) {
this.$$schema = schema
}
public abstract parse(
readableStream: NodeJS.ReadableStream,
options?: TParseOptions
): Promise<TParserResult[]>
/**
*
* @param data data to be built after parsing. Includes validation according to schema, transformation of values, etc.
*/
public abstract buildData(data: TParserResult[]): Promise<TOutputResult[]>
}

View File

@@ -0,0 +1,73 @@
import { AwilixContainer } from "awilix"
/**
* Generic validation interface used to run validation logic on every line or record.
* All different validation objects should implement this interface
*/
export interface ICsvValidator<TCsvLine, TBuiltLine> {
/**
*
* @param value value of column or property
* @param context includes contextual information such as line number, line, etc.
*/
validate: (
value: TBuiltLine,
context: CsvParserContext<TCsvLine>
) => Promise<boolean | never>
}
export type CsvParserContext<TLine> = LineContext<TLine> & {
column: string
}
export type LineContext<TLine> = {
lineNumber: number
line: TLine
}
/**
* Abstract class implementation of the IValidator interface.
* All validation objects part of the schema should extend this class.
*/
export abstract class AbstractCsvValidator<TCsvLine, TBuiltLine>
implements ICsvValidator<TCsvLine, TBuiltLine>
{
constructor(protected readonly container: AwilixContainer) {}
abstract validate(
builtLine: TBuiltLine,
context: CsvParserContext<TCsvLine>
): Promise<boolean | never>
}
export type CsvSchemaColumn<TCsvLine, TBuiltLine> = {
name: string
required?: boolean
validator?: AbstractCsvValidator<TCsvLine, TBuiltLine>
} & (
| {
mapTo?: string
transform?: ColumnTransformer<TCsvLine>
}
| {
match?: RegExp
reducer?: ColumnReducer<TCsvLine, TBuiltLine>
transform?: ColumnTransformer<TCsvLine>
}
)
export type ColumnTransformer<TCsvLine> = (
value: string,
context: CsvParserContext<TCsvLine>
) => unknown
export type ColumnReducer<TCsvLine = unknown, TBuiltLine = unknown> = (
builtLine: TBuiltLine,
key: string,
value: string,
context: CsvParserContext<TCsvLine>
) => TBuiltLine
export type CsvSchema<TCsvLine = unknown, TBuiltLine = unknown> = {
columns: CsvSchemaColumn<TCsvLine, TBuiltLine>[]
}

View File

@@ -0,0 +1,403 @@
import { createContainer } from "awilix"
import { Readable } from "stream"
import { AbstractCsvValidator } from "../../interfaces/csv-parser"
import CsvParser from "../csv-parser"
import { currencies } from "../../utils/currencies"
describe("CsvParser", () => {
describe("parse", () => {
const csvParser = new CsvParser(createContainer(), {
columns: [],
})
let csvContent =
'title,subtitle\n"T-shirt","summer tee"\n"Sunglasses","Red sunglasses"'
let expectedProducts = [
{
title: "T-shirt",
subtitle: "summer tee",
},
{
title: "Sunglasses",
subtitle: "Red sunglasses",
},
]
afterEach(() => {
jest.clearAllMocks()
})
it("given a readable stream, can parse the stream content", async () => {
const stream = Readable.from(csvContent)
const content = await csvParser.parse(stream)
expect(content).toEqual(expectedProducts)
})
})
describe("buildData", () => {
describe("schema validation", () => {
class TitleValidator extends AbstractCsvValidator {
async validate(builtLine) {
if (/\d/.test(builtLine["title"])) {
throw new Error("title should not contain a number")
}
return true
}
}
const schema = {
columns: [
{
name: "title",
validator: new TitleValidator(createContainer()),
},
{
name: "size",
},
{
name: "height",
},
],
}
const csvParser = new CsvParser(createContainer(), schema)
it("given a line containing a column which is not defined in the schema, then validation should fail", async () => {
try {
await csvParser.buildData([
{
title: "sunglasses",
size: "M",
height: "100",
first_name: "lebron",
},
])
} catch (err) {
expect(err.message).toEqual(
"Unable to treat column first_name from the csv file. No target column found in the provided schema"
)
}
})
it("given a line containing a column which does not pass a validation constraint, then validation should fail", async () => {
try {
await csvParser.buildData([
{ title: "contains a number 1", size: "M", height: "100" },
])
} catch (err) {
expect(err.message).toEqual("title should not contain a number")
}
})
it("given a line which passes all validation constraints, then should returned validated content", async () => {
const content = await csvParser.buildData([
{ title: "great product", size: "M", height: "100" },
])
expect(content).toEqual([
{
title: "great product",
size: "M",
height: "100",
},
])
})
it("given a line which does not provide a value for a required column, then should throw an error", async () => {
try {
await csvParser.buildData([{ size: "S", height: "100" }])
} catch (err) {
expect(err.message).toEqual(
`Missing column(s) "title" from the given csv file`
)
}
})
it("given a line which does not provide a value for multiple required columns, then should throw an error", async () => {
try {
await csvParser.buildData([{ size: "S" }])
} catch (err) {
expect(err.message).toEqual(
`Missing column(s) "title", "height" from the given csv file`
)
}
})
it("given a line which does not provide a value for a required column, then should throw an error", async () => {
try {
await csvParser.buildData([
{ title: "t-shirt", height: "100", size: "" },
])
} catch (err) {
expect(err.message).toEqual(
`No value found for target column "size" in line 1 of the given csv file`
)
}
})
})
describe("mapTo", () => {
const csvParser = new CsvParser(createContainer(), {
columns: [
{
name: "title",
mapTo: "product_title",
},
],
})
it("given a mapTo field for a column, when building data including that column, should rename the column name to what mapTo refers to", async () => {
const content = await csvParser.buildData([{ title: "a product" }])
expect(content).toEqual([
{
product_title: "a product",
},
])
})
})
describe("transformer", () => {
const csvParser = new CsvParser(createContainer(), {
columns: [
{
name: "title",
},
{
name: "price usd",
transform: (value) => Math.round(Number(value) * 100),
},
],
})
it("given a transformer function for a column, when building data, should transform that column's value according to the transformation function", async () => {
const content = await csvParser.buildData([
{ title: "medusa t-shirt", "price usd": "19.99" },
])
expect(content).toEqual([
{
title: "medusa t-shirt",
"price usd": 1999,
},
])
})
})
describe("match", () => {
describe("regex", () => {
const csvParser = new CsvParser(createContainer(), {
columns: [
{
name: "title",
},
{
name: "prices",
match: /.*Variant Price.*/i,
transform: (value) => Math.round(Number(value) * 100),
},
],
})
it("given a column with the match property as regex and a transformer, when building data, should resolve that column for all entries in the line that match the regex", async () => {
const content = await csvParser.buildData([
{
title: "medusa t-shirt",
"variant price usd": "19.99",
"variant price cad": "26.79",
"variant price dkk": "1389",
},
{
title: "medusa sunglasses",
"variant price usd": "9.99",
"variant price cad": "16.79",
"variant price dkk": "389",
},
])
expect(content).toEqual([
{
title: "medusa t-shirt",
"variant price usd": 1999,
"variant price cad": 2679,
"variant price dkk": 138900,
},
{
title: "medusa sunglasses",
"variant price usd": 999,
"variant price cad": 1679,
"variant price dkk": 38900,
},
])
})
})
describe("reducer", () => {
const schema = {
columns: [
{
name: "title",
},
{
name: "prices",
match: /.*Variant Price ([a-z]+).*/i,
reducer: (builtLine, key, value) => {
const [, currency_code] = key.match(
/.*Variant Price ([a-z]+).*/i
)
const existingPrices = builtLine.prices ?? []
const price = {
amount: Math.round(Number(value) * 100),
currency_code,
}
return {
...builtLine,
prices: [...existingPrices, price],
}
},
validator: {
validate: (builtLine) => {
const unexistingCurrency = builtLine.prices?.find(
(price) => !currencies[price.currency_code.toUpperCase()]
)
if (unexistingCurrency) {
throw new Error(
`wrong currency: ${unexistingCurrency.currency_code}`
)
}
return true
},
},
},
],
}
const csvParser = new CsvParser(createContainer(), schema)
it("given a column with match and reducer properties, when building data, should return the result of the reducer function", async () => {
const content = await csvParser.buildData([
{
title: "medusa t-shirt",
"variant price usd": "19.99",
"variant price cad": "26.79",
"variant price dkk": "1389",
},
{
title: "medusa sunglasses",
"variant price usd": "9.99",
"variant price cad": "16.79",
"variant price dkk": "389",
},
])
expect(content).toEqual([
{
title: "medusa t-shirt",
prices: [
{
currency_code: "usd",
amount: 1999,
},
{
currency_code: "cad",
amount: 2679,
},
{
currency_code: "dkk",
amount: 138900,
},
],
},
{
title: "medusa sunglasses",
prices: [
{
currency_code: "usd",
amount: 999,
},
{
currency_code: "cad",
amount: 1679,
},
{
currency_code: "dkk",
amount: 38900,
},
],
},
])
})
it("given a column with match and reducer properties, when building data, should run validation on the built data", async () => {
try {
await csvParser.buildData([
{
title: "medusa t-shirt",
"variant price usd": "19.99",
"variant price cad": "26.79",
"variant price grp": "1389",
},
{
title: "medusa sunglasses",
"variant price usd": "9.99",
"variant price cad": "16.79",
"variant price grp": "389",
},
])
} catch (err) {
expect(err.message).toEqual("wrong currency: grp")
}
})
describe("invalid column properties", () => {
const schema = {
columns: [
{
name: "title",
},
{
name: "variants",
match: /.*Variant Price ([a-z]+).*/i,
mapTo: "prices",
},
],
}
const csvParser = new CsvParser(createContainer(), schema)
it("given a column with match and mapTo property, when building data, then the mapTo property should be ignored", async () => {
const content = await csvParser.buildData([
{
title: "medusa t-shirt",
"variant price usd": "19.99",
"variant price cad": "26.79",
"variant price dkk": "1389",
},
{
title: "medusa sunglasses",
"variant price usd": "9.99",
"variant price cad": "16.79",
"variant price dkk": "389",
},
])
expect(content).toEqual([
{
title: "medusa t-shirt",
"variant price usd": "19.99",
"variant price cad": "26.79",
"variant price dkk": "1389",
},
{
title: "medusa sunglasses",
"variant price usd": "9.99",
"variant price cad": "16.79",
"variant price dkk": "389",
},
])
})
})
})
})
})
})

View File

@@ -0,0 +1,202 @@
import { AwilixContainer } from "awilix"
import { difference } from "lodash"
import Papa, { ParseConfig } from "papaparse"
import { AbstractParser } from "../interfaces/abstract-parser"
import { CsvParserContext, CsvSchema } from "../interfaces/csv-parser"
const DEFAULT_PARSE_OPTIONS = {
dynamicTyping: true,
header: true,
}
class CsvParser<
TSchema extends CsvSchema<TParserResult, TOutputResult> = CsvSchema,
TParserResult = unknown,
TOutputResult = unknown
> extends AbstractParser<TSchema, TParserResult, ParseConfig, TOutputResult> {
protected readonly $$delimiter: string = ";"
constructor(
protected readonly container: AwilixContainer,
schema: TSchema,
delimiter?: string
) {
super(schema)
if (delimiter) {
this.$$delimiter = delimiter
}
}
public async parse(
readableStream: NodeJS.ReadableStream,
options: ParseConfig = DEFAULT_PARSE_OPTIONS
): Promise<TParserResult[]> {
const csvStream = Papa.parse(Papa.NODE_STREAM_INPUT, options)
const parsedContent: TParserResult[] = []
readableStream.pipe(csvStream)
for await (const chunk of csvStream) {
parsedContent.push(chunk)
}
return parsedContent
}
async buildData(data: TParserResult[]): Promise<TOutputResult[]> {
const validatedData = [] as TOutputResult[]
for (let i = 0; i < data.length; i++) {
const builtLine = await this._buildLine(data[i], i + 1)
validatedData.push(builtLine)
}
return validatedData
}
private async _buildLine(
line: TParserResult,
lineNumber: number
): Promise<TOutputResult> {
let outputTuple = {} as TOutputResult
const columnMap = this.buildColumnMap_(this.$$schema.columns)
const tupleKeys = Object.keys(line)
/**
* map which keeps track of the columns processed
* used to detect any missing columns which are present in the schema but not in the line
*/
const processedColumns = {}
for (const tupleKey of tupleKeys) {
const column = this.resolveColumn_(tupleKey, columnMap)
/**
* if the tupleKey does not correspond to any column defined in the schema
*/
if (!column) {
throw new Error(
`Unable to treat column ${tupleKey} from the csv file. No target column found in the provided schema`
)
}
processedColumns[column.name] = true
/**
* if the value corresponding to the tupleKey is empty and the column is required in the schema
*/
if (!line[tupleKey] && column.required) {
throw new Error(
`No value found for target column "${column.name}" in line ${lineNumber} of the given csv file`
)
}
const context = {
line,
lineNumber,
column: column.name,
tupleKey,
}
outputTuple = this.resolveTuple_(outputTuple, column, context)
}
/**
* missing columns = columns defined in the schema - columns present in the line
*/
const missingColumns = difference(
Object.keys(columnMap),
Object.keys(processedColumns)
)
if (missingColumns.length > 0) {
throw new Error(
`Missing column(s) ${formatMissingColumns(
missingColumns
)} from the given csv file`
)
}
/**
* Runs the validation defined in the schema columns
*/
for (const column of this.$$schema.columns) {
const context = {
line,
lineNumber,
column: column.name,
}
if (column.validator) {
await column.validator.validate(outputTuple, context)
}
}
return outputTuple
}
private buildColumnMap_(
columns: TSchema["columns"]
): Record<string, TSchema["columns"][number]> {
return columns.reduce((map, column) => {
if (typeof column.name === "string") {
map[column.name] = column
}
return map
}, {})
}
private resolveColumn_(
tupleKey: string,
columnMap: Record<string, TSchema["columns"][number]>
): TSchema["columns"][number] | undefined {
if (columnMap[tupleKey]) {
return columnMap[tupleKey]
}
const matchedColumn = this.$$schema.columns.find((column) =>
"match" in column &&
typeof column.match === "object" &&
column.match instanceof RegExp
? column.match.test(tupleKey)
: false
)
return matchedColumn
}
private resolveTuple_(
tuple: TOutputResult,
column: TSchema["columns"][number],
context: CsvParserContext<TParserResult> & { tupleKey: string }
): TOutputResult {
const outputTuple = { ...tuple }
const { tupleKey, ...csvContext } = context
const { line } = csvContext
let resolvedKey = tupleKey
/**
* if match is provided, then we should call the reducer if it's defined
* otherwise, before using the mapTo property, we should make sure match was not provided
*/
if ("match" in column && column.reducer) {
return column.reducer(outputTuple, tupleKey, line[tupleKey], csvContext)
} else if (!("match" in column) && "mapTo" in column && column.mapTo) {
resolvedKey = column.mapTo
}
const resolvedValue = column.transform
? column.transform(line[tupleKey], csvContext)
: line[tupleKey]
outputTuple[resolvedKey] = resolvedValue
return outputTuple
}
}
const formatMissingColumns = (list: string[]): string =>
list.reduce(
(text, curr, i, array) =>
text + (i < array.length - 1 ? `"${curr}", ` : `"${curr}"`),
""
)
export default CsvParser

View File

@@ -22201,6 +22201,11 @@ pako@~1.0.5:
resolved "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz"
integrity sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==
papaparse@^5.3.2:
version "5.3.2"
resolved "https://registry.yarnpkg.com/papaparse/-/papaparse-5.3.2.tgz#d1abed498a0ee299f103130a6109720404fbd467"
integrity sha512-6dNZu0Ki+gyV0eBsFKJhYr+MdQYAzFUGlBMNj3GNrmHxmz1lfRa24CjFObPXtjcetlOv5Ad299MhIK0znp3afw==
parallel-transform@^1.1.0:
version "1.2.0"
resolved "https://registry.npmjs.org/parallel-transform/-/parallel-transform-1.2.0.tgz"
@@ -27668,10 +27673,10 @@ typeorm@^0.2.29, typeorm@^0.2.31:
yargs "^17.0.1"
zen-observable-ts "^1.0.0"
typescript@^3.7.3, typescript@^4.5.0:
version "4.7.2"
resolved "https://registry.yarnpkg.com/typescript/-/typescript-4.7.2.tgz#1f9aa2ceb9af87cca227813b4310fff0b51593c4"
integrity sha512-Mamb1iX2FDUpcTRzltPxgWMKy3fhg0TN378ylbktPGPK/99KbDtMQ4W1hwgsbPAsG3a0xKa1vmw4VKZQbkvz5A==
typescript@^3.7.3:
version "3.9.10"
resolved "https://registry.yarnpkg.com/typescript/-/typescript-3.9.10.tgz#70f3910ac7a51ed6bef79da7800690b19bf778b8"
integrity sha512-w6fIxVE/H1PkLKcCPsFqKE7Kv7QUwhU8qQY2MueZXWx5cPZdwFupLgKK3vntcK98BtNHZtAF4LA/yl2a7k8R6Q==
typescript@^4.1.3:
version "4.4.2"