chore(orchestration): add support for autoRetry, maxAwaitingRetries, retryStep (#13391)

RESOLVES CORE-1163
RESOLVES CORE-1164

**What**

### Add support for non auto retryable steps.

When marking a step with `maxRetries`, when it will fail it will be marked as temporary failure and then retry itself automatically. Thats the default behaviour, if you now add `autoRetry: false`, when the step will fail it will be marked as temporary failure but not retry automatically. you can now call the workflow engine run to resume the workflow from the failing step to be retried.

### Add support for `maxAwaitingRetries`

When setting `retyIntervalAwaiting` a step that is awaiting will be retried after the specified interval without maximun retry. Now you can set `maxAwaitingRetries` to force a maximum awaiting retry number

### Add support to manually retry an awaiting step

In some scenario, either a machine dies while a step is executing or a step is taking longer than expected, you can now call `retryStep` on the workflow engine to force a retry of the step that is supposedly stucked
This commit is contained in:
Adrien de Peretti
2025-09-08 14:46:30 +02:00
committed by GitHub
parent ac5e23b96c
commit d7692100e7
28 changed files with 1366 additions and 64 deletions

View File

@@ -0,0 +1,56 @@
import {
createStep,
createWorkflow,
StepResponse,
} from "@medusajs/framework/workflows-sdk"
const step1InvokeMock = jest.fn((input) => {
input.test = "test"
return new StepResponse(input, { compensate: 123 })
})
const step1CompensateMock = jest.fn((compensateInput) => {
if (!compensateInput) {
return
}
return new StepResponse({
reverted: true,
})
})
const step2InvokeMock = jest.fn((input) => {
throw new Error("Temporary failure")
})
const step2CompensateMock = jest.fn((compensateInput) => {
if (!compensateInput) {
return
}
return new StepResponse({
reverted: true,
})
})
export {
step1CompensateMock,
step1InvokeMock,
step2CompensateMock,
step2InvokeMock,
}
const step_1 = createStep("step_1", step1InvokeMock, step1CompensateMock)
const step_2 = createStep("step_2", step2InvokeMock, step2CompensateMock)
createWorkflow("workflow_1_auto_retries", function (input) {
step_1(input)
const ret2 = step_2({ hey: "oh" }).config({
async: true,
maxRetries: 2,
})
return ret2
})

View File

@@ -0,0 +1,57 @@
import {
createStep,
createWorkflow,
StepResponse,
} from "@medusajs/framework/workflows-sdk"
const step1InvokeMock = jest.fn((input) => {
input.test = "test"
return new StepResponse(input, { compensate: 123 })
})
const step1CompensateMock = jest.fn((compensateInput) => {
if (!compensateInput) {
return
}
return new StepResponse({
reverted: true,
})
})
const step2InvokeMock = jest.fn((input) => {
throw new Error("Temporary failure")
})
const step2CompensateMock = jest.fn((compensateInput) => {
if (!compensateInput) {
return
}
return new StepResponse({
reverted: true,
})
})
export {
step1CompensateMock,
step1InvokeMock,
step2CompensateMock,
step2InvokeMock,
}
const step_1 = createStep("step_1", step1InvokeMock, step1CompensateMock)
const step_2 = createStep("step_2", step2InvokeMock, step2CompensateMock)
createWorkflow("workflow_1_auto_retries_false", function (input) {
step_1(input)
const ret2 = step_2({ hey: "oh" }).config({
async: true,
maxRetries: 2,
autoRetry: false,
})
return ret2
})

View File

@@ -0,0 +1,60 @@
import {
createStep,
createWorkflow,
StepResponse,
} from "@medusajs/framework/workflows-sdk"
const step1InvokeMock = jest.fn((input) => {
input.test = "test"
return new StepResponse(input, { compensate: 123 })
})
const step1CompensateMock = jest.fn((compensateInput) => {
if (!compensateInput) {
return
}
return new StepResponse({
reverted: true,
})
})
const step2InvokeMock = jest.fn(async (input, context) => {
if (context.metadata.attempt === 1) {
await new Promise((resolve) => setTimeout(resolve, 100000))
return new StepResponse("success after 100 seconds")
}
return new StepResponse("success")
})
const step2CompensateMock = jest.fn((compensateInput) => {
if (!compensateInput) {
return
}
return new StepResponse({
reverted: true,
})
})
export {
step1CompensateMock,
step1InvokeMock,
step2CompensateMock,
step2InvokeMock,
}
const step_1 = createStep("step_1", step1InvokeMock, step1CompensateMock)
const step_2 = createStep("step_2", step2InvokeMock, step2CompensateMock)
createWorkflow("workflow_1_manual_retry_step", function (input) {
step_1(input)
const ret2 = step_2({ hey: "oh" }).config({
async: true,
})
return ret2
})

View File

@@ -38,19 +38,38 @@ import {
workflowNotIdempotentWithRetentionStep3Invoke,
} from "../__fixtures__"
import { createScheduled } from "../__fixtures__/workflow_scheduled"
import {
step1InvokeMock as step1InvokeMockAutoRetries,
step2InvokeMock as step2InvokeMockAutoRetries,
step1CompensateMock as step1CompensateMockAutoRetries,
step2CompensateMock as step2CompensateMockAutoRetries,
} from "../__fixtures__/workflow_1_auto_retries"
import {
step1InvokeMock as step1InvokeMockAutoRetriesFalse,
step2InvokeMock as step2InvokeMockAutoRetriesFalse,
step1CompensateMock as step1CompensateMockAutoRetriesFalse,
step2CompensateMock as step2CompensateMockAutoRetriesFalse,
} from "../__fixtures__/workflow_1_auto_retries_false"
import {
step1InvokeMock as step1InvokeMockManualRetry,
step2InvokeMock as step2InvokeMockManualRetry,
step1CompensateMock as step1CompensateMockManualRetry,
step2CompensateMock as step2CompensateMockManualRetry,
} from "../__fixtures__/workflow_1_manual_retry_step"
import { TestDatabase } from "../utils"
import { Redis } from "ioredis"
jest.setTimeout(300000)
const failTrap = (done, name) => {
setTimeoutSync(() => {
const failTrap = (done, name, timeout = 5000) => {
return setTimeoutSync(() => {
// REF:https://stackoverflow.com/questions/78028715/jest-async-test-with-event-emitter-isnt-ending
console.warn(
`Jest is breaking the event emit with its debouncer. This allows to continue the test by managing the timeout of the test manually. ${name}`
)
done()
}, 5000)
}, timeout)
}
function times(num) {
@@ -221,12 +240,13 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
TransactionState.REVERTED
)
done()
clearTimeout(timeout)
}
},
})
})
failTrap(
const timeout = failTrap(
done,
"should cancel an ongoing execution with async unfinished yet step"
)
@@ -393,6 +413,137 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
})
})
it("should manually retry a step that is taking too long to finish", (done) => {
const transactionId = "transaction-manual-retry" + ulid()
const workflowId = "workflow_1_manual_retry_step"
void workflowOrcModule
.run(workflowId, {
input: {},
transactionId,
})
.then(() => {
expect(step1InvokeMockManualRetry).toHaveBeenCalledTimes(1)
expect(step2InvokeMockManualRetry).toHaveBeenCalledTimes(1)
void workflowOrcModule.retryStep({
idempotencyKey: {
workflowId,
transactionId,
stepId: "step_2",
action: "invoke",
},
})
})
workflowOrcModule.subscribe({
workflowId,
transactionId,
subscriber: async (event) => {
if (event.eventType === "onFinish") {
expect(step1InvokeMockManualRetry).toHaveBeenCalledTimes(1)
expect(step2InvokeMockManualRetry).toHaveBeenCalledTimes(2)
done()
clearTimeout(timeout)
}
},
})
const timeout = failTrap(
done,
"should manually retry a step that is taking too long to finish"
)
})
it("should retry steps X times automatically when maxRetries is set", (done) => {
const transactionId = "transaction-auto-retries" + ulid()
const workflowId = "workflow_1_auto_retries"
void workflowOrcModule.run(workflowId, {
input: {},
transactionId,
})
workflowOrcModule.subscribe({
workflowId,
transactionId,
subscriber: async (event) => {
if (event.eventType === "onFinish") {
expect(step1InvokeMockAutoRetries).toHaveBeenCalledTimes(1)
expect(step2InvokeMockAutoRetries).toHaveBeenCalledTimes(3)
expect(step1CompensateMockAutoRetries).toHaveBeenCalledTimes(1)
expect(step2CompensateMockAutoRetries).toHaveBeenCalledTimes(1)
done()
clearTimeout(timeout)
}
},
})
const timeout = failTrap(
done,
"should retry steps X times automatically when maxRetries is set"
)
})
it("should not retry steps X times automatically when maxRetries is set and autoRetry is false", async () => {
const transactionId = "transaction-auto-retries" + ulid()
const workflowId = "workflow_1_auto_retries_false"
await workflowOrcModule.run(workflowId, {
input: {},
transactionId,
throwOnError: false,
})
let lastExepectHaveBeenCalledTimes = 0
workflowOrcModule.subscribe({
workflowId,
transactionId,
subscriber: async (event) => {
if (event.eventType === "onFinish") {
lastExepectHaveBeenCalledTimes = 1
expect(step1InvokeMockAutoRetriesFalse).toHaveBeenCalledTimes(1)
expect(step2InvokeMockAutoRetriesFalse).toHaveBeenCalledTimes(3)
expect(
step1CompensateMockAutoRetriesFalse
).toHaveBeenCalledTimes(1)
expect(
step2CompensateMockAutoRetriesFalse
).toHaveBeenCalledTimes(1)
}
},
})
expect(step1InvokeMockAutoRetriesFalse).toHaveBeenCalledTimes(1)
expect(step2InvokeMockAutoRetriesFalse).toHaveBeenCalledTimes(1)
expect(step1CompensateMockAutoRetriesFalse).toHaveBeenCalledTimes(0)
expect(step2CompensateMockAutoRetriesFalse).toHaveBeenCalledTimes(0)
await workflowOrcModule.run(workflowId, {
input: {},
transactionId,
throwOnError: false,
})
await setTimeout(1000)
expect(step1InvokeMockAutoRetriesFalse).toHaveBeenCalledTimes(1)
expect(step2InvokeMockAutoRetriesFalse).toHaveBeenCalledTimes(2)
expect(step1CompensateMockAutoRetriesFalse).toHaveBeenCalledTimes(0)
expect(step2CompensateMockAutoRetriesFalse).toHaveBeenCalledTimes(0)
await workflowOrcModule.run(workflowId, {
input: {},
transactionId,
throwOnError: false,
})
await setTimeout(1000)
expect(lastExepectHaveBeenCalledTimes).toEqual(1)
})
it("should prevent executing twice the same workflow in perfect concurrency with the same transactionId and non idempotent and not async but retention time is set", async () => {
const transactionId = "concurrency_transaction_id" + ulid()
const workflowId = "concurrency_workflow_id" + ulid()
@@ -813,11 +964,12 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
subscriber: (event) => {
if (event.eventType === "onFinish") {
done()
clearTimeout(timeout)
}
},
})
failTrap(done, "workflow_async_background")
const timeout = failTrap(done, "workflow_async_background")
})
it("should subscribe to a async workflow and receive the response when it finishes", (done) => {
@@ -840,13 +992,14 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
if (event.eventType === "onFinish") {
onFinish()
done()
clearTimeout(timeout)
}
},
})
expect(onFinish).toHaveBeenCalledTimes(0)
failTrap(done, "workflow_async_background")
const timeout = failTrap(done, "workflow_async_background")
})
it("should not skip step if condition is true", function (done) {
@@ -866,11 +1019,12 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
subscriber: (event) => {
if (event.eventType === "onFinish") {
done()
clearTimeout(timeout)
}
},
})
failTrap(done, "wf-when")
const timeout = failTrap(done, "wf-when")
})
it("should cancel an async sub workflow when compensating", (done) => {
@@ -904,11 +1058,12 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
})
done()
clearTimeout(timeout)
}
},
})
failTrap(done, "workflow_async_background_fail")
const timeout = failTrap(done, "workflow_async_background_fail")
})
it("should cancel and revert a completed workflow", async () => {

View File

@@ -15,14 +15,14 @@ import "../__fixtures__"
jest.setTimeout(300000)
const failTrap = (done, name) => {
setTimeoutSync(() => {
const failTrap = (done, name, timeout = 5000) => {
return setTimeoutSync(() => {
// REF:https://stackoverflow.com/questions/78028715/jest-async-test-with-event-emitter-isnt-ending
console.warn(
`Jest is breaking the event emit with its debouncer. This allows to continue the test by managing the timeout of the test manually. ${name}`
)
done()
}, 5000)
}, timeout)
}
// REF:https://stackoverflow.com/questions/78028715/jest-async-test-with-event-emitter-isnt-ending
@@ -107,6 +107,8 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
done()
} catch (e) {
return done(e)
} finally {
clearTimeout(timeout)
}
}
},
@@ -118,7 +120,7 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
expect(result).toBe("result from step 0")
})
failTrap(
const timeout = failTrap(
done,
"should prevent race continuation of the workflow during retryIntervalAwaiting in background execution"
)
@@ -206,6 +208,8 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
done()
} catch (e) {
return done(e)
} finally {
clearTimeout(timeout)
}
}
},
@@ -217,7 +221,7 @@ moduleIntegrationTestRunner<IWorkflowEngineService>({
expect(result).toBe("result from step 0")
})
failTrap(
const timeout = failTrap(
done,
"should prevent race continuation of the workflow compensation during retryIntervalAwaiting in background execution"
)