chore(orchestration): add support for autoRetry, maxAwaitingRetries, retryStep (#13391)

RESOLVES CORE-1163
RESOLVES CORE-1164

**What**

### Add support for non auto retryable steps.

When marking a step with `maxRetries`, when it will fail it will be marked as temporary failure and then retry itself automatically. Thats the default behaviour, if you now add `autoRetry: false`, when the step will fail it will be marked as temporary failure but not retry automatically. you can now call the workflow engine run to resume the workflow from the failing step to be retried.

### Add support for `maxAwaitingRetries`

When setting `retyIntervalAwaiting` a step that is awaiting will be retried after the specified interval without maximun retry. Now you can set `maxAwaitingRetries` to force a maximum awaiting retry number

### Add support to manually retry an awaiting step

In some scenario, either a machine dies while a step is executing or a step is taking longer than expected, you can now call `retryStep` on the workflow engine to force a retry of the step that is supposedly stucked
This commit is contained in:
Adrien de Peretti
2025-09-08 14:46:30 +02:00
committed by GitHub
parent ac5e23b96c
commit d7692100e7
28 changed files with 1366 additions and 64 deletions

View File

@@ -648,6 +648,116 @@ describe("Transaction Orchestrator", () => {
)
})
it("Should not retry steps X times automatically when flag 'autoRetry' is set to false and then compensate steps afterward", async () => {
const mocks = {
one: jest.fn().mockImplementation((payload) => {
return payload
}),
compensateOne: jest.fn().mockImplementation((payload) => {
return payload
}),
two: jest.fn().mockImplementation((payload) => {
throw new Error()
}),
compensateTwo: jest.fn().mockImplementation((payload) => {
return payload
}),
}
async function handler(
actionId: string,
functionHandlerType: TransactionHandlerType,
payload: TransactionPayload
) {
const command = {
firstMethod: {
[TransactionHandlerType.INVOKE]: () => {
mocks.one(payload)
},
[TransactionHandlerType.COMPENSATE]: () => {
mocks.compensateOne(payload)
},
},
secondMethod: {
[TransactionHandlerType.INVOKE]: () => {
mocks.two(payload)
},
[TransactionHandlerType.COMPENSATE]: () => {
mocks.compensateTwo(payload)
},
},
}
return command[actionId][functionHandlerType](payload)
}
const flow: TransactionStepsDefinition = {
next: {
action: "firstMethod",
maxRetries: 3,
autoRetry: false,
next: {
action: "secondMethod",
maxRetries: 3,
autoRetry: false,
},
},
}
const strategy = new TransactionOrchestrator({
id: "transaction-name",
definition: flow,
})
const transaction = await strategy.beginTransaction({
transactionId: "transaction_id_123",
handler,
})
await strategy.resume(transaction)
expect(transaction.transactionId).toBe("transaction_id_123")
expect(mocks.one).toHaveBeenCalledTimes(1)
expect(mocks.two).toHaveBeenCalledTimes(1)
await strategy.resume(transaction)
expect(mocks.one).toHaveBeenCalledTimes(1)
expect(mocks.two).toHaveBeenCalledTimes(2)
await strategy.resume(transaction)
expect(mocks.one).toHaveBeenCalledTimes(1)
expect(mocks.two).toHaveBeenCalledTimes(3)
await strategy.resume(transaction)
expect(mocks.one).toHaveBeenCalledTimes(1)
expect(mocks.two).toHaveBeenCalledTimes(4)
expect(transaction.getState()).toBe(TransactionState.REVERTED)
expect(mocks.compensateOne).toHaveBeenCalledTimes(1)
expect(mocks.two).nthCalledWith(
1,
expect.objectContaining({
metadata: expect.objectContaining({
attempt: 1,
}),
})
)
expect(mocks.two).nthCalledWith(
4,
expect.objectContaining({
metadata: expect.objectContaining({
attempt: 4,
}),
})
)
})
it("Should fail a transaction if any step fails after retrying X time to compensate it", async () => {
const mocks = {
one: jest.fn().mockImplementation((payload) => {

View File

@@ -99,6 +99,22 @@ export class SkipExecutionError extends Error {
}
}
export class SkipStepAlreadyFinishedError extends Error {
static isSkipStepAlreadyFinishedError(
error: Error
): error is SkipStepAlreadyFinishedError {
return (
error instanceof SkipStepAlreadyFinishedError ||
error?.name === "SkipStepAlreadyFinishedError"
)
}
constructor(message?: string) {
super(message)
this.name = "SkipStepAlreadyFinishedError"
}
}
export class SkipCancelledExecutionError extends Error {
static isSkipCancelledExecutionError(
error: Error

View File

@@ -33,6 +33,7 @@ import {
PermanentStepFailureError,
SkipCancelledExecutionError,
SkipExecutionError,
SkipStepAlreadyFinishedError,
SkipStepResponse,
TransactionStepTimeoutError,
TransactionTimeoutError,
@@ -117,7 +118,8 @@ export class TransactionOrchestrator extends EventEmitter {
private static isExpectedError(error: Error): boolean {
return (
SkipCancelledExecutionError.isSkipCancelledExecutionError(error) ||
SkipExecutionError.isSkipExecutionError(error)
SkipExecutionError.isSkipExecutionError(error) ||
SkipStepAlreadyFinishedError.isSkipStepAlreadyFinishedError(error)
)
}
@@ -415,10 +417,24 @@ export class TransactionOrchestrator extends EventEmitter {
stepDef.definition.retryIntervalAwaiting!
)
}
} else if (stepDef.retryRescheduledAt) {
// The step is not configured for awaiting retry but is manually force to retry
stepDef.retryRescheduledAt = null
nextSteps.push(stepDef)
}
continue
} else if (curState.status === TransactionStepStatus.TEMPORARY_FAILURE) {
if (
!stepDef.temporaryFailedAt &&
stepDef.definition.autoRetry === false
) {
stepDef.temporaryFailedAt = Date.now()
continue
}
stepDef.temporaryFailedAt = null
currentSteps.push(stepDef)
if (!stepDef.canRetry()) {
@@ -565,6 +581,27 @@ export class TransactionOrchestrator extends EventEmitter {
}
}
private static async retryStep(
transaction: DistributedTransactionType,
step: TransactionStep
): Promise<void> {
if (!step.retryRescheduledAt) {
step.hasScheduledRetry = true
step.retryRescheduledAt = Date.now()
}
transaction.getFlow().hasWaitingSteps = true
try {
await transaction.saveCheckpoint()
await transaction.scheduleRetry(step, 0)
} catch (error) {
if (!TransactionOrchestrator.isExpectedError(error)) {
throw error
}
}
}
private static async skipStep({
transaction,
step,
@@ -677,10 +714,13 @@ export class TransactionOrchestrator extends EventEmitter {
stopExecution: boolean
transactionIsCancelling?: boolean
}> {
const result = {
stopExecution: false,
transactionIsCancelling: false,
}
if (SkipExecutionError.isSkipExecutionError(error)) {
return {
stopExecution: false,
}
return result
}
step.failures++
@@ -773,10 +813,21 @@ export class TransactionOrchestrator extends EventEmitter {
if (step.hasTimeout()) {
cleaningUp.push(transaction.clearStepTimeout(step))
}
} else {
const isAsync = step.isCompensating()
? step.definition.compensateAsync
: step.definition.async
if (
step.getStates().status === TransactionStepStatus.TEMPORARY_FAILURE &&
step.definition.autoRetry === false &&
isAsync
) {
step.temporaryFailedAt = Date.now()
result.stopExecution = true
}
}
let transactionIsCancelling = false
let shouldEmit = true
try {
await transaction.saveCheckpoint()
} catch (error) {
@@ -784,11 +835,11 @@ export class TransactionOrchestrator extends EventEmitter {
throw error
}
transactionIsCancelling =
result.transactionIsCancelling =
SkipCancelledExecutionError.isSkipCancelledExecutionError(error)
if (SkipExecutionError.isSkipExecutionError(error)) {
shouldEmit = false
result.stopExecution = true
}
}
@@ -798,7 +849,7 @@ export class TransactionOrchestrator extends EventEmitter {
await promiseAll(cleaningUp)
if (shouldEmit) {
if (!result.stopExecution) {
const eventName = step.isCompensating()
? DistributedTransactionEvent.COMPENSATE_STEP_FAILURE
: DistributedTransactionEvent.STEP_FAILURE
@@ -806,8 +857,8 @@ export class TransactionOrchestrator extends EventEmitter {
}
return {
stopExecution: !shouldEmit,
transactionIsCancelling,
stopExecution: result.stopExecution,
transactionIsCancelling: result.transactionIsCancelling,
}
}
@@ -1732,6 +1783,50 @@ export class TransactionOrchestrator extends EventEmitter {
return curTransaction
}
/**
* Manually force a step to retry even if it is still in awaiting status
* @param responseIdempotencyKey - The idempotency key for the step
* @param handler - The handler function to execute the step
* @param transaction - The current transaction. If not provided it will be loaded based on the responseIdempotencyKey
*/
public async retryStep({
responseIdempotencyKey,
handler,
transaction,
onLoad,
}: {
responseIdempotencyKey: string
handler?: TransactionStepHandler
transaction?: DistributedTransactionType
onLoad?: (transaction: DistributedTransactionType) => Promise<void> | void
}): Promise<DistributedTransactionType> {
const [curTransaction, step] =
await TransactionOrchestrator.getTransactionAndStepFromIdempotencyKey(
responseIdempotencyKey,
handler,
transaction
)
if (onLoad) {
await onLoad(curTransaction)
}
if (step.getStates().status === TransactionStepStatus.WAITING) {
this.emit(DistributedTransactionEvent.RESUME, {
transaction: curTransaction,
})
await TransactionOrchestrator.retryStep(curTransaction, step)
} else {
throw new MedusaError(
MedusaError.Types.NOT_ALLOWED,
`Cannot retry step when status is ${step.getStates().status}`
)
}
return curTransaction
}
/** Register a step success for a specific transaction and step
* @param responseIdempotencyKey - The idempotency key for the step
* @param handler - The handler function to execute the step

View File

@@ -54,6 +54,7 @@ export class TransactionStep {
}
attempts: number
failures: number
temporaryFailedAt: number | null
lastAttempt: number | null
retryRescheduledAt: number | null
hasScheduledRetry: boolean
@@ -189,7 +190,9 @@ export class TransactionStep {
this.hasAwaitingRetry() &&
this.lastAttempt &&
Date.now() - this.lastAttempt >
this.definition.retryIntervalAwaiting! * 1e3
this.definition.retryIntervalAwaiting! * 1e3 &&
(!("maxAwaitingRetries" in this.definition) ||
this.attempts < this.definition.maxAwaitingRetries!)
)
}
@@ -199,7 +202,8 @@ export class TransactionStep {
(!this.isCompensating() &&
state === TransactionStepState.NOT_STARTED &&
flowState === TransactionState.INVOKING) ||
status === TransactionStepStatus.TEMPORARY_FAILURE
(status === TransactionStepStatus.TEMPORARY_FAILURE &&
!this.temporaryFailedAt)
)
}

View File

@@ -47,6 +47,12 @@ export type TransactionStepsDefinition = {
*/
maxRetries?: number
/**
* If true, the step will be retried automatically in case of a temporary failure.
* The default is true.
*/
autoRetry?: boolean
/**
* The interval (in seconds) between retry attempts after a temporary failure.
* The default is to retry immediately.
@@ -58,6 +64,11 @@ export type TransactionStepsDefinition = {
*/
retryIntervalAwaiting?: number
/**
* The maximum number of times to retry a step even if its status is "TransactionStepStatus.WAITING".
*/
maxAwaitingRetries?: number
/**
* The maximum amount of time (in seconds) to wait for this step to complete.
* This is NOT an execution timeout, the step will always be executed and wait for its response.
@@ -279,6 +290,7 @@ export type TransactionFlow = {
timedOutAt: number | null
startedAt?: number
cancelledAt?: number
temporaryFailedAt?: number | null
state: TransactionState
steps: {
[key: string]: TransactionStep

View File

@@ -435,6 +435,33 @@ export class LocalWorkflow {
}
}
async retryStep(
idempotencyKey: string,
context?: Context,
subscribe?: DistributedTransactionEvents
): Promise<DistributedTransactionType> {
this.medusaContext = context
const { handler, orchestrator } = this.workflow
const { cleanUpEventListeners } = this.registerEventCallbacks({
orchestrator,
idempotencyKey,
subscribe,
})
const transaction = await orchestrator.retryStep({
responseIdempotencyKey: idempotencyKey,
handler: handler(this.container_, context),
onLoad: this.onLoad.bind(this),
})
try {
return transaction
} finally {
cleanUpEventListeners()
}
}
async registerStepSuccess(
idempotencyKey: string,
response?: unknown,