From c172c816e7998fb5d7811cfadb7e0839b241196e Mon Sep 17 00:00:00 2001 From: peterstone2017 <12449837+YunchuWang@users.noreply.github.com> Date: Thu, 26 Feb 2026 20:42:36 -0800 Subject: [PATCH] Fix compute_next_delay returning None when max_retry_interval is not set The compute_next_delay() method in RetryableTask had a bug where the return statement for the computed delay was inside the 'if max_retry_interval is not None' block. When max_retry_interval was not set (the default), the method would fall through and return None, causing retries to silently not happen. This fix moves the return statement outside the max_retry_interval check so the delay is always returned after optional capping. Added two new tests: - test_activity_retry_without_max_retry_interval: verifies retry with exponential backoff works when max_retry_interval is not set - test_activity_retry_with_default_backoff: verifies retry with default backoff_coefficient (1.0) produces constant delays --- durabletask/task.py | 3 +- .../test_orchestration_executor.py | 154 ++++++++++++++++++ 2 files changed, 156 insertions(+), 1 deletion(-) diff --git a/durabletask/task.py b/durabletask/task.py index d02fe0f..0ef03da 100644 --- a/durabletask/task.py +++ b/durabletask/task.py @@ -468,7 +468,8 @@ def compute_next_delay(self) -> Optional[timedelta]: if self._retry_policy.max_retry_interval is not None: next_delay_f = min(next_delay_f, self._retry_policy.max_retry_interval.total_seconds()) - return timedelta(seconds=next_delay_f) + + return timedelta(seconds=next_delay_f) return None diff --git a/tests/durabletask/test_orchestration_executor.py b/tests/durabletask/test_orchestration_executor.py index 8c72812..14d5e14 100644 --- a/tests/durabletask/test_orchestration_executor.py +++ b/tests/durabletask/test_orchestration_executor.py @@ -410,6 +410,160 @@ def orchestrator(ctx: task.OrchestrationContext, orchestrator_input): assert actions[-1].id == 7 +def test_activity_retry_without_max_retry_interval(): + """Tests that retry logic works correctly when max_retry_interval is not set. + + This is a regression test for a bug where compute_next_delay() returned None + instead of the computed delay when max_retry_interval was not specified, + causing retries to silently fail. + """ + + def dummy_activity(ctx, _): + raise ValueError("Kah-BOOOOM!!!") + + def orchestrator(ctx: task.OrchestrationContext, orchestrator_input): + result = yield ctx.call_activity( + dummy_activity, + retry_policy=task.RetryPolicy( + first_retry_interval=timedelta(seconds=1), + max_number_of_attempts=3, + backoff_coefficient=2), + input=orchestrator_input) + return result + + registry = worker._Registry() + name = registry.add_orchestrator(orchestrator) + + current_timestamp = datetime.utcnow() + + # Simulate the task failing for the first time — retry timer should be created at 1 second + old_events = [ + helpers.new_orchestrator_started_event(timestamp=current_timestamp), + helpers.new_execution_started_event(name, TEST_INSTANCE_ID, encoded_input=None), + helpers.new_task_scheduled_event(1, task.get_name(dummy_activity))] + expected_fire_at = current_timestamp + timedelta(seconds=1) + + new_events = [ + helpers.new_orchestrator_started_event(timestamp=current_timestamp), + helpers.new_task_failed_event(1, ValueError("Kah-BOOOOM!!!"))] + executor = worker._OrchestrationExecutor(registry, TEST_LOGGER) + result = executor.execute(TEST_INSTANCE_ID, old_events, new_events) + actions = result.actions + assert len(actions) == 1 + assert actions[0].HasField("createTimer") + assert actions[0].createTimer.fireAt.ToDatetime() == expected_fire_at + assert actions[0].id == 2 + + # Simulate the timer firing and a second failure — retry timer should be at 2 seconds (backoff) + current_timestamp = expected_fire_at + old_events = old_events + new_events + new_events = [ + helpers.new_orchestrator_started_event(current_timestamp), + helpers.new_timer_fired_event(2, current_timestamp)] + executor = worker._OrchestrationExecutor(registry, TEST_LOGGER) + result = executor.execute(TEST_INSTANCE_ID, old_events, new_events) + actions = result.actions + assert len(actions) == 2 + assert actions[1].HasField("scheduleTask") + assert actions[1].id == 1 + + expected_fire_at = current_timestamp + timedelta(seconds=2) + old_events = old_events + new_events + new_events = [ + helpers.new_orchestrator_started_event(current_timestamp), + helpers.new_task_failed_event(1, ValueError("Kah-BOOOOM!!!"))] + executor = worker._OrchestrationExecutor(registry, TEST_LOGGER) + result = executor.execute(TEST_INSTANCE_ID, old_events, new_events) + actions = result.actions + assert len(actions) == 3 + assert actions[2].HasField("createTimer") + assert actions[2].createTimer.fireAt.ToDatetime() == expected_fire_at + assert actions[2].id == 3 + + # Simulate the timer firing and a third failure — should now fail (max_number_of_attempts=3) + current_timestamp = expected_fire_at + old_events = old_events + new_events + new_events = [ + helpers.new_orchestrator_started_event(current_timestamp), + helpers.new_timer_fired_event(3, current_timestamp)] + executor = worker._OrchestrationExecutor(registry, TEST_LOGGER) + result = executor.execute(TEST_INSTANCE_ID, old_events, new_events) + actions = result.actions + assert len(actions) == 3 + assert actions[1].HasField("scheduleTask") + + old_events = old_events + new_events + new_events = [ + helpers.new_orchestrator_started_event(current_timestamp), + helpers.new_task_failed_event(1, ValueError("Kah-BOOOOM!!!"))] + executor = worker._OrchestrationExecutor(registry, TEST_LOGGER) + result = executor.execute(TEST_INSTANCE_ID, old_events, new_events) + actions = result.actions + assert len(actions) == 4 + assert actions[-1].completeOrchestration.failureDetails.errorMessage.__contains__("Activity task #1 failed: Kah-BOOOOM!!!") + + +def test_activity_retry_with_default_backoff(): + """Tests retry with default backoff_coefficient (1.0) and no max_retry_interval. + + Verifies that retry delays remain constant when backoff_coefficient defaults to 1.0. + """ + + def dummy_activity(ctx, _): + raise ValueError("Fail!") + + def orchestrator(ctx: task.OrchestrationContext, _): + result = yield ctx.call_activity( + dummy_activity, + retry_policy=task.RetryPolicy( + first_retry_interval=timedelta(seconds=5), + max_number_of_attempts=3)) + return result + + registry = worker._Registry() + name = registry.add_orchestrator(orchestrator) + + current_timestamp = datetime.utcnow() + + # First failure — retry timer at 5 seconds (default backoff=1.0, so 5 * 1^0 = 5) + old_events = [ + helpers.new_orchestrator_started_event(timestamp=current_timestamp), + helpers.new_execution_started_event(name, TEST_INSTANCE_ID, encoded_input=None), + helpers.new_task_scheduled_event(1, task.get_name(dummy_activity))] + expected_fire_at = current_timestamp + timedelta(seconds=5) + + new_events = [ + helpers.new_orchestrator_started_event(timestamp=current_timestamp), + helpers.new_task_failed_event(1, ValueError("Fail!"))] + executor = worker._OrchestrationExecutor(registry, TEST_LOGGER) + result = executor.execute(TEST_INSTANCE_ID, old_events, new_events) + actions = result.actions + assert len(actions) == 1 + assert actions[0].HasField("createTimer") + assert actions[0].createTimer.fireAt.ToDatetime() == expected_fire_at + + # Second failure — retry timer still at 5 seconds (5 * 1^1 = 5, no backoff growth) + current_timestamp = expected_fire_at + old_events = old_events + new_events + new_events = [ + helpers.new_orchestrator_started_event(current_timestamp), + helpers.new_timer_fired_event(2, current_timestamp)] + executor = worker._OrchestrationExecutor(registry, TEST_LOGGER) + result = executor.execute(TEST_INSTANCE_ID, old_events, new_events) + + expected_fire_at = current_timestamp + timedelta(seconds=5) + old_events = old_events + new_events + new_events = [ + helpers.new_orchestrator_started_event(current_timestamp), + helpers.new_task_failed_event(1, ValueError("Fail!"))] + executor = worker._OrchestrationExecutor(registry, TEST_LOGGER) + result = executor.execute(TEST_INSTANCE_ID, old_events, new_events) + actions = result.actions + assert len(actions) == 3 + assert actions[2].HasField("createTimer") + assert actions[2].createTimer.fireAt.ToDatetime() == expected_fire_at + + def test_nondeterminism_expected_timer(): """Tests the non-determinism detection logic when call_timer is expected but some other method (call_activity) is called instead""" def dummy_activity(ctx, _):