fix: improve hatchet workflow reliability (#900)

* Increase max connections

* Classify hard and transient hatchet errors

* Fan out partial success

* Force reprocessing of error transcripts

* Stop retrying on 402 payment required

* Avoid httpx/hatchet timeout race

* Add retry wrapper to get_response for for transient errors

* Add retry backoff

* Return falsy results so get_response won't retry on empty string

* Skip error status in on_workflow_failure when transcript already ended

* Fix precommit issues

* Fail step on first fan-out failure instead of skipping
This commit is contained in:
Sergey Mankovsky
2026-03-06 17:07:26 +01:00
committed by GitHub
parent a682846645
commit c155f66982
17 changed files with 717 additions and 38 deletions

View File

@@ -1,6 +1,6 @@
"""Tests for LLM structured output with astructured_predict + reflection retry"""
from unittest.mock import AsyncMock, patch
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from pydantic import BaseModel, Field, ValidationError
@@ -252,6 +252,63 @@ class TestNetworkErrorRetries:
assert mock_settings.llm.astructured_predict.call_count == 3
class TestGetResponseRetries:
"""Test that get_response() uses the same retry() wrapper for transient errors."""
@pytest.mark.asyncio
async def test_get_response_retries_on_connection_error(self, test_settings):
"""Test that get_response retries on ConnectionError and returns on success."""
llm = LLM(settings=test_settings, temperature=0.4, max_tokens=100)
mock_instance = MagicMock()
mock_instance.aget_response = AsyncMock(
side_effect=[
ConnectionError("Connection refused"),
" Summary text ",
]
)
with patch("reflector.llm.TreeSummarize", return_value=mock_instance):
result = await llm.get_response("Prompt", ["text"])
assert result == "Summary text"
assert mock_instance.aget_response.call_count == 2
@pytest.mark.asyncio
async def test_get_response_exhausts_retries(self, test_settings):
"""Test that get_response raises RetryException after retry attempts exceeded."""
llm = LLM(settings=test_settings, temperature=0.4, max_tokens=100)
mock_instance = MagicMock()
mock_instance.aget_response = AsyncMock(
side_effect=ConnectionError("Connection refused")
)
with patch("reflector.llm.TreeSummarize", return_value=mock_instance):
with pytest.raises(RetryException, match="Retry attempts exceeded"):
await llm.get_response("Prompt", ["text"])
assert mock_instance.aget_response.call_count == 3
@pytest.mark.asyncio
async def test_get_response_returns_empty_string_without_retry(self, test_settings):
"""Empty or whitespace-only LLM response must return '' and not raise RetryException.
retry() must return falsy results (e.g. '' from get_response) instead of
treating them as 'no result' and retrying until RetryException.
"""
llm = LLM(settings=test_settings, temperature=0.4, max_tokens=100)
mock_instance = MagicMock()
mock_instance.aget_response = AsyncMock(return_value=" \n ") # strip() -> ""
with patch("reflector.llm.TreeSummarize", return_value=mock_instance):
result = await llm.get_response("Prompt", ["text"])
assert result == ""
assert mock_instance.aget_response.call_count == 1
class TestTextsInclusion:
"""Test that texts parameter is included in the prompt sent to astructured_predict"""