fix: improve hatchet workflow reliability (#900)

* Increase max connections * Classify hard and transient hatchet errors * Fan out partial success * Force reprocessing of error transcripts * Stop retrying on 402 payment required * Avoid httpx/hatchet timeout race * Add retry wrapper to get_response for for transient errors * Add retry backoff * Return falsy results so get_response won't retry on empty string * Skip error status in on_workflow_failure when transcript already ended * Fix precommit issues * Fail step on first fan-out failure instead of skipping
2026-04-25 06:35:18 +00:00 · 2026-03-06 17:07:26 +01:00
parent a682846645
commit c155f66982
17 changed files with 717 additions and 38 deletions
--- a/server/tests/test_llm_retry.py
+++ b/server/tests/test_llm_retry.py
@@ -1,6 +1,6 @@
 """Tests for LLM structured output with astructured_predict + reflection retry"""

-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch

 import pytest
 from pydantic import BaseModel, Field, ValidationError
@@ -252,6 +252,63 @@ class TestNetworkErrorRetries:
            assert mock_settings.llm.astructured_predict.call_count == 3


+class TestGetResponseRetries:
+    """Test that get_response() uses the same retry() wrapper for transient errors."""
+
+    @pytest.mark.asyncio
+    async def test_get_response_retries_on_connection_error(self, test_settings):
+        """Test that get_response retries on ConnectionError and returns on success."""
+        llm = LLM(settings=test_settings, temperature=0.4, max_tokens=100)
+
+        mock_instance = MagicMock()
+        mock_instance.aget_response = AsyncMock(
+            side_effect=[
+                ConnectionError("Connection refused"),
+                "  Summary text  ",
+            ]
+        )
+
+        with patch("reflector.llm.TreeSummarize", return_value=mock_instance):
+            result = await llm.get_response("Prompt", ["text"])
+
+        assert result == "Summary text"
+        assert mock_instance.aget_response.call_count == 2
+
+    @pytest.mark.asyncio
+    async def test_get_response_exhausts_retries(self, test_settings):
+        """Test that get_response raises RetryException after retry attempts exceeded."""
+        llm = LLM(settings=test_settings, temperature=0.4, max_tokens=100)
+
+        mock_instance = MagicMock()
+        mock_instance.aget_response = AsyncMock(
+            side_effect=ConnectionError("Connection refused")
+        )
+
+        with patch("reflector.llm.TreeSummarize", return_value=mock_instance):
+            with pytest.raises(RetryException, match="Retry attempts exceeded"):
+                await llm.get_response("Prompt", ["text"])
+
+        assert mock_instance.aget_response.call_count == 3
+
+    @pytest.mark.asyncio
+    async def test_get_response_returns_empty_string_without_retry(self, test_settings):
+        """Empty or whitespace-only LLM response must return '' and not raise RetryException.
+
+        retry() must return falsy results (e.g. '' from get_response) instead of
+        treating them as 'no result' and retrying until RetryException.
+        """
+        llm = LLM(settings=test_settings, temperature=0.4, max_tokens=100)
+
+        mock_instance = MagicMock()
+        mock_instance.aget_response = AsyncMock(return_value="   \n  ")  # strip() -> ""
+
+        with patch("reflector.llm.TreeSummarize", return_value=mock_instance):
+            result = await llm.get_response("Prompt", ["text"])
+
+        assert result == ""
+        assert mock_instance.aget_response.call_count == 1
+
+
 class TestTextsInclusion:
    """Test that texts parameter is included in the prompt sent to astructured_predict"""