fix: improve hatchet workflow reliability (#900)

* Increase max connections

* Classify hard and transient hatchet errors

* Fan out partial success

* Force reprocessing of error transcripts

* Stop retrying on 402 payment required

* Avoid httpx/hatchet timeout race

* Add retry wrapper to get_response for for transient errors

* Add retry backoff

* Return falsy results so get_response won't retry on empty string

* Skip error status in on_workflow_failure when transcript already ended

* Fix precommit issues

* Fail step on first fan-out failure instead of skipping
This commit is contained in:
Sergey Mankovsky
2026-03-06 17:07:26 +01:00
committed by GitHub
parent a682846645
commit c155f66982
17 changed files with 717 additions and 38 deletions

View File

@@ -65,10 +65,25 @@ class LLM:
async def get_response(
self, prompt: str, texts: list[str], tone_name: str | None = None
) -> str:
"""Get a text response using TreeSummarize for non-function-calling models"""
summarizer = TreeSummarize(verbose=False)
response = await summarizer.aget_response(prompt, texts, tone_name=tone_name)
return str(response).strip()
"""Get a text response using TreeSummarize for non-function-calling models.
Uses the same retry() wrapper as get_structured_response for transient
network errors (connection, timeout, OSError) with exponential backoff.
"""
async def _call():
summarizer = TreeSummarize(verbose=False)
response = await summarizer.aget_response(
prompt, texts, tone_name=tone_name
)
return str(response).strip()
return await retry(_call)(
retry_attempts=3,
retry_backoff_interval=1.0,
retry_backoff_max=30.0,
retry_ignore_exc_types=(ConnectionError, TimeoutError, OSError),
)
async def get_structured_response(
self,