feat: local LLM via Ollama + structured output response_format

- Add setup script (scripts/setup-local-llm.sh) for one-command Ollama setup
  Mac: native Metal GPU, Linux: containerized via docker-compose profiles
- Add ollama-gpu and ollama-cpu docker-compose profiles for Linux
- Add extra_hosts to server/hatchet-worker-llm for host.docker.internal
- Pass response_format JSON schema in StructuredOutputWorkflow.extract()
  enabling grammar-based constrained decoding on Ollama/llama.cpp/vLLM/OpenAI
- Update .env.example with Ollama as default LLM option
- Add Ollama PRD and local dev setup docs
This commit is contained in:
Igor Loskutov
2026-02-10 15:55:21 -05:00
parent cd2255cfbc
commit 663345ece6
7 changed files with 653 additions and 7 deletions

View File

@@ -286,6 +286,92 @@ class TestStructuredOutputWorkflow:
assert mock_settings.llm.acomplete.call_count == 2
class TestResponseFormat:
"""Test that response_format with JSON schema is passed to acomplete"""
@pytest.mark.asyncio
async def test_acomplete_called_with_response_format(self):
"""acomplete() should receive response_format containing Pydantic JSON schema"""
workflow = StructuredOutputWorkflow(
output_cls=TestResponse,
max_retries=3,
timeout=30,
)
with (
patch("reflector.llm.TreeSummarize") as mock_summarize,
patch("reflector.llm.Settings") as mock_settings,
):
mock_summarizer = MagicMock()
mock_summarize.return_value = mock_summarizer
mock_summarizer.aget_response = AsyncMock(return_value="Some analysis")
mock_settings.llm.acomplete = AsyncMock(
return_value=make_completion_response(
'{"title": "Test", "summary": "Summary", "confidence": 0.95}'
)
)
result = await workflow.run(
prompt="Extract data",
texts=["Some text"],
tone_name=None,
)
assert "success" in result
# Verify response_format was passed
call_kwargs = mock_settings.llm.acomplete.call_args
assert "response_format" in call_kwargs.kwargs
rf = call_kwargs.kwargs["response_format"]
assert rf["type"] == "json_schema"
assert rf["json_schema"]["name"] == "TestResponse"
assert rf["json_schema"]["schema"] == TestResponse.model_json_schema()
@pytest.mark.asyncio
async def test_response_format_present_on_retry(self):
"""response_format should be passed on retry attempts too"""
workflow = StructuredOutputWorkflow(
output_cls=TestResponse,
max_retries=3,
timeout=30,
)
with (
patch("reflector.llm.TreeSummarize") as mock_summarize,
patch("reflector.llm.Settings") as mock_settings,
):
mock_summarizer = MagicMock()
mock_summarize.return_value = mock_summarizer
mock_summarizer.aget_response = AsyncMock(return_value="Some analysis")
call_count = {"count": 0}
async def acomplete_handler(*args, **kwargs):
call_count["count"] += 1
if call_count["count"] == 1:
return make_completion_response('{"title": "Only title"}')
return make_completion_response(
'{"title": "Test", "summary": "Summary", "confidence": 0.9}'
)
mock_settings.llm.acomplete = AsyncMock(side_effect=acomplete_handler)
result = await workflow.run(
prompt="Extract data",
texts=["Some text"],
tone_name=None,
)
assert "success" in result
assert call_count["count"] == 2
# Both calls should have response_format
for call in mock_settings.llm.acomplete.call_args_list:
assert "response_format" in call.kwargs
assert call.kwargs["response_format"]["type"] == "json_schema"
class TestNetworkErrorRetries:
"""Test that network error retries are handled by OpenAILike, not Workflow"""