mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2026-04-14 09:16:54 +00:00
feat: zulip dag monitor for failed runs (#928)
* feat: zulip dag monitor for failed runs * fix: add collapsible tags to big information
This commit is contained in:
committed by
GitHub
parent
7b8d190c52
commit
1f98790e7b
290
server/tests/test_failed_runs_monitor.py
Normal file
290
server/tests/test_failed_runs_monitor.py
Normal file
@@ -0,0 +1,290 @@
|
||||
"""
|
||||
Tests for FailedRunsMonitor Hatchet cron workflow.
|
||||
|
||||
Tests cover:
|
||||
- No Zulip message sent when no failures found
|
||||
- Messages sent for failed main pipeline runs
|
||||
- Child workflow failures filtered out
|
||||
- Errors in the monitor itself are caught and logged
|
||||
"""
|
||||
|
||||
from datetime import timezone
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from hatchet_sdk.clients.rest.models import V1TaskStatus
|
||||
|
||||
|
||||
def _make_task_summary(
|
||||
workflow_name: str,
|
||||
workflow_run_external_id: str = "run-123",
|
||||
status: V1TaskStatus = V1TaskStatus.FAILED,
|
||||
):
|
||||
"""Create a mock V1TaskSummary."""
|
||||
mock = MagicMock()
|
||||
mock.workflow_name = workflow_name
|
||||
mock.workflow_run_external_id = workflow_run_external_id
|
||||
mock.status = status
|
||||
return mock
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestCheckFailedRuns:
|
||||
async def test_no_failures_sends_no_message(self):
|
||||
mock_result = MagicMock()
|
||||
mock_result.rows = []
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
|
||||
|
||||
with (
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
|
||||
return_value=mock_client,
|
||||
),
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
|
||||
new_callable=AsyncMock,
|
||||
) as mock_send,
|
||||
):
|
||||
from reflector.hatchet.workflows.failed_runs_monitor import (
|
||||
_check_failed_runs,
|
||||
)
|
||||
|
||||
result = await _check_failed_runs()
|
||||
|
||||
assert result["checked"] == 0
|
||||
assert result["reported"] == 0
|
||||
mock_send.assert_not_called()
|
||||
|
||||
async def test_reports_failed_main_pipeline_runs(self):
|
||||
failed_runs = [
|
||||
_make_task_summary("DiarizationPipeline", "run-1"),
|
||||
_make_task_summary("FilePipeline", "run-2"),
|
||||
]
|
||||
mock_result = MagicMock()
|
||||
mock_result.rows = failed_runs
|
||||
|
||||
mock_details = MagicMock()
|
||||
mock_client = MagicMock()
|
||||
mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
|
||||
mock_client.runs.aio_get = AsyncMock(return_value=mock_details)
|
||||
|
||||
with (
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
|
||||
return_value=mock_client,
|
||||
),
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
|
||||
return_value="**rendered DAG**",
|
||||
),
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
|
||||
new_callable=AsyncMock,
|
||||
return_value={"id": 1},
|
||||
) as mock_send,
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.settings"
|
||||
) as mock_settings,
|
||||
):
|
||||
mock_settings.ZULIP_DAG_STREAM = "dag-stream"
|
||||
mock_settings.ZULIP_DAG_TOPIC = "dag-topic"
|
||||
|
||||
from reflector.hatchet.workflows.failed_runs_monitor import (
|
||||
_check_failed_runs,
|
||||
)
|
||||
|
||||
result = await _check_failed_runs()
|
||||
|
||||
assert result["checked"] == 2
|
||||
assert result["reported"] == 2
|
||||
assert mock_send.call_count == 2
|
||||
mock_send.assert_any_call("dag-stream", "dag-topic", "**rendered DAG**")
|
||||
|
||||
async def test_filters_out_child_workflows(self):
|
||||
runs = [
|
||||
_make_task_summary("DiarizationPipeline", "run-1"),
|
||||
_make_task_summary("TrackProcessing", "run-2"),
|
||||
_make_task_summary("TopicChunkProcessing", "run-3"),
|
||||
_make_task_summary("SubjectProcessing", "run-4"),
|
||||
]
|
||||
mock_result = MagicMock()
|
||||
mock_result.rows = runs
|
||||
|
||||
mock_details = MagicMock()
|
||||
mock_client = MagicMock()
|
||||
mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
|
||||
mock_client.runs.aio_get = AsyncMock(return_value=mock_details)
|
||||
|
||||
with (
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
|
||||
return_value=mock_client,
|
||||
),
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
|
||||
return_value="**rendered**",
|
||||
),
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
|
||||
new_callable=AsyncMock,
|
||||
return_value={"id": 1},
|
||||
) as mock_send,
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.settings"
|
||||
) as mock_settings,
|
||||
):
|
||||
mock_settings.ZULIP_DAG_STREAM = "dag-stream"
|
||||
mock_settings.ZULIP_DAG_TOPIC = "dag-topic"
|
||||
|
||||
from reflector.hatchet.workflows.failed_runs_monitor import (
|
||||
_check_failed_runs,
|
||||
)
|
||||
|
||||
result = await _check_failed_runs()
|
||||
|
||||
# Only DiarizationPipeline should be reported
|
||||
assert result["checked"] == 4
|
||||
assert result["reported"] == 1
|
||||
assert mock_send.call_count == 1
|
||||
|
||||
async def test_all_three_pipelines_reported(self):
|
||||
runs = [
|
||||
_make_task_summary("DiarizationPipeline", "run-1"),
|
||||
_make_task_summary("FilePipeline", "run-2"),
|
||||
_make_task_summary("LivePostProcessingPipeline", "run-3"),
|
||||
]
|
||||
mock_result = MagicMock()
|
||||
mock_result.rows = runs
|
||||
|
||||
mock_details = MagicMock()
|
||||
mock_client = MagicMock()
|
||||
mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
|
||||
mock_client.runs.aio_get = AsyncMock(return_value=mock_details)
|
||||
|
||||
with (
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
|
||||
return_value=mock_client,
|
||||
),
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
|
||||
return_value="**rendered**",
|
||||
),
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
|
||||
new_callable=AsyncMock,
|
||||
return_value={"id": 1},
|
||||
) as mock_send,
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.settings"
|
||||
) as mock_settings,
|
||||
):
|
||||
mock_settings.ZULIP_DAG_STREAM = "dag-stream"
|
||||
mock_settings.ZULIP_DAG_TOPIC = "dag-topic"
|
||||
|
||||
from reflector.hatchet.workflows.failed_runs_monitor import (
|
||||
_check_failed_runs,
|
||||
)
|
||||
|
||||
result = await _check_failed_runs()
|
||||
|
||||
assert result["reported"] == 3
|
||||
assert mock_send.call_count == 3
|
||||
|
||||
async def test_continues_on_individual_run_failure(self):
|
||||
"""If one run fails to report, the others should still be reported."""
|
||||
runs = [
|
||||
_make_task_summary("DiarizationPipeline", "run-1"),
|
||||
_make_task_summary("FilePipeline", "run-2"),
|
||||
]
|
||||
mock_result = MagicMock()
|
||||
mock_result.rows = runs
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
|
||||
# First call raises, second succeeds
|
||||
mock_client.runs.aio_get = AsyncMock(
|
||||
side_effect=[Exception("Hatchet API error"), MagicMock()]
|
||||
)
|
||||
|
||||
with (
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
|
||||
return_value=mock_client,
|
||||
),
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
|
||||
return_value="**rendered**",
|
||||
),
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
|
||||
new_callable=AsyncMock,
|
||||
return_value={"id": 1},
|
||||
) as mock_send,
|
||||
patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.settings"
|
||||
) as mock_settings,
|
||||
):
|
||||
mock_settings.ZULIP_DAG_STREAM = "dag-stream"
|
||||
mock_settings.ZULIP_DAG_TOPIC = "dag-topic"
|
||||
|
||||
from reflector.hatchet.workflows.failed_runs_monitor import (
|
||||
_check_failed_runs,
|
||||
)
|
||||
|
||||
result = await _check_failed_runs()
|
||||
|
||||
# First run failed to report, second succeeded
|
||||
assert result["reported"] == 1
|
||||
assert mock_send.call_count == 1
|
||||
|
||||
async def test_handles_list_api_failure(self):
|
||||
"""If aio_list fails, should return error and not crash."""
|
||||
mock_client = MagicMock()
|
||||
mock_client.runs.aio_list = AsyncMock(
|
||||
side_effect=Exception("Connection refused")
|
||||
)
|
||||
|
||||
with patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
|
||||
return_value=mock_client,
|
||||
):
|
||||
from reflector.hatchet.workflows.failed_runs_monitor import (
|
||||
_check_failed_runs,
|
||||
)
|
||||
|
||||
result = await _check_failed_runs()
|
||||
|
||||
assert result["checked"] == 0
|
||||
assert result["reported"] == 0
|
||||
assert "error" in result
|
||||
|
||||
async def test_uses_correct_time_window(self):
|
||||
"""Verify the correct since/until parameters are passed to aio_list."""
|
||||
mock_result = MagicMock()
|
||||
mock_result.rows = []
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
|
||||
|
||||
with patch(
|
||||
"reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
|
||||
return_value=mock_client,
|
||||
):
|
||||
from reflector.hatchet.workflows.failed_runs_monitor import (
|
||||
_check_failed_runs,
|
||||
)
|
||||
|
||||
await _check_failed_runs()
|
||||
|
||||
call_kwargs = mock_client.runs.aio_list.call_args
|
||||
assert call_kwargs.kwargs["statuses"] == [V1TaskStatus.FAILED]
|
||||
since = call_kwargs.kwargs["since"]
|
||||
until = call_kwargs.kwargs["until"]
|
||||
assert since.tzinfo == timezone.utc
|
||||
assert until.tzinfo == timezone.utc
|
||||
# Window should be ~1 hour
|
||||
delta = until - since
|
||||
assert 3590 < delta.total_seconds() < 3610
|
||||
Reference in New Issue
Block a user