feat: zulip dag monitor for failed runs (#928)

* feat: zulip dag monitor for failed runs * fix: add collapsible tags to big information
2026-04-14 09:16:54 +00:00 · 2026-03-25 17:26:41 -05:00
parent 7b8d190c52
commit 1f98790e7b
8 changed files with 871 additions and 12 deletions
--- a/server/tests/test_failed_runs_monitor.py
+++ b/server/tests/test_failed_runs_monitor.py
@@ -0,0 +1,290 @@
+"""
+Tests for FailedRunsMonitor Hatchet cron workflow.
+
+Tests cover:
+- No Zulip message sent when no failures found
+- Messages sent for failed main pipeline runs
+- Child workflow failures filtered out
+- Errors in the monitor itself are caught and logged
+"""
+
+from datetime import timezone
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from hatchet_sdk.clients.rest.models import V1TaskStatus
+
+
+def _make_task_summary(
+    workflow_name: str,
+    workflow_run_external_id: str = "run-123",
+    status: V1TaskStatus = V1TaskStatus.FAILED,
+):
+    """Create a mock V1TaskSummary."""
+    mock = MagicMock()
+    mock.workflow_name = workflow_name
+    mock.workflow_run_external_id = workflow_run_external_id
+    mock.status = status
+    return mock
+
+
+@pytest.mark.asyncio
+class TestCheckFailedRuns:
+    async def test_no_failures_sends_no_message(self):
+        mock_result = MagicMock()
+        mock_result.rows = []
+
+        mock_client = MagicMock()
+        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
+
+        with (
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
+                return_value=mock_client,
+            ),
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
+                new_callable=AsyncMock,
+            ) as mock_send,
+        ):
+            from reflector.hatchet.workflows.failed_runs_monitor import (
+                _check_failed_runs,
+            )
+
+            result = await _check_failed_runs()
+
+            assert result["checked"] == 0
+            assert result["reported"] == 0
+            mock_send.assert_not_called()
+
+    async def test_reports_failed_main_pipeline_runs(self):
+        failed_runs = [
+            _make_task_summary("DiarizationPipeline", "run-1"),
+            _make_task_summary("FilePipeline", "run-2"),
+        ]
+        mock_result = MagicMock()
+        mock_result.rows = failed_runs
+
+        mock_details = MagicMock()
+        mock_client = MagicMock()
+        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
+        mock_client.runs.aio_get = AsyncMock(return_value=mock_details)
+
+        with (
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
+                return_value=mock_client,
+            ),
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
+                return_value="**rendered DAG**",
+            ),
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
+                new_callable=AsyncMock,
+                return_value={"id": 1},
+            ) as mock_send,
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.settings"
+            ) as mock_settings,
+        ):
+            mock_settings.ZULIP_DAG_STREAM = "dag-stream"
+            mock_settings.ZULIP_DAG_TOPIC = "dag-topic"
+
+            from reflector.hatchet.workflows.failed_runs_monitor import (
+                _check_failed_runs,
+            )
+
+            result = await _check_failed_runs()
+
+            assert result["checked"] == 2
+            assert result["reported"] == 2
+            assert mock_send.call_count == 2
+            mock_send.assert_any_call("dag-stream", "dag-topic", "**rendered DAG**")
+
+    async def test_filters_out_child_workflows(self):
+        runs = [
+            _make_task_summary("DiarizationPipeline", "run-1"),
+            _make_task_summary("TrackProcessing", "run-2"),
+            _make_task_summary("TopicChunkProcessing", "run-3"),
+            _make_task_summary("SubjectProcessing", "run-4"),
+        ]
+        mock_result = MagicMock()
+        mock_result.rows = runs
+
+        mock_details = MagicMock()
+        mock_client = MagicMock()
+        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
+        mock_client.runs.aio_get = AsyncMock(return_value=mock_details)
+
+        with (
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
+                return_value=mock_client,
+            ),
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
+                return_value="**rendered**",
+            ),
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
+                new_callable=AsyncMock,
+                return_value={"id": 1},
+            ) as mock_send,
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.settings"
+            ) as mock_settings,
+        ):
+            mock_settings.ZULIP_DAG_STREAM = "dag-stream"
+            mock_settings.ZULIP_DAG_TOPIC = "dag-topic"
+
+            from reflector.hatchet.workflows.failed_runs_monitor import (
+                _check_failed_runs,
+            )
+
+            result = await _check_failed_runs()
+
+            # Only DiarizationPipeline should be reported
+            assert result["checked"] == 4
+            assert result["reported"] == 1
+            assert mock_send.call_count == 1
+
+    async def test_all_three_pipelines_reported(self):
+        runs = [
+            _make_task_summary("DiarizationPipeline", "run-1"),
+            _make_task_summary("FilePipeline", "run-2"),
+            _make_task_summary("LivePostProcessingPipeline", "run-3"),
+        ]
+        mock_result = MagicMock()
+        mock_result.rows = runs
+
+        mock_details = MagicMock()
+        mock_client = MagicMock()
+        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
+        mock_client.runs.aio_get = AsyncMock(return_value=mock_details)
+
+        with (
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
+                return_value=mock_client,
+            ),
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
+                return_value="**rendered**",
+            ),
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
+                new_callable=AsyncMock,
+                return_value={"id": 1},
+            ) as mock_send,
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.settings"
+            ) as mock_settings,
+        ):
+            mock_settings.ZULIP_DAG_STREAM = "dag-stream"
+            mock_settings.ZULIP_DAG_TOPIC = "dag-topic"
+
+            from reflector.hatchet.workflows.failed_runs_monitor import (
+                _check_failed_runs,
+            )
+
+            result = await _check_failed_runs()
+
+            assert result["reported"] == 3
+            assert mock_send.call_count == 3
+
+    async def test_continues_on_individual_run_failure(self):
+        """If one run fails to report, the others should still be reported."""
+        runs = [
+            _make_task_summary("DiarizationPipeline", "run-1"),
+            _make_task_summary("FilePipeline", "run-2"),
+        ]
+        mock_result = MagicMock()
+        mock_result.rows = runs
+
+        mock_client = MagicMock()
+        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
+        # First call raises, second succeeds
+        mock_client.runs.aio_get = AsyncMock(
+            side_effect=[Exception("Hatchet API error"), MagicMock()]
+        )
+
+        with (
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
+                return_value=mock_client,
+            ),
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
+                return_value="**rendered**",
+            ),
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
+                new_callable=AsyncMock,
+                return_value={"id": 1},
+            ) as mock_send,
+            patch(
+                "reflector.hatchet.workflows.failed_runs_monitor.settings"
+            ) as mock_settings,
+        ):
+            mock_settings.ZULIP_DAG_STREAM = "dag-stream"
+            mock_settings.ZULIP_DAG_TOPIC = "dag-topic"
+
+            from reflector.hatchet.workflows.failed_runs_monitor import (
+                _check_failed_runs,
+            )
+
+            result = await _check_failed_runs()
+
+            # First run failed to report, second succeeded
+            assert result["reported"] == 1
+            assert mock_send.call_count == 1
+
+    async def test_handles_list_api_failure(self):
+        """If aio_list fails, should return error and not crash."""
+        mock_client = MagicMock()
+        mock_client.runs.aio_list = AsyncMock(
+            side_effect=Exception("Connection refused")
+        )
+
+        with patch(
+            "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
+            return_value=mock_client,
+        ):
+            from reflector.hatchet.workflows.failed_runs_monitor import (
+                _check_failed_runs,
+            )
+
+            result = await _check_failed_runs()
+
+            assert result["checked"] == 0
+            assert result["reported"] == 0
+            assert "error" in result
+
+    async def test_uses_correct_time_window(self):
+        """Verify the correct since/until parameters are passed to aio_list."""
+        mock_result = MagicMock()
+        mock_result.rows = []
+
+        mock_client = MagicMock()
+        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
+
+        with patch(
+            "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
+            return_value=mock_client,
+        ):
+            from reflector.hatchet.workflows.failed_runs_monitor import (
+                _check_failed_runs,
+            )
+
+            await _check_failed_runs()
+
+            call_kwargs = mock_client.runs.aio_list.call_args
+            assert call_kwargs.kwargs["statuses"] == [V1TaskStatus.FAILED]
+            since = call_kwargs.kwargs["since"]
+            until = call_kwargs.kwargs["until"]
+            assert since.tzinfo == timezone.utc
+            assert until.tzinfo == timezone.utc
+            # Window should be ~1 hour
+            delta = until - since
+            assert 3590 < delta.total_seconds() < 3610