From b18ee3b564163b5ec06aeb2a6278e9486905b31d Mon Sep 17 00:00:00 2001 From: Mathieu Virbel Date: Tue, 10 Feb 2026 18:19:30 -0600 Subject: [PATCH] feat: first commit --- AGENTS.md | 143 +++++++++ README.md | 120 ++++++++ docs/company-context.md | 43 +++ docs/connectors-and-sources.md | 99 ++++++ docs/contactdb-api.md | 154 ++++++++++ docs/dataindex-api.md | 218 +++++++++++++ docs/notebook-patterns.md | 545 +++++++++++++++++++++++++++++++++ workflows/.empty | 0 8 files changed, 1322 insertions(+) create mode 100644 AGENTS.md create mode 100644 README.md create mode 100644 docs/company-context.md create mode 100644 docs/connectors-and-sources.md create mode 100644 docs/contactdb-api.md create mode 100644 docs/dataindex-api.md create mode 100644 docs/notebook-patterns.md create mode 100644 workflows/.empty diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..ecf6304 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,143 @@ +# Agent Documentation — InternalAI Platform + +The InternalAI platform aggregates company data from email, calendars, Zulip chat, meetings, and documents into two core APIs. These docs give LLM agents the context they need to build programmatic workflows — typically as marimo notebooks — that answer analytical questions about people and their interactions. + +## Routing Table + +| I need to... | Read | +|---------------------------------------------|-------------------------------| +| Understand the company and its tools | [company-context.md] | +| Look up people, contacts, relationships | [contactdb-api.md] | +| Query emails, meetings, chats, documents | [dataindex-api.md] | +| Know which connector provides what data | [connectors-and-sources.md] | +| Create a marimo analysis notebook | [notebook-patterns.md] | + +## API Base URLs + +| Service | Swagger UI | OpenAPI JSON | +|------------|---------------------------------------------------|----------------------------------------| +| ContactDB | `http://localhost:42000/contactdb-api/docs` | `/contactdb-api/openapi.json` | +| DataIndex | `http://localhost:42000/dataindex/docs` | `/dataindex/openapi.json` | + +Direct backend ports (without Caddy proxy): +- ContactDB: `http://localhost:42800` +- DataIndex: `http://localhost:42180` + +## Common Questions → API Calls + +Use this table to translate natural language questions into API calls. The base URLs below assume Caddy proxy (`http://localhost:42000`). + +| Question | API Call | Notes | +|----------|----------|-------| +| "Who am I?" | `GET /contactdb-api/api/contacts/me` | Returns your contact record: name, emails, bio, contact_id | +| "Find Alice" / "Who is Alice?" | `GET /contactdb-api/api/contacts?search=Alice` | Returns matching contacts with their IDs | +| "What's Alice's contact ID?" | `GET /contactdb-api/api/contacts?search=Alice` → use `contacts[0].id` | Needed for all DataIndex queries about a person | +| "Find contact by email" | `GET /contactdb-api/api/contacts/by-email/{email}` | Direct lookup | +| "My recent meetings" | `GET /dataindex/api/v1/query?entity_types=meeting&contact_ids={my_id}` | Get `my_id` from `/contacts/me` first | +| "Emails with Alice" | `GET /dataindex/api/v1/query?entity_types=email&contact_ids={alice_id}` | Matches sender, to, or cc | +| "What was discussed about X?" | `POST /dataindex/api/v1/search` with `{"search_text": "X"}` | Semantic search across all data | +| "Zulip threads about hiring" | `GET /dataindex/api/v1/query?entity_types=threaded_conversation&search=hiring` | Text filter on content | +| "My calendar this week" | `GET /dataindex/api/v1/query?entity_types=calendar_event&contact_ids={my_id}&date_from=...&date_to=...` | Set date range | +| "Who are the most active contacts?" | `GET /contactdb-api/api/contacts?sort_by=hotness&min_hotness=50` | Hotness = 0-100 interaction score | +| "What connectors are available?" | `GET /dataindex/api/v1/connectors/status` | Lists all data sources and sync status | + +**Key pattern:** Any question about "me" / "my" / "I" requires calling `GET /contactdb-api/api/contacts/me` first to get your `contact_id`, then using that ID in subsequent DataIndex queries. + +## Workflow + +### When to create a marimo notebook + +Any request that involves **analysis over a period of time** (e.g., "meetings this month", "emails since January", "interaction trends") is likely to return a **large volume of data** — too much to process inline. In these cases, **always produce a marimo notebook** (a `.py` file following the patterns in [notebook-patterns.md]). + +Also create a notebook when the user asks to "create a workflow", "write a workflow", or "build an analysis". + +If you're unsure whether a question is simple enough to answer directly or needs a notebook, **ask the user**. + +### File naming and location + +All notebooks go in the **`workflows/`** directory. Use a sequential number prefix so workflows stay ordered by creation: + +``` +workflows/__.py +``` + +- `` — zero-padded sequence number (`001`, `002`, …). Look at existing files in `workflows/` to determine the next number. +- `` — what is being analyzed, in snake_case (e.g., `greyhaven_meetings`, `alice_emails`, `hiring_discussions`) +- `` — time range or qualifier (e.g., `january`, `q1_2026`, `last_30d`, `all_time`) + +**Examples:** + +``` +workflows/001_greyhaven_meetings_january.py +workflows/002_alice_emails_q1_2026.py +workflows/003_hiring_discussions_last_30d.py +workflows/004_team_interaction_timeline_all_time.py +``` + +**Before creating a new workflow**, list existing files in `workflows/` to find the highest number and increment it. + +### Plan before you implement + +Before writing any notebook, **always propose a plan first** and get the user's approval. The plan should describe: + +1. **Goal** — What question are we answering? +2. **Data sources** — Which entity types and API endpoints will be used? +3. **Algorithm / ETL steps** — Step-by-step description of the data pipeline: what gets fetched, how it's filtered, joined, or aggregated, and what the final output looks like. +4. **Output format** — Table columns, charts, or summary statistics the user will see. + +Only proceed to implementation after the user confirms the plan. + +### Steps + +1. **Identify people** — Use ContactDB to resolve names/emails to `contact_id` values. For "me"/"my" questions, always start with `GET /api/contacts/me`. +2. **Find data** — Use DataIndex `GET /query` (exhaustive, paginated) or `POST /search` (semantic, ranked) with `contact_ids`, `entity_types`, `date_from`/`date_to`, `connector_ids` filters. +3. **Analyze** — For simple answers, process the API response directly. For complex multi-step analysis, build a marimo notebook (see [notebook-patterns.md]). + +### Quick Example (Python) + +> "Find all emails involving Alice since January" + +```python +import httpx + +CONTACTDB = "http://localhost:42000/contactdb-api" +DATAINDEX = "http://localhost:42000/dataindex/api/v1" +client = httpx.Client(timeout=30) + +# 1. Resolve "Alice" to a contact_id +resp = client.get(f"{CONTACTDB}/api/contacts", params={"search": "Alice"}) +alice_id = resp.json()["contacts"][0]["id"] # e.g. 42 + +# 2. Fetch all emails involving Alice (with pagination) +emails = [] +offset = 0 +while True: + resp = client.get(f"{DATAINDEX}/query", params={ + "entity_types": "email", + "contact_ids": str(alice_id), + "date_from": "2025-01-01T00:00:00Z", + "limit": 50, + "offset": offset, + }) + data = resp.json() + emails.extend(data["items"]) + if offset + 50 >= data["total"]: + break + offset += 50 + +print(f"Found {len(emails)} emails involving Alice") +``` + +## File Index + +- [company-context.md] — Business context, team structure, vocabulary +- [contactdb-api.md] — ContactDB entities and REST endpoints +- [dataindex-api.md] — DataIndex entity types, query modes, REST endpoints +- [connectors-and-sources.md] — Connector-to-entity-type mapping +- [notebook-patterns.md] — Marimo notebook patterns and common API workflows + +[company-context.md]: ./docs/company-context.md +[contactdb-api.md]: ./docs/contactdb-api.md +[dataindex-api.md]: ./docs/dataindex-api.md +[connectors-and-sources.md]: ./docs/connectors-and-sources.md +[notebook-patterns.md]: ./docs/notebook-patterns.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..2b776c7 --- /dev/null +++ b/README.md @@ -0,0 +1,120 @@ +# InternalAI Agent + +A documentation and pattern library that gives LLM agents the context they need to build data analysis workflows against Monadical's internal systems — ContactDB (people directory) and DataIndex (unified data from email, calendar, Zulip, meetings, documents). + +The goal is to use [opencode](https://opencode.ai) (or any LLM-powered coding tool) to iteratively create [marimo](https://marimo.io) notebook workflows that query and analyze company data. + +## Getting Started + +### Prerequisites + +- [opencode](https://opencode.ai) installed +- Access to the InternalAI platform (ContactDB + DataIndex running locally, accessible via http://localhost:42000) + +### Configuring opencode with LiteLLM + +To use models through LiteLLM, add the following to `~/.config/opencode/config.json`: + +```json +{ + "$schema": "https://opencode.ai/config.json", + "provider": { + "litellm": { + "npm": "@ai-sdk/openai-compatible", + "name": "Litellm", + "options": { + "baseURL": "https://litellm.app.monadical.io", + "apiKey": "xxxxx" + }, + "models": { + "Kimi-K2.5-dev": { + "name": "Kimi-K2.5-dev" + } + } + } + } +} +``` + +Replace `xxxxx` with your actual LiteLLM API key. + +### Running opencode + +From the project root: + +```bash +opencode +``` + +opencode will pick up `AGENTS.md` automatically and use it as the entry point to understand the project, the available APIs, and how to write workflows. + +## How AGENTS.md Works + +`AGENTS.md` is the routing guide for LLM agents. It is structured as follows: + +1. **Purpose statement** — Explains that the agent's job is to build marimo notebooks that analyze company data. + +2. **Documentation routing table** — Directs the agent to the right file depending on the topic: + + | Topic | File | + |-------|------| + | Company context, tools, connectors overview | `docs/company-context.md` | + | People, contacts, relationships | `docs/contactdb-api.md` | + | Querying emails, meetings, chats, docs | `docs/dataindex-api.md` | + | Connector-to-entity-type mappings | `docs/connectors-and-sources.md` | + | Notebook creation patterns and templates | `docs/notebook-patterns.md` | + +3. **API base URLs** — ContactDB and DataIndex endpoints (both via Caddy proxy and direct). + +4. **Common query translation table** — Maps natural-language questions (e.g. "Who am I?", "Recent meetings") to the corresponding API calls. + +5. **Workflow rules** — When to create a notebook vs. answer inline, naming conventions, and the requirement to propose a plan before implementing. + +## Workflow + +### How it works + +1. **Ask a question in opencode** — Describe what you want to analyze (e.g. "Show me all meetings about Greyhaven in January"). + +2. **Agent reads AGENTS.md** — opencode picks up the routing guide and navigates to the relevant docs to understand the APIs. + +3. **Agent proposes a plan** — Before writing code, the agent outlines: Goal, Data Sources, Algorithm, and Output Format. + +4. **Agent creates a marimo notebook** — A `.py` file is written to `workflows/` following the naming convention `__.py`. + +5. **Iterate** — Run the notebook with `marimo edit workflows/.py`, review the output, and ask the agent to refine. + +### Workflow output format + +Workflows are [marimo notebooks](https://marimo.io) — plain Python files with `@app.cell` decorators. They typically follow this structure: + +- **params cell** — User-editable parameters (search terms, date ranges, contact names) +- **config cell** — API base URLs +- **setup cell** — Shared imports (`httpx`, `polars`, `marimo`) +- **data cells** — Fetch and transform data from ContactDB / DataIndex +- **output cells** — Tables, charts, or markdown summaries + +### Naming convention + +``` +workflows/__.py +``` + +Examples: +- `001_greyhaven_meetings_january.py` +- `002_email_activity_q1.py` + +## Project Structure + +``` +internalai-agent/ +├── AGENTS.md # LLM agent routing guide (entry point) +├── README.md +├── docs/ +│ ├── company-context.md # Monadical org, tools, key concepts +│ ├── contactdb-api.md # ContactDB REST API reference +│ ├── dataindex-api.md # DataIndex REST API reference +│ ├── connectors-and-sources.md # Connector → entity type mappings +│ └── notebook-patterns.md # Marimo notebook templates and patterns +└── workflows/ # Generated analysis notebooks go here +``` diff --git a/docs/company-context.md b/docs/company-context.md new file mode 100644 index 0000000..414022b --- /dev/null +++ b/docs/company-context.md @@ -0,0 +1,43 @@ +# Company Context + +## About Monadical + +Monadical is a software consultancy founded in 2016. The company operates across multiple locations: Montreal and Vancouver (Canada), and Medellin and Cali (Colombia). The team builds internal products alongside client work. + +### Internal Products + +- **Reflector** — Meeting recording and transcription tool (produces meeting entities in DataIndex) +- **GreyHaven / InternalAI platform** — A local-first platform that aggregates personal data, resolve contact to do automation and analysis + +## Communication Tools + +| Tool | Role | Data in DataIndex? | +|------------|-----------------------------|---------------------| +| Zulip | Primary internal chat | Yes (connector: `zulip`) | +| Fastmail/Email | External communication | Yes (connector: `mbsync_email`) | +| Calendar | Scheduling (ICS feeds) | Yes (connector: `ics_calendar`) | +| Reflector | Meeting recordings | Yes (connector: `reflector`) | +| HedgeDoc | Collaborative documents | Yes (connector: `hedgedoc`) | + +## How the company is working + +We use zulip as our main hub for communication. Zulip have channels (top level) and topic (low level). Depending the channels, differents behavior have to be adopted. + +### Zulip channels + +Here is a list of zulip stream prefix with context on how the company is organized: + +- InternalAI (zulip:stream:193) is about this specific platform. +- Leads (zulip:stream:78) is where we talk about our leads/client. We usually create one topic per lead/client - So if you are searching information about a client, always have a look if a related topic exist, that match the client or the company name. +- Checkins (zulip:stream:24) are usually one topic per employee. This is where an employee indicate what it did or will do during a period of time, or just some status update. Not everybody is using the system on regular basis. +- Devcap (zulip:stream:156) is where we are talking about our investment / due diligence before investing. One topic per company. +- General (zulip:stream:21) is where we talk about different topic on various subject, company wide or services. +- Enginerring (zulip:stream:25) is where we talk about enginerring issue / services / new tool to try +- Learning (zulip:stream:31) is where we share links about new tools / ideas or stuff to learn about +- Reflector (zulip:stream:155) dedicated stream about reflector development and usage +- GreyHaven is separated in multiple topics: branding is in (zulip:stream:206), leads specific to greyhaven (zulip:stream:208) with one topic per lead, and marketing (zulip:stream:212) + +### Meeting and Calendar + +Some persons in the company have a dedicated room for their meeting in reflector. This can be seen in `room_name` in `meeting` entity. +For person like Max, dataindex have calendar information, and he mostly have a related meeting that will be in reflector. However, there is no direct relation between calendar information and reflector meeting. A correlation has to be done to figure out which meeting is it when talking about an event. diff --git a/docs/connectors-and-sources.md b/docs/connectors-and-sources.md new file mode 100644 index 0000000..029e604 --- /dev/null +++ b/docs/connectors-and-sources.md @@ -0,0 +1,99 @@ +# Connectors and Data Sources + +Each connector ingests data from an external source into DataIndex. Connectors run periodic background syncs to keep data fresh. + +Use `list_connectors()` at runtime to see which connectors are actually configured — not all connectors below may be active in every deployment. + +## Connector → Entity Type Mapping + +| Connector ID | Entity Types Produced | Description | +|------------------|-----------------------------------------------------------------|----------------------------------| +| `reflector` | `meeting` | Meeting recordings + transcripts | +| `ics_calendar` | `calendar_event` | ICS calendar feed events | +| `mbsync_email` | `email` | Email via mbsync IMAP sync | +| `zulip` | `conversation`, `conversation_message`, `threaded_conversation` | Zulip chat streams and topics | +| `babelfish` | `conversation_message`, `threaded_conversation` | Chat translation bridge | +| `hedgedoc` | `document` | HedgeDoc collaborative documents | +| `contactdb` | `contact` | Synced from ContactDB (static) | +| `browser_history`| `webpage` | Browser extension page visits | +| `api_document` | `document` | API-ingested documents (static) | + +## Per-Connector Details + +### `reflector` — Meeting Recordings + +Ingests meetings from Reflector, Monadical's meeting recording tool. + +- **Entity type:** `meeting` +- **Key fields:** `transcript`, `summary`, `participants`, `start_time`, `end_time`, `room_name` +- **Use cases:** Find meetings someone attended, search meeting transcripts, get summaries +- **Tip:** Filter with `contact_ids` to find meetings involving specific people. The `transcript` field contains speaker-diarized text. + +### `ics_calendar` — Calendar Events + +Parses ICS calendar feeds (Google Calendar, Outlook, etc.). + +- **Entity type:** `calendar_event` +- **Key fields:** `start_time`, `end_time`, `attendees`, `location`, `description`, `calendar_name` +- **Use cases:** Check upcoming events, find events with specific attendees, review past schedule +- **Tip:** Multiple calendar feeds may be configured as separate connectors (e.g., `personal_calendar`, `work_calendar`). Use `list_connectors()` to discover them. + +### `mbsync_email` — Email + +Syncs email via mbsync (IMAP). + +- **Entity type:** `email` +- **Key fields:** `text_content`, `from_contact_id`, `to_contact_ids`, `cc_contact_ids`, `thread_id`, `has_attachments` +- **Use cases:** Find emails from/to someone, search email content, track email threads +- **Tip:** Use `from_contact_id` and `to_contact_ids` with `contact_ids` filter. For thread grouping, use the `thread_id` field. + +### `zulip` — Chat + +Ingests Zulip streams, topics, and messages. + +- **Entity types:** + - `conversation` — A Zulip stream/channel with recent messages + - `conversation_message` — Individual chat messages + - `threaded_conversation` — A topic thread within a stream +- **Key fields:** `message`, `mentioned_contact_ids`, `recent_messages` +- **Use cases:** Find discussions about a topic, track who said what, find @-mentions +- **Tip:** Use `threaded_conversation` to find topic-level discussions. Use `conversation_message` with `mentioned_contact_ids` to find messages that mention specific people. + +### `babelfish` — Translation Bridge + +Ingests translated chat messages from the Babelfish service. + +- **Entity types:** `conversation_message`, `threaded_conversation` +- **Use cases:** Similar to Zulip but for translated cross-language conversations +- **Tip:** Query alongside `zulip` connector for complete conversation coverage. + +### `hedgedoc` — Collaborative Documents + +Syncs documents from HedgeDoc (collaborative markdown editor). + +- **Entity type:** `document` +- **Key fields:** `content`, `description`, `url`, `revision_id` +- **Use cases:** Find documents by content, track document revisions +- **Tip:** Use `search()` for semantic document search rather than `query_entities` text filter. + +### `contactdb` — Contact Sync (Static) + +Mirrors contacts from ContactDB into DataIndex for unified search. + +- **Entity type:** `contact` +- **Note:** This is a read-only mirror. Use ContactDB MCP tools directly for contact operations. + +### `browser_history` — Browser Extension (Static) + +Captures visited webpages from a browser extension. + +- **Entity type:** `webpage` +- **Key fields:** `url`, `visit_time`, `text_content` +- **Use cases:** Find previously visited pages, search page content + +### `api_document` — API Documents (Static) + +Documents ingested via the REST API (e.g., uploaded PDFs, imported files). + +- **Entity type:** `document` +- **Note:** These are ingested via `POST /api/v1/ingest/documents`, not periodic sync. diff --git a/docs/contactdb-api.md b/docs/contactdb-api.md new file mode 100644 index 0000000..b8a3935 --- /dev/null +++ b/docs/contactdb-api.md @@ -0,0 +1,154 @@ +# ContactDB API Reference + +ContactDB is the people directory. It stores contacts, their platform identities, relationships, notes, and links. Every person across all data sources resolves to a single ContactDB `contact_id`. + +**Base URL:** `http://localhost:42000/contactdb-api` (via Caddy) or `http://localhost:42800` (direct) + +## Core Entities + +### Contact + +The central entity — represents a person. + +| Field | Type | Description | +|----------------------|---------------------|------------------------------------------------| +| `id` | int | Unique contact ID | +| `name` | string | Display name | +| `emails` | EmailField[] | `{type, value, preferred}` | +| `phones` | PhoneField[] | `{type, value, preferred}` | +| `bio` | string? | Short biography | +| `avatar_url` | string? | Profile image URL | +| `personal_info` | PersonalInfo | Birthday, partner, children, role, company, location, how_we_met | +| `interests` | string[] | Topics of interest | +| `values` | string[] | Personal values | +| `tags` | string[] | User-assigned tags | +| `profile_description`| string? | Extended description | +| `is_placeholder` | bool | Auto-created stub (not yet fully resolved) | +| `is_service_account` | bool | Non-human account (bot, no-reply) | +| `stats` | ContactStats | Interaction statistics (see below) | +| `enrichment_data` | dict | Data from enrichment providers | +| `platform_identities`| PlatformIdentity[] | Identities on various platforms | +| `created_at` | datetime | When created | +| `updated_at` | datetime | Last modified | +| `merged_into_id` | int? | If merged, target contact ID | +| `deleted_at` | datetime? | Soft-delete timestamp | + +### ContactStats + +| Field | Type | Description | +|--------------------------|---------------|--------------------------------------| +| `total_messages` | int | Total messages across platforms | +| `platforms_count` | int | Number of platforms active on | +| `last_interaction_at` | string? | ISO datetime of last interaction | +| `interaction_count_30d` | int | Interactions in last 30 days | +| `interaction_count_90d` | int | Interactions in last 90 days | +| `hotness` | HotnessScore? | Composite engagement score (0-100) | + +### PlatformIdentity + +Links a contact to a specific platform account. + +| Field | Type | Description | +|--------------------|-----------|------------------------------------------| +| `id` | int | Identity record ID | +| `contact_id` | int | Parent contact | +| `source` | string | Data provenance (e.g., `dataindex_zulip`)| +| `platform` | string | Platform name (e.g., `email`, `zulip`) | +| `platform_user_id` | string | User ID on that platform | +| `display_name` | string? | Name shown on that platform | +| `avatar_url` | string? | Platform-specific avatar | +| `bio` | string? | Platform-specific bio | +| `extra_data` | dict | Additional platform-specific data | +| `first_seen_at` | datetime | When first observed | +| `last_seen_at` | datetime | When last observed | + +### Relationship + +Tracks connections between contacts. + +| Field | Type | Description | +|------------------------|-----------|--------------------------------------| +| `id` | int | Relationship ID | +| `from_contact_id` | int | Source contact | +| `to_contact_id` | int | Target contact | +| `relationship_type` | string | Type (e.g., "colleague", "client") | +| `since_date` | date? | When relationship started | +| `relationship_metadata`| dict | Additional metadata | + +### Note + +Free-text notes attached to a contact. + +| Field | Type | Description | +|--------------|----------|----------------------| +| `id` | int | Note ID | +| `contact_id` | int | Parent contact | +| `content` | string | Note text | +| `created_by` | string | Who wrote it | +| `created_at` | datetime | When created | + +### Link + +External URLs associated with a contact. + +| Field | Type | Description | +|--------------|----------|--------------------------| +| `id` | int | Link ID | +| `contact_id` | int | Parent contact | +| `type` | string | Link type (e.g., "github", "linkedin") | +| `label` | string | Display label | +| `url` | string | URL | + +## REST Endpoints + +### GET `/api/contacts` — List/search contacts + +Primary way to find contacts. Returns `{contacts: [...], total, limit, offset}`. + +**Query parameters:** + +| Parameter | Type | Description | +|------------------------|---------------|----------------------------------------------| +| `search` | string? | Search in name and bio | +| `is_placeholder` | bool? | Filter by placeholder status | +| `is_service_account` | bool? | Filter by service account status | +| `sort_by` | string? | `"hotness"`, `"name"`, or `"updated_at"` | +| `min_hotness` | float? | Minimum hotness score (0-100) | +| `max_hotness` | float? | Maximum hotness score (0-100) | +| `platforms` | string[]? | Contacts with ALL specified platforms (AND) | +| `last_interaction_from`| string? | ISO datetime lower bound | +| `last_interaction_to` | string? | ISO datetime upper bound | +| `limit` | int | Max results (1-100, default 50) | +| `offset` | int | Pagination offset (default 0) | + +### GET `/api/contacts/me` — Get self contact + +Returns the platform operator's own contact record. **Call this first** in most workflows to get your own `contact_id`. + +### GET `/api/contacts/{id}` — Get contact by ID + +Get full details for a single contact by numeric ID. + +### GET `/api/contacts/by-email/{email}` — Get contact by email + +Look up a contact by email address. + +### Other Endpoints + +| Method | Path | Description | +|--------|-----------------------------------------|----------------------------------| +| POST | `/api/contacts` | Create contact | +| PUT | `/api/contacts/{id}` | Update contact | +| DELETE | `/api/contacts/{id}` | Delete contact | +| POST | `/api/contacts/merge` | Merge two contacts | +| GET | `/api/contacts/{id}/relationships` | List relationships | +| GET | `/api/contacts/{id}/notes` | List notes | +| GET | `/api/contacts/{id}/links` | List links | +| GET | `/api/platform-identities/contacts/{id}`| List platform identities | + +## Usage Pattern + +1. **Start with `GET /api/contacts/me`** to get the operator's contact ID +2. **Search by name** with `GET /api/contacts?search=Alice` +3. **Use contact IDs** from results as filters in DataIndex queries (`contact_ids` parameter) +4. **Paginate** large result sets with `offset` increments diff --git a/docs/dataindex-api.md b/docs/dataindex-api.md new file mode 100644 index 0000000..534ccd9 --- /dev/null +++ b/docs/dataindex-api.md @@ -0,0 +1,218 @@ +# DataIndex API Reference + +DataIndex aggregates data from all connected sources (email, calendar, Zulip, meetings, documents) into a unified query interface. Every piece of data is an **entity** with a common base structure plus type-specific fields. + +**Base URL:** `http://localhost:42000/dataindex/api/v1` (via Caddy) or `http://localhost:42180/api/v1` (direct) + +## Entity Types + +All entities share these base fields: + +| Field | Type | Description | +|----------------------|-------------|---------------------------------------------| +| `id` | string | Format: `connector_name:native_id` | +| `entity_type` | string | One of the types below | +| `timestamp` | datetime | When the entity occurred | +| `contact_ids` | string[] | ContactDB IDs of people involved | +| `connector_id` | string | Which connector produced this | +| `title` | string? | Display title | +| `parent_id` | string? | Parent entity (e.g., thread for a message) | +| `raw_data` | dict | Original source data (excluded by default) | + +### `calendar_event` + +From ICS calendar feeds. + +| Field | Type | Description | +|-----------------------|-------------|--------------------------------| +| `start_time` | datetime? | Event start | +| `end_time` | datetime? | Event end | +| `all_day` | bool | All-day event flag | +| `description` | string? | Event description | +| `location` | string? | Event location | +| `attendees` | dict[] | Attendee list | +| `organizer_contact_id`| string? | ContactDB ID of organizer | +| `status` | string? | Event status | +| `calendar_name` | string? | Source calendar name | +| `meeting_url` | string? | Video call link | + +### `meeting` + +From Reflector (recorded meetings with transcripts). + +| Field | Type | Description | +|--------------------|---------------------|-----------------------------------| +| `start_time` | datetime? | Meeting start | +| `end_time` | datetime? | Meeting end | +| `participants` | MeetingParticipant[]| People in the meeting | +| `meeting_platform` | string? | Platform (e.g., "jitsi") | +| `transcript` | string? | Full transcript text | +| `summary` | string? | AI-generated summary | +| `meeting_url` | string? | Meeting link | +| `recording_url` | string? | Recording link | +| `location` | string? | Physical location | +| `room_name` | string? | Virtual room name (also indicates meeting location — see below) | + +**MeetingParticipant** fields: `display_name`, `contact_id?`, `platform_user_id?`, `email?`, `speaker?` + +> **`room_name` as location indicator:** The `room_name` field often encodes where the meeting took place (e.g., a Jitsi room name like `standup-office-bogota`). Use it to infer the meeting location when `location` is not set. + +> **Participant and contact coverage is incomplete.** Meeting data comes from Reflector, which only tracks users who are logged into the Reflector platform. This means: +> +> - **`contact_ids`** only contains ContactDB IDs for Reflector-logged participants who were matched to a known contact. It will often be a **subset** of the actual attendees — do not assume it is the full list. +> - **`participants`** is more complete than `contact_ids` but still only includes people detected by Reflector. Not all participants have accounts or could be identified — some attendees may be entirely absent from this list. +> - **`contact_id` within a participant** may be `null` if the person was detected but couldn't be matched to a ContactDB entry. +> +> **Consequence for queries:** Filtering meetings by `contact_ids` will **miss meetings** where the person attended but wasn't logged into Reflector or wasn't resolved. To get better coverage, combine multiple strategies: +> +> 1. Filter by `contact_ids` for resolved participants +> 2. Search `participants[].display_name` client-side for name matches +> 3. Use `POST /search` with the person's name to search meeting transcripts and summaries + +### `email` + +From mbsync email sync. + +| Field | Type | Description | +|--------------------|-----------|--------------------------------------| +| `thread_id` | string? | Email thread grouping | +| `text_content` | string? | Plain text body | +| `html_content` | string? | HTML body | +| `snippet` | string? | Preview snippet | +| `from_contact_id` | string? | Sender's ContactDB ID | +| `to_contact_ids` | string[] | Recipient ContactDB IDs | +| `cc_contact_ids` | string[] | CC recipient ContactDB IDs | +| `has_attachments` | bool | Has attachments flag | +| `attachments` | dict[] | Attachment metadata | + +### `conversation` + +A Zulip stream/channel. + +| Field | Type | Description | +|--------------------|---------|----------------------------------------| +| `recent_messages` | dict[] | Recent messages in the conversation | + +### `conversation_message` + +A single message in a Zulip conversation. + +| Field | Type | Description | +|-------------------------|-----------|-----------------------------------| +| `message` | string? | Message text content | +| `mentioned_contact_ids` | string[] | ContactDB IDs of mentioned people | + +### `threaded_conversation` + +A Zulip topic thread (group of messages under a topic). + +| Field | Type | Description | +|--------------------|---------|----------------------------------------| +| `recent_messages` | dict[] | Recent messages in the thread | + +### `document` + +From HedgeDoc, API ingestion, or other document sources. + +| Field | Type | Description | +|----------------|-----------|------------------------------| +| `content` | string? | Document body text | +| `description` | string? | Document description | +| `mimetype` | string? | MIME type | +| `url` | string? | Source URL | +| `revision_id` | string? | Revision identifier | + +### `webpage` + +From browser history extension. + +| Field | Type | Description | +|----------------|-----------|------------------------------| +| `url` | string | Page URL | +| `visit_time` | datetime | When visited | +| `text_content` | string? | Page text content | + +## REST Endpoints + +### GET `/api/v1/query` — Exhaustive Filtered Enumeration + +Use when you need **all** entities matching specific criteria. Supports pagination. + +**When to use:** "List all meetings since January", "Get all emails from Alice", "Count calendar events this week" + +**Query parameters:** + +| Parameter | Type | Description | +|------------------|---------------|------------------------------------------------| +| `entity_types` | string (repeat) | Filter by type — repeat param for multiple: `?entity_types=email&entity_types=meeting` | +| `contact_ids` | string | Comma-separated ContactDB IDs: `"1,42"` | +| `connector_ids` | string | Comma-separated connector IDs: `"zulip,reflector"` | +| `date_from` | string | ISO datetime lower bound (UTC if no timezone) | +| `date_to` | string | ISO datetime upper bound | +| `search` | string? | Text filter on content fields | +| `parent_id` | string? | Filter by parent entity | +| `thread_id` | string? | Filter emails by thread ID | +| `room_name` | string? | Filter meetings by room name | +| `limit` | int | Max results per page (default 50) | +| `offset` | int | Pagination offset (default 0) | +| `sort_by` | string | `"timestamp"` (default), `"title"`, `"contact_activity"`, etc. | +| `sort_order` | string | `"desc"` (default) or `"asc"` | +| `include_raw_data`| bool | Include raw_data field (default false) | + +**Response format:** + +```json +{ + "items": [...], + "total": 152, + "page": 1, + "size": 50, + "pages": 4 +} +``` + +**Pagination:** loop with offset increments until `offset >= total`. See [notebook-patterns.md] for a reusable helper. + +### POST `/api/v1/search` — Semantic Search + +Use when you need **relevant** results for a natural-language question. Returns ranked text chunks. No pagination — set a higher `limit` instead. + +**When to use:** "What was discussed about the product roadmap?", "Find conversations about hiring" + +**Request body (JSON):** + +```json +{ + "search_text": "product roadmap decisions", + "entity_types": ["meeting", "threaded_conversation"], + "contact_ids": ["1", "42"], + "date_from": "2025-01-01T00:00:00Z", + "date_to": "2025-06-01T00:00:00Z", + "connector_ids": ["reflector", "zulip"], + "limit": 20 +} +``` + +**Response:** `{results: [...chunks], total_count}` — each chunk has `entity_ids`, `entity_type`, `connector_id`, `content`, `timestamp`. + +### GET `/api/v1/entities/{id}` — Get Entity by ID + +Retrieve full details of a single entity. The `entity_id` format is `connector_name:native_id`. + +### GET `/api/v1/connectors/status` — Connector Status + +Get sync status for all connectors (last sync time, entity count, health). + +## Common Query Recipes + +| Question | entity_type + connector_id | +|---------------------------------------|------------------------------------------| +| Meetings I attended | `meeting` + `reflector`, with your contact_id | +| Upcoming calendar events | `calendar_event` + `ics_calendar`, date_from=now | +| Emails from someone | `email` + `mbsync_email`, with their contact_id | +| Zulip threads about a topic | `threaded_conversation` + `zulip`, search="topic" | +| All documents | `document` + `hedgedoc` | +| Chat messages mentioning someone | `conversation_message` + `zulip`, with contact_id | +| What was discussed about X? | Use `POST /search` with `search_text` | + +[notebook-patterns.md]: ./notebook-patterns.md diff --git a/docs/notebook-patterns.md b/docs/notebook-patterns.md new file mode 100644 index 0000000..e224fb3 --- /dev/null +++ b/docs/notebook-patterns.md @@ -0,0 +1,545 @@ +# Marimo Notebook Patterns + +This guide covers how to create [marimo](https://marimo.io) notebooks for data analysis against the InternalAI platform APIs. Marimo notebooks are plain `.py` files with reactive cells — no `.ipynb` format, no Jupyter dependency. + +## Marimo Basics + +A marimo notebook is a Python file with `@app.cell` decorated functions. Each cell returns values as a tuple, and other cells receive them as function parameters — marimo builds a reactive DAG automatically. + +```python +import marimo +app = marimo.App() + +@app.cell +def cell_one(): + x = 42 + return (x,) + +@app.cell +def cell_two(x): + # Re-runs automatically when x changes + result = x * 2 + return (result,) +``` + +**Key rules:** +- Cells declare dependencies via function parameters +- Cells return values as tuples: `return (var1, var2,)` +- The **last expression** in a cell is displayed as rich output in the marimo UI (dataframes render as tables, dicts as collapsible trees) +- Use `mo.md("# heading")` for formatted markdown output (import `mo` once in setup — see below) +- No manual execution order; the DAG determines it +- **Variable names must be unique across cells.** Every variable assigned at the top level of a cell is tracked by marimo's DAG. If two cells both define `resp`, marimo raises `MultipleDefinitionError` and refuses to run. Prefix cell-local variables with `_` (e.g., `_resp`, `_rows`, `_data`) to make them **private** to that cell — marimo ignores `_`-prefixed names. +- **Import shared modules once** in a single setup cell and pass them as cell parameters. Do NOT `import marimo as mo` in multiple cells — that defines `mo` twice. Instead, import it once in `setup` and receive it via `def my_cell(mo):`. + +### Cell Variable Scoping — Example + +This is the **most common mistake**. Any variable assigned at the top level of a cell (not inside a `def` or comprehension) is tracked by marimo. If two cells assign the same name, the notebook refuses to run. + +**BROKEN** — `resp` is defined at top level in both cells: + +```python +# Cell A +@app.cell +def search_meetings(client, DATAINDEX): + resp = client.post(f"{DATAINDEX}/search", json={...}) # defines 'resp' + resp.raise_for_status() + results = resp.json()["results"] + return (results,) + +# Cell B +@app.cell +def fetch_details(client, DATAINDEX, results): + resp = client.get(f"{DATAINDEX}/entities/{results[0]}") # also defines 'resp' → ERROR + meeting = resp.json() + return (meeting,) +``` + +> **Error:** `MultipleDefinitionError: variable 'resp' is defined in multiple cells` + +**FIXED** — prefix cell-local variables with `_`: + +```python +# Cell A +@app.cell +def search_meetings(client, DATAINDEX): + _resp = client.post(f"{DATAINDEX}/search", json={...}) # _resp is cell-private + _resp.raise_for_status() + results = _resp.json()["results"] + return (results,) + +# Cell B +@app.cell +def fetch_details(client, DATAINDEX, results): + _resp = client.get(f"{DATAINDEX}/entities/{results[0]}") # _resp is cell-private, no conflict + meeting = _resp.json() + return (meeting,) +``` + +**Rule of thumb:** if a variable is only used within the cell to compute a return value, prefix it with `_`. Only leave names unprefixed if another cell needs to receive them. + +> **Note:** Variables inside nested `def` functions are naturally local and don't need `_` prefixes — e.g., `resp` inside a `def fetch_all(...)` helper is fine because it's scoped to the function, not the cell. + +### Inline Dependencies with PEP 723 + +Use PEP 723 `/// script` metadata so `uv run` auto-installs dependencies: + +```python +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "marimo", +# "httpx", +# "polars", +# ] +# /// +``` + +### Running Notebooks + +```bash +uvx marimo edit notebook.py # Interactive editor (best for development) +uvx marimo run notebook.py # Read-only web app +uv run notebook.py # Script mode (terminal output) +``` + +### Inspecting Cell Outputs + +In `marimo edit`, every cell's return value is displayed as rich output below the cell. This is the primary way to introspect API responses: + +- **Dicts/lists** render as collapsible JSON trees — click to expand nested fields +- **Polars/Pandas DataFrames** render as interactive sortable tables +- **Strings** render as plain text + +To inspect a raw API response, just make it the last expression: + +```python +@app.cell +def inspect_response(client, DATAINDEX): + _resp = client.get(f"{DATAINDEX}/query", params={ + "entity_types": "meeting", "limit": 2, + }) + _resp.json() # This gets displayed as a collapsible JSON tree +``` + +To inspect an intermediate value alongside other work, use `mo.accordion` or return it: + +```python +@app.cell +def debug_meetings(meetings, mo): + mo.md(f"**Count:** {len(meetings)}") + # Show first item structure for inspection + mo.accordion({"First meeting raw": mo.json(meetings[0])}) if meetings else None +``` + +## Notebook Skeleton + +Every notebook against InternalAI follows this structure: + +```python +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "marimo", +# "httpx", +# "polars", +# ] +# /// + +import marimo +app = marimo.App() + +@app.cell +def params(): + """User parameters — edit these to change the workflow's behavior.""" + SEARCH_TERMS = ["greyhaven"] + DATE_FROM = "2026-01-01T00:00:00Z" + DATE_TO = "2026-02-01T00:00:00Z" + TARGET_PERSON = None # Set to a name like "Alice" to filter by person, or None for all + return DATE_FROM, DATE_TO, SEARCH_TERMS, TARGET_PERSON + +@app.cell +def config(): + BASE = "http://localhost:42000" + CONTACTDB = f"{BASE}/contactdb-api" + DATAINDEX = f"{BASE}/dataindex/api/v1" + return (CONTACTDB, DATAINDEX,) + +@app.cell +def setup(): + import httpx + import marimo as mo + import polars as pl + client = httpx.Client(timeout=30) + return (client, mo, pl,) + +# --- your IN / ETL / OUT cells here --- + +if __name__ == "__main__": + app.run() +``` + +**The `params` cell must always be the first cell** after `app = marimo.App()`. It contains all user-configurable constants (search terms, date ranges, target names, etc.) as plain Python values. This way the user can tweak the workflow by editing a single cell at the top — no need to hunt through the code for hardcoded values. + +## Pagination Helper + +The DataIndex `GET /query` endpoint paginates with `limit` and `offset`. Always paginate — result sets can be large. + +```python +@app.cell +def helpers(client): + def fetch_all(url, params): + """Fetch all pages from a paginated DataIndex endpoint.""" + all_items = [] + limit = params.get("limit", 50) + params = {**params, "limit": limit, "offset": 0} + while True: + resp = client.get(url, params=params) + resp.raise_for_status() + data = resp.json() + all_items.extend(data["items"]) + if params["offset"] + limit >= data["total"]: + break + params["offset"] += limit + return all_items + + def resolve_contact(name, contactdb_url): + """Find a contact by name, return their ID.""" + resp = client.get(f"{contactdb_url}/api/contacts", params={"search": name}) + resp.raise_for_status() + contacts = resp.json()["contacts"] + if not contacts: + raise ValueError(f"No contact found for '{name}'") + return contacts[0] + + return (fetch_all, resolve_contact,) +``` + +## Pattern 1: Emails Involving a Specific Person + +Emails have `from_contact_id`, `to_contact_ids`, and `cc_contact_ids`. The query API's `contact_ids` filter matches entities where the contact appears in **any** of these roles. + +```python +@app.cell +def find_person(resolve_contact, CONTACTDB): + target = resolve_contact("Alice", CONTACTDB) + target_id = target["id"] + target_name = target["name"] + return (target_id, target_name,) + +@app.cell +def fetch_emails(fetch_all, DATAINDEX, target_id): + emails = fetch_all(f"{DATAINDEX}/query", { + "entity_types": "email", + "contact_ids": str(target_id), + "date_from": "2025-01-01T00:00:00Z", + "sort_order": "desc", + }) + return (emails,) + +@app.cell +def email_table(emails, target_id, target_name, pl): + email_df = pl.DataFrame([{ + "date": e["timestamp"][:10], + "subject": e.get("title", "(no subject)"), + "direction": ( + "sent" if str(target_id) == str(e.get("from_contact_id")) + else "received" + ), + "snippet": (e.get("snippet") or e.get("text_content") or "")[:100], + } for e in emails]) + return (email_df,) + +@app.cell +def show_emails(email_df, target_name, mo): + mo.md(f"## Emails involving {target_name} ({len(email_df)} total)") + +@app.cell +def display_email_table(email_df): + email_df # Renders as interactive table in marimo edit +``` + +## Pattern 2: Meetings with a Specific Participant + +Meetings have a `participants` list where each entry may or may not have a resolved `contact_id`. The query API's `contact_ids` filter only matches **resolved** participants. + +**Strategy:** Query by `contact_ids` to get meetings with resolved participants, then optionally do a client-side check on `participants[].display_name` or `transcript` for unresolved ones. + +```python +@app.cell +def fetch_meetings(fetch_all, DATAINDEX, target_id, my_id): + # Get meetings where the target appears in contact_ids + resolved_meetings = fetch_all(f"{DATAINDEX}/query", { + "entity_types": "meeting", + "contact_ids": str(target_id), + "date_from": "2025-01-01T00:00:00Z", + }) + return (resolved_meetings,) + +@app.cell +def meeting_table(resolved_meetings, target_name, pl): + _rows = [] + for _m in resolved_meetings: + _participants = _m.get("participants", []) + _names = [_p["display_name"] for _p in _participants] + _rows.append({ + "date": (_m.get("start_time") or _m["timestamp"])[:10], + "title": _m.get("title", _m.get("room_name", "Untitled")), + "participants": ", ".join(_names), + "has_transcript": _m.get("transcript") is not None, + "has_summary": _m.get("summary") is not None, + }) + meeting_df = pl.DataFrame(_rows) + return (meeting_df,) +``` + +To also find meetings where the person was present but **not resolved** (guest), search the transcript: + +```python +@app.cell +def search_unresolved(client, DATAINDEX, target_name): + # Semantic search for the person's name in meeting transcripts + _resp = client.post(f"{DATAINDEX}/search", json={ + "search_text": target_name, + "entity_types": ["meeting"], + "limit": 50, + }) + _resp.raise_for_status() + transcript_hits = _resp.json()["results"] + return (transcript_hits,) +``` + +## Pattern 3: Calendar Events → Meeting Correlation + +Calendar events and meetings are separate entities from different connectors. To find which calendar events had a corresponding recorded meeting, match by time overlap. + +```python +@app.cell +def fetch_calendar_and_meetings(fetch_all, DATAINDEX, my_id): + events = fetch_all(f"{DATAINDEX}/query", { + "entity_types": "calendar_event", + "contact_ids": str(my_id), + "date_from": "2025-01-01T00:00:00Z", + "sort_by": "timestamp", + "sort_order": "asc", + }) + meetings = fetch_all(f"{DATAINDEX}/query", { + "entity_types": "meeting", + "contact_ids": str(my_id), + "date_from": "2025-01-01T00:00:00Z", + }) + return (events, meetings,) + +@app.cell +def correlate(events, meetings, pl): + from datetime import datetime, timedelta + + def _parse_dt(s): + if not s: + return None + return datetime.fromisoformat(s.replace("Z", "+00:00")) + + # Index meetings by start_time for matching + _meeting_by_time = {} + for _m in meetings: + _start = _parse_dt(_m.get("start_time")) + if _start: + _meeting_by_time[_start] = _m + + _rows = [] + for _ev in events: + _ev_start = _parse_dt(_ev.get("start_time")) + _ev_end = _parse_dt(_ev.get("end_time")) + if not _ev_start: + continue + + # Find meeting within 15-min window of calendar event start + _matched = None + for _m_start, _m in _meeting_by_time.items(): + if abs((_m_start - _ev_start).total_seconds()) < 900: + _matched = _m + break + + _rows.append({ + "date": _ev_start.strftime("%Y-%m-%d"), + "time": _ev_start.strftime("%H:%M"), + "event_title": _ev.get("title", "(untitled)"), + "has_recording": _matched is not None, + "meeting_title": _matched.get("title", "") if _matched else "", + "attendee_count": len(_ev.get("attendees", [])), + }) + + calendar_df = pl.DataFrame(_rows) + return (calendar_df,) +``` + +## Pattern 4: Full Interaction Timeline for a Person + +Combine emails, meetings, and Zulip messages into a single chronological view. + +```python +@app.cell +def fetch_all_interactions(fetch_all, DATAINDEX, target_id): + all_entities = fetch_all(f"{DATAINDEX}/query", { + "contact_ids": str(target_id), + "date_from": "2025-01-01T00:00:00Z", + "sort_by": "timestamp", + "sort_order": "desc", + }) + return (all_entities,) + +@app.cell +def interaction_timeline(all_entities, target_name, pl): + _rows = [] + for _e in all_entities: + _etype = _e["entity_type"] + _summary = "" + if _etype == "email": + _summary = _e.get("snippet") or _e.get("title") or "" + elif _etype == "meeting": + _summary = _e.get("summary") or _e.get("title") or "" + elif _etype == "conversation_message": + _summary = (_e.get("message") or "")[:120] + elif _etype == "threaded_conversation": + _summary = _e.get("title") or "" + elif _etype == "calendar_event": + _summary = _e.get("title") or "" + else: + _summary = _e.get("title") or _e["entity_type"] + + _rows.append({ + "date": _e["timestamp"][:10], + "type": _etype, + "source": _e["connector_id"], + "summary": _summary[:120], + }) + + timeline_df = pl.DataFrame(_rows) + return (timeline_df,) + +@app.cell +def show_timeline(timeline_df, target_name, mo): + mo.md(f"## Interaction Timeline: {target_name} ({len(timeline_df)} events)") + +@app.cell +def display_timeline(timeline_df): + timeline_df +``` + +## Do / Don't — Quick Reference for LLM Agents + +When generating marimo notebooks, follow these rules strictly. Violations cause `MultipleDefinitionError` at runtime. + +### Do + +- **Prefix cell-local variables with `_`** — `_resp`, `_rows`, `_m`, `_data`, `_chunk`. Marimo ignores `_`-prefixed names so they won't clash across cells. +- **Import shared modules once in `setup`** and pass them as cell parameters: `def my_cell(client, mo, pl):`. +- **Give returned DataFrames unique names** — `email_df`, `meeting_df`, `timeline_df`. Never use a bare `df` that might collide with another cell. +- **Return only values other cells need** — everything else should be `_`-prefixed and stays private to the cell. +- **Use `from datetime import datetime` inside the cell** that needs it (stdlib imports are fine inline since they're `_`-safe inside functions, but avoid assigning them to non-`_` names if another cell does the same). +- **Every non-utility cell must show a preview** — see the "Cell Output Previews" section below. +- **Put all user parameters in a `params` cell as the first cell** — date ranges, search terms, target names, limits. Never hardcode these values deeper in the notebook. + +### Don't + +- **Don't define the same variable name in two cells** — even `resp = ...` in cell A and `resp = ...` in cell B is a fatal error. +- **Don't `import marimo as mo` in multiple cells** — this defines `mo` twice. Import it once in `setup`, then receive it via `def my_cell(mo):`. +- **Don't use generic top-level names** like `df`, `rows`, `resp`, `data`, `result` — either prefix with `_` or give them a unique descriptive name. +- **Don't return temporary variables** — if `_rows` is only used to build a DataFrame, keep it `_`-prefixed and only return the DataFrame. +- **Don't use `import X` at the top level of multiple cells** for the same module — the module variable name would be duplicated. Import once in `setup` or use `_`-prefixed local imports (`_json = __import__("json")`). + +## Cell Output Previews + +Every cell that fetches, transforms, or produces data **must display a preview** so the user can validate results at each step. The only exceptions are **utility cells** (config, setup, helpers) that only define constants or functions. + +Think from the user's perspective: when they open the notebook in `marimo edit`, each cell should tell them something useful — a count, a sample, a summary. Silent cells that do work but show nothing are hard to debug and validate. + +### What to show + +| Cell type | What to preview | +|-----------|----------------| +| API fetch (list of items) | `mo.md(f"**Fetched {len(items)} meetings**")` | +| DataFrame build | The DataFrame itself as last expression (renders as interactive table) | +| Scalar result | `mo.md(f"**Contact:** {name} (id={contact_id})")` | +| Search / filter | `mo.md(f"**{len(hits)} results** matching '{term}'")` | +| Final output | Full DataFrame or `mo.md()` summary as last expression | + +### Example: fetch cell with preview + +**Bad** — cell runs silently, user sees nothing: + +```python +@app.cell +def fetch_meetings(fetch_all, DATAINDEX, my_id): + meetings = fetch_all(f"{DATAINDEX}/query", { + "entity_types": "meeting", + "contact_ids": str(my_id), + }) + return (meetings,) +``` + +**Good** — cell shows a count so the user knows it worked: + +```python +@app.cell +def fetch_meetings(fetch_all, DATAINDEX, my_id, mo): + meetings = fetch_all(f"{DATAINDEX}/query", { + "entity_types": "meeting", + "contact_ids": str(my_id), + }) + mo.md(f"**Fetched {len(meetings)} meetings**") + return (meetings,) +``` + +### Example: transform cell with table preview + +**Bad** — builds DataFrame but doesn't display it: + +```python +@app.cell +def build_table(meetings, pl): + _rows = [{"date": _m["timestamp"][:10], "title": _m.get("title", "")} for _m in meetings] + meeting_df = pl.DataFrame(_rows) + return (meeting_df,) +``` + +**Good** — DataFrame is the last expression, so marimo renders it as an interactive table: + +```python +@app.cell +def build_table(meetings, pl, mo): + _rows = [{"date": _m["timestamp"][:10], "title": _m.get("title", "")} for _m in meetings] + meeting_df = pl.DataFrame(_rows).sort("date") + mo.md(f"### Meetings ({len(meeting_df)} results)") + return (meeting_df,) + +@app.cell +def show_meeting_table(meeting_df): + meeting_df # Renders as interactive sortable table +``` + +### Utility cells (no preview needed) + +Config, setup, and helper cells that only define constants or functions don't need previews: + +```python +@app.cell +def config(): + BASE = "http://localhost:42000" + CONTACTDB = f"{BASE}/contactdb-api" + DATAINDEX = f"{BASE}/dataindex/api/v1" + return CONTACTDB, DATAINDEX + +@app.cell +def helpers(client): + def fetch_all(url, params): + ... + return (fetch_all,) +``` + +## Tips + +- Use `marimo edit` during development to see cell outputs interactively +- Make raw API responses the last expression in a cell to inspect their structure +- Use `polars` over `pandas` for better performance and type safety +- Set `timeout=30` on httpx clients — some queries over large date ranges are slow +- Name cells descriptively — function names appear in the marimo sidebar diff --git a/workflows/.empty b/workflows/.empty new file mode 100644 index 0000000..e69de29