diff --git a/docs/notebook-patterns.md b/docs/notebook-patterns.md index 4b0bb54..2f74464 100644 --- a/docs/notebook-patterns.md +++ b/docs/notebook-patterns.md @@ -137,13 +137,12 @@ If you write `await` in a non-async cell, marimo cannot parse the cell and saves If a cell defines Pydantic models (or any class) that other cells need, it **must** return them: ```python +# BaseModel and Field are imported in the setup cell and received as parameters @app.cell -def models(): - from pydantic import BaseModel - +def models(BaseModel, Field): class MeetingSentiment(BaseModel): overall_sentiment: str - sentiment_score: int + sentiment_score: int = Field(description="Score from -10 to +10") class FrustrationExtraction(BaseModel): has_frustrations: bool @@ -291,14 +290,15 @@ def config(): @app.cell def setup(): from dotenv import load_dotenv - load_dotenv() # Load .env from the project root + load_dotenv(".env") # Load .env from the project root import asyncio # All imports go here — never import inside other cells import httpx import marimo as mo import polars as pl + from pydantic import BaseModel, Field client = httpx.Client(timeout=30) - return (asyncio, client, mo, pl,) + return (asyncio, client, mo, pl, BaseModel, Field,) # --- your IN / ETL / OUT cells here --- @@ -306,7 +306,7 @@ if __name__ == "__main__": app.run() ``` -> **`load_dotenv()`** reads the `.env` file from the project root (walks up from the notebook's directory). This makes `LLM_API_KEY` and other env vars available to `os.getenv()` calls in `lib/llm.py` without requiring the shell to have them pre-set. Always include `python-dotenv` in PEP 723 dependencies and call `load_dotenv()` early in the setup cell. +> **`load_dotenv(".env")`** reads the `.env` file explicitly by name. This makes `LLM_API_KEY` and other env vars available to `os.getenv()` calls in `lib/llm.py` without requiring the shell to have them pre-set. Always include `python-dotenv` in PEP 723 dependencies and call `load_dotenv(".env")` early in the setup cell. **The `params` cell must always be the first cell** after `app = marimo.App()`. It contains all user-configurable constants (search terms, date ranges, target names, etc.) as plain Python values. This way the user can tweak the workflow by editing a single cell at the top — no need to hunt through the code for hardcoded values. @@ -581,15 +581,16 @@ When you need to classify, score, or extract structured information from each en @app.cell def setup(): from dotenv import load_dotenv - load_dotenv() # Makes LLM_API_KEY available to lib/llm.py + load_dotenv(".env") # Makes LLM_API_KEY available to lib/llm.py import asyncio import httpx import marimo as mo import polars as pl + from pydantic import BaseModel, Field from lib.llm import llm_call client = httpx.Client(timeout=30) - return (asyncio, client, llm_call, mo, pl,) + return (asyncio, client, llm_call, mo, pl, BaseModel, Field,) ``` ### Define a response model @@ -598,8 +599,7 @@ Create a Pydantic model that describes the structured output you want from the L ```python @app.cell -def models(): - from pydantic import BaseModel +def models(BaseModel, Field): class RelevanceScore(BaseModel): relevant: bool @@ -653,11 +653,12 @@ When generating marimo notebooks, follow these rules strictly. Violations cause - **Return only values other cells need** — everything else should be `_`-prefixed and stays private to the cell. - **Import stdlib modules in `setup` too** — even `from datetime import datetime` creates a top-level name. If two cells both import `datetime`, marimo errors. Import it once in `setup` and receive it as a parameter, or use it inside a `_`-prefixed helper function where it's naturally scoped. - **Every non-utility cell must show a preview** — see the "Cell Output Previews" section below. +- **Use separate display cells for DataFrames** — the build cell returns the DataFrame and shows a `mo.md()` count/heading; a standalone display cell (e.g., `def show_table(df): df`) renders it as an interactive table the user can sort and filter. - **Keep cell output expressions at the top level** — if a cell conditionally displays a DataFrame, initialize `_output = None` before the `if`/`else`, assign inside the branches, then put `_output` as the last top-level expression. Expressions inside `if`/`else`/`for` blocks are silently ignored by marimo. - **Put all user parameters in a `params` cell as the first cell** — date ranges, search terms, target names, limits. Never hardcode these values deeper in the notebook. - **Declare cells as `async def` when using `await`** — `@app.cell` followed by `async def cell_name(...)`. This includes cells using `asyncio.gather`, `await llm_call(...)`, or any async API. - **Return classes/models from cells that define them** — if a cell defines `class MyModel(BaseModel)`, return it so other cells can use it as a parameter: `return (MyModel,)`. -- **Use `python-dotenv` to load `.env`** — add `python-dotenv` to PEP 723 dependencies and call `load_dotenv()` early in the setup cell (before importing `lib.llm`). This ensures `LLM_API_KEY` and other env vars are available without requiring them to be pre-set in the shell. +- **Use `python-dotenv` to load `.env`** — add `python-dotenv` to PEP 723 dependencies and call `load_dotenv(".env")` early in the setup cell (before importing `lib.llm`). This ensures `LLM_API_KEY` and other env vars are available without requiring them to be pre-set in the shell. ### Don't @@ -724,7 +725,7 @@ def build_table(meetings, pl): return (meeting_df,) ``` -**Good** — DataFrame is the last expression, so marimo renders it as an interactive table: +**Good** — the build cell shows a `mo.md()` count, and a **separate display cell** renders the DataFrame as an interactive table: ```python @app.cell @@ -739,6 +740,27 @@ def show_meeting_table(meeting_df): meeting_df # Renders as interactive sortable table ``` +### Separate display cells for DataFrames + +When a cell builds a DataFrame, use **two cells**: one that builds and returns it (with a `mo.md()` summary), and a standalone display cell that renders it as a table. This keeps the build logic clean and gives the user an interactive table they can sort and filter in the marimo UI. + +```python +# Cell 1: build and return the DataFrame, show a count +@app.cell +def build_sentiment_table(analyzed_meetings, pl, mo): + _rows = [...] + sentiment_df = pl.DataFrame(_rows).sort("date", descending=True) + mo.md(f"### Sentiment Analysis ({len(sentiment_df)} meetings)") + return (sentiment_df,) + +# Cell 2: standalone display — just the DataFrame, nothing else +@app.cell +def show_sentiment_table(sentiment_df): + sentiment_df +``` + +This pattern makes every result inspectable. The `mo.md()` cell gives a quick count/heading; the display cell lets the user explore the full data interactively. + ### Utility cells (no preview needed) Config, setup, and helper cells that only define constants or functions don't need previews: diff --git a/workflows/lib/llm.py b/workflows/lib/llm.py index c2e4786..d65c510 100644 --- a/workflows/lib/llm.py +++ b/workflows/lib/llm.py @@ -1,6 +1,7 @@ """Simple LLM helper for workbooks using Mirascope v2.""" import os +import re from typing import TypeVar from mirascope import llm @@ -23,6 +24,15 @@ llm.register_provider( ) +def _sanitize_json(text: str) -> str: + """Strip control characters (U+0000–U+001F) that break JSON parsing. + + Some LLMs emit literal newlines/tabs inside JSON string values, + which is invalid per the JSON spec. Replace them with spaces. + """ + return re.sub(r"[\x00-\x1f]+", " ", text) + + async def llm_call( prompt: str, response_model: type[T], @@ -47,4 +57,8 @@ async def llm_call( return f"{system_prompt}\n\n{prompt}" response = await _call() - return response.parse() + try: + return response.parse() + except Exception: + # Fallback: sanitize control characters and parse manually + return response_model.model_validate_json(_sanitize_json(response.content))