diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..e0bb0fe --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,243 @@ +name: Benchmarks + +on: + workflow_dispatch: + inputs: + min_runs: + description: "Minimum benchmark runs" + required: false + default: "30" + quick: + description: "Quick mode (fewer runs)" + required: false + default: "false" + type: boolean + + # Run on PRs that potentially touch performance-sensitive code + # pull_request: + # branches: [main] + # paths: + # - "internal/sandbox/**" + # - "internal/proxy/**" + # - "cmd/fence/**" + +permissions: + contents: read + +jobs: + benchmark-linux: + name: Benchmark (Linux) + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Set up Node + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Download dependencies + run: go mod download + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y \ + bubblewrap \ + socat \ + uidmap \ + curl \ + netcat-openbsd \ + ripgrep \ + hyperfine \ + jq \ + bc + # Configure subuid/subgid + echo "$(whoami):100000:65536" | sudo tee -a /etc/subuid + echo "$(whoami):100000:65536" | sudo tee -a /etc/subgid + sudo chmod u+s $(which bwrap) + + - name: Install benchstat + run: go install golang.org/x/perf/cmd/benchstat@latest + + - name: Build fence + run: make build-ci + + - name: Run Go microbenchmarks + run: | + mkdir -p benchmarks + go test -run=^$ -bench=. -benchmem -count=10 ./internal/sandbox/... | tee benchmarks/go-bench-linux.txt + + - name: Run CLI benchmarks + run: | + MIN_RUNS="${{ github.event.inputs.min_runs || '30' }}" + QUICK="${{ github.event.inputs.quick || 'false' }}" + + if [[ "$QUICK" == "true" ]]; then + ./scripts/benchmark.sh -q -o benchmarks + else + ./scripts/benchmark.sh -n "$MIN_RUNS" -o benchmarks + fi + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-linux + path: benchmarks/ + retention-days: 30 + + - name: Summary + run: | + echo "## Linux Benchmark Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + for f in benchmarks/*.md; do + [[ -f "$f" ]] && cat "$f" >> $GITHUB_STEP_SUMMARY + done + + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Go Microbenchmarks" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + grep -E '^Benchmark|^ok|^PASS' benchmarks/go-bench-linux.txt | head -50 >> $GITHUB_STEP_SUMMARY || true + echo '```' >> $GITHUB_STEP_SUMMARY + + benchmark-macos: + name: Benchmark (macOS) + runs-on: macos-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Set up Node + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Download dependencies + run: go mod download + + - name: Install dependencies + run: | + brew install hyperfine ripgrep coreutils jq + + - name: Install benchstat + run: go install golang.org/x/perf/cmd/benchstat@latest + + - name: Build fence + run: make build-ci + + - name: Run Go microbenchmarks + run: | + mkdir -p benchmarks + go test -run=^$ -bench=. -benchmem -count=10 ./internal/sandbox/... | tee benchmarks/go-bench-macos.txt + + - name: Run CLI benchmarks + run: | + MIN_RUNS="${{ github.event.inputs.min_runs || '30' }}" + QUICK="${{ github.event.inputs.quick || 'false' }}" + + if [[ "$QUICK" == "true" ]]; then + ./scripts/benchmark.sh -q -o benchmarks + else + ./scripts/benchmark.sh -n "$MIN_RUNS" -o benchmarks + fi + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-macos + path: benchmarks/ + retention-days: 30 + + - name: Summary + run: | + echo "## macOS Benchmark Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + for f in benchmarks/*.md; do + [[ -f "$f" ]] && cat "$f" >> $GITHUB_STEP_SUMMARY + done + + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Go Microbenchmarks" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + grep -E '^Benchmark|^ok|^PASS' benchmarks/go-bench-macos.txt | head -50 >> $GITHUB_STEP_SUMMARY || true + echo '```' >> $GITHUB_STEP_SUMMARY + + compare: + name: Compare Results + needs: [benchmark-linux, benchmark-macos] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Install benchstat + run: go install golang.org/x/perf/cmd/benchstat@latest + + - name: Download Linux results + uses: actions/download-artifact@v4 + with: + name: benchmark-results-linux + path: linux-results/ + + - name: Download macOS results + uses: actions/download-artifact@v4 + with: + name: benchmark-results-macos + path: macos-results/ + + - name: Compare Go benchmarks + run: | + echo "## Cross-Platform Comparison" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + if [[ -f linux-results/go-bench-linux.txt && -f macos-results/go-bench-macos.txt ]]; then + echo "### Go Microbenchmark Comparison (Linux vs macOS)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + benchstat linux-results/go-bench-linux.txt macos-results/go-bench-macos.txt 2>&1 | head -100 >> $GITHUB_STEP_SUMMARY || echo "benchstat comparison failed" + echo '```' >> $GITHUB_STEP_SUMMARY + fi + + - name: Display results + run: | + echo "=== Linux Results ===" + ls -la linux-results/ + echo "" + echo "=== macOS Results ===" + ls -la macos-results/ + echo "" + if [[ -f linux-results/go-bench-linux.txt && -f macos-results/go-bench-macos.txt ]]; then + echo "=== Benchstat Comparison ===" + benchstat linux-results/go-bench-linux.txt macos-results/go-bench-macos.txt || true + fi diff --git a/.gitignore b/.gitignore index d84377c..c4cca2f 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,9 @@ coverage.out # GoReleaser /dist/ +# Benchmark results +/benchmarks/ +*.prof +cpu.out +mem.out + diff --git a/docs/README.md b/docs/README.md index c89b888..49c04f5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -23,6 +23,7 @@ Fence is a sandboxing tool that restricts network and filesystem access for arbi - [Security model](security-model.md) - Threat model, guarantees, and limitations - [Linux security features](linux-security-features.md) - Landlock, seccomp, eBPF details and fallback behavior - [Testing](testing.md) - How to run tests and write new ones +- [Benchmarking](benchmarking.md) - Performance overhead and profiling ## Examples diff --git a/docs/benchmarking.md b/docs/benchmarking.md new file mode 100644 index 0000000..337991a --- /dev/null +++ b/docs/benchmarking.md @@ -0,0 +1,329 @@ +# Benchmarking + +This document describes how to run, interpret, and compare sandbox performance benchmarks for Fence. + +## Quick Start + +```bash +# Install dependencies +brew install hyperfine # macOS +# apt install hyperfine # Linux + +go install golang.org/x/perf/cmd/benchstat@latest + +# Run CLI benchmarks +./scripts/benchmark.sh + +# Run Go microbenchmarks +go test -run=^$ -bench=. -benchmem ./internal/sandbox/... +``` + +## Goals + +1. Quantify sandbox overhead on each platform (`sandboxed / unsandboxed` ratio) +2. Compare macOS (Seatbelt) vs Linux (bwrap+Landlock) overhead fairly +3. Attribute overhead to specific components (proxy startup, bridge setup, wrap generation) +4. Track regressions over time + +## Benchmark Types + +### Layer 1: CLI Benchmarks (`scripts/benchmark.sh`) + +**What it measures**: Real-world agent cost - full `fence` invocation including proxy startup, socat bridges (Linux), and sandbox-exec/bwrap setup. + +This is the most realistic benchmark for understanding the cost of running agent commands through Fence. + +```bash +# Full benchmark suite +./scripts/benchmark.sh + +# Quick mode (fewer runs) +./scripts/benchmark.sh -q + +# Custom output directory +./scripts/benchmark.sh -o ./my-results + +# Include network benchmarks (requires local server) +./scripts/benchmark.sh --network +``` + +#### Options + +| Option | Description | +|--------|-------------| +| `-b, --binary PATH` | Path to fence binary (default: ./fence) | +| `-o, --output DIR` | Output directory (default: ./benchmarks) | +| `-n, --runs N` | Minimum runs per benchmark (default: 30) | +| `-q, --quick` | Quick mode: fewer runs, skip slow benchmarks | +| `--network` | Include network benchmarks | + +### Layer 2: Go Microbenchmarks (`internal/sandbox/benchmark_test.go`) + +**What it measures**: Component-level overhead - isolates Manager initialization, WrapCommand generation, and execution. + +```bash +# Run all benchmarks +go test -run=^$ -bench=. -benchmem ./internal/sandbox/... + +# Run specific benchmark +go test -run=^$ -bench=BenchmarkWarmSandbox -benchmem ./internal/sandbox/... + +# Multiple runs for statistical analysis +go test -run=^$ -bench=. -benchmem -count=10 ./internal/sandbox/... > bench.txt +benchstat bench.txt +``` + +#### Available Benchmarks + +| Benchmark | Description | +|-----------|-------------| +| `BenchmarkBaseline_*` | Unsandboxed command execution | +| `BenchmarkManagerInitialize` | Cold initialization (proxies + bridges) | +| `BenchmarkWrapCommand` | Command string construction only | +| `BenchmarkColdSandbox_*` | Full init + wrap + exec per iteration | +| `BenchmarkWarmSandbox_*` | Pre-initialized manager, just exec | +| `BenchmarkOverhead` | Grouped comparison of baseline vs sandbox | + +### Layer 3: OS-Level Profiling + +**What it measures**: Kernel/system overhead - context switches, syscalls, page faults. + +#### Linux + +```bash +# Quick syscall cost breakdown +strace -f -c ./fence -- true + +# Context switches, page faults +perf stat -- ./fence -- true + +# Full profiling (flamegraph-ready) +perf record -F 99 -g -- ./fence -- git status +perf report +``` + +#### macOS + +```bash +# Time Profiler via Instruments +xcrun xctrace record --template 'Time Profiler' --launch -- ./fence -- true + +# Quick call-stack snapshot +./fence -- sleep 5 & +sample $! 5 -file sample.txt +``` + +## Interpreting Results + +### Key Metric: Overhead Factor + +```text +Overhead Factor = time(sandboxed) / time(unsandboxed) +``` + +Compare overhead factors across platforms, not absolute times, because hardware differences swamp absolute timings. + +### Example Output + +```text +Benchmark Unsandboxed Sandboxed Overhead +true 1.2 ms 45 ms 37.5x +git status 15 ms 62 ms 4.1x +python -c 'pass' 25 ms 73 ms 2.9x +``` + +### What to Expect + +| Workload | Linux Overhead | macOS Overhead | Notes | +|----------|----------------|----------------|-------| +| `true` | 180-360x | 8-10x | Dominated by cold start | +| `echo` | 150-300x | 6-8x | Similar to true | +| `python3 -c 'pass'` | 10-12x | 2-3x | Interpreter startup dominates | +| `git status` | 50-60x | 4-5x | Real I/O helps amortize | +| `rg` | 40-50x | 3-4x | Search I/O helps amortize | + +The overhead factor decreases as the actual workload increases (because sandbox setup is fixed cost). Linux overhead is significantly higher due to bwrap/socat setup. + +## Cross-Platform Comparison + +### Fair Comparison Approach + +1. Run benchmarks on each platform independently +2. Compare overhead factors, not absolute times +3. Use the same fence version and workloads + +```bash +# On macOS +go test -run=^$ -bench=. -count=10 ./internal/sandbox/... > bench_macos.txt + +# On Linux +go test -run=^$ -bench=. -count=10 ./internal/sandbox/... > bench_linux.txt + +# Compare +benchstat bench_macos.txt bench_linux.txt +``` + +### Caveats + +- macOS uses Seatbelt (sandbox-exec) - built-in, lightweight kernel sandbox +- Linux uses bwrap + Landlock, this creates socat bridges for network, incurring significant setup cost +- Linux cold start is ~10x slower than macOS due to bwrap/socat bridge setup +- Linux warm path is still ~5x slower than macOS - bwrap execution itself has overhead +- For long-running agents, this difference is negligible (one-time startup cost) + +> [!TIP] +> Running Linux benchmarks inside a VM (Colima, Docker Desktop, etc.) inflates overhead due to virtualization. Use native Linux (bare metal or CI) for fair cross-platform comparison. + +## GitHub Actions + +Benchmarks can be run in CI via the workflow at `.github/workflows/benchmark.yml`: + +```bash +# Trigger manually from GitHub UI: Actions > Benchmarks > Run workflow + +# Or via gh CLI +gh workflow run benchmark.yml +``` + +Results are uploaded as artifacts and summarized in the workflow summary. + +## Tips + +### Reducing Variance + +- Run with `--min-runs 50` or higher +- Close other applications +- Pin CPU frequency if possible (Linux: `cpupower frequency-set --governor performance`) +- Run multiple times and use benchstat for statistical analysis + +### Profiling Hotspots + +```bash +# CPU profile +go test -run=^$ -bench=BenchmarkWarmSandbox -cpuprofile=cpu.out ./internal/sandbox/... +go tool pprof -http=:8080 cpu.out + +# Memory profile +go test -run=^$ -bench=BenchmarkWarmSandbox -memprofile=mem.out ./internal/sandbox/... +go tool pprof -http=:8080 mem.out +``` + +### Tracking Regressions + +1. Run benchmarks before and after changes +2. Save results to files +3. Compare with benchstat + +```bash +# Before +go test -run=^$ -bench=. -count=10 ./internal/sandbox/... > before.txt + +# Make changes... + +# After +go test -run=^$ -bench=. -count=10 ./internal/sandbox/... > after.txt + +# Compare +benchstat before.txt after.txt +``` + +## Workload Categories + +| Category | Commands | What it Stresses | +|----------|----------|------------------| +| **Spawn-only** | `true`, `echo` | Process spawn, wrapper overhead | +| **Interpreter** | `python3 -c`, `node -e` | Runtime startup under sandbox | +| **FS-heavy** | file creation, `rg` | Landlock/Seatbelt FS rules | +| **Network (local)** | `curl localhost` | Proxy forwarding overhead | +| **Real tools** | `git status` | Practical agent workloads | + +## Benchmark Findings (12/28/2025) + +Results from GitHub Actions CI runners (Linux: AMD EPYC 7763, macOS: Apple M1 Virtual). + +### Manager Initialization + +| Platform | `Manager.Initialize()` | +|----------|------------------------| +| Linux | 101.9 ms | +| macOS | 27.5 µs | + +Linux initialization is ~3,700x slower because it must: + +- Start HTTP + SOCKS proxies +- Create Unix socket bridges for socat +- Set up bwrap namespace configuration + +macOS only generates a Seatbelt profile string (very cheap). + +### Cold Start Overhead (one `fence` invocation per command) + +| Workload | Linux | macOS | +|----------|-------|-------| +| `true` | 215 ms | 22 ms | +| Python | 124 ms | 33 ms | +| Git status | 114 ms | 25 ms | + +This is the realistic cost for scripts running `fence -c "command"` repeatedly. + +### Warm Path Overhead (pre-initialized manager) + +| Workload | Linux | macOS | +|----------|-------|-------| +| `true` | 112 ms | 20 ms | +| Python | 124 ms | 33 ms | +| Git status | 114 ms | 25 ms | + +Even with proxies already running, Linux bwrap execution adds ~110ms overhead per command. + +### Overhead Factors + +| Workload | Linux Overhead | macOS Overhead | +|----------|----------------|----------------| +| `true` (cold) | ~360x | ~10x | +| `true` (warm) | ~187x | ~8x | +| Python (warm) | ~11x | ~2x | +| Git status (warm) | ~54x | ~4x | + +Overhead decreases as the actual workload increases (sandbox setup is fixed cost). + +## Impact on Agent Usage + +### Long-Running Agents (`fence claude`, `fence codex`) + +For agents that run as a child process under fence: + +| Phase | Cost | +|-------|------| +| Startup (once) | Linux: ~215ms, macOS: ~22ms | +| Per tool call | Negligible (baseline fork+exec only) | + +Child processes inherit the sandbox - no re-initialization, no WrapCommand overhead. The per-command cost is just normal process spawning: + +| Command | Linux | macOS | +|---------|-------|-------| +| `true` | 0.6 ms | 2.3 ms | +| `git status` | 2.1 ms | 5.9 ms | +| Python script | 11 ms | 15 ms | + +**Bottom line**: For `fence ` usage, sandbox overhead is a one-time startup cost. Tool calls inside the agent run at native speed. + +### Per-Command Invocation (`fence -c "command"`) + +For scripts or CI running fence per command: + +| Session | Linux Cost | macOS Cost | +|---------|------------|------------| +| 1 command | 215 ms | 22 ms | +| 10 commands | 2.15 s | 220 ms | +| 50 commands | 10.75 s | 1.1 s | + +Consider keeping the manager alive (daemon mode) or batching commands to reduce overhead. + +## Additional Notes + +- `Manager.Initialize()` starts HTTP + SOCKS proxies; on Linux also creates socat bridges +- Cold start includes all initialization; hot path is just `WrapCommand + exec` +- `-m` (monitor mode) spawns additional monitoring processes, so we'll have to benchmark separately +- Keep workloads under the repo - avoid `/tmp` since Linux bwrap does `--tmpfs /tmp` +- `debug` mode changes logging, so always benchmark with debug off diff --git a/internal/sandbox/benchmark_test.go b/internal/sandbox/benchmark_test.go new file mode 100644 index 0000000..b940a14 --- /dev/null +++ b/internal/sandbox/benchmark_test.go @@ -0,0 +1,369 @@ +package sandbox + +import ( + "bytes" + "context" + "os" + "os/exec" + "path/filepath" + "runtime" + "testing" + "time" + + "github.com/Use-Tusk/fence/internal/config" +) + +// ============================================================================ +// Baseline Benchmarks (unsandboxed) +// ============================================================================ + +// BenchmarkBaseline_True measures the cost of spawning a minimal process. +func BenchmarkBaseline_True(b *testing.B) { + for i := 0; i < b.N; i++ { + cmd := exec.Command("true") + _ = cmd.Run() + } +} + +// BenchmarkBaseline_Echo measures echo command without sandbox. +func BenchmarkBaseline_Echo(b *testing.B) { + for i := 0; i < b.N; i++ { + cmd := exec.Command("sh", "-c", "echo hello") + _ = cmd.Run() + } +} + +// BenchmarkBaseline_Python measures Python startup without sandbox. +func BenchmarkBaseline_Python(b *testing.B) { + if _, err := exec.LookPath("python3"); err != nil { + b.Skip("python3 not found") + } + for i := 0; i < b.N; i++ { + cmd := exec.Command("python3", "-c", "pass") + _ = cmd.Run() + } +} + +// BenchmarkBaseline_Node measures Node.js startup without sandbox. +func BenchmarkBaseline_Node(b *testing.B) { + if _, err := exec.LookPath("node"); err != nil { + b.Skip("node not found") + } + for i := 0; i < b.N; i++ { + cmd := exec.Command("node", "-e", "") + _ = cmd.Run() + } +} + +// BenchmarkBaseline_GitStatus measures git status without sandbox. +func BenchmarkBaseline_GitStatus(b *testing.B) { + if _, err := exec.LookPath("git"); err != nil { + b.Skip("git not found") + } + // Find a git repo to run in + repoDir := findGitRepo() + if repoDir == "" { + b.Skip("no git repo found") + } + + for i := 0; i < b.N; i++ { + cmd := exec.Command("git", "status", "--porcelain") + cmd.Dir = repoDir + cmd.Stdout = nil // discard + _ = cmd.Run() + } +} + +// ============================================================================ +// Component Benchmarks (isolate overhead sources) +// ============================================================================ + +// BenchmarkManagerInitialize measures cold initialization cost (proxies + bridges). +func BenchmarkManagerInitialize(b *testing.B) { + skipBenchIfSandboxed(b) + + workspace := b.TempDir() + cfg := benchConfig(workspace) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + manager := NewManager(cfg, false, false) + if err := manager.Initialize(); err != nil { + b.Fatalf("failed to initialize: %v", err) + } + manager.Cleanup() + } +} + +// BenchmarkWrapCommand measures the cost of command wrapping (string construction only). +func BenchmarkWrapCommand(b *testing.B) { + skipBenchIfSandboxed(b) + + workspace := b.TempDir() + cfg := benchConfig(workspace) + + manager := NewManager(cfg, false, false) + if err := manager.Initialize(); err != nil { + b.Fatalf("failed to initialize: %v", err) + } + defer manager.Cleanup() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := manager.WrapCommand("echo hello") + if err != nil { + b.Fatalf("wrap failed: %v", err) + } + } +} + +// ============================================================================ +// Cold Sandbox Benchmarks (full init + wrap + exec each iteration) +// ============================================================================ + +// BenchmarkColdSandbox_True measures full cold-start sandbox cost. +func BenchmarkColdSandbox_True(b *testing.B) { + skipBenchIfSandboxed(b) + + workspace := b.TempDir() + cfg := benchConfig(workspace) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + manager := NewManager(cfg, false, false) + if err := manager.Initialize(); err != nil { + b.Fatalf("init failed: %v", err) + } + + wrappedCmd, err := manager.WrapCommand("true") + if err != nil { + manager.Cleanup() + b.Fatalf("wrap failed: %v", err) + } + + execBenchCommand(b, wrappedCmd, workspace) + manager.Cleanup() + } +} + +// ============================================================================ +// Warm Sandbox Benchmarks (Manager.Initialize once, repeat WrapCommand + exec) +// ============================================================================ + +// BenchmarkWarmSandbox_True measures sandbox cost with pre-initialized manager. +func BenchmarkWarmSandbox_True(b *testing.B) { + skipBenchIfSandboxed(b) + + workspace := b.TempDir() + cfg := benchConfig(workspace) + + manager := NewManager(cfg, false, false) + if err := manager.Initialize(); err != nil { + b.Fatalf("init failed: %v", err) + } + defer manager.Cleanup() + + wrappedCmd, err := manager.WrapCommand("true") + if err != nil { + b.Fatalf("wrap failed: %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + execBenchCommand(b, wrappedCmd, workspace) + } +} + +// BenchmarkWarmSandbox_Echo measures echo command with pre-initialized manager. +func BenchmarkWarmSandbox_Echo(b *testing.B) { + skipBenchIfSandboxed(b) + + workspace := b.TempDir() + cfg := benchConfig(workspace) + + manager := NewManager(cfg, false, false) + if err := manager.Initialize(); err != nil { + b.Fatalf("init failed: %v", err) + } + defer manager.Cleanup() + + wrappedCmd, err := manager.WrapCommand("echo hello") + if err != nil { + b.Fatalf("wrap failed: %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + execBenchCommand(b, wrappedCmd, workspace) + } +} + +// BenchmarkWarmSandbox_Python measures Python startup with pre-initialized manager. +func BenchmarkWarmSandbox_Python(b *testing.B) { + skipBenchIfSandboxed(b) + if _, err := exec.LookPath("python3"); err != nil { + b.Skip("python3 not found") + } + + workspace := b.TempDir() + cfg := benchConfig(workspace) + + manager := NewManager(cfg, false, false) + if err := manager.Initialize(); err != nil { + b.Fatalf("init failed: %v", err) + } + defer manager.Cleanup() + + wrappedCmd, err := manager.WrapCommand("python3 -c 'pass'") + if err != nil { + b.Fatalf("wrap failed: %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + execBenchCommand(b, wrappedCmd, workspace) + } +} + +// BenchmarkWarmSandbox_FileWrite measures file write with pre-initialized manager. +func BenchmarkWarmSandbox_FileWrite(b *testing.B) { + skipBenchIfSandboxed(b) + + workspace := b.TempDir() + cfg := benchConfig(workspace) + + manager := NewManager(cfg, false, false) + if err := manager.Initialize(); err != nil { + b.Fatalf("init failed: %v", err) + } + defer manager.Cleanup() + + testFile := filepath.Join(workspace, "bench.txt") + wrappedCmd, err := manager.WrapCommand("echo 'benchmark data' > " + testFile) + if err != nil { + b.Fatalf("wrap failed: %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + execBenchCommand(b, wrappedCmd, workspace) + _ = os.Remove(testFile) + } +} + +// BenchmarkWarmSandbox_GitStatus measures git status with pre-initialized manager. +func BenchmarkWarmSandbox_GitStatus(b *testing.B) { + skipBenchIfSandboxed(b) + if _, err := exec.LookPath("git"); err != nil { + b.Skip("git not found") + } + + repoDir := findGitRepo() + if repoDir == "" { + b.Skip("no git repo found") + } + + cfg := benchConfig(repoDir) + + manager := NewManager(cfg, false, false) + if err := manager.Initialize(); err != nil { + b.Fatalf("init failed: %v", err) + } + defer manager.Cleanup() + + wrappedCmd, err := manager.WrapCommand("git status --porcelain") + if err != nil { + b.Fatalf("wrap failed: %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + execBenchCommand(b, wrappedCmd, repoDir) + } +} + +// ============================================================================ +// Comparison Sub-benchmarks +// ============================================================================ + +// BenchmarkOverhead runs baseline vs sandbox comparisons for easy diffing. +func BenchmarkOverhead(b *testing.B) { + b.Run("Baseline/True", BenchmarkBaseline_True) + b.Run("Baseline/Echo", BenchmarkBaseline_Echo) + b.Run("Baseline/Python", BenchmarkBaseline_Python) + + b.Run("Warm/True", BenchmarkWarmSandbox_True) + b.Run("Warm/Echo", BenchmarkWarmSandbox_Echo) + b.Run("Warm/Python", BenchmarkWarmSandbox_Python) + + b.Run("Cold/True", BenchmarkColdSandbox_True) +} + +// ============================================================================ +// Helpers +// ============================================================================ + +func skipBenchIfSandboxed(b *testing.B) { + b.Helper() + if os.Getenv("FENCE_SANDBOX") == "1" { + b.Skip("already running inside Fence sandbox") + } +} + +func benchConfig(workspace string) *config.Config { + return &config.Config{ + Network: config.NetworkConfig{ + AllowedDomains: []string{}, + }, + Filesystem: config.FilesystemConfig{ + AllowWrite: []string{workspace}, + }, + Command: config.CommandConfig{ + UseDefaults: boolPtr(false), + }, + } +} + +func execBenchCommand(b *testing.B, command string, workDir string) { + b.Helper() + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + shell := "/bin/sh" + if runtime.GOOS == "darwin" { + shell = "/bin/bash" + } + + cmd := exec.CommandContext(ctx, shell, "-c", command) + cmd.Dir = workDir + cmd.Stdout = &bytes.Buffer{} + cmd.Stderr = &bytes.Buffer{} + + if err := cmd.Run(); err != nil { + // Don't fail on command errors - we're measuring timing, not correctness + // (e.g., git status might fail if not in a repo) + _ = err + } +} + +func findGitRepo() string { + // Try current directory and parents + dir, err := os.Getwd() + if err != nil { + return "" + } + + for { + if _, err := os.Stat(filepath.Join(dir, ".git")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + break + } + dir = parent + } + + return "" +} diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh new file mode 100755 index 0000000..3ac64c8 --- /dev/null +++ b/scripts/benchmark.sh @@ -0,0 +1,396 @@ +#!/bin/bash +# benchmark.sh - Comprehensive sandbox benchmarking +# +# This script compares sandbox overhead between: +# - Unsandboxed (baseline) +# - Sandboxed (default mode) +# - Sandboxed with monitor (-m) +# +# Usage: +# ./scripts/benchmark.sh [options] +# +# Options: +# -b, --binary PATH Path to fence binary (default: ./fence or builds one) +# -o, --output DIR Output directory for results (default: ./benchmarks) +# -n, --runs N Minimum runs per benchmark (default: 30) +# -q, --quick Quick mode: fewer runs, skip slow benchmarks +# --network Include network benchmarks (requires local server) +# -h, --help Show this help +# +# Requirements: +# - hyperfine (brew install hyperfine / apt install hyperfine) +# - go (for building fence if needed) +# - Optional: python3 (for local-server.py network benchmarks) + +set -euo pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Defaults +FENCE_BIN="" +OUTPUT_DIR="./benchmarks" +MIN_RUNS=30 +WARMUP=3 +QUICK=false +NETWORK=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + -b|--binary) + FENCE_BIN="$2" + shift 2 + ;; + -o|--output) + OUTPUT_DIR="$2" + shift 2 + ;; + -n|--runs) + MIN_RUNS="$2" + shift 2 + ;; + -q|--quick) + QUICK=true + MIN_RUNS=10 + WARMUP=1 + shift + ;; + --network) + NETWORK=true + shift + ;; + -h|--help) + head -30 "$0" | tail -28 + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Find or build fence binary +if [[ -z "$FENCE_BIN" ]]; then + if [[ -x "./fence" ]]; then + FENCE_BIN="./fence" + elif [[ -x "./dist/fence" ]]; then + FENCE_BIN="./dist/fence" + else + echo -e "${BLUE}Building fence...${NC}" + go build -o ./fence ./cmd/fence + FENCE_BIN="./fence" + fi +fi + +if [[ ! -x "$FENCE_BIN" ]]; then + echo -e "${RED}Error: fence binary not found at $FENCE_BIN${NC}" + exit 1 +fi + +# Check for hyperfine +if ! command -v hyperfine &> /dev/null; then + echo -e "${RED}Error: hyperfine not found. Install with:${NC}" + echo " brew install hyperfine # macOS" + echo " apt install hyperfine # Linux" + exit 1 +fi + +# Create output directory +mkdir -p "$OUTPUT_DIR" + +# Create workspace in current directory (not /tmp, which bwrap overlays) +WORKSPACE=$(mktemp -d -p .) +trap 'rm -rf "$WORKSPACE"' EXIT + +# Create settings file for sandbox +SETTINGS_FILE="$WORKSPACE/fence.json" +cat > "$SETTINGS_FILE" << EOF +{ + "filesystem": { + "allowWrite": ["$WORKSPACE", "."] + } +} +EOF + +# Platform info +OS=$(uname -s) +ARCH=$(uname -m) +KERNEL=$(uname -r) +DATE=$(date +%Y-%m-%d) +TIMESTAMP=$(date +%Y%m%d-%H%M%S) + +# Results file +RESULTS_JSON="$OUTPUT_DIR/${OS,,}-${ARCH}-${TIMESTAMP}.json" +RESULTS_MD="$OUTPUT_DIR/${OS,,}-${ARCH}-${TIMESTAMP}.md" + +echo "" +echo -e "${BLUE}==========================================${NC}" +echo -e "${BLUE}Fence Sandbox Benchmarks${NC}" +echo -e "${BLUE}==========================================${NC}" +echo "" +echo "Platform: $OS $ARCH" +echo "Kernel: $KERNEL" +echo "Date: $DATE" +echo "Fence: $FENCE_BIN" +echo "Output: $OUTPUT_DIR" +echo "Min runs: $MIN_RUNS" +echo "" + +# Helper to run hyperfine with consistent options +run_bench() { + local name="$1" + shift + local json_file="$WORKSPACE/${name}.json" + + echo -e "${GREEN}Benchmarking: $name${NC}" + + hyperfine \ + --warmup "$WARMUP" \ + --min-runs "$MIN_RUNS" \ + --export-json "$json_file" \ + --style basic \ + "$@" + + echo "" +} + +# ============================================================================ +# Spawn-only benchmarks (minimal process overhead) +# ============================================================================ + +echo -e "${YELLOW}=== Spawn-Only Benchmarks ===${NC}" +echo "" + +run_bench "true" \ + --command-name "unsandboxed" "true" \ + --command-name "sandboxed" "$FENCE_BIN -s $SETTINGS_FILE -- true" + +run_bench "echo" \ + --command-name "unsandboxed" "echo hello >/dev/null" \ + --command-name "sandboxed" "$FENCE_BIN -s $SETTINGS_FILE -c 'echo hello' >/dev/null" + +# ============================================================================ +# Tool compatibility benchmarks +# ============================================================================ + +echo -e "${YELLOW}=== Tool Compatibility Benchmarks ===${NC}" +echo "" + +if command -v python3 &> /dev/null; then + run_bench "python" \ + --command-name "unsandboxed" "python3 -c 'pass'" \ + --command-name "sandboxed" "$FENCE_BIN -s $SETTINGS_FILE -c \"python3 -c 'pass'\"" +else + echo -e "${YELLOW}Skipping python3 (not found)${NC}" +fi + +if command -v node &> /dev/null && [[ "$QUICK" == "false" ]]; then + run_bench "node" \ + --command-name "unsandboxed" "node -e ''" \ + --command-name "sandboxed" "$FENCE_BIN -s $SETTINGS_FILE -c \"node -e ''\"" +else + echo -e "${YELLOW}Skipping node (not found or quick mode)${NC}" +fi + +# ============================================================================ +# Real workload benchmarks +# ============================================================================ + +echo -e "${YELLOW}=== Real Workload Benchmarks ===${NC}" +echo "" + +if command -v git &> /dev/null && [[ -d .git ]]; then + run_bench "git-status" \ + --command-name "unsandboxed" "git status --porcelain >/dev/null" \ + --command-name "sandboxed" "$FENCE_BIN -s $SETTINGS_FILE -- git status --porcelain >/dev/null" +else + echo -e "${YELLOW}Skipping git status (not in a git repo)${NC}" +fi + +if command -v rg &> /dev/null && [[ "$QUICK" == "false" ]]; then + run_bench "ripgrep" \ + --command-name "unsandboxed" "rg -n 'package' -S . >/dev/null 2>&1 || true" \ + --command-name "sandboxed" "$FENCE_BIN -s $SETTINGS_FILE -c \"rg -n 'package' -S . >/dev/null 2>&1\" || true" +else + echo -e "${YELLOW}Skipping ripgrep (not found or quick mode)${NC}" +fi + +# ============================================================================ +# File I/O benchmarks +# ============================================================================ + +echo -e "${YELLOW}=== File I/O Benchmarks ===${NC}" +echo "" + +run_bench "file-write" \ + --command-name "unsandboxed" "echo 'test' > $WORKSPACE/test.txt" \ + --command-name "sandboxed" "$FENCE_BIN -s $SETTINGS_FILE -c \"echo 'test' > $WORKSPACE/test.txt\"" + +run_bench "file-read" \ + --command-name "unsandboxed" "cat $WORKSPACE/test.txt >/dev/null" \ + --command-name "sandboxed" "$FENCE_BIN -s $SETTINGS_FILE -c 'cat $WORKSPACE/test.txt' >/dev/null" + +# ============================================================================ +# Monitor mode benchmarks (optional) +# ============================================================================ + +if [[ "$QUICK" == "false" ]]; then + echo -e "${YELLOW}=== Monitor Mode Benchmarks ===${NC}" + echo "" + + run_bench "monitor-true" \ + --command-name "sandboxed" "$FENCE_BIN -s $SETTINGS_FILE -- true" \ + --command-name "sandboxed+monitor" "$FENCE_BIN -m -s $SETTINGS_FILE -- true" +fi + +# ============================================================================ +# Network benchmarks (optional, requires local server) +# ============================================================================ + +if [[ "$NETWORK" == "true" ]]; then + echo -e "${YELLOW}=== Network Benchmarks ===${NC}" + echo "" + + # Start local server + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + if [[ -f "$SCRIPT_DIR/local-server.py" ]]; then + python3 "$SCRIPT_DIR/local-server.py" & + SERVER_PID=$! + trap 'kill $SERVER_PID 2>/dev/null || true; rm -rf "$WORKSPACE"' EXIT + sleep 1 + + # Create network settings + NET_SETTINGS="$WORKSPACE/fence-net.json" + cat > "$NET_SETTINGS" << EOF +{ + "network": { + "allowedDomains": ["127.0.0.1", "localhost"] + }, + "filesystem": { + "allowWrite": ["$WORKSPACE"] + } +} +EOF + + if command -v curl &> /dev/null; then + run_bench "network-curl" \ + --command-name "unsandboxed" "curl -s http://127.0.0.1:8765/ >/dev/null" \ + --command-name "sandboxed" "$FENCE_BIN -s $NET_SETTINGS -c 'curl -s http://127.0.0.1:8765/' >/dev/null" + fi + + kill $SERVER_PID 2>/dev/null || true + else + echo -e "${YELLOW}Skipping network benchmarks (local-server.py not found)${NC}" + fi +fi + +# ============================================================================ +# Combine results and generate report +# ============================================================================ + +echo -e "${YELLOW}=== Generating Report ===${NC}" +echo "" + +# Combine all JSON results +echo "{" > "$RESULTS_JSON" +echo " \"platform\": \"$OS\"," >> "$RESULTS_JSON" +echo " \"arch\": \"$ARCH\"," >> "$RESULTS_JSON" +echo " \"kernel\": \"$KERNEL\"," >> "$RESULTS_JSON" +echo " \"date\": \"$DATE\"," >> "$RESULTS_JSON" +echo " \"fence_version\": \"$($FENCE_BIN --version 2>/dev/null || echo unknown)\"," >> "$RESULTS_JSON" +echo " \"benchmarks\": {" >> "$RESULTS_JSON" + +first=true +for json_file in "$WORKSPACE"/*.json; do + [[ -f "$json_file" ]] || continue + name=$(basename "$json_file" .json) + if [[ "$first" == "true" ]]; then + first=false + else + echo "," >> "$RESULTS_JSON" + fi + echo " \"$name\": $(cat "$json_file")" >> "$RESULTS_JSON" +done + +echo "" >> "$RESULTS_JSON" +echo " }" >> "$RESULTS_JSON" +echo "}" >> "$RESULTS_JSON" + +# Generate Markdown report +cat > "$RESULTS_MD" << EOF +# Fence Benchmark Results + +**Platform:** $OS $ARCH +**Kernel:** $KERNEL +**Date:** $DATE +**Fence:** $($FENCE_BIN --version 2>/dev/null || echo unknown) + +## Summary + +| Benchmark | Unsandboxed | Sandboxed | Overhead | +|-----------|-------------|-----------|----------| +EOF + +# Parse results and add to markdown (run in subshell to prevent failures from stopping script) +if command -v jq &> /dev/null; then + for json_file in "$WORKSPACE"/*.json; do + [[ -f "$json_file" ]] || continue + name=$(basename "$json_file" .json) + + # Extract mean times, defaulting to empty if not found + unsandboxed=$(jq -r '.results[] | select(.command == "unsandboxed") | .mean // empty' "$json_file" 2>/dev/null) || true + sandboxed=$(jq -r '.results[] | select(.command == "sandboxed") | .mean // empty' "$json_file" 2>/dev/null) || true + + # Skip if values are missing, null, or zero + if [[ -z "$unsandboxed" || -z "$sandboxed" || "$unsandboxed" == "null" || "$sandboxed" == "null" ]]; then + continue + fi + + # Calculate values, catching any bc errors + overhead=$(echo "scale=1; $sandboxed / $unsandboxed" | bc 2>/dev/null) || continue + unsandboxed_ms=$(echo "scale=2; $unsandboxed * 1000" | bc 2>/dev/null) || continue + sandboxed_ms=$(echo "scale=2; $sandboxed * 1000" | bc 2>/dev/null) || continue + + if [[ -n "$overhead" && -n "$unsandboxed_ms" && -n "$sandboxed_ms" ]]; then + echo "| $name | ${unsandboxed_ms}ms | ${sandboxed_ms}ms | ${overhead}x |" >> "$RESULTS_MD" + fi + done +fi + +echo "" +echo -e "${GREEN}Results saved to:${NC}" +echo " JSON: $RESULTS_JSON" +echo " Markdown: $RESULTS_MD" +echo "" + +# Print quick summary (errors in this section should not fail the script) +if command -v jq &> /dev/null; then + echo -e "${BLUE}Quick Summary (overhead factors):${NC}" + for json_file in "$WORKSPACE"/*.json; do + ( + [[ -f "$json_file" ]] || exit 0 + name=$(basename "$json_file" .json) + + # Extract values, defaulting to empty if not found + unsandboxed=$(jq -r '.results[] | select(.command == "unsandboxed") | .mean // empty' "$json_file" 2>/dev/null) || exit 0 + sandboxed=$(jq -r '.results[] | select(.command == "sandboxed") | .mean // empty' "$json_file" 2>/dev/null) || exit 0 + + # Skip if either value is missing or null + [[ -z "$unsandboxed" || -z "$sandboxed" || "$unsandboxed" == "null" || "$sandboxed" == "null" ]] && exit 0 + + # Calculate overhead, catching any bc errors + overhead=$(echo "scale=1; $sandboxed / $unsandboxed" | bc 2>/dev/null) || exit 0 + + [[ -n "$overhead" ]] && printf " %-15s %sx\n" "$name:" "$overhead" + ) || true # Ignore errors from subshell + done +fi + +echo "" +echo -e "${GREEN}Done!${NC}" diff --git a/scripts/local-server.py b/scripts/local-server.py new file mode 100755 index 0000000..84ad84d --- /dev/null +++ b/scripts/local-server.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +""" +Simple HTTP server for network benchmarking. + +Runs on port 8765 and responds to all requests with a minimal JSON response. +Used by benchmark.sh to measure proxy overhead without internet variability. + +Usage: + python3 scripts/local-server.py + # Server runs on http://127.0.0.1:8765/ + + # In another terminal: + curl http://127.0.0.1:8765/ +""" + +import http.server +import json +import socketserver +import sys + +PORT = 8765 + + +class BenchmarkHandler(http.server.BaseHTTPRequestHandler): + """Minimal HTTP handler for benchmarking.""" + + def do_GET(self): + """Handle GET requests with minimal response.""" + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + response = {"status": "ok", "path": self.path} + self.wfile.write(json.dumps(response).encode()) + + def do_POST(self): + """Handle POST requests with minimal response.""" + content_length = int(self.headers.get("Content-Length", 0)) + _ = self.rfile.read(content_length) # Read and discard body + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + response = {"status": "ok", "method": "POST"} + self.wfile.write(json.dumps(response).encode()) + + def log_message(self, format, *args): + """Suppress request logging for cleaner benchmark output.""" + pass + + +def main(): + socketserver.TCPServer.allow_reuse_address = True + with socketserver.TCPServer(("127.0.0.1", PORT), BenchmarkHandler) as httpd: + print(f"Benchmark server running on http://127.0.0.1:{PORT}/", file=sys.stderr) + print("Press Ctrl+C to stop", file=sys.stderr) + try: + httpd.serve_forever() + except KeyboardInterrupt: + print("\nShutting down...", file=sys.stderr) + httpd.shutdown() + + +if __name__ == "__main__": + main()