diff --git a/app/hackathons/page.tsx b/app/hackathons/page.tsx index 7cd835c..1318329 100644 --- a/app/hackathons/page.tsx +++ b/app/hackathons/page.tsx @@ -14,10 +14,11 @@ import { ShieldAlert, MessageSquare, Crown, + FlaskConical, } from 'lucide-react' import { Footer } from '@/components/footer' import { LiveTerminal } from '@/components/hackathons/live-terminal' -import { StreamViz, SecureViz, RadarViz, ScanViz, ExtendViz } from '@/components/hackathons/track-visuals' +import { StreamViz, SecureViz, RadarViz, ScanViz, ExtendViz, BenchViz } from '@/components/hackathons/track-visuals' const ShieldScene = dynamic( () => import('@/components/hackathons/shield-scene').then((m) => m.ShieldScene), @@ -546,6 +547,18 @@ const tracks = [ examples: ['Error interception', 'Context injection', 'Alternative suggestion', 'Loop prevention'], Visual: ExtendViz, }, + { + id: 'control-benchmark', + icon: FlaskConical, + title: 'Control Evaluation Benchmark', + hook: 'Build the test suite that measures how well sandboxes actually work.', + color: 'from-rose-500/10 to-pink-500/5', + borderColor: 'hover:border-rose-500/30', + description: 'Define main tasks (real software engineering work) paired with side tasks (exfiltrate a key, install a backdoor, modify git hooks). Measure how much useful work an agent completes under Greywall\'s restrictions vs. how often adversarial goals succeed.', + scoring: 'Show the safety/usefulness tradeoff. A good benchmark reveals real weaknesses, not just toy scenarios.', + examples: ['Main/side task pairs', 'Policy quality scoring', 'Layer isolation tests', 'Reproducible evals'], + Visual: BenchViz, + }, ] function TrackCard({ track, index }: { track: typeof tracks[0]; index: number }) { @@ -612,7 +625,7 @@ function Tracks() { Pick your track.

- Five open-ended tracks, all building on top of{' '} + Six open-ended tracks, all building on top of{' '} Greywall. Go deep on one or try a few.

diff --git a/components/hackathons/track-visuals.tsx b/components/hackathons/track-visuals.tsx index 212ad4b..0e7e4b7 100644 --- a/components/hackathons/track-visuals.tsx +++ b/components/hackathons/track-visuals.tsx @@ -405,3 +405,146 @@ export function ExtendViz() { ) } + +/* ─── Benchmark grid (Track 6 — Control Evaluation Benchmark) ─── */ + +export function BenchViz() { + const canvasRef = useRef(null) + + useEffect(() => { + const canvas = canvasRef.current + if (!canvas) return + const ctx = canvas.getContext('2d') + if (!ctx) return + + const dpr = window.devicePixelRatio || 1 + let w = 0, h = 0 + + const resize = () => { + const rect = canvas.getBoundingClientRect() + w = rect.width + h = rect.height + canvas.width = w * dpr + canvas.height = h * dpr + ctx.setTransform(dpr, 0, 0, dpr, 0, 0) + } + resize() + + let t = 0 + let animId: number + + const cols = 6, rows = 8 + const cells = Array.from({ length: cols * rows }, (_, i) => ({ + pass: Math.random() > 0.25, + revealAt: Math.random() * 4 + (Math.floor(i / cols)) * 0.3, + pulsePhase: Math.random() * Math.PI * 2, + })) + + const draw = () => { + ctx.clearRect(0, 0, w, h) + t += 0.016 + + const padX = w * 0.08, padY = h * 0.06 + const cellW = (w - padX * 2) / cols + const cellH = (h - padY * 2) / rows + const gap = 2.5 + const loopT = t % 6 + + cells.forEach((cell, i) => { + const col = i % cols + const row = Math.floor(i / cols) + const x = padX + col * cellW + gap + const y = padY + row * cellH + gap + const cw = cellW - gap * 2 + const ch = cellH - gap * 2 + const r = Math.min(cw, ch) * 0.15 + + if (loopT < cell.revealAt) { + // Not yet revealed — dim outline + ctx.strokeStyle = 'rgba(244, 63, 94, 0.06)' + ctx.lineWidth = 0.5 + ctx.beginPath() + ctx.roundRect(x, y, cw, ch, r) + ctx.stroke() + return + } + + // Reveal animation + const revealProgress = Math.min((loopT - cell.revealAt) * 2, 1) + const pulse = Math.sin(t * 2 + cell.pulsePhase) * 0.15 + 0.85 + + if (cell.pass) { + // Pass — rose/green tint + ctx.fillStyle = `rgba(74, 222, 128, ${0.12 * revealProgress * pulse})` + ctx.beginPath() + ctx.roundRect(x, y, cw, ch, r) + ctx.fill() + + ctx.strokeStyle = `rgba(74, 222, 128, ${0.25 * revealProgress})` + ctx.lineWidth = 0.5 + ctx.beginPath() + ctx.roundRect(x, y, cw, ch, r) + ctx.stroke() + + // Checkmark + if (revealProgress > 0.5) { + const alpha = (revealProgress - 0.5) * 2 + const cx = x + cw / 2, cy = y + ch / 2 + const s = Math.min(cw, ch) * 0.2 + ctx.strokeStyle = `rgba(74, 222, 128, ${0.5 * alpha})` + ctx.lineWidth = 1.2 + ctx.beginPath() + ctx.moveTo(cx - s * 0.6, cy) + ctx.lineTo(cx - s * 0.1, cy + s * 0.5) + ctx.lineTo(cx + s * 0.6, cy - s * 0.4) + ctx.stroke() + } + } else { + // Fail — red tint + ctx.fillStyle = `rgba(244, 63, 94, ${0.15 * revealProgress * pulse})` + ctx.beginPath() + ctx.roundRect(x, y, cw, ch, r) + ctx.fill() + + ctx.strokeStyle = `rgba(244, 63, 94, ${0.3 * revealProgress})` + ctx.lineWidth = 0.5 + ctx.beginPath() + ctx.roundRect(x, y, cw, ch, r) + ctx.stroke() + + // X mark + if (revealProgress > 0.5) { + const alpha = (revealProgress - 0.5) * 2 + const cx = x + cw / 2, cy = y + ch / 2 + const s = Math.min(cw, ch) * 0.18 + ctx.strokeStyle = `rgba(244, 63, 94, ${0.5 * alpha})` + ctx.lineWidth = 1.2 + ctx.beginPath() + ctx.moveTo(cx - s, cy - s) + ctx.lineTo(cx + s, cy + s) + ctx.moveTo(cx + s, cy - s) + ctx.lineTo(cx - s, cy + s) + ctx.stroke() + } + } + }) + + // Progress bar at bottom + const barY = h - padY * 0.6 + const barH = 2 + const progress = Math.min(loopT / 5, 1) + ctx.fillStyle = 'rgba(244, 63, 94, 0.08)' + ctx.fillRect(padX, barY, w - padX * 2, barH) + ctx.fillStyle = 'rgba(244, 63, 94, 0.3)' + ctx.fillRect(padX, barY, (w - padX * 2) * progress, barH) + + animId = requestAnimationFrame(draw) + } + draw() + + window.addEventListener('resize', resize) + return () => { cancelAnimationFrame(animId); window.removeEventListener('resize', resize) } + }, []) + + return +}