package daemon import ( "crypto/rand" "encoding/hex" "encoding/json" "fmt" "net" "os" "os/exec" "os/user" "strings" "sync" "syscall" "time" ) // Protocol types for JSON communication over Unix socket (newline-delimited). // Request from CLI to daemon. type Request struct { Action string `json:"action"` // "create_session", "destroy_session", "status", "start_learning", "stop_learning" ProxyURL string `json:"proxy_url,omitempty"` // for create_session DNSAddr string `json:"dns_addr,omitempty"` // for create_session SessionID string `json:"session_id,omitempty"` // for destroy_session LearningID string `json:"learning_id,omitempty"` // for stop_learning } // Response from daemon to CLI. type Response struct { OK bool `json:"ok"` Error string `json:"error,omitempty"` SessionID string `json:"session_id,omitempty"` TunDevice string `json:"tun_device,omitempty"` SandboxUser string `json:"sandbox_user,omitempty"` SandboxGroup string `json:"sandbox_group,omitempty"` // Status response fields. Running bool `json:"running,omitempty"` ActiveSessions int `json:"active_sessions,omitempty"` // Learning response fields. LearningID string `json:"learning_id,omitempty"` LearningLog string `json:"learning_log,omitempty"` } // Session tracks an active sandbox session. type Session struct { ID string ProxyURL string DNSAddr string CreatedAt time.Time } // Server listens on a Unix socket and manages sandbox sessions. It orchestrates // TunManager (utun + pf) and DNSRelay lifecycle for each session. type Server struct { socketPath string listener net.Listener tunManager *TunManager dnsRelay *DNSRelay sessions map[string]*Session mu sync.Mutex done chan struct{} wg sync.WaitGroup debug bool tun2socksPath string sandboxGID string // cached numeric GID for the sandbox group // Learning mode state esloggerCmd *exec.Cmd // running eslogger process esloggerLogPath string // temp file path for eslogger output esloggerDone chan error // receives result of cmd.Wait() (set once, reused for stop) learningID string // current learning session ID } // NewServer creates a new daemon server that will listen on the given Unix socket path. func NewServer(socketPath, tun2socksPath string, debug bool) *Server { return &Server{ socketPath: socketPath, tun2socksPath: tun2socksPath, sessions: make(map[string]*Session), done: make(chan struct{}), debug: debug, } } // Start begins listening on the Unix socket and accepting connections. // It removes any stale socket file before binding. func (s *Server) Start() error { // Pre-resolve the sandbox group GID so session creation is fast // and doesn't depend on OpenDirectory latency. grp, err := user.LookupGroup(SandboxGroupName) if err != nil { Logf("Warning: could not resolve group %s at startup: %v (will retry per-session)", SandboxGroupName, err) } else { s.sandboxGID = grp.Gid Logf("Resolved group %s → GID %s", SandboxGroupName, s.sandboxGID) } // Remove stale socket file if it exists. if _, err := os.Stat(s.socketPath); err == nil { s.logDebug("Removing stale socket file %s", s.socketPath) if err := os.Remove(s.socketPath); err != nil { return fmt.Errorf("failed to remove stale socket %s: %w", s.socketPath, err) } } ln, err := net.Listen("unix", s.socketPath) if err != nil { return fmt.Errorf("failed to listen on %s: %w", s.socketPath, err) } s.listener = ln // Set socket permissions so any local user can connect to the daemon. // The socket is localhost-only (Unix domain socket); access control is // handled at the daemon protocol level, not file permissions. if err := os.Chmod(s.socketPath, 0o666); err != nil { //nolint:gosec // daemon socket needs 0666 so non-root CLI can connect _ = ln.Close() _ = os.Remove(s.socketPath) return fmt.Errorf("failed to set socket permissions: %w", err) } s.logDebug("Listening on %s", s.socketPath) s.wg.Add(1) go s.acceptLoop() return nil } // Stop gracefully shuts down the server. It stops accepting new connections, // tears down all active sessions, and removes the socket file. func (s *Server) Stop() error { // Signal shutdown. select { case <-s.done: // Already closed. default: close(s.done) } // Close the listener to unblock acceptLoop. if s.listener != nil { _ = s.listener.Close() } // Wait for the accept loop and any in-flight handlers to finish. s.wg.Wait() // Tear down all active sessions and learning. s.mu.Lock() var errs []string // Stop learning session if active if s.esloggerCmd != nil && s.esloggerCmd.Process != nil { s.logDebug("Stopping eslogger during shutdown") _ = s.esloggerCmd.Process.Kill() if s.esloggerDone != nil { <-s.esloggerDone } s.esloggerCmd = nil s.esloggerDone = nil s.learningID = "" } if s.esloggerLogPath != "" { _ = os.Remove(s.esloggerLogPath) s.esloggerLogPath = "" } for id := range s.sessions { s.logDebug("Stopping session %s during shutdown", id) } if s.tunManager != nil { if err := s.tunManager.Stop(); err != nil { errs = append(errs, fmt.Sprintf("stop tun manager: %v", err)) } s.tunManager = nil } if s.dnsRelay != nil { s.dnsRelay.Stop() s.dnsRelay = nil } s.sessions = make(map[string]*Session) s.mu.Unlock() // Remove the socket file. if err := os.Remove(s.socketPath); err != nil && !os.IsNotExist(err) { errs = append(errs, fmt.Sprintf("remove socket: %v", err)) } if len(errs) > 0 { return fmt.Errorf("stop errors: %s", join(errs, "; ")) } s.logDebug("Server stopped") return nil } // ActiveSessions returns the number of currently active sessions. func (s *Server) ActiveSessions() int { s.mu.Lock() defer s.mu.Unlock() return len(s.sessions) } // acceptLoop runs the main accept loop for the Unix socket listener. func (s *Server) acceptLoop() { defer s.wg.Done() for { conn, err := s.listener.Accept() if err != nil { select { case <-s.done: return default: } s.logDebug("Accept error: %v", err) continue } s.wg.Add(1) go s.handleConnection(conn) } } // handleConnection reads a single JSON request from the connection, dispatches // it to the appropriate handler, and writes the JSON response back. func (s *Server) handleConnection(conn net.Conn) { defer s.wg.Done() defer conn.Close() //nolint:errcheck // best-effort close after handling request // Set a read deadline to prevent hung connections. if err := conn.SetReadDeadline(time.Now().Add(30 * time.Second)); err != nil { s.logDebug("Failed to set read deadline: %v", err) return } decoder := json.NewDecoder(conn) encoder := json.NewEncoder(conn) var req Request if err := decoder.Decode(&req); err != nil { s.logDebug("Failed to decode request: %v", err) resp := Response{OK: false, Error: fmt.Sprintf("invalid request: %v", err)} _ = encoder.Encode(resp) // best-effort error response return } Logf("Received request: action=%s", req.Action) var resp Response switch req.Action { case "create_session": resp = s.handleCreateSession(req) case "destroy_session": resp = s.handleDestroySession(req) case "start_learning": resp = s.handleStartLearning() case "stop_learning": resp = s.handleStopLearning(req) case "status": resp = s.handleStatus() default: resp = Response{OK: false, Error: fmt.Sprintf("unknown action: %q", req.Action)} } if err := encoder.Encode(resp); err != nil { s.logDebug("Failed to encode response: %v", err) } } // handleCreateSession creates a new sandbox session with a utun tunnel, // optional DNS relay, and pf rules for the sandbox group. func (s *Server) handleCreateSession(req Request) Response { s.mu.Lock() defer s.mu.Unlock() if req.ProxyURL == "" { return Response{OK: false, Error: "proxy_url is required"} } // Phase 1: only one session at a time. if len(s.sessions) > 0 { Logf("Rejecting create_session: %d session(s) already active", len(s.sessions)) return Response{OK: false, Error: "a session is already active (only one session supported in Phase 1)"} } Logf("Creating session: proxy=%s dns=%s", req.ProxyURL, req.DNSAddr) // Step 1: Create and start TunManager. tm := NewTunManager(s.tun2socksPath, req.ProxyURL, s.debug) if err := tm.Start(); err != nil { return Response{OK: false, Error: fmt.Sprintf("failed to start tunnel: %v", err)} } // Step 2: Create DNS relay. pf rules always redirect DNS (UDP:53) from // the sandbox group to the relay address, so we must always start the // relay when a proxy session is active. If no explicit DNS address was // provided, default to the proxy's DNS resolver. dnsTarget := req.DNSAddr if dnsTarget == "" { dnsTarget = defaultDNSTarget Logf("No dns_addr provided, defaulting DNS relay upstream to %s", dnsTarget) } dr, err := NewDNSRelay(dnsRelayIP+":"+dnsRelayPort, dnsTarget, s.debug) if err != nil { if stopErr := tm.Stop(); stopErr != nil { Logf("Warning: failed to stop tunnel during cleanup: %v", stopErr) } return Response{OK: false, Error: fmt.Sprintf("failed to create DNS relay: %v", err)} } if err := dr.Start(); err != nil { if stopErr := tm.Stop(); stopErr != nil { Logf("Warning: failed to stop tunnel during cleanup: %v", stopErr) } return Response{OK: false, Error: fmt.Sprintf("failed to start DNS relay: %v", err)} } // Step 3: Resolve the sandbox group GID. pfctl in the LaunchDaemon // context cannot resolve group names via OpenDirectory, so we use the // cached GID (resolved at startup) or look it up now. sandboxGID := s.sandboxGID if sandboxGID == "" { grp, err := user.LookupGroup(SandboxGroupName) if err != nil { _ = tm.Stop() dr.Stop() return Response{OK: false, Error: fmt.Sprintf("failed to resolve group %s: %v", SandboxGroupName, err)} } sandboxGID = grp.Gid s.sandboxGID = sandboxGID } Logf("Loading pf rules for group %s (GID %s)", SandboxGroupName, sandboxGID) if err := tm.LoadPFRules(sandboxGID); err != nil { dr.Stop() _ = tm.Stop() // best-effort cleanup return Response{OK: false, Error: fmt.Sprintf("failed to load pf rules: %v", err)} } // Step 4: Generate session ID and store. sessionID, err := generateSessionID() if err != nil { dr.Stop() _ = tm.UnloadPFRules() // best-effort cleanup _ = tm.Stop() // best-effort cleanup return Response{OK: false, Error: fmt.Sprintf("failed to generate session ID: %v", err)} } session := &Session{ ID: sessionID, ProxyURL: req.ProxyURL, DNSAddr: dnsTarget, CreatedAt: time.Now(), } s.sessions[sessionID] = session s.tunManager = tm s.dnsRelay = dr Logf("Session created: id=%s device=%s group=%s(GID %s)", sessionID, tm.TunDevice(), SandboxGroupName, sandboxGID) return Response{ OK: true, SessionID: sessionID, TunDevice: tm.TunDevice(), SandboxUser: SandboxUserName, SandboxGroup: SandboxGroupName, } } // handleDestroySession tears down an existing session by unloading pf rules, // stopping the tunnel, and stopping the DNS relay. func (s *Server) handleDestroySession(req Request) Response { s.mu.Lock() defer s.mu.Unlock() if req.SessionID == "" { return Response{OK: false, Error: "session_id is required"} } Logf("Destroying session: id=%s", req.SessionID) session, ok := s.sessions[req.SessionID] if !ok { Logf("Session %q not found (active sessions: %d)", req.SessionID, len(s.sessions)) return Response{OK: false, Error: fmt.Sprintf("session %q not found", req.SessionID)} } var errs []string // Step 1: Unload pf rules. if s.tunManager != nil { if err := s.tunManager.UnloadPFRules(); err != nil { errs = append(errs, fmt.Sprintf("unload pf rules: %v", err)) } } // Step 2: Stop tun manager. if s.tunManager != nil { if err := s.tunManager.Stop(); err != nil { errs = append(errs, fmt.Sprintf("stop tun manager: %v", err)) } s.tunManager = nil } // Step 3: Stop DNS relay. if s.dnsRelay != nil { s.dnsRelay.Stop() s.dnsRelay = nil } // Step 4: Remove session. delete(s.sessions, session.ID) if len(errs) > 0 { Logf("Session %s destroyed with errors: %v", session.ID, errs) return Response{OK: false, Error: fmt.Sprintf("session destroyed with errors: %s", join(errs, "; "))} } Logf("Session destroyed: id=%s (remaining: %d)", session.ID, len(s.sessions)) return Response{OK: true} } // handleStartLearning starts an eslogger trace for learning mode. // eslogger uses the Endpoint Security framework and reports real Unix PIDs // via audit_token.pid, plus fork events for process tree tracking. func (s *Server) handleStartLearning() Response { s.mu.Lock() defer s.mu.Unlock() // Only one learning session at a time if s.learningID != "" { return Response{OK: false, Error: "a learning session is already active"} } // Create temp file for eslogger output. // The daemon runs as root but the CLI reads this file as a normal user, // so we must make it world-readable. logFile, err := os.CreateTemp("", "greywall-eslogger-*.log") if err != nil { return Response{OK: false, Error: fmt.Sprintf("failed to create temp file: %v", err)} } logPath := logFile.Name() if err := os.Chmod(logPath, 0o644); err != nil { //nolint:gosec // intentionally world-readable so non-root CLI can parse the log _ = logFile.Close() _ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input return Response{OK: false, Error: fmt.Sprintf("failed to set log file permissions: %v", err)} } // Create a separate file for eslogger stderr so we can diagnose failures. stderrFile, err := os.CreateTemp("", "greywall-eslogger-stderr-*.log") if err != nil { _ = logFile.Close() _ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input return Response{OK: false, Error: fmt.Sprintf("failed to create stderr file: %v", err)} } stderrPath := stderrFile.Name() // Start eslogger with filesystem events + fork for process tree tracking. // eslogger outputs one JSON object per line to stdout. cmd := exec.Command("eslogger", "open", "create", "write", "unlink", "rename", "link", "truncate", "fork") //nolint:gosec // daemon-controlled command cmd.Stdout = logFile cmd.Stderr = stderrFile if err := cmd.Start(); err != nil { _ = logFile.Close() _ = stderrFile.Close() _ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input _ = os.Remove(stderrPath) //nolint:gosec // stderrPath from os.CreateTemp, not user input return Response{OK: false, Error: fmt.Sprintf("failed to start eslogger: %v", err)} } // Generate learning ID learningID, err := generateSessionID() if err != nil { _ = cmd.Process.Kill() _ = logFile.Close() _ = stderrFile.Close() _ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input _ = os.Remove(stderrPath) //nolint:gosec // stderrPath from os.CreateTemp, not user input return Response{OK: false, Error: fmt.Sprintf("failed to generate learning ID: %v", err)} } // Wait briefly for eslogger to initialize, then check if it exited early // (e.g., missing Full Disk Access permission). exitCh := make(chan error, 1) go func() { exitCh <- cmd.Wait() }() select { case waitErr := <-exitCh: // eslogger exited during startup — read stderr for the error message _ = stderrFile.Close() stderrContent, _ := os.ReadFile(stderrPath) //nolint:gosec // stderrPath from os.CreateTemp _ = os.Remove(stderrPath) //nolint:gosec _ = logFile.Close() _ = os.Remove(logPath) //nolint:gosec errMsg := strings.TrimSpace(string(stderrContent)) if errMsg == "" { errMsg = fmt.Sprintf("eslogger exited: %v", waitErr) } if strings.Contains(errMsg, "Full Disk Access") { errMsg += "\n\nGrant Full Disk Access to /usr/local/bin/greywall:\n" + " System Settings → Privacy & Security → Full Disk Access → add /usr/local/bin/greywall\n" + "Then reinstall the daemon: sudo greywall daemon uninstall -f && sudo greywall daemon install" } return Response{OK: false, Error: fmt.Sprintf("eslogger failed to start: %s", errMsg)} case <-time.After(500 * time.Millisecond): // eslogger is still running after 500ms — good, it initialized successfully } s.esloggerCmd = cmd s.esloggerLogPath = logPath s.esloggerDone = exitCh s.learningID = learningID // Clean up stderr file now that eslogger is running _ = stderrFile.Close() _ = os.Remove(stderrPath) //nolint:gosec Logf("Learning session started: id=%s log=%s pid=%d", learningID, logPath, cmd.Process.Pid) return Response{ OK: true, LearningID: learningID, LearningLog: logPath, } } // handleStopLearning stops the eslogger trace for a learning session. func (s *Server) handleStopLearning(req Request) Response { s.mu.Lock() defer s.mu.Unlock() if req.LearningID == "" { return Response{OK: false, Error: "learning_id is required"} } if s.learningID == "" || s.learningID != req.LearningID { return Response{OK: false, Error: fmt.Sprintf("learning session %q not found", req.LearningID)} } if s.esloggerCmd != nil && s.esloggerCmd.Process != nil { // Send SIGINT to eslogger for graceful shutdown (flushes buffers) _ = s.esloggerCmd.Process.Signal(syscall.SIGINT) // Reuse the wait channel from startup (cmd.Wait already called there) if s.esloggerDone != nil { select { case <-s.esloggerDone: // Exited cleanly case <-time.After(5 * time.Second): // Force kill after timeout _ = s.esloggerCmd.Process.Kill() <-s.esloggerDone } } } Logf("Learning session stopped: id=%s", s.learningID) s.esloggerCmd = nil s.esloggerDone = nil s.learningID = "" // Don't remove the log file — the CLI needs to read it s.esloggerLogPath = "" return Response{OK: true} } // handleStatus returns the current daemon status including whether it is running // and how many sessions are active. func (s *Server) handleStatus() Response { s.mu.Lock() defer s.mu.Unlock() return Response{ OK: true, Running: true, ActiveSessions: len(s.sessions), } } // generateSessionID produces a cryptographically random hex session identifier. func generateSessionID() (string, error) { b := make([]byte, 16) if _, err := rand.Read(b); err != nil { return "", fmt.Errorf("failed to read random bytes: %w", err) } return hex.EncodeToString(b), nil } // join concatenates string slices with a separator. This avoids importing // the strings package solely for strings.Join. func join(parts []string, sep string) string { if len(parts) == 0 { return "" } result := parts[0] for _, p := range parts[1:] { result += sep + p } return result } // logDebug writes a timestamped debug message to stderr. func (s *Server) logDebug(format string, args ...interface{}) { if s.debug { Logf(format, args...) } }