feat: switch macOS learning mode from fs_usage to eslogger

Replace fs_usage (reports Mach thread IDs, requiring process name matching
with false positives) with eslogger (Endpoint Security framework, reports
real Unix PIDs via audit_token.pid plus fork events for process tree tracking).

Key changes:
- Daemon starts eslogger instead of fs_usage, with early-exit detection
  and clear Full Disk Access error messaging
- New two-pass eslogger JSON parser: pass 1 builds PID tree from fork
  events, pass 2 filters filesystem events by PID set
- Remove runtime PID polling (StartPIDTracking, pollDescendantPIDs) —
  process tree is now built post-hoc from the eslogger log
- Platform-specific generateLearnedTemplatePlatform() for darwin/linux/stub
- Refactor TraceResult and GenerateLearnedTemplate to be platform-agnostic
This commit is contained in:
2026-02-26 17:23:43 -06:00
parent e05b54ec1b
commit 9d5d852860
14 changed files with 1434 additions and 70 deletions

View File

@@ -71,6 +71,43 @@ func (c *Client) DestroySession(sessionID string) error {
return nil
}
// StartLearning asks the daemon to start an fs_usage trace for learning mode.
func (c *Client) StartLearning() (*Response, error) {
req := Request{
Action: "start_learning",
}
resp, err := c.sendRequest(req)
if err != nil {
return nil, fmt.Errorf("start learning request failed: %w", err)
}
if !resp.OK {
return resp, fmt.Errorf("start learning failed: %s", resp.Error)
}
return resp, nil
}
// StopLearning asks the daemon to stop the fs_usage trace for the given learning session.
func (c *Client) StopLearning(learningID string) error {
req := Request{
Action: "stop_learning",
LearningID: learningID,
}
resp, err := c.sendRequest(req)
if err != nil {
return fmt.Errorf("stop learning request failed: %w", err)
}
if !resp.OK {
return fmt.Errorf("stop learning failed: %s", resp.Error)
}
return nil
}
// Status queries the daemon for its current status.
func (c *Client) Status() (*Response, error) {
req := Request{

View File

@@ -7,8 +7,11 @@ import (
"fmt"
"net"
"os"
"os/exec"
"os/user"
"strings"
"sync"
"syscall"
"time"
)
@@ -16,10 +19,11 @@ import (
// Request from CLI to daemon.
type Request struct {
Action string `json:"action"` // "create_session", "destroy_session", "status"
ProxyURL string `json:"proxy_url,omitempty"` // for create_session
DNSAddr string `json:"dns_addr,omitempty"` // for create_session
SessionID string `json:"session_id,omitempty"` // for destroy_session
Action string `json:"action"` // "create_session", "destroy_session", "status", "start_learning", "stop_learning"
ProxyURL string `json:"proxy_url,omitempty"` // for create_session
DNSAddr string `json:"dns_addr,omitempty"` // for create_session
SessionID string `json:"session_id,omitempty"` // for destroy_session
LearningID string `json:"learning_id,omitempty"` // for stop_learning
}
// Response from daemon to CLI.
@@ -33,6 +37,9 @@ type Response struct {
// Status response fields.
Running bool `json:"running,omitempty"`
ActiveSessions int `json:"active_sessions,omitempty"`
// Learning response fields.
LearningID string `json:"learning_id,omitempty"`
LearningLog string `json:"learning_log,omitempty"`
}
// Session tracks an active sandbox session.
@@ -57,6 +64,11 @@ type Server struct {
debug bool
tun2socksPath string
sandboxGID string // cached numeric GID for the sandbox group
// Learning mode state
esloggerCmd *exec.Cmd // running eslogger process
esloggerLogPath string // temp file path for eslogger output
esloggerDone chan error // receives result of cmd.Wait() (set once, reused for stop)
learningID string // current learning session ID
}
// NewServer creates a new daemon server that will listen on the given Unix socket path.
@@ -133,9 +145,26 @@ func (s *Server) Stop() error {
// Wait for the accept loop and any in-flight handlers to finish.
s.wg.Wait()
// Tear down all active sessions.
// Tear down all active sessions and learning.
s.mu.Lock()
var errs []string
// Stop learning session if active
if s.esloggerCmd != nil && s.esloggerCmd.Process != nil {
s.logDebug("Stopping eslogger during shutdown")
_ = s.esloggerCmd.Process.Kill()
if s.esloggerDone != nil {
<-s.esloggerDone
}
s.esloggerCmd = nil
s.esloggerDone = nil
s.learningID = ""
}
if s.esloggerLogPath != "" {
_ = os.Remove(s.esloggerLogPath)
s.esloggerLogPath = ""
}
for id := range s.sessions {
s.logDebug("Stopping session %s during shutdown", id)
}
@@ -227,6 +256,10 @@ func (s *Server) handleConnection(conn net.Conn) {
resp = s.handleCreateSession(req)
case "destroy_session":
resp = s.handleDestroySession(req)
case "start_learning":
resp = s.handleStartLearning()
case "stop_learning":
resp = s.handleStopLearning(req)
case "status":
resp = s.handleStatus()
default:
@@ -387,6 +420,156 @@ func (s *Server) handleDestroySession(req Request) Response {
return Response{OK: true}
}
// handleStartLearning starts an eslogger trace for learning mode.
// eslogger uses the Endpoint Security framework and reports real Unix PIDs
// via audit_token.pid, plus fork events for process tree tracking.
func (s *Server) handleStartLearning() Response {
s.mu.Lock()
defer s.mu.Unlock()
// Only one learning session at a time
if s.learningID != "" {
return Response{OK: false, Error: "a learning session is already active"}
}
// Create temp file for eslogger output.
// The daemon runs as root but the CLI reads this file as a normal user,
// so we must make it world-readable.
logFile, err := os.CreateTemp("", "greywall-eslogger-*.log")
if err != nil {
return Response{OK: false, Error: fmt.Sprintf("failed to create temp file: %v", err)}
}
logPath := logFile.Name()
if err := os.Chmod(logPath, 0o644); err != nil { //nolint:gosec // intentionally world-readable so non-root CLI can parse the log
_ = logFile.Close()
_ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input
return Response{OK: false, Error: fmt.Sprintf("failed to set log file permissions: %v", err)}
}
// Create a separate file for eslogger stderr so we can diagnose failures.
stderrFile, err := os.CreateTemp("", "greywall-eslogger-stderr-*.log")
if err != nil {
_ = logFile.Close()
_ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input
return Response{OK: false, Error: fmt.Sprintf("failed to create stderr file: %v", err)}
}
stderrPath := stderrFile.Name()
// Start eslogger with filesystem events + fork for process tree tracking.
// eslogger outputs one JSON object per line to stdout.
cmd := exec.Command("eslogger", "open", "create", "write", "unlink", "rename", "link", "truncate", "fork") //nolint:gosec // daemon-controlled command
cmd.Stdout = logFile
cmd.Stderr = stderrFile
if err := cmd.Start(); err != nil {
_ = logFile.Close()
_ = stderrFile.Close()
_ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input
_ = os.Remove(stderrPath) //nolint:gosec // stderrPath from os.CreateTemp, not user input
return Response{OK: false, Error: fmt.Sprintf("failed to start eslogger: %v", err)}
}
// Generate learning ID
learningID, err := generateSessionID()
if err != nil {
_ = cmd.Process.Kill()
_ = logFile.Close()
_ = stderrFile.Close()
_ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input
_ = os.Remove(stderrPath) //nolint:gosec // stderrPath from os.CreateTemp, not user input
return Response{OK: false, Error: fmt.Sprintf("failed to generate learning ID: %v", err)}
}
// Wait briefly for eslogger to initialize, then check if it exited early
// (e.g., missing Full Disk Access permission).
exitCh := make(chan error, 1)
go func() {
exitCh <- cmd.Wait()
}()
select {
case waitErr := <-exitCh:
// eslogger exited during startup — read stderr for the error message
_ = stderrFile.Close()
stderrContent, _ := os.ReadFile(stderrPath) //nolint:gosec // stderrPath from os.CreateTemp
_ = os.Remove(stderrPath) //nolint:gosec
_ = logFile.Close()
_ = os.Remove(logPath) //nolint:gosec
errMsg := strings.TrimSpace(string(stderrContent))
if errMsg == "" {
errMsg = fmt.Sprintf("eslogger exited: %v", waitErr)
}
if strings.Contains(errMsg, "Full Disk Access") {
errMsg += "\n\nGrant Full Disk Access to /usr/local/bin/greywall:\n" +
" System Settings → Privacy & Security → Full Disk Access → add /usr/local/bin/greywall\n" +
"Then reinstall the daemon: sudo greywall daemon uninstall -f && sudo greywall daemon install"
}
return Response{OK: false, Error: fmt.Sprintf("eslogger failed to start: %s", errMsg)}
case <-time.After(500 * time.Millisecond):
// eslogger is still running after 500ms — good, it initialized successfully
}
s.esloggerCmd = cmd
s.esloggerLogPath = logPath
s.esloggerDone = exitCh
s.learningID = learningID
// Clean up stderr file now that eslogger is running
_ = stderrFile.Close()
_ = os.Remove(stderrPath) //nolint:gosec
Logf("Learning session started: id=%s log=%s pid=%d", learningID, logPath, cmd.Process.Pid)
return Response{
OK: true,
LearningID: learningID,
LearningLog: logPath,
}
}
// handleStopLearning stops the eslogger trace for a learning session.
func (s *Server) handleStopLearning(req Request) Response {
s.mu.Lock()
defer s.mu.Unlock()
if req.LearningID == "" {
return Response{OK: false, Error: "learning_id is required"}
}
if s.learningID == "" || s.learningID != req.LearningID {
return Response{OK: false, Error: fmt.Sprintf("learning session %q not found", req.LearningID)}
}
if s.esloggerCmd != nil && s.esloggerCmd.Process != nil {
// Send SIGINT to eslogger for graceful shutdown (flushes buffers)
_ = s.esloggerCmd.Process.Signal(syscall.SIGINT)
// Reuse the wait channel from startup (cmd.Wait already called there)
if s.esloggerDone != nil {
select {
case <-s.esloggerDone:
// Exited cleanly
case <-time.After(5 * time.Second):
// Force kill after timeout
_ = s.esloggerCmd.Process.Kill()
<-s.esloggerDone
}
}
}
Logf("Learning session stopped: id=%s", s.learningID)
s.esloggerCmd = nil
s.esloggerDone = nil
s.learningID = ""
// Don't remove the log file — the CLI needs to read it
s.esloggerLogPath = ""
return Response{OK: true}
}
// handleStatus returns the current daemon status including whether it is running
// and how many sessions are active.
func (s *Server) handleStatus() Response {