feat: switch macOS learning mode from fs_usage to eslogger
Replace fs_usage (reports Mach thread IDs, requiring process name matching with false positives) with eslogger (Endpoint Security framework, reports real Unix PIDs via audit_token.pid plus fork events for process tree tracking). Key changes: - Daemon starts eslogger instead of fs_usage, with early-exit detection and clear Full Disk Access error messaging - New two-pass eslogger JSON parser: pass 1 builds PID tree from fork events, pass 2 filters filesystem events by PID set - Remove runtime PID polling (StartPIDTracking, pollDescendantPIDs) — process tree is now built post-hoc from the eslogger log - Platform-specific generateLearnedTemplatePlatform() for darwin/linux/stub - Refactor TraceResult and GenerateLearnedTemplate to be platform-agnostic
This commit is contained in:
@@ -71,6 +71,43 @@ func (c *Client) DestroySession(sessionID string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// StartLearning asks the daemon to start an fs_usage trace for learning mode.
|
||||
func (c *Client) StartLearning() (*Response, error) {
|
||||
req := Request{
|
||||
Action: "start_learning",
|
||||
}
|
||||
|
||||
resp, err := c.sendRequest(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start learning request failed: %w", err)
|
||||
}
|
||||
|
||||
if !resp.OK {
|
||||
return resp, fmt.Errorf("start learning failed: %s", resp.Error)
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
// StopLearning asks the daemon to stop the fs_usage trace for the given learning session.
|
||||
func (c *Client) StopLearning(learningID string) error {
|
||||
req := Request{
|
||||
Action: "stop_learning",
|
||||
LearningID: learningID,
|
||||
}
|
||||
|
||||
resp, err := c.sendRequest(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("stop learning request failed: %w", err)
|
||||
}
|
||||
|
||||
if !resp.OK {
|
||||
return fmt.Errorf("stop learning failed: %s", resp.Error)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Status queries the daemon for its current status.
|
||||
func (c *Client) Status() (*Response, error) {
|
||||
req := Request{
|
||||
|
||||
@@ -7,8 +7,11 @@ import (
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -16,10 +19,11 @@ import (
|
||||
|
||||
// Request from CLI to daemon.
|
||||
type Request struct {
|
||||
Action string `json:"action"` // "create_session", "destroy_session", "status"
|
||||
ProxyURL string `json:"proxy_url,omitempty"` // for create_session
|
||||
DNSAddr string `json:"dns_addr,omitempty"` // for create_session
|
||||
SessionID string `json:"session_id,omitempty"` // for destroy_session
|
||||
Action string `json:"action"` // "create_session", "destroy_session", "status", "start_learning", "stop_learning"
|
||||
ProxyURL string `json:"proxy_url,omitempty"` // for create_session
|
||||
DNSAddr string `json:"dns_addr,omitempty"` // for create_session
|
||||
SessionID string `json:"session_id,omitempty"` // for destroy_session
|
||||
LearningID string `json:"learning_id,omitempty"` // for stop_learning
|
||||
}
|
||||
|
||||
// Response from daemon to CLI.
|
||||
@@ -33,6 +37,9 @@ type Response struct {
|
||||
// Status response fields.
|
||||
Running bool `json:"running,omitempty"`
|
||||
ActiveSessions int `json:"active_sessions,omitempty"`
|
||||
// Learning response fields.
|
||||
LearningID string `json:"learning_id,omitempty"`
|
||||
LearningLog string `json:"learning_log,omitempty"`
|
||||
}
|
||||
|
||||
// Session tracks an active sandbox session.
|
||||
@@ -57,6 +64,11 @@ type Server struct {
|
||||
debug bool
|
||||
tun2socksPath string
|
||||
sandboxGID string // cached numeric GID for the sandbox group
|
||||
// Learning mode state
|
||||
esloggerCmd *exec.Cmd // running eslogger process
|
||||
esloggerLogPath string // temp file path for eslogger output
|
||||
esloggerDone chan error // receives result of cmd.Wait() (set once, reused for stop)
|
||||
learningID string // current learning session ID
|
||||
}
|
||||
|
||||
// NewServer creates a new daemon server that will listen on the given Unix socket path.
|
||||
@@ -133,9 +145,26 @@ func (s *Server) Stop() error {
|
||||
// Wait for the accept loop and any in-flight handlers to finish.
|
||||
s.wg.Wait()
|
||||
|
||||
// Tear down all active sessions.
|
||||
// Tear down all active sessions and learning.
|
||||
s.mu.Lock()
|
||||
var errs []string
|
||||
|
||||
// Stop learning session if active
|
||||
if s.esloggerCmd != nil && s.esloggerCmd.Process != nil {
|
||||
s.logDebug("Stopping eslogger during shutdown")
|
||||
_ = s.esloggerCmd.Process.Kill()
|
||||
if s.esloggerDone != nil {
|
||||
<-s.esloggerDone
|
||||
}
|
||||
s.esloggerCmd = nil
|
||||
s.esloggerDone = nil
|
||||
s.learningID = ""
|
||||
}
|
||||
if s.esloggerLogPath != "" {
|
||||
_ = os.Remove(s.esloggerLogPath)
|
||||
s.esloggerLogPath = ""
|
||||
}
|
||||
|
||||
for id := range s.sessions {
|
||||
s.logDebug("Stopping session %s during shutdown", id)
|
||||
}
|
||||
@@ -227,6 +256,10 @@ func (s *Server) handleConnection(conn net.Conn) {
|
||||
resp = s.handleCreateSession(req)
|
||||
case "destroy_session":
|
||||
resp = s.handleDestroySession(req)
|
||||
case "start_learning":
|
||||
resp = s.handleStartLearning()
|
||||
case "stop_learning":
|
||||
resp = s.handleStopLearning(req)
|
||||
case "status":
|
||||
resp = s.handleStatus()
|
||||
default:
|
||||
@@ -387,6 +420,156 @@ func (s *Server) handleDestroySession(req Request) Response {
|
||||
return Response{OK: true}
|
||||
}
|
||||
|
||||
// handleStartLearning starts an eslogger trace for learning mode.
|
||||
// eslogger uses the Endpoint Security framework and reports real Unix PIDs
|
||||
// via audit_token.pid, plus fork events for process tree tracking.
|
||||
func (s *Server) handleStartLearning() Response {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
// Only one learning session at a time
|
||||
if s.learningID != "" {
|
||||
return Response{OK: false, Error: "a learning session is already active"}
|
||||
}
|
||||
|
||||
// Create temp file for eslogger output.
|
||||
// The daemon runs as root but the CLI reads this file as a normal user,
|
||||
// so we must make it world-readable.
|
||||
logFile, err := os.CreateTemp("", "greywall-eslogger-*.log")
|
||||
if err != nil {
|
||||
return Response{OK: false, Error: fmt.Sprintf("failed to create temp file: %v", err)}
|
||||
}
|
||||
|
||||
logPath := logFile.Name()
|
||||
if err := os.Chmod(logPath, 0o644); err != nil { //nolint:gosec // intentionally world-readable so non-root CLI can parse the log
|
||||
_ = logFile.Close()
|
||||
_ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input
|
||||
return Response{OK: false, Error: fmt.Sprintf("failed to set log file permissions: %v", err)}
|
||||
}
|
||||
|
||||
// Create a separate file for eslogger stderr so we can diagnose failures.
|
||||
stderrFile, err := os.CreateTemp("", "greywall-eslogger-stderr-*.log")
|
||||
if err != nil {
|
||||
_ = logFile.Close()
|
||||
_ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input
|
||||
return Response{OK: false, Error: fmt.Sprintf("failed to create stderr file: %v", err)}
|
||||
}
|
||||
stderrPath := stderrFile.Name()
|
||||
|
||||
// Start eslogger with filesystem events + fork for process tree tracking.
|
||||
// eslogger outputs one JSON object per line to stdout.
|
||||
cmd := exec.Command("eslogger", "open", "create", "write", "unlink", "rename", "link", "truncate", "fork") //nolint:gosec // daemon-controlled command
|
||||
cmd.Stdout = logFile
|
||||
cmd.Stderr = stderrFile
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
_ = logFile.Close()
|
||||
_ = stderrFile.Close()
|
||||
_ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input
|
||||
_ = os.Remove(stderrPath) //nolint:gosec // stderrPath from os.CreateTemp, not user input
|
||||
return Response{OK: false, Error: fmt.Sprintf("failed to start eslogger: %v", err)}
|
||||
}
|
||||
|
||||
// Generate learning ID
|
||||
learningID, err := generateSessionID()
|
||||
if err != nil {
|
||||
_ = cmd.Process.Kill()
|
||||
_ = logFile.Close()
|
||||
_ = stderrFile.Close()
|
||||
_ = os.Remove(logPath) //nolint:gosec // logPath from os.CreateTemp, not user input
|
||||
_ = os.Remove(stderrPath) //nolint:gosec // stderrPath from os.CreateTemp, not user input
|
||||
return Response{OK: false, Error: fmt.Sprintf("failed to generate learning ID: %v", err)}
|
||||
}
|
||||
|
||||
// Wait briefly for eslogger to initialize, then check if it exited early
|
||||
// (e.g., missing Full Disk Access permission).
|
||||
exitCh := make(chan error, 1)
|
||||
go func() {
|
||||
exitCh <- cmd.Wait()
|
||||
}()
|
||||
|
||||
select {
|
||||
case waitErr := <-exitCh:
|
||||
// eslogger exited during startup — read stderr for the error message
|
||||
_ = stderrFile.Close()
|
||||
stderrContent, _ := os.ReadFile(stderrPath) //nolint:gosec // stderrPath from os.CreateTemp
|
||||
_ = os.Remove(stderrPath) //nolint:gosec
|
||||
_ = logFile.Close()
|
||||
_ = os.Remove(logPath) //nolint:gosec
|
||||
errMsg := strings.TrimSpace(string(stderrContent))
|
||||
if errMsg == "" {
|
||||
errMsg = fmt.Sprintf("eslogger exited: %v", waitErr)
|
||||
}
|
||||
if strings.Contains(errMsg, "Full Disk Access") {
|
||||
errMsg += "\n\nGrant Full Disk Access to /usr/local/bin/greywall:\n" +
|
||||
" System Settings → Privacy & Security → Full Disk Access → add /usr/local/bin/greywall\n" +
|
||||
"Then reinstall the daemon: sudo greywall daemon uninstall -f && sudo greywall daemon install"
|
||||
}
|
||||
return Response{OK: false, Error: fmt.Sprintf("eslogger failed to start: %s", errMsg)}
|
||||
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
// eslogger is still running after 500ms — good, it initialized successfully
|
||||
}
|
||||
|
||||
s.esloggerCmd = cmd
|
||||
s.esloggerLogPath = logPath
|
||||
s.esloggerDone = exitCh
|
||||
s.learningID = learningID
|
||||
|
||||
// Clean up stderr file now that eslogger is running
|
||||
_ = stderrFile.Close()
|
||||
_ = os.Remove(stderrPath) //nolint:gosec
|
||||
|
||||
Logf("Learning session started: id=%s log=%s pid=%d", learningID, logPath, cmd.Process.Pid)
|
||||
|
||||
return Response{
|
||||
OK: true,
|
||||
LearningID: learningID,
|
||||
LearningLog: logPath,
|
||||
}
|
||||
}
|
||||
|
||||
// handleStopLearning stops the eslogger trace for a learning session.
|
||||
func (s *Server) handleStopLearning(req Request) Response {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
if req.LearningID == "" {
|
||||
return Response{OK: false, Error: "learning_id is required"}
|
||||
}
|
||||
|
||||
if s.learningID == "" || s.learningID != req.LearningID {
|
||||
return Response{OK: false, Error: fmt.Sprintf("learning session %q not found", req.LearningID)}
|
||||
}
|
||||
|
||||
if s.esloggerCmd != nil && s.esloggerCmd.Process != nil {
|
||||
// Send SIGINT to eslogger for graceful shutdown (flushes buffers)
|
||||
_ = s.esloggerCmd.Process.Signal(syscall.SIGINT)
|
||||
|
||||
// Reuse the wait channel from startup (cmd.Wait already called there)
|
||||
if s.esloggerDone != nil {
|
||||
select {
|
||||
case <-s.esloggerDone:
|
||||
// Exited cleanly
|
||||
case <-time.After(5 * time.Second):
|
||||
// Force kill after timeout
|
||||
_ = s.esloggerCmd.Process.Kill()
|
||||
<-s.esloggerDone
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Logf("Learning session stopped: id=%s", s.learningID)
|
||||
|
||||
s.esloggerCmd = nil
|
||||
s.esloggerDone = nil
|
||||
s.learningID = ""
|
||||
// Don't remove the log file — the CLI needs to read it
|
||||
s.esloggerLogPath = ""
|
||||
|
||||
return Response{OK: true}
|
||||
}
|
||||
|
||||
// handleStatus returns the current daemon status including whether it is running
|
||||
// and how many sessions are active.
|
||||
func (s *Server) handleStatus() Response {
|
||||
|
||||
Reference in New Issue
Block a user