greywall/internal/sandbox/linux_seccomp.go

//go:build linux

// Package sandbox provides sandboxing functionality for macOS and Linux.
package sandbox

import (
	"fmt"
	"os"
	"path/filepath"

	"golang.org/x/sys/unix"
)

// SeccompFilter generates and manages seccomp BPF filters.
type SeccompFilter struct {
	debug bool
}

// NewSeccompFilter creates a new seccomp filter generator.
func NewSeccompFilter(debug bool) *SeccompFilter {
	return &SeccompFilter{debug: debug}
}

// DangerousSyscalls lists syscalls that should be blocked for security.
var DangerousSyscalls = []string{
	"ptrace",            // Process debugging/injection
	"process_vm_readv",  // Read another process's memory
	"process_vm_writev", // Write another process's memory
	"keyctl",            // Kernel keyring operations
	"add_key",           // Add key to keyring
	"request_key",       // Request key from keyring
	"personality",       // Change execution domain (can bypass ASLR)
	"userfaultfd",       // User-space page fault handling (potential sandbox escape)
	"perf_event_open",   // Performance monitoring (info leak)
	"bpf",               // eBPF operations (without CAP_BPF)
	"kexec_load",        // Load new kernel
	"kexec_file_load",   // Load new kernel from file
	"reboot",            // Reboot system
	"syslog",            // Kernel log access
	"acct",              // Process accounting
	"mount",             // Mount filesystems
	"umount2",           // Unmount filesystems
	"pivot_root",        // Change root filesystem
	"swapon",            // Enable swap
	"swapoff",           // Disable swap
	"sethostname",       // Change hostname
	"setdomainname",     // Change domain name
	"init_module",       // Load kernel module
	"finit_module",      // Load kernel module from file
	"delete_module",     // Unload kernel module
	"ioperm",            // I/O port permissions
	"iopl",              // I/O privilege level
}

// GenerateBPFFilter generates a seccomp-bpf filter that blocks dangerous syscalls.
// Returns the path to the generated BPF filter file.
func (s *SeccompFilter) GenerateBPFFilter() (string, error) {
	features := DetectLinuxFeatures()
	if !features.HasSeccomp {
		return "", fmt.Errorf("seccomp not available on this system")
	}

	// Create a temporary directory for the filter
	tmpDir := filepath.Join(os.TempDir(), "fence-seccomp")
	if err := os.MkdirAll(tmpDir, 0o700); err != nil {
		return "", fmt.Errorf("failed to create seccomp dir: %w", err)
	}

	filterPath := filepath.Join(tmpDir, fmt.Sprintf("fence-seccomp-%d.bpf", os.Getpid()))

	// Generate the filter using the seccomp library or raw BPF
	// For now, we'll use bwrap's built-in seccomp support via --seccomp
	// which accepts a file descriptor with a BPF program

	// Write a simple seccomp policy using bpf assembly
	if err := s.writeBPFProgram(filterPath); err != nil {
		return "", fmt.Errorf("failed to write BPF program: %w", err)
	}

	if s.debug {
		fmt.Fprintf(os.Stderr, "[fence:seccomp] Generated BPF filter at %s\n", filterPath)
	}

	return filterPath, nil
}

// writeBPFProgram writes a BPF program that blocks dangerous syscalls.
// This generates a compact BPF program in the format expected by bwrap --seccomp.
func (s *SeccompFilter) writeBPFProgram(path string) error {
	// For bwrap, we need to pass the seccomp filter via file descriptor
	// The filter format is: struct sock_filter array
	//
	// We'll build a simple filter:
	// 1. Load syscall number
	// 2. For each dangerous syscall: if match, return ERRNO(EPERM) or LOG+ERRNO
	// 3. Default: allow

	// Get syscall numbers for the current architecture
	syscallNums := make(map[string]int)
	for _, name := range DangerousSyscalls {
		if num, ok := getSyscallNumber(name); ok {
			syscallNums[name] = num
		}
	}

	if len(syscallNums) == 0 {
		// No syscalls to block (unknown architecture?)
		return fmt.Errorf("no syscall numbers found for dangerous syscalls")
	}

	// Build BPF program
	var program []bpfInstruction

	// Load syscall number from seccomp_data
	// BPF_LD | BPF_W | BPF_ABS: load word from absolute offset
	program = append(program, bpfInstruction{
		code: BPF_LD | BPF_W | BPF_ABS,
		k:    0, // offsetof(struct seccomp_data, nr)
	})

	// For each dangerous syscall, add a comparison and block
	// Note: SECCOMP_RET_ERRNO returns -1 with errno in the low 16 bits
	// SECCOMP_RET_LOG means "log and allow" which is NOT what we want
	// We use SECCOMP_RET_ERRNO to block with EPERM
	action := SECCOMP_RET_ERRNO | (unix.EPERM & 0xFFFF)

	for _, name := range DangerousSyscalls {
		num, ok := syscallNums[name]
		if !ok {
			continue
		}

		// BPF_JMP | BPF_JEQ | BPF_K: if A == K, jump jt else jump jf
		program = append(program, bpfInstruction{
			code: BPF_JMP | BPF_JEQ | BPF_K,
			jt:   0, // if match, go to next instruction (block)
			jf:   1, // if not match, skip the block instruction
			k:    uint32(num),
		})

		// Return action (block with EPERM)
		program = append(program, bpfInstruction{
			code: BPF_RET | BPF_K,
			k:    uint32(action),
		})
	}

	// Default: allow
	program = append(program, bpfInstruction{
		code: BPF_RET | BPF_K,
		k:    SECCOMP_RET_ALLOW,
	})

	// Write the program to file
	f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600)
	if err != nil {
		return err
	}
	defer f.Close()

	for _, inst := range program {
		if err := inst.writeTo(f); err != nil {
			return err
		}
	}

	return nil
}

// CleanupFilter removes a generated filter file.
func (s *SeccompFilter) CleanupFilter(path string) {
	if path != "" {
		_ = os.Remove(path)
	}
}

// BPF instruction codes
const (
	BPF_LD  = 0x00
	BPF_JMP = 0x05
	BPF_RET = 0x06
	BPF_W   = 0x00
	BPF_ABS = 0x20
	BPF_JEQ = 0x10
	BPF_K   = 0x00
)

// Seccomp return values
const (
	SECCOMP_RET_ALLOW = 0x7fff0000
	SECCOMP_RET_ERRNO = 0x00050000
	SECCOMP_RET_LOG   = 0x7ffc0000
)

// bpfInstruction represents a single BPF instruction
type bpfInstruction struct {
	code uint16
	jt   uint8
	jf   uint8
	k    uint32
}

func (i *bpfInstruction) writeTo(f *os.File) error {
	// BPF instruction is 8 bytes: code(2) + jt(1) + jf(1) + k(4)
	buf := make([]byte, 8)
	buf[0] = byte(i.code)
	buf[1] = byte(i.code >> 8)
	buf[2] = i.jt
	buf[3] = i.jf
	buf[4] = byte(i.k)
	buf[5] = byte(i.k >> 8)
	buf[6] = byte(i.k >> 16)
	buf[7] = byte(i.k >> 24)
	_, err := f.Write(buf)
	return err
}

// getSyscallNumber returns the syscall number for the current architecture.
func getSyscallNumber(name string) (int, bool) {
	// Detect architecture using uname
	var utsname unix.Utsname
	if err := unix.Uname(&utsname); err != nil {
		return 0, false
	}

	// Convert machine to string
	machine := string(utsname.Machine[:])
	// Trim null bytes
	for i, c := range machine {
		if c == 0 {
			machine = machine[:i]
			break
		}
	}

	var syscallMap map[string]int

	if machine == "aarch64" || machine == "arm64" {
		// ARM64 syscall numbers (from asm-generic/unistd.h)
		syscallMap = map[string]int{
			"ptrace":            117,
			"process_vm_readv":  270,
			"process_vm_writev": 271,
			"keyctl":            219,
			"add_key":           217,
			"request_key":       218,
			"personality":       92,
			"userfaultfd":       282,
			"perf_event_open":   241,
			"bpf":               280,
			"kexec_load":        104,
			"kexec_file_load":   294,
			"reboot":            142,
			"syslog":            116,
			"acct":              89,
			"mount":             40,
			"umount2":           39,
			"pivot_root":        41,
			"swapon":            224,
			"swapoff":           225,
			"sethostname":       161,
			"setdomainname":     162,
			"init_module":       105,
			"finit_module":      273,
			"delete_module":     106,
			// ioperm and iopl don't exist on ARM64
		}
	} else {
		// x86_64 syscall numbers
		syscallMap = map[string]int{
			"ptrace":            101,
			"process_vm_readv":  310,
			"process_vm_writev": 311,
			"keyctl":            250,
			"add_key":           248,
			"request_key":       249,
			"personality":       135,
			"userfaultfd":       323,
			"perf_event_open":   298,
			"bpf":               321,
			"kexec_load":        246,
			"kexec_file_load":   320,
			"reboot":            169,
			"syslog":            103,
			"acct":              163,
			"mount":             165,
			"umount2":           166,
			"pivot_root":        155,
			"swapon":            167,
			"swapoff":           168,
			"sethostname":       170,
			"setdomainname":     171,
			"init_module":       175,
			"finit_module":      313,
			"delete_module":     176,
			"ioperm":            173,
			"iopl":              172,
		}
	}

	num, ok := syscallMap[name]
	return num, ok
}

// Note: SeccompMonitor was removed because SECCOMP_RET_ERRNO (which we use to block
// syscalls) is completely silent - it doesn't log to dmesg, audit, or anywhere else.
// The monitor code attempted to parse dmesg for seccomp events, but those only appear
// with SECCOMP_RET_LOG (allows the syscall) or SECCOMP_RET_KILL (kills the process).
//
// Alternative approaches considered:
// - SECCOMP_RET_USER_NOTIF: Complex supervisor architecture with latency on every blocked call
// - auditd integration: Requires audit daemon setup and root access
// - SECCOMP_RET_LOG: Logs but doesn't block (defeats the purpose)
//
// The eBPF monitor in linux_ebpf.go now handles syscall failure detection instead,
// which catches EPERM/EACCES errors regardless of their source.