Enhance Linux sandbox security features with Landlock, seccomp, and eBPF monitoring

2025-12-25 17:33:55 -08:00
parent a8158a39b3
commit 08ed28f88f
20 changed files with 2820 additions and 64 deletions
--- a/internal/sandbox/linux_seccomp.go
+++ b/internal/sandbox/linux_seccomp.go
@@ -0,0 +1,316 @@
+//go:build linux
+
+// Package sandbox provides sandboxing functionality for macOS and Linux.
+package sandbox
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"golang.org/x/sys/unix"
+)
+
+// SeccompFilter generates and manages seccomp BPF filters.
+type SeccompFilter struct {
+	debug bool
+}
+
+// NewSeccompFilter creates a new seccomp filter generator.
+func NewSeccompFilter(debug bool) *SeccompFilter {
+	return &SeccompFilter{debug: debug}
+}
+
+// DangerousSyscalls lists syscalls that should be blocked for security.
+var DangerousSyscalls = []string{
+	"ptrace",            // Process debugging/injection
+	"process_vm_readv",  // Read another process's memory
+	"process_vm_writev", // Write another process's memory
+	"keyctl",            // Kernel keyring operations
+	"add_key",           // Add key to keyring
+	"request_key",       // Request key from keyring
+	"personality",       // Change execution domain (can bypass ASLR)
+	"userfaultfd",       // User-space page fault handling (potential sandbox escape)
+	"perf_event_open",   // Performance monitoring (info leak)
+	"bpf",               // eBPF operations (without CAP_BPF)
+	"kexec_load",        // Load new kernel
+	"kexec_file_load",   // Load new kernel from file
+	"reboot",            // Reboot system
+	"syslog",            // Kernel log access
+	"acct",              // Process accounting
+	"mount",             // Mount filesystems
+	"umount2",           // Unmount filesystems
+	"pivot_root",        // Change root filesystem
+	"swapon",            // Enable swap
+	"swapoff",           // Disable swap
+	"sethostname",       // Change hostname
+	"setdomainname",     // Change domain name
+	"init_module",       // Load kernel module
+	"finit_module",      // Load kernel module from file
+	"delete_module",     // Unload kernel module
+	"ioperm",            // I/O port permissions
+	"iopl",              // I/O privilege level
+}
+
+// GenerateBPFFilter generates a seccomp-bpf filter that blocks dangerous syscalls.
+// Returns the path to the generated BPF filter file.
+func (s *SeccompFilter) GenerateBPFFilter() (string, error) {
+	features := DetectLinuxFeatures()
+	if !features.HasSeccomp {
+		return "", fmt.Errorf("seccomp not available on this system")
+	}
+
+	// Create a temporary directory for the filter
+	tmpDir := filepath.Join(os.TempDir(), "fence-seccomp")
+	if err := os.MkdirAll(tmpDir, 0o700); err != nil {
+		return "", fmt.Errorf("failed to create seccomp dir: %w", err)
+	}
+
+	filterPath := filepath.Join(tmpDir, fmt.Sprintf("fence-seccomp-%d.bpf", os.Getpid()))
+
+	// Generate the filter using the seccomp library or raw BPF
+	// For now, we'll use bwrap's built-in seccomp support via --seccomp
+	// which accepts a file descriptor with a BPF program
+
+	// Write a simple seccomp policy using bpf assembly
+	if err := s.writeBPFProgram(filterPath); err != nil {
+		return "", fmt.Errorf("failed to write BPF program: %w", err)
+	}
+
+	if s.debug {
+		fmt.Fprintf(os.Stderr, "[fence:seccomp] Generated BPF filter at %s\n", filterPath)
+	}
+
+	return filterPath, nil
+}
+
+// writeBPFProgram writes a BPF program that blocks dangerous syscalls.
+// This generates a compact BPF program in the format expected by bwrap --seccomp.
+func (s *SeccompFilter) writeBPFProgram(path string) error {
+	// For bwrap, we need to pass the seccomp filter via file descriptor
+	// The filter format is: struct sock_filter array
+	//
+	// We'll build a simple filter:
+	// 1. Load syscall number
+	// 2. For each dangerous syscall: if match, return ERRNO(EPERM) or LOG+ERRNO
+	// 3. Default: allow
+
+	// Get syscall numbers for the current architecture
+	syscallNums := make(map[string]int)
+	for _, name := range DangerousSyscalls {
+		if num, ok := getSyscallNumber(name); ok {
+			syscallNums[name] = num
+		}
+	}
+
+	if len(syscallNums) == 0 {
+		// No syscalls to block (unknown architecture?)
+		return fmt.Errorf("no syscall numbers found for dangerous syscalls")
+	}
+
+	// Build BPF program
+	var program []bpfInstruction
+
+	// Load syscall number from seccomp_data
+	// BPF_LD | BPF_W | BPF_ABS: load word from absolute offset
+	program = append(program, bpfInstruction{
+		code: BPF_LD | BPF_W | BPF_ABS,
+		k:    0, // offsetof(struct seccomp_data, nr)
+	})
+
+	// For each dangerous syscall, add a comparison and block
+	// Note: SECCOMP_RET_ERRNO returns -1 with errno in the low 16 bits
+	// SECCOMP_RET_LOG means "log and allow" which is NOT what we want
+	// We use SECCOMP_RET_ERRNO to block with EPERM
+	action := SECCOMP_RET_ERRNO | (unix.EPERM & 0xFFFF)
+
+	for _, name := range DangerousSyscalls {
+		num, ok := syscallNums[name]
+		if !ok {
+			continue
+		}
+
+		// BPF_JMP | BPF_JEQ | BPF_K: if A == K, jump jt else jump jf
+		program = append(program, bpfInstruction{
+			code: BPF_JMP | BPF_JEQ | BPF_K,
+			jt:   0, // if match, go to next instruction (block)
+			jf:   1, // if not match, skip the block instruction
+			k:    uint32(num),
+		})
+
+		// Return action (block with EPERM)
+		program = append(program, bpfInstruction{
+			code: BPF_RET | BPF_K,
+			k:    uint32(action),
+		})
+	}
+
+	// Default: allow
+	program = append(program, bpfInstruction{
+		code: BPF_RET | BPF_K,
+		k:    SECCOMP_RET_ALLOW,
+	})
+
+	// Write the program to file
+	f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	for _, inst := range program {
+		if err := inst.writeTo(f); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// CleanupFilter removes a generated filter file.
+func (s *SeccompFilter) CleanupFilter(path string) {
+	if path != "" {
+		_ = os.Remove(path)
+	}
+}
+
+// BPF instruction codes
+const (
+	BPF_LD  = 0x00
+	BPF_JMP = 0x05
+	BPF_RET = 0x06
+	BPF_W   = 0x00
+	BPF_ABS = 0x20
+	BPF_JEQ = 0x10
+	BPF_K   = 0x00
+)
+
+// Seccomp return values
+const (
+	SECCOMP_RET_ALLOW = 0x7fff0000
+	SECCOMP_RET_ERRNO = 0x00050000
+	SECCOMP_RET_LOG   = 0x7ffc0000
+)
+
+// bpfInstruction represents a single BPF instruction
+type bpfInstruction struct {
+	code uint16
+	jt   uint8
+	jf   uint8
+	k    uint32
+}
+
+func (i *bpfInstruction) writeTo(f *os.File) error {
+	// BPF instruction is 8 bytes: code(2) + jt(1) + jf(1) + k(4)
+	buf := make([]byte, 8)
+	buf[0] = byte(i.code)
+	buf[1] = byte(i.code >> 8)
+	buf[2] = i.jt
+	buf[3] = i.jf
+	buf[4] = byte(i.k)
+	buf[5] = byte(i.k >> 8)
+	buf[6] = byte(i.k >> 16)
+	buf[7] = byte(i.k >> 24)
+	_, err := f.Write(buf)
+	return err
+}
+
+// getSyscallNumber returns the syscall number for the current architecture.
+func getSyscallNumber(name string) (int, bool) {
+	// Detect architecture using uname
+	var utsname unix.Utsname
+	if err := unix.Uname(&utsname); err != nil {
+		return 0, false
+	}
+
+	// Convert machine to string
+	machine := string(utsname.Machine[:])
+	// Trim null bytes
+	for i, c := range machine {
+		if c == 0 {
+			machine = machine[:i]
+			break
+		}
+	}
+
+	var syscallMap map[string]int
+
+	if machine == "aarch64" || machine == "arm64" {
+		// ARM64 syscall numbers (from asm-generic/unistd.h)
+		syscallMap = map[string]int{
+			"ptrace":            117,
+			"process_vm_readv":  270,
+			"process_vm_writev": 271,
+			"keyctl":            219,
+			"add_key":           217,
+			"request_key":       218,
+			"personality":       92,
+			"userfaultfd":       282,
+			"perf_event_open":   241,
+			"bpf":               280,
+			"kexec_load":        104,
+			"kexec_file_load":   294,
+			"reboot":            142,
+			"syslog":            116,
+			"acct":              89,
+			"mount":             40,
+			"umount2":           39,
+			"pivot_root":        41,
+			"swapon":            224,
+			"swapoff":           225,
+			"sethostname":       161,
+			"setdomainname":     162,
+			"init_module":       105,
+			"finit_module":      273,
+			"delete_module":     106,
+			// ioperm and iopl don't exist on ARM64
+		}
+	} else {
+		// x86_64 syscall numbers
+		syscallMap = map[string]int{
+			"ptrace":            101,
+			"process_vm_readv":  310,
+			"process_vm_writev": 311,
+			"keyctl":            250,
+			"add_key":           248,
+			"request_key":       249,
+			"personality":       135,
+			"userfaultfd":       323,
+			"perf_event_open":   298,
+			"bpf":               321,
+			"kexec_load":        246,
+			"kexec_file_load":   320,
+			"reboot":            169,
+			"syslog":            103,
+			"acct":              163,
+			"mount":             165,
+			"umount2":           166,
+			"pivot_root":        155,
+			"swapon":            167,
+			"swapoff":           168,
+			"sethostname":       170,
+			"setdomainname":     171,
+			"init_module":       175,
+			"finit_module":      313,
+			"delete_module":     176,
+			"ioperm":            173,
+			"iopl":              172,
+		}
+	}
+
+	num, ok := syscallMap[name]
+	return num, ok
+}
+
+// Note: SeccompMonitor was removed because SECCOMP_RET_ERRNO (which we use to block
+// syscalls) is completely silent - it doesn't log to dmesg, audit, or anywhere else.
+// The monitor code attempted to parse dmesg for seccomp events, but those only appear
+// with SECCOMP_RET_LOG (allows the syscall) or SECCOMP_RET_KILL (kills the process).
+//
+// Alternative approaches considered:
+// - SECCOMP_RET_USER_NOTIF: Complex supervisor architecture with latency on every blocked call
+// - auditd integration: Requires audit daemon setup and root access
+// - SECCOMP_RET_LOG: Logs but doesn't block (defeats the purpose)
+//
+// The eBPF monitor in linux_ebpf.go now handles syscall failure detection instead,
+// which catches EPERM/EACCES errors regardless of their source.