317 lines
9.1 KiB
Go
317 lines
9.1 KiB
Go
//go:build linux
|
|
|
|
// Package sandbox provides sandboxing functionality for macOS and Linux.
|
|
package sandbox
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// SeccompFilter generates and manages seccomp BPF filters.
|
|
type SeccompFilter struct {
|
|
debug bool
|
|
}
|
|
|
|
// NewSeccompFilter creates a new seccomp filter generator.
|
|
func NewSeccompFilter(debug bool) *SeccompFilter {
|
|
return &SeccompFilter{debug: debug}
|
|
}
|
|
|
|
// DangerousSyscalls lists syscalls that should be blocked for security.
|
|
var DangerousSyscalls = []string{
|
|
"ptrace", // Process debugging/injection
|
|
"process_vm_readv", // Read another process's memory
|
|
"process_vm_writev", // Write another process's memory
|
|
"keyctl", // Kernel keyring operations
|
|
"add_key", // Add key to keyring
|
|
"request_key", // Request key from keyring
|
|
"personality", // Change execution domain (can bypass ASLR)
|
|
"userfaultfd", // User-space page fault handling (potential sandbox escape)
|
|
"perf_event_open", // Performance monitoring (info leak)
|
|
"bpf", // eBPF operations (without CAP_BPF)
|
|
"kexec_load", // Load new kernel
|
|
"kexec_file_load", // Load new kernel from file
|
|
"reboot", // Reboot system
|
|
"syslog", // Kernel log access
|
|
"acct", // Process accounting
|
|
"mount", // Mount filesystems
|
|
"umount2", // Unmount filesystems
|
|
"pivot_root", // Change root filesystem
|
|
"swapon", // Enable swap
|
|
"swapoff", // Disable swap
|
|
"sethostname", // Change hostname
|
|
"setdomainname", // Change domain name
|
|
"init_module", // Load kernel module
|
|
"finit_module", // Load kernel module from file
|
|
"delete_module", // Unload kernel module
|
|
"ioperm", // I/O port permissions
|
|
"iopl", // I/O privilege level
|
|
}
|
|
|
|
// GenerateBPFFilter generates a seccomp-bpf filter that blocks dangerous syscalls.
|
|
// Returns the path to the generated BPF filter file.
|
|
func (s *SeccompFilter) GenerateBPFFilter() (string, error) {
|
|
features := DetectLinuxFeatures()
|
|
if !features.HasSeccomp {
|
|
return "", fmt.Errorf("seccomp not available on this system")
|
|
}
|
|
|
|
// Create a temporary directory for the filter
|
|
tmpDir := filepath.Join(os.TempDir(), "fence-seccomp")
|
|
if err := os.MkdirAll(tmpDir, 0o700); err != nil {
|
|
return "", fmt.Errorf("failed to create seccomp dir: %w", err)
|
|
}
|
|
|
|
filterPath := filepath.Join(tmpDir, fmt.Sprintf("fence-seccomp-%d.bpf", os.Getpid()))
|
|
|
|
// Generate the filter using the seccomp library or raw BPF
|
|
// For now, we'll use bwrap's built-in seccomp support via --seccomp
|
|
// which accepts a file descriptor with a BPF program
|
|
|
|
// Write a simple seccomp policy using bpf assembly
|
|
if err := s.writeBPFProgram(filterPath); err != nil {
|
|
return "", fmt.Errorf("failed to write BPF program: %w", err)
|
|
}
|
|
|
|
if s.debug {
|
|
fmt.Fprintf(os.Stderr, "[fence:seccomp] Generated BPF filter at %s\n", filterPath)
|
|
}
|
|
|
|
return filterPath, nil
|
|
}
|
|
|
|
// writeBPFProgram writes a BPF program that blocks dangerous syscalls.
|
|
// This generates a compact BPF program in the format expected by bwrap --seccomp.
|
|
func (s *SeccompFilter) writeBPFProgram(path string) error {
|
|
// For bwrap, we need to pass the seccomp filter via file descriptor
|
|
// The filter format is: struct sock_filter array
|
|
//
|
|
// We'll build a simple filter:
|
|
// 1. Load syscall number
|
|
// 2. For each dangerous syscall: if match, return ERRNO(EPERM) or LOG+ERRNO
|
|
// 3. Default: allow
|
|
|
|
// Get syscall numbers for the current architecture
|
|
syscallNums := make(map[string]int)
|
|
for _, name := range DangerousSyscalls {
|
|
if num, ok := getSyscallNumber(name); ok {
|
|
syscallNums[name] = num
|
|
}
|
|
}
|
|
|
|
if len(syscallNums) == 0 {
|
|
// No syscalls to block (unknown architecture?)
|
|
return fmt.Errorf("no syscall numbers found for dangerous syscalls")
|
|
}
|
|
|
|
// Build BPF program
|
|
var program []bpfInstruction
|
|
|
|
// Load syscall number from seccomp_data
|
|
// BPF_LD | BPF_W | BPF_ABS: load word from absolute offset
|
|
program = append(program, bpfInstruction{
|
|
code: BPF_LD | BPF_W | BPF_ABS,
|
|
k: 0, // offsetof(struct seccomp_data, nr)
|
|
})
|
|
|
|
// For each dangerous syscall, add a comparison and block
|
|
// Note: SECCOMP_RET_ERRNO returns -1 with errno in the low 16 bits
|
|
// SECCOMP_RET_LOG means "log and allow" which is NOT what we want
|
|
// We use SECCOMP_RET_ERRNO to block with EPERM
|
|
action := SECCOMP_RET_ERRNO | (unix.EPERM & 0xFFFF)
|
|
|
|
for _, name := range DangerousSyscalls {
|
|
num, ok := syscallNums[name]
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
// BPF_JMP | BPF_JEQ | BPF_K: if A == K, jump jt else jump jf
|
|
program = append(program, bpfInstruction{
|
|
code: BPF_JMP | BPF_JEQ | BPF_K,
|
|
jt: 0, // if match, go to next instruction (block)
|
|
jf: 1, // if not match, skip the block instruction
|
|
k: uint32(num),
|
|
})
|
|
|
|
// Return action (block with EPERM)
|
|
program = append(program, bpfInstruction{
|
|
code: BPF_RET | BPF_K,
|
|
k: uint32(action),
|
|
})
|
|
}
|
|
|
|
// Default: allow
|
|
program = append(program, bpfInstruction{
|
|
code: BPF_RET | BPF_K,
|
|
k: SECCOMP_RET_ALLOW,
|
|
})
|
|
|
|
// Write the program to file
|
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer f.Close()
|
|
|
|
for _, inst := range program {
|
|
if err := inst.writeTo(f); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// CleanupFilter removes a generated filter file.
|
|
func (s *SeccompFilter) CleanupFilter(path string) {
|
|
if path != "" {
|
|
_ = os.Remove(path)
|
|
}
|
|
}
|
|
|
|
// BPF instruction codes
|
|
const (
|
|
BPF_LD = 0x00
|
|
BPF_JMP = 0x05
|
|
BPF_RET = 0x06
|
|
BPF_W = 0x00
|
|
BPF_ABS = 0x20
|
|
BPF_JEQ = 0x10
|
|
BPF_K = 0x00
|
|
)
|
|
|
|
// Seccomp return values
|
|
const (
|
|
SECCOMP_RET_ALLOW = 0x7fff0000
|
|
SECCOMP_RET_ERRNO = 0x00050000
|
|
SECCOMP_RET_LOG = 0x7ffc0000
|
|
)
|
|
|
|
// bpfInstruction represents a single BPF instruction
|
|
type bpfInstruction struct {
|
|
code uint16
|
|
jt uint8
|
|
jf uint8
|
|
k uint32
|
|
}
|
|
|
|
func (i *bpfInstruction) writeTo(f *os.File) error {
|
|
// BPF instruction is 8 bytes: code(2) + jt(1) + jf(1) + k(4)
|
|
buf := make([]byte, 8)
|
|
buf[0] = byte(i.code)
|
|
buf[1] = byte(i.code >> 8)
|
|
buf[2] = i.jt
|
|
buf[3] = i.jf
|
|
buf[4] = byte(i.k)
|
|
buf[5] = byte(i.k >> 8)
|
|
buf[6] = byte(i.k >> 16)
|
|
buf[7] = byte(i.k >> 24)
|
|
_, err := f.Write(buf)
|
|
return err
|
|
}
|
|
|
|
// getSyscallNumber returns the syscall number for the current architecture.
|
|
func getSyscallNumber(name string) (int, bool) {
|
|
// Detect architecture using uname
|
|
var utsname unix.Utsname
|
|
if err := unix.Uname(&utsname); err != nil {
|
|
return 0, false
|
|
}
|
|
|
|
// Convert machine to string
|
|
machine := string(utsname.Machine[:])
|
|
// Trim null bytes
|
|
for i, c := range machine {
|
|
if c == 0 {
|
|
machine = machine[:i]
|
|
break
|
|
}
|
|
}
|
|
|
|
var syscallMap map[string]int
|
|
|
|
if machine == "aarch64" || machine == "arm64" {
|
|
// ARM64 syscall numbers (from asm-generic/unistd.h)
|
|
syscallMap = map[string]int{
|
|
"ptrace": 117,
|
|
"process_vm_readv": 270,
|
|
"process_vm_writev": 271,
|
|
"keyctl": 219,
|
|
"add_key": 217,
|
|
"request_key": 218,
|
|
"personality": 92,
|
|
"userfaultfd": 282,
|
|
"perf_event_open": 241,
|
|
"bpf": 280,
|
|
"kexec_load": 104,
|
|
"kexec_file_load": 294,
|
|
"reboot": 142,
|
|
"syslog": 116,
|
|
"acct": 89,
|
|
"mount": 40,
|
|
"umount2": 39,
|
|
"pivot_root": 41,
|
|
"swapon": 224,
|
|
"swapoff": 225,
|
|
"sethostname": 161,
|
|
"setdomainname": 162,
|
|
"init_module": 105,
|
|
"finit_module": 273,
|
|
"delete_module": 106,
|
|
// ioperm and iopl don't exist on ARM64
|
|
}
|
|
} else {
|
|
// x86_64 syscall numbers
|
|
syscallMap = map[string]int{
|
|
"ptrace": 101,
|
|
"process_vm_readv": 310,
|
|
"process_vm_writev": 311,
|
|
"keyctl": 250,
|
|
"add_key": 248,
|
|
"request_key": 249,
|
|
"personality": 135,
|
|
"userfaultfd": 323,
|
|
"perf_event_open": 298,
|
|
"bpf": 321,
|
|
"kexec_load": 246,
|
|
"kexec_file_load": 320,
|
|
"reboot": 169,
|
|
"syslog": 103,
|
|
"acct": 163,
|
|
"mount": 165,
|
|
"umount2": 166,
|
|
"pivot_root": 155,
|
|
"swapon": 167,
|
|
"swapoff": 168,
|
|
"sethostname": 170,
|
|
"setdomainname": 171,
|
|
"init_module": 175,
|
|
"finit_module": 313,
|
|
"delete_module": 176,
|
|
"ioperm": 173,
|
|
"iopl": 172,
|
|
}
|
|
}
|
|
|
|
num, ok := syscallMap[name]
|
|
return num, ok
|
|
}
|
|
|
|
// Note: SeccompMonitor was removed because SECCOMP_RET_ERRNO (which we use to block
|
|
// syscalls) is completely silent - it doesn't log to dmesg, audit, or anywhere else.
|
|
// The monitor code attempted to parse dmesg for seccomp events, but those only appear
|
|
// with SECCOMP_RET_LOG (allows the syscall) or SECCOMP_RET_KILL (kills the process).
|
|
//
|
|
// Alternative approaches considered:
|
|
// - SECCOMP_RET_USER_NOTIF: Complex supervisor architecture with latency on every blocked call
|
|
// - auditd integration: Requires audit daemon setup and root access
|
|
// - SECCOMP_RET_LOG: Logs but doesn't block (defeats the purpose)
|
|
//
|
|
// The eBPF monitor in linux_ebpf.go now handles syscall failure detection instead,
|
|
// which catches EPERM/EACCES errors regardless of their source.
|