| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * Container Security Monitor module |
| * |
| * Copyright (c) 2018 Google, Inc |
| */ |
| |
| #include "monitor.h" |
| |
| #include <linux/mount.h> |
| #include <linux/pid_namespace.h> |
| #include <linux/timekeeping.h> |
| #include <linux/xattr.h> |
| #include <linux/random.h> |
| #include <overlayfs/overlayfs.h> |
| #include <uapi/linux/magic.h> |
| #include <linux/mempool.h> |
| #include <linux/highmem.h> |
| #include <linux/mm.h> |
| #include <linux/sched/signal.h> |
| #include <linux/audit.h> |
| #include <linux/net.h> |
| #include <linux/file.h> |
| #include <linux/vmalloc.h> |
| |
| /* Configuration options for execute collector. */ |
| struct execute_config csm_execute_config; |
| |
| /* unique atomic value for the machine boot instance */ |
| static atomic_t machine_rand = ATOMIC_INIT(0); |
| |
| /* sequential container identifier */ |
| static atomic_t contid = ATOMIC_INIT(0); |
| |
| static void *kmap_argument_stack(struct linux_binprm *bprm, void **ctx) |
| { |
| char *argv; |
| int err; |
| unsigned long i, pos; |
| void *map; |
| struct page *page; |
| |
| /* vma_pages holds the number of pages reserved for the stack */ |
| if (likely(bprm->vma_pages == 1)) { |
| err = get_user_pages_remote(current, bprm->mm, bprm->p, 1, |
| FOLL_FORCE, &page, NULL, NULL); |
| if (err != 1) |
| return NULL; |
| |
| argv = kmap(page); |
| *ctx = page; |
| } else { |
| /* |
| * If more than one pages is needed, copy all of them to a set |
| * of pages. Parsing the argument across kmap pages in different |
| * addresses would make it impractical. |
| */ |
| argv = vmalloc(bprm->vma_pages * PAGE_SIZE); |
| if (!argv) |
| return NULL; |
| |
| for (i = 0; i < bprm->vma_pages; i++) { |
| pos = ALIGN_DOWN(bprm->p, PAGE_SIZE) + i * PAGE_SIZE; |
| err = get_user_pages_remote(current, bprm->mm, pos, 1, |
| FOLL_FORCE, &page, NULL, |
| NULL); |
| if (err <= 0) { |
| free_pages((unsigned long)argv, |
| bprm->vma_pages); |
| return NULL; |
| } |
| |
| map = kmap(page); |
| memcpy(argv + i * PAGE_SIZE, map, PAGE_SIZE); |
| kunmap(page); |
| put_page(page); |
| } |
| *ctx = bprm; |
| } |
| |
| return argv; |
| } |
| |
| static void kunmap_argument_stack(struct linux_binprm *bprm, void *addr, |
| void *ctx) |
| { |
| struct page *page; |
| |
| if (!addr) |
| return; |
| |
| if (likely(bprm->vma_pages == 1)) { |
| page = (struct page *)ctx; |
| kunmap(page); |
| put_page(ctx); |
| } else { |
| vfree(addr); |
| } |
| } |
| |
| static char *find_array_next_entry(char *array, unsigned long *offset, |
| unsigned long end) |
| { |
| char *entry; |
| unsigned long off = *offset; |
| |
| if (off >= end) |
| return NULL; |
| |
| /* Check the entry is null terminated and in bound */ |
| entry = array + off; |
| while (array[off]) { |
| if (++off >= end) |
| return NULL; |
| } |
| |
| /* Pass the null byte for the next iteration */ |
| *offset = off + 1; |
| |
| return entry; |
| } |
| |
| struct string_arr_ctx { |
| struct linux_binprm *bprm; |
| void *stack; |
| }; |
| |
| static size_t get_config_limit(size_t *config_ptr) |
| { |
| lockdep_assert_held_read(&csm_rwsem_config); |
| |
| /* |
| * If execute is not enabled, do not capture arguments. |
| * The vsock packet won't be sent anyway. |
| */ |
| if (!csm_execute_enabled) |
| return 0; |
| |
| return *config_ptr; |
| } |
| |
| static bool encode_current_argv(pb_ostream_t *stream, const pb_field_t *field, |
| void * const *arg) |
| { |
| struct string_arr_ctx *ctx = (struct string_arr_ctx *)*arg; |
| int i; |
| struct linux_binprm *bprm = ctx->bprm; |
| unsigned long offset = bprm->p % PAGE_SIZE; |
| unsigned long end = bprm->vma_pages * PAGE_SIZE; |
| char *argv = ctx->stack; |
| char *entry; |
| size_t limit, used = 0; |
| ssize_t ret; |
| |
| limit = get_config_limit(&csm_execute_config.argv_limit); |
| if (!limit) |
| return true; |
| |
| for (i = 0; i < bprm->argc; i++) { |
| entry = find_array_next_entry(argv, &offset, end); |
| if (!entry) |
| return false; |
| |
| ret = pb_encode_string_field_limit(stream, field, |
| (void * const *)&entry, |
| limit - used); |
| if (ret < 0) |
| return false; |
| |
| used += ret; |
| |
| if (used >= limit) |
| break; |
| } |
| |
| return true; |
| } |
| |
| static bool check_envp_allowlist(char *envp) |
| { |
| bool ret = false; |
| char *strs, *equal; |
| size_t str_size, equal_pos; |
| |
| /* If execute is not enabled, skip all. */ |
| if (!csm_execute_enabled) |
| goto out; |
| |
| /* No filter, allow all. */ |
| strs = csm_execute_config.envp_allowlist; |
| if (!strs) { |
| ret = true; |
| goto out; |
| } |
| |
| /* |
| * Identify the key=value separation. |
| * If none exists use the whole string as a key. |
| */ |
| equal = strchr(envp, '='); |
| equal_pos = equal ? (equal - envp) : strlen(envp); |
| |
| /* Default to skip if no match found. */ |
| ret = false; |
| |
| do { |
| str_size = strlen(strs); |
| |
| /* |
| * If the filter length align with the key value equal sign, |
| * it might be a match, check the key value. |
| */ |
| if (str_size == equal_pos && |
| !strncmp(strs, envp, str_size)) { |
| ret = true; |
| goto out; |
| } |
| |
| strs += str_size + 1; |
| } while (*strs != 0); |
| |
| out: |
| return ret; |
| } |
| |
| static bool encode_current_envp(pb_ostream_t *stream, const pb_field_t *field, |
| void * const *arg) |
| { |
| struct string_arr_ctx *ctx = (struct string_arr_ctx *)*arg; |
| int i; |
| struct linux_binprm *bprm = ctx->bprm; |
| unsigned long offset = bprm->p % PAGE_SIZE; |
| unsigned long end = bprm->vma_pages * PAGE_SIZE; |
| char *argv = ctx->stack; |
| char *entry; |
| size_t limit, used = 0; |
| ssize_t ret; |
| |
| limit = get_config_limit(&csm_execute_config.envp_limit); |
| if (!limit) |
| return true; |
| |
| /* Skip arguments */ |
| for (i = 0; i < bprm->argc; i++) { |
| if (!find_array_next_entry(argv, &offset, end)) |
| return false; |
| } |
| |
| for (i = 0; i < bprm->envc; i++) { |
| entry = find_array_next_entry(argv, &offset, end); |
| if (!entry) |
| return false; |
| |
| if (!check_envp_allowlist(entry)) |
| continue; |
| |
| ret = pb_encode_string_field_limit(stream, field, |
| (void * const *)&entry, |
| limit - used); |
| if (ret < 0) |
| return false; |
| |
| used += ret; |
| |
| if (used >= limit) |
| break; |
| } |
| |
| return true; |
| } |
| |
| static bool is_overlayfs_mounted(struct file *file) |
| { |
| struct vfsmount *mnt; |
| struct super_block *mnt_sb; |
| |
| mnt = file->f_path.mnt; |
| if (mnt == NULL) |
| return false; |
| |
| mnt_sb = mnt->mnt_sb; |
| if (mnt_sb == NULL || mnt_sb->s_magic != OVERLAYFS_SUPER_MAGIC) |
| return false; |
| |
| return true; |
| } |
| |
| /* |
| * Before the process starts, identify a possible container by checking if the |
| * task is on a pid namespace and the target file is using an overlayfs mounting |
| * point. This check is valid for COS and GKE but not all existing containers. |
| */ |
| static bool is_possible_container(struct task_struct *task, |
| struct file *file) |
| { |
| if (task_active_pid_ns(task) == &init_pid_ns) |
| return false; |
| |
| return is_overlayfs_mounted(file); |
| } |
| |
| /* |
| * Generates a random identifier for this boot instance. |
| * This identifier is generated only when needed to increase the entropy |
| * available compared to doing it at early boot. |
| */ |
| static u32 get_machine_id(void) |
| { |
| int machineid, old; |
| |
| machineid = atomic_read(&machine_rand); |
| |
| if (unlikely(machineid == 0)) { |
| machineid = (int)get_random_int(); |
| if (machineid == 0) |
| machineid = 1; |
| old = atomic_cmpxchg(&machine_rand, 0, machineid); |
| |
| /* If someone beat us, use their value. */ |
| if (old != 0) |
| machineid = old; |
| } |
| |
| return (u32)machineid; |
| } |
| |
| /* |
| * Generate a 128-bit unique identifier for the process by appending: |
| * - A machine identifier unique per boot. |
| * - The start time of the process in nanoseconds. |
| * - The tgid for the set of threads in a process. |
| */ |
| static int get_process_uuid(struct task_struct *task, char *buffer, size_t size) |
| { |
| union process_uuid *id = (union process_uuid *)buffer; |
| |
| memset(buffer, 0, size); |
| |
| if (WARN_ON(size < PROCESS_UUID_SIZE)) |
| return -EINVAL; |
| |
| id->machineid = get_machine_id(); |
| id->start_time = ktime_mono_to_real(task->start_time); |
| id->tgid = task_tgid_nr(task); |
| |
| return 0; |
| } |
| |
| int get_process_uuid_by_pid(pid_t pid_nr, char *buffer, size_t size) |
| { |
| int err; |
| struct task_struct *task = NULL; |
| |
| rcu_read_lock(); |
| task = find_task_by_pid_ns(pid_nr, &init_pid_ns); |
| if (!task) { |
| err = -ENOENT; |
| goto out; |
| } |
| err = get_process_uuid(task, buffer, size); |
| out: |
| rcu_read_unlock(); |
| return err; |
| } |
| |
| u64 csm_set_contid(struct task_struct *task) |
| { |
| u64 cid; |
| |
| /* TODO: Report error to the backend */ |
| if (WARN_ON(!task->audit)) |
| return AUDIT_CID_UNSET; |
| |
| cid = atomic_inc_return(&contid); |
| task->audit->contid = cid; |
| return cid; |
| } |
| |
| static u32 get_fd_inode_mode(int fd) |
| { |
| u32 mode = 0; |
| struct fd sfd; |
| |
| sfd = fdget(fd); |
| if (sfd.file) |
| mode = file_inode(sfd.file)->i_mode; |
| fdput(sfd); |
| |
| return mode; |
| } |
| |
| int csm_bprm_check_security(struct linux_binprm *bprm) |
| { |
| char *buf; |
| struct dentry *dentry; |
| char *path; |
| char uuid[PROCESS_UUID_SIZE]; |
| char parent_uuid[PROCESS_UUID_SIZE]; |
| int err; |
| schema_Event event = schema_Event_init_zero; |
| schema_Process *proc; |
| schema_Overlay *overlayfs; |
| struct string_arr_ctx argv_ctx; |
| void *stack = NULL, *ctx = NULL; |
| u64 cid; |
| |
| /* |
| * Always create a container-id for containerized processes. |
| * If the LSM is enabled later, we can track existing containers. |
| */ |
| cid = audit_get_contid(current); |
| |
| if (cid == AUDIT_CID_UNSET) { |
| if (!is_possible_container(current, bprm->file)) |
| return 0; |
| |
| cid = csm_set_contid(current); |
| |
| if (cid == AUDIT_CID_UNSET) |
| return 0; |
| } |
| |
| if (!csm_execute_enabled) |
| return 0; |
| |
| buf = (char *)__get_free_page(GFP_KERNEL); |
| if (!buf) |
| return -ENOMEM; |
| |
| path = d_path(&bprm->file->f_path, buf, PAGE_SIZE); |
| if (IS_ERR(path)) { |
| err = PTR_ERR(path); |
| goto out_free_buf; |
| } |
| |
| proc = &event.event.execute.proc; |
| proc->creation_timestamp = ktime_get_real_ns(); |
| proc->container_id = cid; |
| |
| /* Add information about pid in different namespaces */ |
| proc->pid = task_pid_nr(current); |
| proc->parent_pid = task_ppid_nr(current); |
| proc->container_pid = task_pid_nr_ns(current, NULL); |
| proc->container_parent_pid = task_ppid_nr_ns(current, NULL); |
| |
| /* Generate unique identifier for the process and its parent */ |
| err = get_process_uuid(current, uuid, sizeof(uuid)); |
| if (err) |
| goto out_free_buf; |
| |
| proc->uuid.funcs.encode = pb_encode_string_field; |
| proc->uuid.arg = uuid; |
| |
| if (proc->parent_pid) { |
| err = get_process_uuid_by_pid(proc->parent_pid, parent_uuid, |
| sizeof(parent_uuid)); |
| /* Report the parent process if available. */ |
| if (!err) { |
| proc->parent_uuid.funcs.encode = pb_encode_string_field; |
| proc->parent_uuid.arg = parent_uuid; |
| } else if (err != -ENOENT) { |
| goto out_free_buf; |
| } |
| } |
| |
| /* Provide information about the launched binary */ |
| proc->binary.fullpath.funcs.encode = pb_encode_string_field; |
| proc->binary.fullpath.arg = path; |
| |
| if (is_overlayfs_mounted(bprm->file)) { |
| dentry = bprm->file->f_path.dentry; |
| overlayfs = &proc->binary.filesystem.overlayfs; |
| overlayfs->lower_layer = ovl_dentry_lower(dentry); |
| overlayfs->upper_layer = ovl_dentry_upper(dentry); |
| } |
| proc->binary.which_filesystem = schema_File_overlayfs_tag; |
| |
| stack = kmap_argument_stack(bprm, &ctx); |
| if (!stack) { |
| err = -EFAULT; |
| goto out_free_buf; |
| } |
| |
| /* Capture process argument */ |
| argv_ctx.bprm = bprm; |
| argv_ctx.stack = stack; |
| proc->args.argv.funcs.encode = encode_current_argv; |
| proc->args.argv.arg = &argv_ctx; |
| |
| /* Capture process environment variables */ |
| proc->args.envp.funcs.encode = encode_current_envp; |
| proc->args.envp.arg = &argv_ctx; |
| |
| /* Information about streams */ |
| proc->streams.stdin.mode = get_fd_inode_mode(STDIN_FILENO); |
| proc->streams.stdout.mode = get_fd_inode_mode(STDOUT_FILENO); |
| proc->streams.stderr.mode = get_fd_inode_mode(STDERR_FILENO); |
| |
| event.which_event = schema_Event_execute_tag; |
| |
| /* |
| * Configurations options are checked when computing the serialized |
| * protobufs. |
| */ |
| down_read(&csm_rwsem_config); |
| err = csm_sendeventproto(schema_Event_fields, &event); |
| up_read(&csm_rwsem_config); |
| |
| if (err) |
| pr_err("csm_sendeventproto returned %d on execve\n", err); |
| err = 0; |
| |
| out_free_buf: |
| kunmap_argument_stack(bprm, stack, ctx); |
| free_page((unsigned long)buf); |
| |
| /* |
| * On failure, enforce it only if the execute config is enabled. |
| * If the collector was disabled, prefer to succeed to not impact the |
| * system. |
| */ |
| if (err < 0 && !csm_execute_enabled) |
| err = 0; |
| |
| return err; |
| } |
| |
| void csm_task_exit(struct task_struct *task) |
| { |
| int err; |
| schema_Event event = schema_Event_init_zero; |
| schema_ExitEvent *exit; |
| char uuid[PROCESS_UUID_SIZE]; |
| |
| /* Catch the last thread of a process with a container identifier. */ |
| if (!csm_execute_enabled || |
| get_nr_threads(task) > 1 || |
| audit_get_contid(task) == AUDIT_CID_UNSET) |
| return; |
| |
| exit = &event.event.exit; |
| |
| /* Fetch the unique identifier for this process */ |
| err = get_process_uuid(task, uuid, sizeof(uuid)); |
| if (err) { |
| pr_err("failed to get process uuid on exit\n"); |
| return; |
| } |
| |
| exit->process_uuid.funcs.encode = pb_encode_string_field; |
| exit->process_uuid.arg = uuid; |
| |
| event.which_event = schema_Event_exit_tag; |
| |
| err = csm_sendeventproto(schema_Event_fields, &event); |
| if (err) |
| pr_err("csm_sendeventproto returned %d on exit\n", err); |
| } |