blob: 07a41ccd2377d790a2846c704496bbede2dae6ce [file] [log] [blame]
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#define XOPEN_SOURCE 500
#include "library.h"
#include <algorithm>
#include <elf.h>
#include <errno.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/unistd.h>
#include <set>
#include <signal.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ptrace.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "allocator.h"
#include "debug.h"
#include "sandbox_impl.h"
#include "syscall_entrypoint.h"
#include "system_call_table.h"
#include "x86_decode.h"
#if defined(__x86_64__)
typedef Elf64_Phdr Elf_Phdr;
typedef Elf64_Rela Elf_Rel;
typedef Elf64_Half Elf_Half;
typedef Elf64_Word Elf_Word;
typedef Elf64_Sword Elf_Sword;
typedef Elf64_Xword Elf_Xword;
typedef Elf64_Sxword Elf_Sxword;
typedef Elf64_Off Elf_Off;
typedef Elf64_Section Elf_Section;
typedef Elf64_Versym Elf_Versym;
#elif defined(__i386__)
typedef Elf32_Phdr Elf_Phdr;
typedef Elf32_Rel Elf_Rel;
typedef Elf32_Half Elf_Half;
typedef Elf32_Word Elf_Word;
typedef Elf32_Sword Elf_Sword;
typedef Elf32_Xword Elf_Xword;
typedef Elf32_Sxword Elf_Sxword;
typedef Elf32_Off Elf_Off;
typedef Elf32_Section Elf_Section;
typedef Elf32_Versym Elf_Versym;
#else
#error Unsupported target platform
#endif
namespace playground {
char* Library::__kernel_vsyscall;
char* Library::__kernel_sigreturn;
char* Library::__kernel_rt_sigreturn;
Library::Library() :
valid_(false),
isVDSO_(false),
asr_offset_(0),
vsys_offset_(0),
image_(0),
image_size_(0),
maps_(NULL) {
}
Library::~Library() {
if (image_size_) {
// We no longer need access to a full mapping of the underlying library
// file. Move the temporarily extended mapping back to where we originally
// found. Make sure to preserve any changes that we might have made since.
Sandbox::SysCalls sys;
sys.mprotect(image_, 4096, PROT_READ | PROT_WRITE | PROT_EXEC);
if (memcmp(image_, memory_ranges_.rbegin()->second.start, 4096)) {
// Only copy data, if we made any changes in this data. Otherwise there
// is no need to create another modified COW mapping.
memcpy(image_, memory_ranges_.rbegin()->second.start, 4096);
}
sys.mprotect(image_, 4096, PROT_READ | PROT_EXEC);
sys.mremap(image_, image_size_, 4096, MREMAP_MAYMOVE | MREMAP_FIXED,
memory_ranges_.rbegin()->second.start);
}
}
char* Library::getBytes(char* dst, const char* src, ssize_t len) {
// Some kernels don't allow accessing the VDSO from write()
if (isVDSO_ &&
src >= memory_ranges_.begin()->second.start &&
src <= memory_ranges_.begin()->second.stop) {
ssize_t max =
reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) - src;
if (len > max) {
len = max;
}
memcpy(dst, src, len);
return dst;
}
// Read up to "len" bytes from "src" and copy them to "dst". Short
// copies are possible, if we are at the end of a mapping. Returns
// NULL, if the operation failed completely.
static int helper_socket[2];
Sandbox::SysCalls sys;
if (!helper_socket[0] && !helper_socket[1]) {
// Copy data through a socketpair, as this allows us to access it
// without incurring a segmentation fault.
sys.socketpair(AF_UNIX, SOCK_STREAM, 0, helper_socket);
}
char* ptr = dst;
int inc = 4096;
while (len > 0) {
ssize_t l = inc == 1 ? inc : 4096 - (reinterpret_cast<long>(src) & 0xFFF);
if (l > len) {
l = len;
}
l = NOINTR_SYS(sys.write(helper_socket[0], src, l));
if (l == -1) {
if (sys.my_errno == EFAULT) {
if (inc == 1) {
if (ptr == dst) {
return NULL;
}
break;
}
inc = 1;
continue;
} else {
return NULL;
}
}
l = sys.read(helper_socket[1], ptr, l);
if (l <= 0) {
return NULL;
}
ptr += l;
src += l;
len -= l;
}
return dst;
}
char *Library::get(Elf_Addr offset, char *buf, size_t len) {
if (!valid_) {
memset(buf, 0, len);
return NULL;
}
RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset);
if (iter == memory_ranges_.end()) {
memset(buf, 0, len);
return NULL;
}
offset -= iter->first;
long size = reinterpret_cast<char *>(iter->second.stop) -
reinterpret_cast<char *>(iter->second.start);
if (offset > size - len) {
memset(buf, 0, len);
return NULL;
}
char *src = reinterpret_cast<char *>(iter->second.start) + offset;
memset(buf, 0, len);
if (!getBytes(buf, src, len)) {
return NULL;
}
return buf;
}
Library::string Library::get(Elf_Addr offset) {
if (!valid_) {
return "";
}
RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset);
if (iter == memory_ranges_.end()) {
return "";
}
offset -= iter->first;
const char *start = reinterpret_cast<char *>(iter->second.start) + offset;
const char *stop = reinterpret_cast<char *>(iter->second.stop) + offset;
char buf[4096] = { 0 };
getBytes(buf, start, stop - start >= (int)sizeof(buf) ?
sizeof(buf) - 1 : stop - start);
start = buf;
stop = buf;
while (*stop) {
++stop;
}
string s = stop > start ? string(start, stop - start) : "";
return s;
}
char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) {
if (!valid_) {
memset(buf, 0, len);
return NULL;
}
Sandbox::SysCalls sys;
if (!image_ && !isVDSO_ && !memory_ranges_.empty() &&
memory_ranges_.rbegin()->first == 0) {
// Extend the mapping of the very first page of the underlying library
// file. This way, we can read the original file contents of the entire
// library.
// We have to be careful, because doing so temporarily removes the first
// 4096 bytes of the library from memory. And we don't want to accidentally
// unmap code that we are executing. So, only use functions that can be
// inlined.
void* start = memory_ranges_.rbegin()->second.start;
image_size_ = memory_ranges_.begin()->first +
(reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) -
reinterpret_cast<char *>(memory_ranges_.begin()->second.start));
if (image_size_ < 8192) {
// It is possible to create a library that is only a single page in
// size. In that case, we have to make sure that we artificially map
// one extra page past the end of it, as our code relies on mremap()
// actually moving the mapping.
image_size_ = 8192;
}
image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_,
MREMAP_MAYMOVE));
if (image_size_ == 8192 && image_ == start) {
// We really mean it, when we say we want the memory to be moved.
image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_,
MREMAP_MAYMOVE));
sys.munmap(reinterpret_cast<char *>(start) + 4096, 4096);
}
if (image_ == MAP_FAILED) {
image_ = NULL;
} else {
sys.MMAP(start, 4096, PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
for (int i = 4096 / sizeof(long); --i;
reinterpret_cast<long *>(start)[i] =
reinterpret_cast<long *>(image_)[i]);
}
}
if (image_) {
if (offset + len > image_size_) {
// It is quite likely that we initially did not map the entire file as
// we did not know how large it is. So, if necessary, try to extend the
// mapping.
size_t new_size = (offset + len + 4095) & ~4095;
char* tmp =
reinterpret_cast<char *>(sys.mremap(image_, image_size_, new_size,
MREMAP_MAYMOVE));
if (tmp != MAP_FAILED) {
image_ = tmp;
image_size_ = new_size;
}
}
if (buf && offset + len <= image_size_) {
return reinterpret_cast<char *>(memcpy(buf, image_ + offset, len));
}
return NULL;
}
return buf ? get(offset, buf, len) : NULL;
}
Library::string Library::getOriginal(Elf_Addr offset) {
if (!valid_) {
return "";
}
// Make sure we actually have a mapping that we can access. If the string
// is located at the end of the image, we might not yet have extended the
// mapping sufficiently.
if (!image_ || image_size_ <= offset) {
getOriginal(offset, NULL, 1);
}
if (image_) {
if (offset < image_size_) {
char* start = image_ + offset;
char* stop = start;
while (stop < image_ + image_size_ && *stop) {
++stop;
if (stop >= image_ + image_size_) {
getOriginal(stop - image_, NULL, 1);
}
}
return string(start, stop - start);
}
return "";
}
return get(offset);
}
const Elf_Ehdr* Library::getEhdr() {
if (!valid_) {
return NULL;
}
return &ehdr_;
}
const Elf_Shdr* Library::getSection(const string& section) {
if (!valid_) {
return NULL;
}
SectionTable::const_iterator iter = section_table_.find(section);
if (iter == section_table_.end()) {
return NULL;
}
return &iter->second.second;
}
void Library::makeWritable(bool state) const {
for (RangeMap::const_iterator iter = memory_ranges_.begin();
iter != memory_ranges_.end(); ++iter) {
const Range& range = iter->second;
long length = reinterpret_cast<char *>(range.stop) -
reinterpret_cast<char *>(range.start);
Sandbox::SysCalls sys;
sys.mprotect(range.start, length,
range.prot | (state ? PROT_WRITE : 0));
}
}
bool Library::isSafeInsn(unsigned short insn) {
// Check if the instruction has no unexpected side-effects. If so, it can
// be safely relocated from the function that we are patching into the
// out-of-line scratch space that we are setting up. This is often necessary
// to make room for the JMP into the scratch space.
return ((insn & 0x7) < 0x6 && (insn & 0xF0) < 0x40
/* ADD, OR, ADC, SBB, AND, SUB, XOR, CMP */) ||
#if defined(__x86_64__)
insn == 0x63 /* MOVSXD */ ||
#endif
(insn >= 0x80 && insn <= 0x8E /* ADD, OR, ADC,
SBB, AND, SUB, XOR, CMP, TEST, XCHG, MOV, LEA */) ||
(insn == 0x90) || /* NOP */
(insn >= 0xA0 && insn <= 0xA9) /* MOV, TEST */ ||
(insn >= 0xB0 && insn <= 0xBF /* MOV */) ||
(insn >= 0xC0 && insn <= 0xC1) || /* Bit Shift */
(insn >= 0xD0 && insn <= 0xD3) || /* Bit Shift */
(insn >= 0xC6 && insn <= 0xC7 /* MOV */) ||
(insn == 0xF7) /* TEST, NOT, NEG, MUL, IMUL, DIV, IDIV */ ||
(insn >= 0xF19 && insn <= 0xF1F) /* long NOP */;
}
char* Library::getScratchSpace(const Maps* maps, char* near, int needed,
char** extraSpace, int* extraLength) {
if (needed > *extraLength ||
labs(*extraSpace - reinterpret_cast<char *>(near)) > (1536 << 20)) {
if (*extraSpace) {
// Start a new scratch page and mark any previous page as write-protected
Sandbox::SysCalls sys;
sys.mprotect(*extraSpace, 4096, PROT_READ|PROT_EXEC);
}
// Our new scratch space is initially executable and writable.
*extraLength = 4096;
*extraSpace = maps->allocNearAddr(near, *extraLength,
PROT_READ|PROT_WRITE|PROT_EXEC);
}
if (*extraSpace) {
*extraLength -= needed;
return *extraSpace + *extraLength;
}
Sandbox::die("Insufficient space to intercept system call");
}
#if defined(__x86_64__)
static bool isCallToVsyscallPage(char* code) {
// Look for these instructions, which are a call to the x86-64
// vsyscall page, which the kernel puts at a fixed address:
//
// 48 c7 c0 00 XX 60 ff mov $0xffffffffff60XX00,%rax
// ff d0 callq *%rax
//
// This will not catch all calls to the vsyscall page, but it
// handles the important cases that glibc contains. The vsyscall
// page is deprecated, so it is unlikely that new instruction
// sequences for calling it will be introduced.
return (code[0] == '\x48' &&
code[1] == '\xc7' &&
code[2] == '\xc0' &&
code[3] == '\x00' &&
(code[4] == '\x00' || code[4] == '\x04' || code[4] == '\x08') &&
code[5] == '\x60' &&
code[6] == '\xff' &&
code[7] == '\xff' &&
code[8] == '\xd0');
}
static void patchCallToVsyscallPage(char* code) {
// We replace the mov+callq with these instructions:
//
// b8 XX XX XX XX mov $X, %eax // where X is the syscall number
// 0f 05 syscall
// 90 nop
// 90 nop
//
// The syscall instruction will later be patched by the general case.
if (code[4] == '\x00') {
// Use __NR_gettimeofday == 96 == 0x60.
const char replacement[] = "\xb8\x60\x00\x00\x00\x0f\x05\x90\x90";
memcpy(code, replacement, sizeof(replacement) - 1);
} else if (code[4] == '\x04') {
// Use __NR_time == 201 == 0xc9.
const char replacement[] = "\xb8\xc9\x00\x00\x00\x0f\x05\x90\x90";
memcpy(code, replacement, sizeof(replacement) - 1);
} else if (code[4] == '\x08') {
// Use __NR_getcpu == 309 == 0x135.
const char replacement[] = "\xb8\x35\x01\x00\x00\x0f\x05\x90\x90";
memcpy(code, replacement, sizeof(replacement) - 1);
}
}
#endif
void Library::patchSystemCallsInFunction(const Maps* maps, int vsys_offset,
char* start, char* end,
char** extraSpace, int* extraLength) {
typedef std::set<char *, std::less<char *>, SystemAllocator<char *> >
BranchTargets;
BranchTargets branch_targets;
for (char *ptr = start; ptr < end; ) {
unsigned short insn = next_inst((const char **)&ptr, __WORDSIZE == 64);
char *target;
if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ || insn == 0xEB /* JMP */) {
target = ptr + (reinterpret_cast<signed char *>(ptr))[-1];
} else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ ||
(insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) {
target = ptr + (reinterpret_cast<int *>(ptr))[-1];
} else {
continue;
}
branch_targets.insert(target);
}
struct Code {
char* addr;
int len;
unsigned short insn;
bool is_ip_relative;
} code[5] = { { 0 } };
int codeIdx = 0;
char* ptr = start;
while (ptr < end) {
#if defined(__x86_64__)
if (isCallToVsyscallPage(ptr)) {
patchCallToVsyscallPage(ptr);
}
#endif
// Keep a ring-buffer of the last few instruction in order to find the
// correct place to patch the code.
char *mod_rm;
code[codeIdx].addr = ptr;
code[codeIdx].insn = next_inst((const char **)&ptr, __WORDSIZE == 64,
0, 0, &mod_rm, 0, 0);
code[codeIdx].len = ptr - code[codeIdx].addr;
code[codeIdx].is_ip_relative =
#if defined(__x86_64__)
mod_rm && (*mod_rm & 0xC7) == 0x5;
#else
false;
#endif
// Whenever we find a system call, we patch it with a jump to out-of-line
// code that redirects to our system call entrypoint.
bool is_syscall = true;
#if defined(__x86_64__)
bool is_indirect_call = false;
if (code[codeIdx].insn == 0x0F05 /* SYSCALL */ ||
// In addition, on x86-64, we need to redirect all CALLs between the
// VDSO and the VSyscalls page. We want these to jump to our own
// modified copy of the VSyscalls. As we know that the VSyscalls are
// always more than 2GB away from the VDSO, the compiler has to
// generate some form of indirect jumps. We can find all indirect
// CALLs and redirect them to a separate scratch area, where we can
// inspect the destination address. If it indeed points to the
// VSyscall area, we then adjust the destination address accordingly.
(is_indirect_call =
(vsys_offset && code[codeIdx].insn == 0xFF &&
!code[codeIdx].is_ip_relative &&
mod_rm && (*mod_rm & 0x38) == 0x10 /* CALL (indirect) */))) {
is_syscall = !is_indirect_call;
#elif defined(__i386__)
bool is_gs_call = false;
if (code[codeIdx].len == 7 &&
code[codeIdx].insn == 0xFF &&
code[codeIdx].addr[2] == '\x15' /* CALL (indirect) */ &&
code[codeIdx].addr[0] == '\x65' /* %gs prefix */) {
char* target;
asm volatile("mov %%gs:(%1), %0\n"
: "=a"(target)
: "c"(*reinterpret_cast<int *>(code[codeIdx].addr+3)));
if (target == __kernel_vsyscall) {
is_gs_call = true;
// TODO(markus): also handle the other vsyscalls
}
}
if (is_gs_call ||
(code[codeIdx].insn == 0xCD &&
code[codeIdx].addr[1] == '\x80' /* INT $0x80 */)) {
#else
#error Unsupported target platform
#endif
// Found a system call. Search backwards to figure out how to redirect
// the code. We will need to overwrite a couple of instructions and,
// of course, move these instructions somewhere else.
int startIdx = codeIdx;
int length = code[codeIdx].len;
for (int idx = codeIdx;
(idx = (idx + (sizeof(code) / sizeof(struct Code)) - 1) %
(sizeof(code) / sizeof(struct Code))) != codeIdx; ) {
BranchTargets::const_iterator iter =
std::upper_bound(branch_targets.begin(), branch_targets.end(),
code[idx].addr);
if (iter != branch_targets.end() && *iter < ptr) {
// Found a branch pointing to somewhere past our instruction. This
// instruction cannot be moved safely. Leave it in place.
break;
}
if (code[idx].addr && !code[idx].is_ip_relative &&
isSafeInsn(code[idx].insn)) {
// These are all benign instructions with no side-effects and no
// dependency on the program counter. We should be able to safely
// relocate them.
startIdx = idx;
length = ptr - code[startIdx].addr;
} else {
break;
}
}
// Search forward past the system call, too. Sometimes, we can only
// find relocatable instructions following the system call.
#if defined(__i386__)
findEndIdx:
#endif
char *next = ptr;
for (int i = codeIdx;
next < end &&
(i = (i + 1) % (sizeof(code) / sizeof(struct Code))) != startIdx;
) {
BranchTargets::const_iterator iter =
std::lower_bound(branch_targets.begin(), branch_targets.end(),
next);
if (iter != branch_targets.end() && *iter == next) {
// Found branch target pointing to our instruction
break;
}
char *tmp_rm;
code[i].addr = next;
code[i].insn = next_inst((const char **)&next, __WORDSIZE == 64,
0, 0, &tmp_rm, 0, 0);
code[i].len = next - code[i].addr;
code[i].is_ip_relative = tmp_rm && (*tmp_rm & 0xC7) == 0x5;
if (!code[i].is_ip_relative && isSafeInsn(code[i].insn)) {
length = next - code[startIdx].addr;
} else {
break;
}
}
// We now know, how many instructions neighboring the system call we
// can safely overwrite. On x86-32 we need six bytes, and on x86-64
// We need five bytes to insert a JMPQ and a 32bit address. We then
// jump to a code fragment that safely forwards to our system call
// entrypoint.
// On x86-64, this is complicated by the fact that the API allows up
// to 128 bytes of red-zones below the current stack pointer. So, we
// cannot write to the stack until we have adjusted the stack
// pointer.
// On both x86-32 and x86-64 we take care to leave the stack unchanged
// while we are executing the preamble and postamble. This allows us
// to treat instructions that reference %esp/%rsp as safe for
// relocation.
// In particular, this means that on x86-32 we cannot use CALL, but
// have to use a PUSH/RET combination to change the instruction pointer.
// On x86-64, we can instead use a 32bit JMPQ.
//
// .. .. .. .. ; any leading instructions copied from original code
// 48 81 EC 80 00 00 00 SUB $0x80, %rsp
// 50 PUSH %rax
// 48 8D 05 .. .. .. .. LEA ...(%rip), %rax
// 50 PUSH %rax
// 48 B8 .. .. .. .. MOV $syscallEntryPointWithFrame, %rax
// .. .. .. ..
// 50 PUSH %rax
// 48 8D 05 06 00 00 00 LEA 6(%rip), %rax
// 48 87 44 24 10 XCHG %rax, 16(%rsp)
// C3 RETQ
// 48 81 C4 80 00 00 00 ADD $0x80, %rsp
// .. .. .. .. ; any trailing instructions copied from original code
// E9 .. .. .. .. JMPQ ...
//
// Total: 52 bytes + any bytes that were copied
//
// On x86-32, the stack is available and we can do:
//
// TODO(markus): Try to maintain frame pointers on x86-32
//
// .. .. .. .. ; any leading instructions copied from original code
// 68 .. .. .. .. PUSH . + 11
// 68 .. .. .. .. PUSH return_addr
// 68 .. .. .. .. PUSH $syscallEntryPointWithFrame
// C3 RET
// .. .. .. .. ; any trailing instructions copied from original code
// 68 .. .. .. .. PUSH return_addr
// C3 RET
//
// Total: 22 bytes + any bytes that were copied
//
// For indirect jumps from the VDSO to the VSyscall page, we instead
// replace the following code (this is only necessary on x86-64). This
// time, we don't have to worry about red zones:
//
// .. .. .. .. ; any leading instructions copied from original code
// E8 00 00 00 00 CALL .
// 48 83 04 24 .. ADDQ $.., (%rsp)
// FF .. .. .. .. .. PUSH .. ; from original CALL instruction
// 48 81 3C 24 00 00 00 FF CMPQ $0xFFFFFFFFFF000000, 0(%rsp)
// 72 10 JB . + 16
// 81 2C 24 .. .. .. .. SUBL ..., 0(%rsp)
// C7 44 24 04 00 00 00 00 MOVL $0, 4(%rsp)
// C3 RETQ
// 48 87 04 24 XCHG %rax,(%rsp)
// 48 89 44 24 08 MOV %rax, 8(%rsp)
// 58 POP %rax
// C3 RETQ
// .. .. .. .. ; any trailing instructions copied from original code
// E9 .. .. .. .. JMPQ ...
//
// Total: 52 bytes + any bytes that were copied
if (length < (__WORDSIZE == 32 ? 6 : 5)) {
// There are a very small number of instruction sequences that we
// cannot easily intercept, and that have been observed in real world
// examples. Handle them here:
#if defined(__i386__)
int diff;
if (!memcmp(code[codeIdx].addr, "\xCD\x80\xEB", 3) &&
(diff = *reinterpret_cast<signed char *>(
code[codeIdx].addr + 3)) < 0 && diff >= -6) {
// We have seen...
// for (;;) {
// _exit(0);
// }
// ..get compiled to:
// B8 01 00 00 00 MOV $__NR_exit, %eax
// 66 90 XCHG %ax, %ax
// 31 DB 0:XOR %ebx, %ebx
// CD 80 INT $0x80
// EB FA JMP 0b
// The JMP is really superfluous as the system call never returns.
// And there are in fact no returning system calls that need to be
// unconditionally repeated in an infinite loop.
// If we replace the JMP with NOPs, the system call can successfully
// be intercepted.
*reinterpret_cast<unsigned short *>(code[codeIdx].addr + 2) = 0x9090;
goto findEndIdx;
}
#elif defined(__x86_64__)
BranchTargets::const_iterator iter;
#endif
// If we cannot figure out any other way to intercept this system call,
// we replace it with a call to INT0. This causes a SEGV which we then
// handle in the signal handler. That's a lot slower than rewriting the
// instruction with a jump, but it should only happen very rarely.
if (is_syscall) {
memcpy(code[codeIdx].addr, "\xCD" /* INT $0 */, 2);
if (code[codeIdx].len > 2) {
memset(code[codeIdx].addr + 2, 0x90 /* NOP */,
code[codeIdx].len - 2);
}
goto replaced;
}
#if defined(__x86_64__)
// On x86-64, we occasionally see code like this in the VDSO:
// 48 8B 05 CF FE FF FF MOV -0x131(%rip),%rax
// FF 50 20 CALLQ *0x20(%rax)
// By default, we would not replace the MOV instruction, as it is
// IP relative. But if the following instruction is also IP relative,
// we are left with only three bytes which is not enough to insert a
// jump.
// We recognize this particular situation, and as long as the CALLQ
// is not a branch target, we decide to still relocate the entire
// sequence. We just have to make sure that we then patch up the
// IP relative addressing.
else if (is_indirect_call && startIdx == codeIdx &&
code[startIdx = (startIdx + (sizeof(code) /
sizeof(struct Code)) - 1) %
(sizeof(code) / sizeof(struct Code))].addr &&
ptr - code[startIdx].addr >= 5 &&
code[startIdx].is_ip_relative &&
isSafeInsn(code[startIdx].insn) &&
((iter = std::upper_bound(branch_targets.begin(),
branch_targets.end(),
code[startIdx].addr)) ==
branch_targets.end() || *iter >= ptr)) {
// We changed startIdx to include the IP relative instruction.
// When copying this preamble, we make sure to patch up the
// offset.
}
#endif
else {
Sandbox::die("Cannot intercept system call");
}
}
int needed = (__WORDSIZE == 32 ? 6 : 5) - code[codeIdx].len;
int first = codeIdx;
while (needed > 0 && first != startIdx) {
first = (first + (sizeof(code) / sizeof(struct Code)) - 1) %
(sizeof(code) / sizeof(struct Code));
needed -= code[first].len;
}
int second = codeIdx;
while (needed > 0) {
second = (second + 1) % (sizeof(code) / sizeof(struct Code));
needed -= code[second].len;
}
int preamble = code[codeIdx].addr - code[first].addr;
int postamble = code[second].addr + code[second].len -
code[codeIdx].addr - code[codeIdx].len;
// The following is all the code that construct the various bits of
// assembly code.
#if defined(__x86_64__)
if (is_indirect_call) {
needed = 52 + preamble + code[codeIdx].len + postamble;
} else {
needed = 52 + preamble + postamble;
}
#elif defined(__i386__)
needed = 22 + preamble + postamble;
#else
#error Unsupported target platform
#endif
// Allocate scratch space and copy the preamble of code that was moved
// from the function that we are patching.
char* dest = getScratchSpace(maps, code[first].addr, needed,
extraSpace, extraLength);
memcpy(dest, code[first].addr, preamble);
// For jumps from the VDSO to the VSyscalls we sometimes allow exactly
// one IP relative instruction in the preamble.
if (code[first].is_ip_relative) {
*reinterpret_cast<int *>(dest + (code[codeIdx].addr -
code[first].addr) - 4)
-= dest - code[first].addr;
}
// For indirect calls, we need to copy the actual CALL instruction and
// turn it into a PUSH instruction.
#if defined(__x86_64__)
if (is_indirect_call) {
memcpy(dest + preamble,
"\xE8\x00\x00\x00\x00" // CALL .
"\x48\x83\x04\x24", // ADDQ $.., (%rsp)
9);
dest[preamble + 9] = code[codeIdx].len + 42;
memcpy(dest + preamble + 10, code[codeIdx].addr, code[codeIdx].len);
// Convert CALL -> PUSH
dest[preamble + 10 + (mod_rm - code[codeIdx].addr)] |= 0x20;
preamble += 10 + code[codeIdx].len;
}
#endif
// Copy the static body of the assembly code.
memcpy(dest + preamble,
#if defined(__x86_64__)
is_indirect_call ?
"\x48\x81\x3C\x24\x00\x00\x00\xFF"// CMPQ $0xFFFFFFFFFF000000,0(rsp)
"\x72\x10" // JB . + 16
"\x81\x2C\x24\x00\x00\x00\x00" // SUBL ..., 0(%rsp)
"\xC7\x44\x24\x04\x00\x00\x00\x00"// MOVL $0, 4(%rsp)
"\xC3" // RETQ
"\x48\x87\x04\x24" // XCHG %rax, (%rsp)
"\x48\x89\x44\x24\x08" // MOV %rax, 8(%rsp)
"\x58" // POP %rax
"\xC3" : // RETQ
"\x48\x81\xEC\x80\x00\x00\x00" // SUB $0x80, %rsp
"\x50" // PUSH %rax
"\x48\x8D\x05\x00\x00\x00\x00" // LEA ...(%rip), %rax
"\x50" // PUSH %rax
"\x48\xB8\x00\x00\x00\x00\x00" // MOV $syscallEntryPointWithFrm,
"\x00\x00\x00" // %rax
"\x50" // PUSH %rax
"\x48\x8D\x05\x06\x00\x00\x00" // LEA 6(%rip), %rax
"\x48\x87\x44\x24\x10" // XCHG %rax, 16(%rsp)
"\xC3" // RETQ
"\x48\x81\xC4\x80\x00\x00", // ADD $0x80, %rsp
is_indirect_call ? 37 : 47
#elif defined(__i386__)
"\x68\x00\x00\x00\x00" // PUSH . + 11
"\x68\x00\x00\x00\x00" // PUSH return_addr
"\x68\x00\x00\x00\x00" // PUSH $syscallEntryPointWithFrm
"\xC3", // RET
16
#else
#error Unsupported target platform
#endif
);
// Copy the postamble that was moved from the function that we are
// patching.
memcpy(dest + preamble +
#if defined(__x86_64__)
(is_indirect_call ? 37 : 47),
#elif defined(__i386__)
16,
#else
#error Unsupported target platform
#endif
code[codeIdx].addr + code[codeIdx].len,
postamble);
// Patch up the various computed values
#if defined(__x86_64__)
int post = preamble + (is_indirect_call ? 37 : 47) + postamble;
dest[post] = '\xE9'; // JMPQ
*reinterpret_cast<int *>(dest + post + 1) =
(code[second].addr + code[second].len) - (dest + post + 5);
if (is_indirect_call) {
*reinterpret_cast<int *>(dest + preamble + 13) = vsys_offset;
} else {
*reinterpret_cast<int *>(dest + preamble + 11) =
(code[second].addr + code[second].len) - (dest + preamble + 15);
*reinterpret_cast<void **>(dest + preamble + 18) =
reinterpret_cast<void *>(&syscallEntryPointWithFrame);
}
#elif defined(__i386__)
*(dest + preamble + 16 + postamble) = '\x68'; // PUSH
*reinterpret_cast<char **>(dest + preamble + 17 + postamble) =
code[second].addr + code[second].len;
*(dest + preamble + 21 + postamble) = '\xC3'; // RET
*reinterpret_cast<char **>(dest + preamble + 1) =
dest + preamble + 16;
*reinterpret_cast<char **>(dest + preamble + 6) =
code[second].addr + code[second].len;
*reinterpret_cast<void (**)()>(dest + preamble + 11) =
syscallEntryPointWithFrame;
#else
#error Unsupported target platform
#endif
// Pad unused space in the original function with NOPs
memset(code[first].addr, 0x90 /* NOP */,
code[second].addr + code[second].len - code[first].addr);
// Replace the system call with an unconditional jump to our new code.
#if defined(__x86_64__)
*code[first].addr = '\xE9'; // JMPQ
*reinterpret_cast<int *>(code[first].addr + 1) =
dest - (code[first].addr + 5);
#elif defined(__i386__)
code[first].addr[0] = '\x68'; // PUSH
*reinterpret_cast<char **>(code[first].addr + 1) = dest;
code[first].addr[5] = '\xC3'; // RET
#else
#error Unsupported target platform
#endif
}
replaced:
codeIdx = (codeIdx + 1) % (sizeof(code) / sizeof(struct Code));
}
}
void Library::patchVDSO(char** extraSpace, int* extraLength){
#if defined(__i386__)
Sandbox::SysCalls sys;
if (!__kernel_vsyscall ||
sys.mprotect(reinterpret_cast<void *>(
reinterpret_cast<long>(__kernel_vsyscall) & ~0xFFF),
4096, PROT_READ|PROT_WRITE|PROT_EXEC)) {
return;
}
// x86-32 has a small number of well-defined functions in the VDSO library.
// These functions do not easily lend themselves to be rewritten by the
// automatic code. Instead, we explicitly find new definitions for them.
//
// We don't bother with optimizing the syscall instruction instead always
// use INT $0x80, no matter whether the hardware supports more modern
// calling conventions.
//
// TODO(markus): Investigate whether it is worthwhile to optimize this
// code path and use the platform-specific entry code.
if (__kernel_vsyscall) {
// Replace the kernel entry point with:
//
// E9 .. .. .. .. JMP syscallEntryPointNoFrame
*__kernel_vsyscall = '\xE9'; // JMPQ
*reinterpret_cast<long *>(__kernel_vsyscall + 1) =
reinterpret_cast<char *>(&syscallEntryPointNoFrame) -
reinterpret_cast<char *>(__kernel_vsyscall + 5);
}
if (__kernel_sigreturn) {
// Replace the sigreturn() system call with a jump to code that does:
//
// 58 POP %eax
// B8 77 00 00 00 MOV $0x77, %eax
// E8 .. .. .. .. CALL syscallEntryPointNoFrame
char* dest = getScratchSpace(maps_, __kernel_sigreturn, 11, extraSpace,
extraLength);
memcpy(dest,
"\x58" // POP %eax
"\xB8\x77\x00\x00\x00" // MOV %0x77, %eax
"\xE8", // CALL syscallEntryPointNoFrame
7);
*reinterpret_cast<long *>(dest + 7) =
reinterpret_cast<char *>(&syscallEntryPointNoFrame) - dest - 11;;
*__kernel_sigreturn = '\xE9'; // JMPQ
*reinterpret_cast<long *>(__kernel_sigreturn + 1) =
dest - reinterpret_cast<char *>(__kernel_sigreturn) - 5;
}
if (__kernel_rt_sigreturn) {
// Replace the rt_sigreturn() system call with a jump to code that does:
//
// B8 AD 00 00 00 MOV $0xAD, %eax
// E8 .. .. .. .. CALL syscallEntryPointNoFrame
char* dest = getScratchSpace(maps_, __kernel_rt_sigreturn, 10, extraSpace,
extraLength);
memcpy(dest,
"\xB8\xAD\x00\x00\x00" // MOV $0xAD, %eax
"\xE8", // CALL syscallEntryPointNoFrame
6);
*reinterpret_cast<long *>(dest + 6) =
reinterpret_cast<char *>(&syscallEntryPointNoFrame) - dest - 10;
*__kernel_rt_sigreturn = '\xE9'; // JMPQ
*reinterpret_cast<long *>(__kernel_rt_sigreturn + 1) =
dest - reinterpret_cast<char *>(__kernel_rt_sigreturn) - 5;
}
#endif
}
int Library::patchVSystemCalls() {
#if defined(__x86_64__)
// VSyscalls live in a shared 4kB page at the top of the address space. This
// page cannot be unmapped nor remapped. We have to create a copy within
// 2GB of the page, and rewrite all IP-relative accesses to shared variables.
// As the top of the address space is not accessible by mmap(), this means
// that we need to wrap around addresses to the bottom 2GB of the address
// space.
// Only x86-64 has VSyscalls.
if (maps_->vsyscall()) {
char* copy = maps_->allocNearAddr(maps_->vsyscall(), 0x1000,
PROT_READ|PROT_WRITE|PROT_EXEC);
char* extraSpace = copy;
int extraLength = 0x1000;
memcpy(copy, maps_->vsyscall(), 0x1000);
long adjust = (long)maps_->vsyscall() - (long)copy;
for (int vsys = 0; vsys < 0x1000; vsys += 0x400) {
char* start = copy + vsys;
char* end = start + 0x400;
// There can only be up to four VSyscalls starting at an offset of
// n*0x1000, each. VSyscalls are invoked by functions in the VDSO
// and provide fast implementations of a time source. We don't exactly
// know where the code and where the data is in the VSyscalls page.
// So, we disassemble the code for each function and find all branch
// targets within the function in order to find the last address of
// function.
for (char *last = start, *vars = end, *ptr = start; ptr < end; ) {
new_function:
char* mod_rm;
unsigned short insn = next_inst((const char **)&ptr, true, 0, 0,
&mod_rm, 0, 0);
if (mod_rm && (*mod_rm & 0xC7) == 0x5) {
// Instruction has IP relative addressing mode. Adjust to reference
// the variables in the original VSyscall segment.
long offset = *reinterpret_cast<int *>(mod_rm + 1);
char* var = ptr + offset;
if (var >= ptr && var < vars) {
// Variables are stored somewhere past all the functions. Remember
// the first variable in the VSyscall slot, so that we stop
// scanning for instructions once we reach that address.
vars = var;
}
offset += adjust;
if ((offset >> 32) && (offset >> 32) != -1) {
Sandbox::die("Cannot patch [vsystemcall]");
}
*reinterpret_cast<int *>(mod_rm + 1) = offset;
}
// Check for jump targets to higher addresses (but within our own
// VSyscall slot). They extend the possible end-address of this
// function.
char *target = 0;
if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ ||
insn == 0xEB /* JMP */) {
target = ptr + (reinterpret_cast<signed char *>(ptr))[-1];
} else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ ||
(insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) {
target = ptr + (reinterpret_cast<int *>(ptr))[-1];
}
// The function end is found, once the loop reaches the last valid
// address in the VSyscall slot, or once it finds a RET instruction
// that is not followed by any jump targets. Unconditional jumps that
// point backwards are treated the same as a RET instruction.
if (insn == 0xC3 /* RET */ ||
(target < ptr &&
(insn == 0xEB /* JMP */ || insn == 0xE9 /* JMP */))) {
if (last >= ptr) {
continue;
} else {
// The function can optionally be followed by more functions in
// the same VSyscall slot. Allow for alignment to a 16 byte
// boundary. If we then find more non-zero bytes, and if this is
// not the known start of the variables, assume a new function
// started.
for (; ptr < vars; ++ptr) {
if ((long)ptr & 0xF) {
if (*ptr && *ptr != '\x90' /* NOP */) {
goto new_function;
}
*ptr = '\x90'; // NOP
} else {
if (*ptr && *ptr != '\x90' /* NOP */) {
goto new_function;
}
break;
}
}
// Translate all SYSCALLs to jumps into our system call handler.
patchSystemCallsInFunction(NULL, 0, start, ptr,
&extraSpace, &extraLength);
break;
}
}
// Adjust assumed end address for this function, if a valid jump
// target has been found that originates from the current instruction.
if (target > last && target < start + 0x100) {
last = target;
}
}
}
// We are done. Write-protect our code and make it executable.
Sandbox::SysCalls sys;
sys.mprotect(copy, 0x1000, PROT_READ|PROT_EXEC);
return maps_->vsyscall() - copy;
}
#endif
return 0;
}
void Library::patchSystemCalls() {
if (!valid_) {
return;
}
int extraLength = 0;
char* extraSpace = NULL;
if (isVDSO_) {
// patchVDSO() calls patchSystemCallsInFunction() which needs vsys_offset_
// iff processing the VDSO library. So, make sure we call
// patchVSystemCalls() first.
vsys_offset_ = patchVSystemCalls();
#if defined(__i386__)
patchVDSO(&extraSpace, &extraLength);
return;
#endif
}
SectionTable::const_iterator iter;
if ((iter = section_table_.find(".text")) == section_table_.end()) {
return;
}
const Elf_Shdr& shdr = iter->second.second;
char* start = reinterpret_cast<char *>(shdr.sh_addr + asr_offset_);
char* stop = start + shdr.sh_size;
patchSystemCallsInRange(start, stop, &extraSpace, &extraLength);
// Mark our scratch space as write-protected and executable.
if (extraSpace) {
Sandbox::SysCalls sys;
sys.mprotect(extraSpace, 4096, PROT_READ|PROT_EXEC);
}
}
void Library::patchSystemCallsInRange(char* start, char* stop,
char** extraSpace, int* extraLength) {
char* func = start;
int nopcount = 0;
bool has_syscall = false;
for (char *ptr = start; ptr < stop; ptr++) {
#if defined(__x86_64__)
if ((*ptr == '\x0F' && ptr[1] == '\x05' /* SYSCALL */) ||
(isVDSO_ && *ptr == '\xFF') ||
isCallToVsyscallPage(ptr)) {
#elif defined(__i386__)
if ((*ptr == '\xCD' && ptr[1] == '\x80' /* INT $0x80 */) ||
(*ptr == '\x65' && ptr[1] == '\xFF' &&
ptr[2] == '\x15' /* CALL %gs:.. */)) {
#else
#error Unsupported target platform
#endif
ptr++;
has_syscall = true;
nopcount = 0;
} else if (*ptr == '\x90' /* NOP */) {
nopcount++;
} else if (!(reinterpret_cast<long>(ptr) & 0xF)) {
if (nopcount > 2) {
// This is very likely the beginning of a new function. Functions
// are aligned on 16 byte boundaries and the preceding function is
// padded out with NOPs.
//
// For performance reasons, we quickly scan the entire text segment
// for potential SYSCALLs, and then patch the code in increments of
// individual functions.
if (has_syscall) {
has_syscall = false;
// Our quick scan of the function found a potential system call.
// Do a more thorough scan, now.
patchSystemCallsInFunction(maps_, isVDSO_ ? vsys_offset_ : 0, func,
ptr, extraSpace, extraLength);
}
func = ptr;
}
nopcount = 0;
} else {
nopcount = 0;
}
}
if (has_syscall) {
// Patch any remaining system calls that were in the last function before
// the loop terminated.
patchSystemCallsInFunction(maps_, isVDSO_ ? vsys_offset_ : 0, func, stop,
extraSpace, extraLength);
}
}
bool Library::parseElf() {
valid_ = true;
// Verify ELF header
Elf_Shdr str_shdr;
if (!getOriginal(0, &ehdr_) ||
ehdr_.e_ehsize < sizeof(Elf_Ehdr) ||
ehdr_.e_phentsize < sizeof(Elf_Phdr) ||
ehdr_.e_shentsize < sizeof(Elf_Shdr) ||
!getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize,
&str_shdr)) {
// Not all memory mappings are necessarily ELF files. Skip memory
// mappings that we cannot identify.
error:
valid_ = false;
return false;
}
// Parse section table and find all sections in this ELF file
for (int i = 0; i < ehdr_.e_shnum; i++) {
Elf_Shdr shdr;
if (!getOriginal(ehdr_.e_shoff + i*ehdr_.e_shentsize, &shdr)) {
continue;
}
section_table_.insert(
std::make_pair(getOriginal(str_shdr.sh_offset + shdr.sh_name),
std::make_pair(i, shdr)));
}
// Compute the offset of entries in the .text segment
const Elf_Shdr* text = getSection(".text");
if (text == NULL) {
// On x86-32, the VDSO is unusual in as much as it does not have a single
// ".text" section. Instead, it has one section per function. Each
// section name starts with ".text". We just need to pick an arbitrary
// one in order to find the asr_offset_ -- which would typically be zero
// for the VDSO.
for (SectionTable::const_iterator iter = section_table_.begin();
iter != section_table_.end(); ++iter) {
if (!strncmp(iter->first.c_str(), ".text", 5)) {
text = &iter->second.second;
break;
}
}
}
// Now that we know where the .text segment is located, we can compute the
// asr_offset_.
if (text) {
RangeMap::const_iterator iter =
memory_ranges_.lower_bound(text->sh_offset);
if (iter != memory_ranges_.end()) {
asr_offset_ = reinterpret_cast<char *>(iter->second.start) -
(text->sh_addr - (text->sh_offset - iter->first));
} else {
goto error;
}
} else {
goto error;
}
return !isVDSO_ || parseSymbols();
}
bool Library::parseSymbols() {
if (!valid_) {
return false;
}
Elf_Shdr str_shdr;
getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize, &str_shdr);
// Find symbol table
const Elf_Shdr* symtab = getSection(".dynsym");
Elf_Shdr strtab = { 0 };
if (symtab) {
if (symtab->sh_link >= ehdr_.e_shnum ||
!getOriginal(ehdr_.e_shoff + symtab->sh_link * ehdr_.e_shentsize,
&strtab)) {
Debug::message("Cannot find valid symbol table\n");
valid_ = false;
return false;
}
// Parse symbol table and add its entries
for (Elf_Addr addr = 0; addr < symtab->sh_size; addr += sizeof(Elf_Sym)) {
Elf_Sym sym;
if (!getOriginal(symtab->sh_offset + addr, &sym) ||
(sym.st_shndx >= ehdr_.e_shnum &&
sym.st_shndx < SHN_LORESERVE)) {
Debug::message("Encountered invalid symbol\n");
valid_ = false;
return false;
}
string name = getOriginal(strtab.sh_offset + sym.st_name);
if (name.empty()) {
continue;
}
symbols_.insert(std::make_pair(name, sym));
}
}
SymbolTable::const_iterator iter = symbols_.find("__kernel_vsyscall");
if (iter != symbols_.end() && iter->second.st_value) {
__kernel_vsyscall = asr_offset_ + iter->second.st_value;
}
iter = symbols_.find("__kernel_sigreturn");
if (iter != symbols_.end() && iter->second.st_value) {
__kernel_sigreturn = asr_offset_ + iter->second.st_value;
}
iter = symbols_.find("__kernel_rt_sigreturn");
if (iter != symbols_.end() && iter->second.st_value) {
__kernel_rt_sigreturn = asr_offset_ + iter->second.st_value;
}
return true;
}
} // namespace