syscall_entrypoint.cc - external/seccompsandbox - Git at Google

 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "syscall_entrypoint.h"

 #include "debug.h"
 #include "sandbox_impl.h"
 #include "system_call_table.h"

 namespace playground {

 // TODO(markus): change this into a function that returns the address of the assembly code. If that isn't possible for sandbox_clone, then move that function into a *.S file
 asm(
     ".pushsection .text, \"ax\", @progbits\n"

     // This is the special wrapper for the clone() system call. The code
     // relies on the stack layout of the system call entrypoint (c.f. below). It
     // passes the stack pointer as an additional argument to sandbox__clone(),
     // so that upon starting the child, register values can be restored and
     // the child can start executing at the correct IP, instead of trying to
     // run in the trusted thread.
     "playground$sandbox_clone:"
     ".internal playground$sandbox_clone\n"
     ".globl playground$sandbox_clone\n"
     ".type playground$sandbox_clone, @function\n"
     #if defined(__x86_64__)
     // Skip the 8 byte return address into the system call entrypoint. The
     // following bytes are the saved register values that we need to restore
     // upon return from clone() in the new thread.
     "lea 8(%rsp), %r9\n"
     "jmp playground$sandbox__clone\n"
     #elif defined(__i386__)
     // As i386 passes function arguments on the stack, we need to skip a few
     // more values before we can get to the saved registers.
     "mov 28(%esp), %eax\n"
     "mov %eax, 24(%esp)\n"
     "jmp playground$sandbox__clone\n"
     #else
     #error Unsupported target platform
     #endif
     ".size playground$sandbox_clone, .-playground$sandbox_clone\n"


     // This is the entrypoint which is called by the untrusted code, trying to
     // make a system call.
     "playground$syscallEntryPointNoFrame:"
     ".internal playground$syscallEntryPointNoFrame\n"
     ".globl playground$syscallEntryPointNoFrame\n"
     ".type playground$syscallEntryPointNoFrame, @function\n"
     #if defined(__x86_64__)
     "mov  0(%rsp), %r11\n"         // add fake return address by duplicating
     "push %r11\n"                  // real return address
     /* fall through */
     #elif defined(__i386__)
     "push %eax\n"                  // add fake return address, which in this
     "mov  4(%esp), %eax\n"         // case is identical to the real return
     "xchg %eax, 0(%esp)\n"         // address
     /* fall through */
     #else
     #error Unsupported target platform
     #endif
     ".size playground$syscallEntryPointNoFrame, "
         ".-playground$syscallEntryPointNoFrame\n"

     "playground$syscallEntryPointWithFrame:"
     ".internal playground$syscallEntryPointWithFrame\n"
     ".globl playground$syscallEntryPointWithFrame\n"
     ".type playground$syscallEntryPointWithFrame, @function\n"
     #if defined(__x86_64__)
     // Check for rt_sigreturn(). It needs to be handled specially.
     "cmp  $15, %rax\n"             // NR_rt_sigreturn
     "jnz  1f\n"
     "add  $0x90, %rsp\n"           // pop return addresses and red zone
   "0:syscall\n"                    // rt_sigreturn() is unrestricted
     "mov  $66, %edi\n"             // rt_sigreturn() should never return
     "mov  $231, %eax\n"            // NR_exit_group
     "jmp  0b\n"

     // Save all registers
   "1:push %rbp\n"
     "movq  $0xDEADBEEFDEADBEEF, %rbp\n" // marker used by breakpad to remove
     "push %rbp\n"                  // seccomp-sandbox's stack frame from dumps
     "mov  %rsp, %rbp\n"
     "push %rbx\n"
     "push %rcx\n"
     "push %rdx\n"
     "push %rsi\n"
     "push %rdi\n"
     "push %r8\n"
     "push %r9\n"
     "push %r10\n"
     "push %r11\n"
     "push %r12\n"
     "push %r13\n"
     "push %r14\n"
     "push %r15\n"

     // TODO(markus): On x86-32 we have to explicitly align the stack. Do we
     //               also have to do this on x86-64? Empirical evidence
     //               suggests, we are OK -- but we might have to revisit this
     //               decision.

     // Check range of system call
     "mov playground$maxSyscall@GOTPCREL(%rip), %rcx\n"
     "cmp 0(%rcx), %eax\n"
     "ja  3f\n"

     // Retrieve function call from system call table (c.f.system_call_table.cc)
     // We have three different types of entries; zero for denied system calls,
     // that should be handled by the defaultSystemCallHandler(); minus one
     // for unrestricted system calls that need to be forwarded to the trusted
     // thread; and function pointers to specific handler functions.
     "mov %rax, %rcx\n"
     "shl $4, %rcx\n"
     "mov playground$syscallTable@GOTPCREL(%rip), %r11\n"
     "mov 0(%r11), %r11\n"
     "add %r11, %rcx\n"
     "mov 0(%rcx), %rcx\n"

     // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
     // jump to fallback handler.
     "cmp $1, %rcx\n"
     "jbe 3f\n"
     "xchg %r10, %rcx\n"            // Syscall to userspace calling conventions
     "call *%r10\n"
   "2:"

     // Restore CPU registers, except for %rax which was set by the system call.
     "pop %r15\n"
     "pop %r14\n"
     "pop %r13\n"
     "pop %r12\n"
     "pop %r11\n"
     "pop %r10\n"
     "pop %r9\n"
     "pop %r8\n"
     "pop %rdi\n"
     "pop %rsi\n"
     "pop %rdx\n"
     "pop %rcx\n"
     "pop %rbx\n"
     "pop %rbp\n"                   // 0xDEADBEEF marker
     "pop %rbp\n"

     // Remove fake return address. This is added in the patching code in
     // library.cc and it makes stack traces a little cleaner.
     "add $8, %rsp\n"

     // Return to caller
     "ret\n"

   "3:"
     // If we end up calling a specific handler, we don't need to know the
     // system call number. However, in the generic case, we do. Shift
     // registers so that the system call number becomes visible as the
     // first function argument.
     "push %r9\n"
     "mov  %r8, %r9\n"
     "mov  %r10, %r8\n"
     "mov  %rdx, %rcx\n"
     "mov  %rsi, %rdx\n"
     "mov  %rdi, %rsi\n"
     "mov  %rax, %rdi\n"

     // Call default handler.
     "call playground$defaultSystemCallHandler\n"
     "pop  %r9\n"
     "jmp 2b\n"
     #elif defined(__i386__)
     "cmp  $119, %eax\n"            // NR_sigreturn
     "jnz  1f\n"
     "add  $0x8, %esp\n"            // pop return address
   "0:int  $0x80\n"                 // sigreturn() is unrestricted
     "mov  $66, %ebx\n"             // sigreturn() should never return
     "mov  %ebx, %eax\n"            // NR_exit
     "jmp  0b\n"
   "1:cmp  $173, %eax\n"            // NR_rt_sigreturn
     "jnz  4f\n"

     // Convert rt_sigframe into sigframe, allowing us to call sigreturn().
     // This is possible since the first part of signal stack frames have
     // stayed very stable since the earliest kernel versions. While never
     // officially documented, lots of user space applications rely on this
     // part of the ABI, and kernel developers have been careful to maintain
     // backwards compatibility.
     // In general, the rt_sigframe includes a lot of extra information that
     // the signal handler can look at. Most notably, this means a complete
     // siginfo record.
     // Fortunately though, the kernel doesn't look at any of this extra data
     // when returning from a signal handler. So, we can safely convert an
     // rt_sigframe to a legacy sigframe, discarding the extra data in the
     // process. Interestingly, the legacy signal frame is actually larger than
     // the rt signal frame, as it includes a lot more padding.
     "sub  $0x1C4, %esp\n"          // a legacy signal stack is much larger
     "mov  0x1CC(%esp), %eax\n"     // push signal number
     "push %eax\n"
     "lea  0x270(%esp), %esi\n"     // copy siginfo register values
     "lea  0x4(%esp), %edi\n"       //     into new location
     "mov  $0x16, %ecx\n"
     "cld\n"
     "rep movsl\n"
     "mov  0x2C8(%esp), %ebx\n"     // copy first half of signal mask
     "mov  %ebx, 0x54(%esp)\n"
     "call 2f\n"
   "2:pop  %esi\n"
     "add  $(3f-2b), %esi\n"
     "push %esi\n"                  // push restorer function
     "lea  0x2D4(%esp), %edi\n"     // patch up retcode magic numbers
     "movb $2, %cl\n"
     "rep movsl\n"
     "ret\n"                        // return to restorer function
   "3:pop  %eax\n"                  // remove dummy argument (signo)
     "mov  $119, %eax\n"            // NR_sigaction
     "int  $0x80\n"


     // Preserve all registers
   "4:push %ebp\n"
     "push $0xDEADBEEF\n"           // marker used by breakpad
     "push %ebx\n"
     "push %ecx\n"
     "push %edx\n"
     "push %esi\n"
     "push %edi\n"

     // Align stack pointer, so that called functions can push SSE registers
     // onto stack. This apparently is a requirement of the x86-32 ABI.
     "mov  %esp, %ebp\n"
     "and  $-16, %esp\n"
     "sub $4, %esp\n"
     "push %ebp\n"                  // push old un-aligned stack pointer
     "lea  0x14(%ebp), %ebp\n"      // frame pointer points to 0xDEADBEEF
     "push %eax\n"
     "mov  4(%ebp), %eax\n"         // push original value of %ebp
     "xchg %eax, 0(%esp)\n"

     // Convert from syscall calling conventions to C calling conventions
     "push %edi\n"
     "push %esi\n"
     "push %edx\n"
     "push %ecx\n"
     "push %ebx\n"
     "push %eax\n"

     // Check range of system call
     "call 5f\n"
   "5:pop  %edx\n"
     "add $(_GLOBAL_OFFSET_TABLE_+(.-5b)), %edx\n"
     "mov playground$maxSyscall@GOT(%edx), %edx\n"
     "cmp 0(%edx), %eax\n"
     "ja  14f\n"

     // We often have long sequences of calls to gettimeofday(). This is
     // needlessly expensive. Coalesce them into a single call.
     //
     // We keep track of state in TLS storage that we can access through
     // the %fs segment register. See trusted_thread.cc for the exact
     // memory layout.
     //
     // TODO(markus): maybe, we should proactively call gettimeofday() and
     //               clock_gettime(), whenever we talk to the trusted thread?
     //               or maybe, if we have recently seen requests to compute
     //               the time. There might be a repeated pattern of those.
     "cmp  $78, %eax\n"             // __NR_gettimeofday
     "jnz  10f\n"
     "cmp  %eax, %fs:0x102C-0x58\n" // last system call
     "jnz  7f\n"

     // This system call and the last system call prior to this one both are
     // calls to gettimeofday(). Try to avoid making the new call and just
     // return the same result as in the previous call.
     // Just in case the caller is spinning on the result from gettimeofday(),
     // every so often, call the actual system call.
     "decl %fs:0x1030-0x58\n"       // countdown calls to gettimofday()
     "jz   7f\n"

     // Atomically read the 64bit word representing last-known timestamp and
     // return it to the caller. On x86-32 this is a little more complicated and
     // requires the use of the cmpxchg8b instruction.
     "mov  %ebx, %eax\n"
     "mov  %ecx, %edx\n"
     "call 6f\n"
   "6:pop %ebp\n"
     "add $(100f-6b), %ebp\n"
     "lock; cmpxchg8b 0(%ebp)\n"
     "mov  %eax, 0(%ebx)\n"
     "mov  %edx, 4(%ebx)\n"
     "xor  %eax, %eax\n"
     "add  $28, %esp\n"
     "jmp  13f\n"

     // This is a call to gettimeofday(), but we don't have a valid cached
     // result, yet.
   "7:mov  %eax, %fs:0x102C-0x58\n" // remember syscall number
     "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations
     "call playground$defaultSystemCallHandler@PLT\n"

     // Returned from gettimeofday(). Remember return value, in case the
     // application calls us again right away.
     // Again, this has to happen atomically and requires cmpxchg8b.
     "mov 4(%ebx), %ecx\n"
     "mov 0(%ebx), %ebx\n"
     "call 8f\n"
   "8:pop %ebp\n"
     "add $(100f-8b), %ebp\n"
     "mov 0(%ebp), %eax\n"
     "mov 4(%ebp), %edx\n"
   "9:lock; cmpxchg8b 0(%ebp)\n"
     "jnz 9b\n"
     "xor %eax, %eax\n"
     "jmp 15f\n"

     // Remember the number of the last system call made. We deliberately do
     // not remember calls to gettid(), as we have often seen long sequences
     // of calls to just gettimeofday() and gettid(). In that situation, we
     // would still like to coalesce the gettimeofday() calls.
  "10:cmp $224, %eax\n"             // __NR_gettid
     "jz  11f\n"
     "mov  %eax, %fs:0x102C-0x58\n" // remember syscall number

     // Retrieve function call from system call table (c.f.system_call_table.cc)
     // We have three different types of entries; zero for denied system calls,
     // that should be handled by the defaultSystemCallHandler(); minus one
     // for unrestricted system calls that need to be forwarded to the trusted
     // thread; and function pointers to specific handler functions.
  "11:shl  $3, %eax\n"
     "call 12f\n"
  "12:pop  %ebx\n"
     "add  $(_GLOBAL_OFFSET_TABLE_+(.-12b)), %ebx\n"
     "mov  playground$syscallTable@GOT(%ebx), %ebx\n"
     "add  0(%ebx), %eax\n"
     "mov  0(%eax), %eax\n"

     // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
     // jump to fallback handler.
     "cmp  $1, %eax\n"
     "jbe  14f\n"
     "add  $4, %esp\n"
     "call *%eax\n"
     "add  $24, %esp\n"

     // Restore CPU registers, except for %eax which was set by the system call.
  "13:pop  %esp\n"
     "pop  %edi\n"
     "pop  %esi\n"
     "pop  %edx\n"
     "pop  %ecx\n"
     "pop  %ebx\n"
     "pop  %ebp\n"                  // 0xDEADBEEF marker
     "pop  %ebp\n"

     // Remove fake return address. This is added in the patching code in
     // library.cc and it makes stack traces a little cleaner.
     "add  $4, %esp\n"

     // Return to caller
     "ret\n"

     // Call default handler.
  "14:call playground$defaultSystemCallHandler@PLT\n"
  "15:add  $28, %esp\n"
     "jmp  13b\n"

     ".pushsection \".bss\"\n"
     ".balign 8\n"
 "100:.byte 0, 0, 0, 0, 0, 0, 0, 0\n"
     ".popsection\n"

     #else
     #error Unsupported target platform
     #endif
     ".size playground$syscallEntryPointWithFrame,"
         ".-playground$syscallEntryPointWithFrame\n"
     ".popsection\n"
 );


 void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1,
                                         void* arg2, void* arg3, void* arg4,
                                         void* arg5) {
   // TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that.

   // We try to avoid intercepting read(), and write(), as these system calls
   // are not restricted in Seccomp mode. But depending on the exact
   // instruction sequence in libc, we might not be able to reliably
   // filter out these system calls at the time when we instrument the code.
   SysCalls  sys;
   long      rc;
   long long tm;
   switch (syscallNum) {
     case __NR_read:
       Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
       rc             = sys.read((long)arg0, arg1, (size_t)arg2);
       break;
     case __NR_write:
       Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
       rc             = sys.write((long)arg0, arg1, (size_t)arg2);
       break;
     default:
       if (Debug::isEnabled()) {
         // In debug mode, prevent stderr from being closed
         if (syscallNum == __NR_close && arg0 == (void *)2)
           return 0;
       }

       if ((unsigned)syscallNum <= SyscallTable::maxSyscall &&
          SyscallTable::syscallTable[syscallNum].handler==UNRESTRICTED_SYSCALL){
         Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
      perform_unrestricted:
         struct {
           int          sysnum;
           void*        unrestricted_req[6];
         } __attribute__((packed)) request = {
           syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } };

         int   thread = threadFdPub();
         void* rc;
         if (write(sys, thread, &request, sizeof(request)) != sizeof(request) ||
             read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) {
           die("Failed to forward unrestricted system call");
         }
         Debug::elapsed(tm, syscallNum);
         return rc;
       } else if (Debug::isEnabled()) {
         Debug::syscall(&tm, syscallNum,
                        "In production mode, this call would be disallowed");
         goto perform_unrestricted;
       } else {
         return (void *)-ENOSYS;
       }
   }
   if (rc < 0) {
     rc               = -sys.my_errno;
   }
   Debug::elapsed(tm, syscallNum);
   return (void *)rc;
 }

 } // namespace
	// Copyright (c) 2010 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "syscall_entrypoint.h"

	#include "debug.h"
	#include "sandbox_impl.h"
	#include "system_call_table.h"

	namespace playground {

	// TODO(markus): change this into a function that returns the address of the assembly code. If that isn't possible for sandbox_clone, then move that function into a *.S file
	asm(
	".pushsection .text, \"ax\", @progbits\n"

	// This is the special wrapper for the clone() system call. The code
	// relies on the stack layout of the system call entrypoint (c.f. below). It
	// passes the stack pointer as an additional argument to sandbox__clone(),
	// so that upon starting the child, register values can be restored and
	// the child can start executing at the correct IP, instead of trying to
	// run in the trusted thread.
	"playground$sandbox_clone:"
	".internal playground$sandbox_clone\n"
	".globl playground$sandbox_clone\n"
	".type playground$sandbox_clone, @function\n"
	#if defined(__x86_64__)
	// Skip the 8 byte return address into the system call entrypoint. The
	// following bytes are the saved register values that we need to restore
	// upon return from clone() in the new thread.
	"lea 8(%rsp), %r9\n"
	"jmp playground$sandbox__clone\n"
	#elif defined(__i386__)
	// As i386 passes function arguments on the stack, we need to skip a few
	// more values before we can get to the saved registers.
	"mov 28(%esp), %eax\n"
	"mov %eax, 24(%esp)\n"
	"jmp playground$sandbox__clone\n"
	#else
	#error Unsupported target platform
	#endif
	".size playground$sandbox_clone, .-playground$sandbox_clone\n"


	// This is the entrypoint which is called by the untrusted code, trying to
	// make a system call.
	"playground$syscallEntryPointNoFrame:"
	".internal playground$syscallEntryPointNoFrame\n"
	".globl playground$syscallEntryPointNoFrame\n"
	".type playground$syscallEntryPointNoFrame, @function\n"
	#if defined(__x86_64__)
	"mov 0(%rsp), %r11\n" // add fake return address by duplicating
	"push %r11\n" // real return address
	/* fall through */
	#elif defined(__i386__)
	"push %eax\n" // add fake return address, which in this
	"mov 4(%esp), %eax\n" // case is identical to the real return
	"xchg %eax, 0(%esp)\n" // address
	/* fall through */
	#else
	#error Unsupported target platform
	#endif
	".size playground$syscallEntryPointNoFrame, "
	".-playground$syscallEntryPointNoFrame\n"

	"playground$syscallEntryPointWithFrame:"
	".internal playground$syscallEntryPointWithFrame\n"
	".globl playground$syscallEntryPointWithFrame\n"
	".type playground$syscallEntryPointWithFrame, @function\n"
	#if defined(__x86_64__)
	// Check for rt_sigreturn(). It needs to be handled specially.
	"cmp $15, %rax\n" // NR_rt_sigreturn
	"jnz 1f\n"
	"add $0x90, %rsp\n" // pop return addresses and red zone
	"0:syscall\n" // rt_sigreturn() is unrestricted
	"mov $66, %edi\n" // rt_sigreturn() should never return
	"mov $231, %eax\n" // NR_exit_group
	"jmp 0b\n"

	// Save all registers
	"1:push %rbp\n"
	"movq $0xDEADBEEFDEADBEEF, %rbp\n" // marker used by breakpad to remove
	"push %rbp\n" // seccomp-sandbox's stack frame from dumps
	"mov %rsp, %rbp\n"
	"push %rbx\n"
	"push %rcx\n"
	"push %rdx\n"
	"push %rsi\n"
	"push %rdi\n"
	"push %r8\n"
	"push %r9\n"
	"push %r10\n"
	"push %r11\n"
	"push %r12\n"
	"push %r13\n"
	"push %r14\n"
	"push %r15\n"

	// TODO(markus): On x86-32 we have to explicitly align the stack. Do we
	// also have to do this on x86-64? Empirical evidence
	// suggests, we are OK -- but we might have to revisit this
	// decision.

	// Check range of system call
	"mov playground$maxSyscall@GOTPCREL(%rip), %rcx\n"
	"cmp 0(%rcx), %eax\n"
	"ja 3f\n"

	// Retrieve function call from system call table (c.f.system_call_table.cc)
	// We have three different types of entries; zero for denied system calls,
	// that should be handled by the defaultSystemCallHandler(); minus one
	// for unrestricted system calls that need to be forwarded to the trusted
	// thread; and function pointers to specific handler functions.
	"mov %rax, %rcx\n"
	"shl $4, %rcx\n"
	"mov playground$syscallTable@GOTPCREL(%rip), %r11\n"
	"mov 0(%r11), %r11\n"
	"add %r11, %rcx\n"
	"mov 0(%rcx), %rcx\n"

	// Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
	// jump to fallback handler.
	"cmp $1, %rcx\n"
	"jbe 3f\n"
	"xchg %r10, %rcx\n" // Syscall to userspace calling conventions
	"call *%r10\n"
	"2:"

	// Restore CPU registers, except for %rax which was set by the system call.
	"pop %r15\n"
	"pop %r14\n"
	"pop %r13\n"
	"pop %r12\n"
	"pop %r11\n"
	"pop %r10\n"
	"pop %r9\n"
	"pop %r8\n"
	"pop %rdi\n"
	"pop %rsi\n"
	"pop %rdx\n"
	"pop %rcx\n"
	"pop %rbx\n"
	"pop %rbp\n" // 0xDEADBEEF marker
	"pop %rbp\n"

	// Remove fake return address. This is added in the patching code in
	// library.cc and it makes stack traces a little cleaner.
	"add $8, %rsp\n"

	// Return to caller
	"ret\n"

	"3:"
	// If we end up calling a specific handler, we don't need to know the
	// system call number. However, in the generic case, we do. Shift
	// registers so that the system call number becomes visible as the
	// first function argument.
	"push %r9\n"
	"mov %r8, %r9\n"
	"mov %r10, %r8\n"
	"mov %rdx, %rcx\n"
	"mov %rsi, %rdx\n"
	"mov %rdi, %rsi\n"
	"mov %rax, %rdi\n"

	// Call default handler.
	"call playground$defaultSystemCallHandler\n"
	"pop %r9\n"
	"jmp 2b\n"
	#elif defined(__i386__)
	"cmp $119, %eax\n" // NR_sigreturn
	"jnz 1f\n"
	"add $0x8, %esp\n" // pop return address
	"0:int $0x80\n" // sigreturn() is unrestricted
	"mov $66, %ebx\n" // sigreturn() should never return
	"mov %ebx, %eax\n" // NR_exit
	"jmp 0b\n"
	"1:cmp $173, %eax\n" // NR_rt_sigreturn
	"jnz 4f\n"

	// Convert rt_sigframe into sigframe, allowing us to call sigreturn().
	// This is possible since the first part of signal stack frames have
	// stayed very stable since the earliest kernel versions. While never
	// officially documented, lots of user space applications rely on this
	// part of the ABI, and kernel developers have been careful to maintain
	// backwards compatibility.
	// In general, the rt_sigframe includes a lot of extra information that
	// the signal handler can look at. Most notably, this means a complete
	// siginfo record.
	// Fortunately though, the kernel doesn't look at any of this extra data
	// when returning from a signal handler. So, we can safely convert an
	// rt_sigframe to a legacy sigframe, discarding the extra data in the
	// process. Interestingly, the legacy signal frame is actually larger than
	// the rt signal frame, as it includes a lot more padding.
	"sub $0x1C4, %esp\n" // a legacy signal stack is much larger
	"mov 0x1CC(%esp), %eax\n" // push signal number
	"push %eax\n"
	"lea 0x270(%esp), %esi\n" // copy siginfo register values
	"lea 0x4(%esp), %edi\n" // into new location
	"mov $0x16, %ecx\n"
	"cld\n"
	"rep movsl\n"
	"mov 0x2C8(%esp), %ebx\n" // copy first half of signal mask
	"mov %ebx, 0x54(%esp)\n"
	"call 2f\n"
	"2:pop %esi\n"
	"add $(3f-2b), %esi\n"
	"push %esi\n" // push restorer function
	"lea 0x2D4(%esp), %edi\n" // patch up retcode magic numbers
	"movb $2, %cl\n"
	"rep movsl\n"
	"ret\n" // return to restorer function
	"3:pop %eax\n" // remove dummy argument (signo)
	"mov $119, %eax\n" // NR_sigaction
	"int $0x80\n"


	// Preserve all registers
	"4:push %ebp\n"
	"push $0xDEADBEEF\n" // marker used by breakpad
	"push %ebx\n"
	"push %ecx\n"
	"push %edx\n"
	"push %esi\n"
	"push %edi\n"

	// Align stack pointer, so that called functions can push SSE registers
	// onto stack. This apparently is a requirement of the x86-32 ABI.
	"mov %esp, %ebp\n"
	"and $-16, %esp\n"
	"sub $4, %esp\n"
	"push %ebp\n" // push old un-aligned stack pointer
	"lea 0x14(%ebp), %ebp\n" // frame pointer points to 0xDEADBEEF
	"push %eax\n"
	"mov 4(%ebp), %eax\n" // push original value of %ebp
	"xchg %eax, 0(%esp)\n"

	// Convert from syscall calling conventions to C calling conventions
	"push %edi\n"
	"push %esi\n"
	"push %edx\n"
	"push %ecx\n"
	"push %ebx\n"
	"push %eax\n"

	// Check range of system call
	"call 5f\n"
	"5:pop %edx\n"
	"add $(_GLOBAL_OFFSET_TABLE_+(.-5b)), %edx\n"
	"mov playground$maxSyscall@GOT(%edx), %edx\n"
	"cmp 0(%edx), %eax\n"
	"ja 14f\n"

	// We often have long sequences of calls to gettimeofday(). This is
	// needlessly expensive. Coalesce them into a single call.
	//
	// We keep track of state in TLS storage that we can access through
	// the %fs segment register. See trusted_thread.cc for the exact
	// memory layout.
	//
	// TODO(markus): maybe, we should proactively call gettimeofday() and
	// clock_gettime(), whenever we talk to the trusted thread?
	// or maybe, if we have recently seen requests to compute
	// the time. There might be a repeated pattern of those.
	"cmp $78, %eax\n" // __NR_gettimeofday
	"jnz 10f\n"
	"cmp %eax, %fs:0x102C-0x58\n" // last system call
	"jnz 7f\n"

	// This system call and the last system call prior to this one both are
	// calls to gettimeofday(). Try to avoid making the new call and just
	// return the same result as in the previous call.
	// Just in case the caller is spinning on the result from gettimeofday(),
	// every so often, call the actual system call.
	"decl %fs:0x1030-0x58\n" // countdown calls to gettimofday()
	"jz 7f\n"

	// Atomically read the 64bit word representing last-known timestamp and
	// return it to the caller. On x86-32 this is a little more complicated and
	// requires the use of the cmpxchg8b instruction.
	"mov %ebx, %eax\n"
	"mov %ecx, %edx\n"
	"call 6f\n"
	"6:pop %ebp\n"
	"add $(100f-6b), %ebp\n"
	"lock; cmpxchg8b 0(%ebp)\n"
	"mov %eax, 0(%ebx)\n"
	"mov %edx, 4(%ebx)\n"
	"xor %eax, %eax\n"
	"add $28, %esp\n"
	"jmp 13f\n"

	// This is a call to gettimeofday(), but we don't have a valid cached
	// result, yet.
	"7:mov %eax, %fs:0x102C-0x58\n" // remember syscall number
	"movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations
	"call playground$defaultSystemCallHandler@PLT\n"

	// Returned from gettimeofday(). Remember return value, in case the
	// application calls us again right away.
	// Again, this has to happen atomically and requires cmpxchg8b.
	"mov 4(%ebx), %ecx\n"
	"mov 0(%ebx), %ebx\n"
	"call 8f\n"
	"8:pop %ebp\n"
	"add $(100f-8b), %ebp\n"
	"mov 0(%ebp), %eax\n"
	"mov 4(%ebp), %edx\n"
	"9:lock; cmpxchg8b 0(%ebp)\n"
	"jnz 9b\n"
	"xor %eax, %eax\n"
	"jmp 15f\n"

	// Remember the number of the last system call made. We deliberately do
	// not remember calls to gettid(), as we have often seen long sequences
	// of calls to just gettimeofday() and gettid(). In that situation, we
	// would still like to coalesce the gettimeofday() calls.
	"10:cmp $224, %eax\n" // __NR_gettid
	"jz 11f\n"
	"mov %eax, %fs:0x102C-0x58\n" // remember syscall number

	// Retrieve function call from system call table (c.f.system_call_table.cc)
	// We have three different types of entries; zero for denied system calls,
	// that should be handled by the defaultSystemCallHandler(); minus one
	// for unrestricted system calls that need to be forwarded to the trusted
	// thread; and function pointers to specific handler functions.
	"11:shl $3, %eax\n"
	"call 12f\n"
	"12:pop %ebx\n"
	"add $(_GLOBAL_OFFSET_TABLE_+(.-12b)), %ebx\n"
	"mov playground$syscallTable@GOT(%ebx), %ebx\n"
	"add 0(%ebx), %eax\n"
	"mov 0(%eax), %eax\n"

	// Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
	// jump to fallback handler.
	"cmp $1, %eax\n"
	"jbe 14f\n"
	"add $4, %esp\n"
	"call *%eax\n"
	"add $24, %esp\n"

	// Restore CPU registers, except for %eax which was set by the system call.
	"13:pop %esp\n"
	"pop %edi\n"
	"pop %esi\n"
	"pop %edx\n"
	"pop %ecx\n"
	"pop %ebx\n"
	"pop %ebp\n" // 0xDEADBEEF marker
	"pop %ebp\n"

	// Remove fake return address. This is added in the patching code in
	// library.cc and it makes stack traces a little cleaner.
	"add $4, %esp\n"

	// Return to caller
	"ret\n"

	// Call default handler.
	"14:call playground$defaultSystemCallHandler@PLT\n"
	"15:add $28, %esp\n"
	"jmp 13b\n"

	".pushsection \".bss\"\n"
	".balign 8\n"
	"100:.byte 0, 0, 0, 0, 0, 0, 0, 0\n"
	".popsection\n"

	#else
	#error Unsupported target platform
	#endif
	".size playground$syscallEntryPointWithFrame,"
	".-playground$syscallEntryPointWithFrame\n"
	".popsection\n"
	);


	void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1,
	void* arg2, void* arg3, void* arg4,
	void* arg5) {
	// TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that.

	// We try to avoid intercepting read(), and write(), as these system calls
	// are not restricted in Seccomp mode. But depending on the exact
	// instruction sequence in libc, we might not be able to reliably
	// filter out these system calls at the time when we instrument the code.
	SysCalls sys;
	long rc;
	long long tm;
	switch (syscallNum) {
	case __NR_read:
	Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
	rc = sys.read((long)arg0, arg1, (size_t)arg2);
	break;
	case __NR_write:
	Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
	rc = sys.write((long)arg0, arg1, (size_t)arg2);
	break;
	default:
	if (Debug::isEnabled()) {
	// In debug mode, prevent stderr from being closed
	if (syscallNum == __NR_close && arg0 == (void *)2)
	return 0;
	}

	if ((unsigned)syscallNum <= SyscallTable::maxSyscall &&
	SyscallTable::syscallTable[syscallNum].handler==UNRESTRICTED_SYSCALL){
	Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
	perform_unrestricted:
	struct {
	int sysnum;
	void* unrestricted_req[6];
	} __attribute__((packed)) request = {
	syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } };

	int thread = threadFdPub();
	void* rc;
	if (write(sys, thread, &request, sizeof(request)) != sizeof(request) \|\|
	read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) {
	die("Failed to forward unrestricted system call");
	}
	Debug::elapsed(tm, syscallNum);
	return rc;
	} else if (Debug::isEnabled()) {
	Debug::syscall(&tm, syscallNum,
	"In production mode, this call would be disallowed");
	goto perform_unrestricted;
	} else {
	return (void *)-ENOSYS;
	}
	}
	if (rc < 0) {
	rc = -sys.my_errno;
	}
	Debug::elapsed(tm, syscallNum);
	return (void *)rc;
	}

	} // namespace