pkg/sentry/kernel/task_run.go - infra/sanddune - Git at Google

 // Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package kernel

 import (
 	"bytes"
 	"fmt"
 	"runtime"
 	"runtime/trace"
 	"sync/atomic"

 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/goid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )

 // A taskRunState is a reified state in the task state machine. See README.md
 // for details. The canonical list of all run states, as well as transitions
 // between them, is given in run_states.dot.
 //
 // The set of possible states is enumerable and completely defined by the
 // kernel package, so taskRunState would ideally be represented by a
 // discriminated union. However, Go does not support sum types.
 //
 // Hence, as with TaskStop, data-free taskRunStates should be represented as
 // typecast nils to avoid unnecessary allocation.
 type taskRunState interface {
 	// execute executes the code associated with this state over the given task
 	// and returns the following state. If execute returns nil, the task
 	// goroutine should exit.
 	//
 	// It is valid to tail-call a following state's execute to avoid the
 	// overhead of converting the following state to an interface object and
 	// checking for stops, provided that the tail-call cannot recurse.
 	execute(*Task) taskRunState
 }

 // run runs the task goroutine.
 //
 // threadID a dummy value set to the task's TID in the root PID namespace to
 // make it visible in stack dumps. A goroutine for a given task can be identified
 // searching for Task.run()'s argument value.
 func (t *Task) run(threadID uintptr) {
 	atomic.StoreInt64(&t.goid, goid.Get())

 	// Construct t.blockingTimer here. We do this here because we can't
 	// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
 	// kernel.timekeeper.SetClocks() hasn't been called yet.
 	blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
 	t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
 	defer t.blockingTimer.Destroy()
 	t.blockingTimerChan = blockingTimerChan

 	// Activate our address space.
 	t.Activate()
 	// The corresponding t.Deactivate occurs in the exit path
 	// (runExitMain.execute) so that when
 	// Platform.CooperativelySharesAddressSpace() == true, we give up the
 	// AddressSpace before the task goroutine finishes executing.

 	// If this is a newly-started task, it should check for participation in
 	// group stops. If this is a task resuming after restore, it was
 	// interrupted by saving. In either case, the task is initially
 	// interrupted.
 	t.interruptSelf()

 	for {
 		// Explanation for this ordering:
 		//
 		// - A freshly-started task that is stopped should not do anything
 		// before it enters the stop.
 		//
 		// - If taskRunState.execute returns nil, the task goroutine should
 		// exit without checking for a stop.
 		//
 		// - Task.Start won't start Task.run if t.runState is nil, so this
 		// ordering is safe.
 		t.doStop()
 		t.runState = t.runState.execute(t)
 		if t.runState == nil {
 			t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
 			t.goroutineStopped.Done()
 			t.tg.liveGoroutines.Done()
 			t.tg.pidns.owner.liveGoroutines.Done()
 			t.tg.pidns.owner.runningGoroutines.Done()
 			t.p.Release()

 			// Deferring this store triggers a false positive in the race
 			// detector (https://github.com/golang/go/issues/42599).
 			atomic.StoreInt64(&t.goid, 0)
 			// Keep argument alive because stack trace for dead variables may not be correct.
 			runtime.KeepAlive(threadID)
 			return
 		}
 	}
 }

 // doStop is called by Task.run to block until the task is not stopped.
 func (t *Task) doStop() {
 	if atomic.LoadInt32(&t.stopCount) == 0 {
 		return
 	}
 	t.Deactivate()
 	// NOTE(b/30316266): t.Activate() must be called without any locks held, so
 	// this defer must precede the defer for unlocking the signal mutex.
 	defer t.Activate()
 	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
 	defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
 	t.tg.pidns.owner.runningGoroutines.Add(-1)
 	defer t.tg.pidns.owner.runningGoroutines.Add(1)
 	t.goroutineStopped.Add(-1)
 	defer t.goroutineStopped.Add(1)
 	for t.stopCount > 0 {
 		t.endStopCond.Wait()
 	}
 }

 func (*runApp) handleCPUIDInstruction(t *Task) error {
 	if len(arch.CPUIDInstruction) == 0 {
 		// CPUID emulation isn't supported, but this code can be
 		// executed, because the ptrace platform returns
 		// ErrContextSignalCPUID on page faults too. Look at
 		// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
 		// details.
 		return platform.ErrContextSignal
 	}
 	// Is this a CPUID instruction?
 	region := trace.StartRegion(t.traceContext, cpuidRegion)
 	expected := arch.CPUIDInstruction[:]
 	found := make([]byte, len(expected))
 	_, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
 	if err == nil && bytes.Equal(expected, found) {
 		// Skip the cpuid instruction.
 		t.Arch().CPUIDEmulate(t)
 		t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
 		region.End()

 		return nil
 	}
 	region.End() // Not an actual CPUID, but required copy-in.
 	return platform.ErrContextSignal
 }

 // The runApp state checks for interrupts before executing untrusted
 // application code.
 //
 // +stateify savable
 type runApp struct{}

 func (app *runApp) execute(t *Task) taskRunState {
 	if t.interrupted() {
 		// Checkpointing instructs tasks to stop by sending an interrupt, so we
 		// must check for stops before entering runInterrupt (instead of
 		// tail-calling it).
 		return (*runInterrupt)(nil)
 	}

 	// Execute any task work callbacks before returning to user space.
 	if atomic.LoadInt32(&t.taskWorkCount) > 0 {
 		t.taskWorkMu.Lock()
 		queue := t.taskWork
 		t.taskWork = nil
 		atomic.StoreInt32(&t.taskWorkCount, 0)
 		t.taskWorkMu.Unlock()

 		// Do not hold taskWorkMu while executing task work, which may register
 		// more work.
 		for _, work := range queue {
 			work.TaskWork(t)
 		}
 	}

 	// We're about to switch to the application again. If there's still an
 	// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
 	// restart the syscall that was interrupted. If there's a saved signal
 	// mask, restore it. (Note that restoring the saved signal mask may unblock
 	// a pending signal, causing another interruption, but that signal should
 	// not interact with the interrupted syscall.)
 	if t.haveSyscallReturn {
 		if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
 			if sre == syserror.ERESTART_RESTARTBLOCK {
 				t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
 				t.Arch().RestartSyscallWithRestartBlock()
 			} else {
 				t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
 				t.Arch().RestartSyscall()
 			}
 		}
 		t.haveSyscallReturn = false
 	}
 	if t.haveSavedSignalMask {
 		t.SetSignalMask(t.savedSignalMask)
 		t.haveSavedSignalMask = false
 		if t.interrupted() {
 			return (*runInterrupt)(nil)
 		}
 	}

 	// Apply restartable sequences.
 	if t.rseqPreempted {
 		t.rseqPreempted = false
 		if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
 			// Linux writes the CPU on every preemption. We only do
 			// so if it changed. Thus we may delay delivery of
 			// SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
 			cpu := int32(hostcpu.GetCPU())
 			if t.rseqCPU != cpu {
 				t.rseqCPU = cpu
 				if err := t.rseqCopyOutCPU(); err != nil {
 					t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
 					t.forceSignal(linux.SIGSEGV, false)
 					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 					// Re-enter the task run loop for signal delivery.
 					return (*runApp)(nil)
 				}
 				if err := t.oldRSeqCopyOutCPU(); err != nil {
 					t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
 					t.forceSignal(linux.SIGSEGV, false)
 					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 					// Re-enter the task run loop for signal delivery.
 					return (*runApp)(nil)
 				}
 			}
 		}
 		t.rseqInterrupt()
 	}

 	// Check if we need to enable single-stepping. Tracers expect that the
 	// kernel preserves the value of the single-step flag set by PTRACE_SETREGS
 	// whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
 	// includes our ptrace platform, by the way), so we should only clear the
 	// single-step flag if we're responsible for setting it. (clearSinglestep
 	// is therefore analogous to Linux's TIF_FORCED_TF.)
 	//
 	// Strictly speaking, we should also not clear the single-step flag if we
 	// single-step through an instruction that sets the single-step flag
 	// (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
 	// own TF. (Famous last words, I know.)
 	clearSinglestep := false
 	if t.hasTracer() {
 		t.tg.pidns.owner.mu.RLock()
 		if t.ptraceSinglestep {
 			clearSinglestep = !t.Arch().SingleStep()
 			t.Arch().SetSingleStep()
 		}
 		t.tg.pidns.owner.mu.RUnlock()
 	}

 	region := trace.StartRegion(t.traceContext, runRegion)
 	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
 	info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU)
 	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
 	region.End()

 	if clearSinglestep {
 		t.Arch().ClearSingleStep()
 	}

 	switch err {
 	case nil:
 		// Handle application system call.
 		return t.doSyscall()

 	case platform.ErrContextInterrupt:
 		// Interrupted by platform.Context.Interrupt(). Re-enter the run
 		// loop to figure out why.
 		return (*runApp)(nil)

 	case platform.ErrContextSignalCPUID:
 		if err := app.handleCPUIDInstruction(t); err == nil {
 			// Resume execution.
 			return (*runApp)(nil)
 		}

 		// The instruction at the given RIP was not a CPUID, and we
 		// fallthrough to the default signal deliver behavior below.
 		fallthrough

 	case platform.ErrContextSignal:
 		// Looks like a signal has been delivered to us. If it's a synchronous
 		// signal (SEGV, SIGBUS, etc.), it should be sent to the application
 		// thread that received it.
 		sig := linux.Signal(info.Signo)

 		// Was it a fault that we should handle internally? If so, this wasn't
 		// an application-generated signal and we should continue execution
 		// normally.
 		if at.Any() {
 			region := trace.StartRegion(t.traceContext, faultRegion)
 			addr := usermem.Addr(info.Addr())
 			err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
 			region.End()
 			if err == nil {
 				// The fault was handled appropriately.
 				// We can resume running the application.
 				return (*runApp)(nil)
 			}

 			// Is this a vsyscall that we need emulate?
 			//
 			// Note that we don't track vsyscalls as part of a
 			// specific trace region. This is because regions don't
 			// stack, and the actual system call will count as a
 			// region. We should be able to easily identify
 			// vsyscalls by having a <fault><syscall> pair.
 			if at.Execute {
 				if sysno, ok := t.image.st.LookupEmulate(addr); ok {
 					return t.doVsyscall(addr, sysno)
 				}
 			}

 			// Faults are common, log only at debug level.
 			t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
 			t.DebugDumpState()

 			// Continue to signal handling.
 			//
 			// Convert a BusError error to a SIGBUS from a SIGSEGV. All
 			// other info bits stay the same (address, etc.).
 			if _, ok := err.(*memmap.BusError); ok {
 				sig = linux.SIGBUS
 				info.Signo = int32(linux.SIGBUS)
 			}
 		}

 		switch sig {
 		case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
 			// Synchronous signal. Send it to ourselves. Assume the signal is
 			// legitimate and force it (work around the signal being ignored or
 			// blocked) like Linux does. Conveniently, this is even the correct
 			// behavior for SIGTRAP from single-stepping.
 			t.forceSignal(linux.Signal(sig), false /* unconditional */)
 			t.SendSignal(info)

 		case platform.SignalInterrupt:
 			// Assume that a call to platform.Context.Interrupt() misfired.

 		case linux.SIGPROF:
 			// It's a profiling interrupt: there's not much
 			// we can do. We've already paid a decent cost
 			// by intercepting the signal, at this point we
 			// simply ignore it.

 		default:
 			// Asynchronous signal. Let the system deal with it.
 			t.k.sendExternalSignal(info, "application")
 		}

 		return (*runApp)(nil)

 	case platform.ErrContextCPUPreempted:
 		// Ensure that rseq critical sections are interrupted and per-thread
 		// CPU values are updated before the next platform.Context.Switch().
 		t.rseqPreempted = true
 		return (*runApp)(nil)

 	default:
 		// What happened? Can't continue.
 		t.Warningf("Unexpected SwitchToApp error: %v", err)
 		t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)})
 		return (*runExit)(nil)
 	}
 }

 // assertTaskGoroutine panics if the caller is not running on t's task
 // goroutine.
 func (t *Task) assertTaskGoroutine() {
 	if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want {
 		panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want))
 	}
 }

 // GoroutineID returns the ID of t's task goroutine.
 func (t *Task) GoroutineID() int64 {
 	return atomic.LoadInt64(&t.goid)
 }

 // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
 func (t *Task) waitGoroutineStoppedOrExited() {
 	t.goroutineStopped.Wait()
 }

 // WaitExited blocks until all task goroutines in tg have exited.
 //
 // WaitExited does not correspond to anything in Linux; it's provided so that
 // external callers of Kernel.CreateProcess can wait for the created thread
 // group to terminate.
 func (tg *ThreadGroup) WaitExited() {
 	tg.liveGoroutines.Wait()
 }

 // Yield yields the processor for the calling task.
 func (t *Task) Yield() {
 	atomic.AddUint64(&t.yieldCount, 1)
 	runtime.Gosched()
 }
	// Copyright 2018 The gVisor Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package kernel

	import (
	"bytes"
	"fmt"
	"runtime"
	"runtime/trace"
	"sync/atomic"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/goid"
	"gvisor.dev/gvisor/pkg/sentry/arch"
	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
	"gvisor.dev/gvisor/pkg/sentry/memmap"
	"gvisor.dev/gvisor/pkg/sentry/platform"
	"gvisor.dev/gvisor/pkg/syserror"
	"gvisor.dev/gvisor/pkg/usermem"
	)

	// A taskRunState is a reified state in the task state machine. See README.md
	// for details. The canonical list of all run states, as well as transitions
	// between them, is given in run_states.dot.
	//
	// The set of possible states is enumerable and completely defined by the
	// kernel package, so taskRunState would ideally be represented by a
	// discriminated union. However, Go does not support sum types.
	//
	// Hence, as with TaskStop, data-free taskRunStates should be represented as
	// typecast nils to avoid unnecessary allocation.
	type taskRunState interface {
	// execute executes the code associated with this state over the given task
	// and returns the following state. If execute returns nil, the task
	// goroutine should exit.
	//
	// It is valid to tail-call a following state's execute to avoid the
	// overhead of converting the following state to an interface object and
	// checking for stops, provided that the tail-call cannot recurse.
	execute(*Task) taskRunState
	}

	// run runs the task goroutine.
	//
	// threadID a dummy value set to the task's TID in the root PID namespace to
	// make it visible in stack dumps. A goroutine for a given task can be identified
	// searching for Task.run()'s argument value.
	func (t *Task) run(threadID uintptr) {
	atomic.StoreInt64(&t.goid, goid.Get())

	// Construct t.blockingTimer here. We do this here because we can't
	// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
	// kernel.timekeeper.SetClocks() hasn't been called yet.
	blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
	t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
	defer t.blockingTimer.Destroy()
	t.blockingTimerChan = blockingTimerChan

	// Activate our address space.
	t.Activate()
	// The corresponding t.Deactivate occurs in the exit path
	// (runExitMain.execute) so that when
	// Platform.CooperativelySharesAddressSpace() == true, we give up the
	// AddressSpace before the task goroutine finishes executing.

	// If this is a newly-started task, it should check for participation in
	// group stops. If this is a task resuming after restore, it was
	// interrupted by saving. In either case, the task is initially
	// interrupted.
	t.interruptSelf()

	for {
	// Explanation for this ordering:
	//
	// - A freshly-started task that is stopped should not do anything
	// before it enters the stop.
	//
	// - If taskRunState.execute returns nil, the task goroutine should
	// exit without checking for a stop.
	//
	// - Task.Start won't start Task.run if t.runState is nil, so this
	// ordering is safe.
	t.doStop()
	t.runState = t.runState.execute(t)
	if t.runState == nil {
	t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
	t.goroutineStopped.Done()
	t.tg.liveGoroutines.Done()
	t.tg.pidns.owner.liveGoroutines.Done()
	t.tg.pidns.owner.runningGoroutines.Done()
	t.p.Release()

	// Deferring this store triggers a false positive in the race
	// detector (https://github.com/golang/go/issues/42599).
	atomic.StoreInt64(&t.goid, 0)
	// Keep argument alive because stack trace for dead variables may not be correct.
	runtime.KeepAlive(threadID)
	return
	}
	}
	}

	// doStop is called by Task.run to block until the task is not stopped.
	func (t *Task) doStop() {
	if atomic.LoadInt32(&t.stopCount) == 0 {
	return
	}
	t.Deactivate()
	// NOTE(b/30316266): t.Activate() must be called without any locks held, so
	// this defer must precede the defer for unlocking the signal mutex.
	defer t.Activate()
	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
	defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	t.tg.pidns.owner.runningGoroutines.Add(-1)
	defer t.tg.pidns.owner.runningGoroutines.Add(1)
	t.goroutineStopped.Add(-1)
	defer t.goroutineStopped.Add(1)
	for t.stopCount > 0 {
	t.endStopCond.Wait()
	}
	}

	func (runApp) handleCPUIDInstruction(t Task) error {
	if len(arch.CPUIDInstruction) == 0 {
	// CPUID emulation isn't supported, but this code can be
	// executed, because the ptrace platform returns
	// ErrContextSignalCPUID on page faults too. Look at
	// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
	// details.
	return platform.ErrContextSignal
	}
	// Is this a CPUID instruction?
	region := trace.StartRegion(t.traceContext, cpuidRegion)
	expected := arch.CPUIDInstruction[:]
	found := make([]byte, len(expected))
	_, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
	if err == nil && bytes.Equal(expected, found) {
	// Skip the cpuid instruction.
	t.Arch().CPUIDEmulate(t)
	t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
	region.End()

	return nil
	}
	region.End() // Not an actual CPUID, but required copy-in.
	return platform.ErrContextSignal
	}

	// The runApp state checks for interrupts before executing untrusted
	// application code.
	//
	// +stateify savable
	type runApp struct{}

	func (app runApp) execute(t Task) taskRunState {
	if t.interrupted() {
	// Checkpointing instructs tasks to stop by sending an interrupt, so we
	// must check for stops before entering runInterrupt (instead of
	// tail-calling it).
	return (*runInterrupt)(nil)
	}

	// Execute any task work callbacks before returning to user space.
	if atomic.LoadInt32(&t.taskWorkCount) > 0 {
	t.taskWorkMu.Lock()
	queue := t.taskWork
	t.taskWork = nil
	atomic.StoreInt32(&t.taskWorkCount, 0)
	t.taskWorkMu.Unlock()

	// Do not hold taskWorkMu while executing task work, which may register
	// more work.
	for _, work := range queue {
	work.TaskWork(t)
	}
	}

	// We're about to switch to the application again. If there's still an
	// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
	// restart the syscall that was interrupted. If there's a saved signal
	// mask, restore it. (Note that restoring the saved signal mask may unblock
	// a pending signal, causing another interruption, but that signal should
	// not interact with the interrupted syscall.)
	if t.haveSyscallReturn {
	if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
	if sre == syserror.ERESTART_RESTARTBLOCK {
	t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
	t.Arch().RestartSyscallWithRestartBlock()
	} else {
	t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
	t.Arch().RestartSyscall()
	}
	}
	t.haveSyscallReturn = false
	}
	if t.haveSavedSignalMask {
	t.SetSignalMask(t.savedSignalMask)
	t.haveSavedSignalMask = false
	if t.interrupted() {
	return (*runInterrupt)(nil)
	}
	}

	// Apply restartable sequences.
	if t.rseqPreempted {
	t.rseqPreempted = false
	if t.rseqAddr != 0 \|\| t.oldRSeqCPUAddr != 0 {
	// Linux writes the CPU on every preemption. We only do
	// so if it changed. Thus we may delay delivery of
	// SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
	cpu := int32(hostcpu.GetCPU())
	if t.rseqCPU != cpu {
	t.rseqCPU = cpu
	if err := t.rseqCopyOutCPU(); err != nil {
	t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
	t.forceSignal(linux.SIGSEGV, false)
	t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
	// Re-enter the task run loop for signal delivery.
	return (*runApp)(nil)
	}
	if err := t.oldRSeqCopyOutCPU(); err != nil {
	t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
	t.forceSignal(linux.SIGSEGV, false)
	t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
	// Re-enter the task run loop for signal delivery.
	return (*runApp)(nil)
	}
	}
	}
	t.rseqInterrupt()
	}

	// Check if we need to enable single-stepping. Tracers expect that the
	// kernel preserves the value of the single-step flag set by PTRACE_SETREGS
	// whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
	// includes our ptrace platform, by the way), so we should only clear the
	// single-step flag if we're responsible for setting it. (clearSinglestep
	// is therefore analogous to Linux's TIF_FORCED_TF.)
	//
	// Strictly speaking, we should also not clear the single-step flag if we
	// single-step through an instruction that sets the single-step flag
	// (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
	// own TF. (Famous last words, I know.)
	clearSinglestep := false
	if t.hasTracer() {
	t.tg.pidns.owner.mu.RLock()
	if t.ptraceSinglestep {
	clearSinglestep = !t.Arch().SingleStep()
	t.Arch().SetSingleStep()
	}
	t.tg.pidns.owner.mu.RUnlock()
	}

	region := trace.StartRegion(t.traceContext, runRegion)
	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
	info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU)
	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
	region.End()

	if clearSinglestep {
	t.Arch().ClearSingleStep()
	}

	switch err {
	case nil:
	// Handle application system call.
	return t.doSyscall()

	case platform.ErrContextInterrupt:
	// Interrupted by platform.Context.Interrupt(). Re-enter the run
	// loop to figure out why.
	return (*runApp)(nil)

	case platform.ErrContextSignalCPUID:
	if err := app.handleCPUIDInstruction(t); err == nil {
	// Resume execution.
	return (*runApp)(nil)
	}

	// The instruction at the given RIP was not a CPUID, and we
	// fallthrough to the default signal deliver behavior below.
	fallthrough

	case platform.ErrContextSignal:
	// Looks like a signal has been delivered to us. If it's a synchronous
	// signal (SEGV, SIGBUS, etc.), it should be sent to the application
	// thread that received it.
	sig := linux.Signal(info.Signo)

	// Was it a fault that we should handle internally? If so, this wasn't
	// an application-generated signal and we should continue execution
	// normally.
	if at.Any() {
	region := trace.StartRegion(t.traceContext, faultRegion)
	addr := usermem.Addr(info.Addr())
	err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
	region.End()
	if err == nil {
	// The fault was handled appropriately.
	// We can resume running the application.
	return (*runApp)(nil)
	}

	// Is this a vsyscall that we need emulate?
	//
	// Note that we don't track vsyscalls as part of a
	// specific trace region. This is because regions don't
	// stack, and the actual system call will count as a
	// region. We should be able to easily identify
	// vsyscalls by having a <fault><syscall> pair.
	if at.Execute {
	if sysno, ok := t.image.st.LookupEmulate(addr); ok {
	return t.doVsyscall(addr, sysno)
	}
	}

	// Faults are common, log only at debug level.
	t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
	t.DebugDumpState()

	// Continue to signal handling.
	//
	// Convert a BusError error to a SIGBUS from a SIGSEGV. All
	// other info bits stay the same (address, etc.).
	if _, ok := err.(*memmap.BusError); ok {
	sig = linux.SIGBUS
	info.Signo = int32(linux.SIGBUS)
	}
	}

	switch sig {
	case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
	// Synchronous signal. Send it to ourselves. Assume the signal is
	// legitimate and force it (work around the signal being ignored or
	// blocked) like Linux does. Conveniently, this is even the correct
	// behavior for SIGTRAP from single-stepping.
	t.forceSignal(linux.Signal(sig), false /* unconditional */)
	t.SendSignal(info)

	case platform.SignalInterrupt:
	// Assume that a call to platform.Context.Interrupt() misfired.

	case linux.SIGPROF:
	// It's a profiling interrupt: there's not much
	// we can do. We've already paid a decent cost
	// by intercepting the signal, at this point we
	// simply ignore it.

	default:
	// Asynchronous signal. Let the system deal with it.
	t.k.sendExternalSignal(info, "application")
	}

	return (*runApp)(nil)

	case platform.ErrContextCPUPreempted:
	// Ensure that rseq critical sections are interrupted and per-thread
	// CPU values are updated before the next platform.Context.Switch().
	t.rseqPreempted = true
	return (*runApp)(nil)

	default:
	// What happened? Can't continue.
	t.Warningf("Unexpected SwitchToApp error: %v", err)
	t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)})
	return (*runExit)(nil)
	}
	}

	// assertTaskGoroutine panics if the caller is not running on t's task
	// goroutine.
	func (t *Task) assertTaskGoroutine() {
	if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want {
	panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want))
	}
	}

	// GoroutineID returns the ID of t's task goroutine.
	func (t *Task) GoroutineID() int64 {
	return atomic.LoadInt64(&t.goid)
	}

	// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
	func (t *Task) waitGoroutineStoppedOrExited() {
	t.goroutineStopped.Wait()
	}

	// WaitExited blocks until all task goroutines in tg have exited.
	//
	// WaitExited does not correspond to anything in Linux; it's provided so that
	// external callers of Kernel.CreateProcess can wait for the created thread
	// group to terminate.
	func (tg *ThreadGroup) WaitExited() {
	tg.liveGoroutines.Wait()
	}

	// Yield yields the processor for the calling task.
	func (t *Task) Yield() {
	atomic.AddUint64(&t.yieldCount, 1)
	runtime.Gosched()
	}