pkg/sentry/syscalls/linux/sys_mempolicy.go - infra/sanddune - Git at Google

 // Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package linux

 import (
 	"fmt"

 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )

 // We unconditionally report a single NUMA node. This also means that our
 // "nodemask_t" is a single unsigned long (uint64).
 const (
 	maxNodes        = 1
 	allowedNodemask = (1 << maxNodes) - 1
 )

 func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) {
 	// "nodemask points to a bit mask of node IDs that contains up to maxnode
 	// bits. The bit mask size is rounded to the next multiple of
 	// sizeof(unsigned long), but the kernel will use bits only up to maxnode.
 	// A NULL value of nodemask or a maxnode value of zero specifies the empty
 	// set of nodes. If the value of maxnode is zero, the nodemask argument is
 	// ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate
 	// because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses
 	// maxnode-1, not maxnode, as the number of bits.
 	bits := maxnode - 1
 	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
 		return 0, syserror.EINVAL
 	}
 	if bits == 0 {
 		return 0, nil
 	}
 	// Copy in the whole nodemask.
 	numUint64 := (bits + 63) / 64
 	buf := t.CopyScratchBuffer(int(numUint64) * 8)
 	if _, err := t.CopyInBytes(addr, buf); err != nil {
 		return 0, err
 	}
 	val := usermem.ByteOrder.Uint64(buf)
 	// Check that only allowed bits in the first unsigned long in the nodemask
 	// are set.
 	if val&^allowedNodemask != 0 {
 		return 0, syserror.EINVAL
 	}
 	// Check that all remaining bits in the nodemask are 0.
 	for i := 8; i < len(buf); i++ {
 		if buf[i] != 0 {
 			return 0, syserror.EINVAL
 		}
 	}
 	return val, nil
 }

 func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error {
 	// mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of
 	// bits.
 	bits := maxnode - 1
 	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
 		return syserror.EINVAL
 	}
 	if bits == 0 {
 		return nil
 	}
 	// Copy out the first unsigned long in the nodemask.
 	buf := t.CopyScratchBuffer(8)
 	usermem.ByteOrder.PutUint64(buf, val)
 	if _, err := t.CopyOutBytes(addr, buf); err != nil {
 		return err
 	}
 	// Zero out remaining unsigned longs in the nodemask.
 	if bits > 64 {
 		remAddr, ok := addr.AddLength(8)
 		if !ok {
 			return syserror.EFAULT
 		}
 		remUint64 := (bits - 1) / 64
 		if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{
 			AddressSpaceActive: true,
 		}); err != nil {
 			return err
 		}
 	}
 	return nil
 }

 // GetMempolicy implements the syscall get_mempolicy(2).
 func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	mode := args[0].Pointer()
 	nodemask := args[1].Pointer()
 	maxnode := args[2].Uint()
 	addr := args[3].Pointer()
 	flags := args[4].Uint()

 	if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
 	nodeFlag := flags&linux.MPOL_F_NODE != 0
 	addrFlag := flags&linux.MPOL_F_ADDR != 0
 	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0

 	// "EINVAL: The value specified by maxnode is less than the number of node
 	// IDs supported by the system." - get_mempolicy(2)
 	if nodemask != 0 && maxnode < maxNodes {
 		return 0, nil, syserror.EINVAL
 	}

 	// "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is
 	// ignored and the set of nodes (memories) that the thread is allowed to
 	// specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the
 	// absence of any mode flags) is returned in nodemask."
 	if memsAllowed {
 		// "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either
 		// MPOL_F_ADDR or MPOL_F_NODE."
 		if nodeFlag || addrFlag {
 			return 0, nil, syserror.EINVAL
 		}
 		if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil {
 			return 0, nil, err
 		}
 		return 0, nil, nil
 	}

 	// "If flags specifies MPOL_F_ADDR, then information is returned about the
 	// policy governing the memory address given in addr. ... If the mode
 	// argument is not NULL, then get_mempolicy() will store the policy mode
 	// and any optional mode flags of the requested NUMA policy in the location
 	// pointed to by this argument. If nodemask is not NULL, then the nodemask
 	// associated with the policy will be stored in the location pointed to by
 	// this argument."
 	if addrFlag {
 		policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr)
 		if err != nil {
 			return 0, nil, err
 		}
 		if nodeFlag {
 			// "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR,
 			// get_mempolicy() will return the node ID of the node on which the
 			// address addr is allocated into the location pointed to by mode.
 			// If no page has yet been allocated for the specified address,
 			// get_mempolicy() will allocate a page as if the thread had
 			// performed a read (load) access to that address, and return the
 			// ID of the node where that page was allocated."
 			buf := t.CopyScratchBuffer(1)
 			_, err := t.CopyInBytes(addr, buf)
 			if err != nil {
 				return 0, nil, err
 			}
 			policy = linux.MPOL_DEFAULT // maxNodes == 1
 		}
 		if mode != 0 {
 			if _, err := policy.CopyOut(t, mode); err != nil {
 				return 0, nil, err
 			}
 		}
 		if nodemask != 0 {
 			if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
 				return 0, nil, err
 			}
 		}
 		return 0, nil, nil
 	}

 	// "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did
 	// not specify MPOL_F_ADDR and addr is not NULL." This is partially
 	// inaccurate: if flags specifies MPOL_F_ADDR,
 	// mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will
 	// just (usually) fail to find a VMA at address 0 and return EFAULT.
 	if addr != 0 {
 		return 0, nil, syserror.EINVAL
 	}

 	// "If flags is specified as 0, then information about the calling thread's
 	// default policy (as set by set_mempolicy(2)) is returned, in the buffers
 	// pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but
 	// not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE,
 	// then get_mempolicy() will return in the location pointed to by a
 	// non-NULL mode argument, the node ID of the next node that will be used
 	// for interleaving of internal kernel pages allocated on behalf of the
 	// thread."
 	policy, nodemaskVal := t.NumaPolicy()
 	if nodeFlag {
 		if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
 			return 0, nil, syserror.EINVAL
 		}
 		policy = linux.MPOL_DEFAULT // maxNodes == 1
 	}
 	if mode != 0 {
 		if _, err := policy.CopyOut(t, mode); err != nil {
 			return 0, nil, err
 		}
 	}
 	if nodemask != 0 {
 		if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
 			return 0, nil, err
 		}
 	}
 	return 0, nil, nil
 }

 // SetMempolicy implements the syscall set_mempolicy(2).
 func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	modeWithFlags := linux.NumaPolicy(args[0].Int())
 	nodemask := args[1].Pointer()
 	maxnode := args[2].Uint()

 	modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode)
 	if err != nil {
 		return 0, nil, err
 	}

 	t.SetNumaPolicy(modeWithFlags, nodemaskVal)
 	return 0, nil, nil
 }

 // Mbind implements the syscall mbind(2).
 func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 	length := args[1].Uint64()
 	mode := linux.NumaPolicy(args[2].Int())
 	nodemask := args[3].Pointer()
 	maxnode := args[4].Uint()
 	flags := args[5].Uint()

 	if flags&^linux.MPOL_MF_VALID != 0 {
 		return 0, nil, syserror.EINVAL
 	}
 	// "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be
 	// privileged (CAP_SYS_NICE) to use this flag." - mbind(2)
 	if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) {
 		return 0, nil, syserror.EPERM
 	}

 	mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode)
 	if err != nil {
 		return 0, nil, err
 	}

 	// Since we claim to have only a single node, all flags can be ignored
 	// (since all pages must already be on that single node).
 	err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal)
 	return 0, nil, err
 }

 func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask usermem.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) {
 	flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS)
 	mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS)
 	if flags == linux.MPOL_MODE_FLAGS {
 		// Can't specify both mode flags simultaneously.
 		return 0, 0, syserror.EINVAL
 	}
 	if mode < 0 || mode >= linux.MPOL_MAX {
 		// Must specify a valid mode.
 		return 0, 0, syserror.EINVAL
 	}

 	var nodemaskVal uint64
 	if nodemask != 0 {
 		var err error
 		nodemaskVal, err = copyInNodemask(t, nodemask, maxnode)
 		if err != nil {
 			return 0, 0, err
 		}
 	}

 	switch mode {
 	case linux.MPOL_DEFAULT:
 		// "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate;
 		// Linux allows a nodemask to be specified, as long as it is empty.
 		if nodemaskVal != 0 {
 			return 0, 0, syserror.EINVAL
 		}
 	case linux.MPOL_BIND, linux.MPOL_INTERLEAVE:
 		// These require a non-empty nodemask.
 		if nodemaskVal == 0 {
 			return 0, 0, syserror.EINVAL
 		}
 	case linux.MPOL_PREFERRED:
 		// This permits an empty nodemask, as long as no flags are set.
 		if nodemaskVal == 0 && flags != 0 {
 			return 0, 0, syserror.EINVAL
 		}
 	case linux.MPOL_LOCAL:
 		// This requires an empty nodemask and no flags set ...
 		if nodemaskVal != 0 || flags != 0 {
 			return 0, 0, syserror.EINVAL
 		}
 		// ... and is implemented as MPOL_PREFERRED.
 		mode = linux.MPOL_PREFERRED
 	default:
 		// Unknown mode, which we should have rejected above.
 		panic(fmt.Sprintf("unknown mode: %v", mode))
 	}

 	return mode | flags, nodemaskVal, nil
 }
	// Copyright 2019 The gVisor Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package linux

	import (
	"fmt"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/sentry/arch"
	"gvisor.dev/gvisor/pkg/sentry/kernel"
	"gvisor.dev/gvisor/pkg/syserror"
	"gvisor.dev/gvisor/pkg/usermem"
	)

	// We unconditionally report a single NUMA node. This also means that our
	// "nodemask_t" is a single unsigned long (uint64).
	const (
	maxNodes = 1
	allowedNodemask = (1 << maxNodes) - 1
	)

	func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) {
	// "nodemask points to a bit mask of node IDs that contains up to maxnode
	// bits. The bit mask size is rounded to the next multiple of
	// sizeof(unsigned long), but the kernel will use bits only up to maxnode.
	// A NULL value of nodemask or a maxnode value of zero specifies the empty
	// set of nodes. If the value of maxnode is zero, the nodemask argument is
	// ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate
	// because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses
	// maxnode-1, not maxnode, as the number of bits.
	bits := maxnode - 1
	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
	return 0, syserror.EINVAL
	}
	if bits == 0 {
	return 0, nil
	}
	// Copy in the whole nodemask.
	numUint64 := (bits + 63) / 64
	buf := t.CopyScratchBuffer(int(numUint64) * 8)
	if _, err := t.CopyInBytes(addr, buf); err != nil {
	return 0, err
	}
	val := usermem.ByteOrder.Uint64(buf)
	// Check that only allowed bits in the first unsigned long in the nodemask
	// are set.
	if val&^allowedNodemask != 0 {
	return 0, syserror.EINVAL
	}
	// Check that all remaining bits in the nodemask are 0.
	for i := 8; i < len(buf); i++ {
	if buf[i] != 0 {
	return 0, syserror.EINVAL
	}
	}
	return val, nil
	}

	func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error {
	// mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of
	// bits.
	bits := maxnode - 1
	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
	return syserror.EINVAL
	}
	if bits == 0 {
	return nil
	}
	// Copy out the first unsigned long in the nodemask.
	buf := t.CopyScratchBuffer(8)
	usermem.ByteOrder.PutUint64(buf, val)
	if _, err := t.CopyOutBytes(addr, buf); err != nil {
	return err
	}
	// Zero out remaining unsigned longs in the nodemask.
	if bits > 64 {
	remAddr, ok := addr.AddLength(8)
	if !ok {
	return syserror.EFAULT
	}
	remUint64 := (bits - 1) / 64
	if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{
	AddressSpaceActive: true,
	}); err != nil {
	return err
	}
	}
	return nil
	}

	// GetMempolicy implements the syscall get_mempolicy(2).
	func GetMempolicy(t kernel.Task, args arch.SyscallArguments) (uintptr, kernel.SyscallControl, error) {
	mode := args[0].Pointer()
	nodemask := args[1].Pointer()
	maxnode := args[2].Uint()
	addr := args[3].Pointer()
	flags := args[4].Uint()

	if flags&^(linux.MPOL_F_NODE\|linux.MPOL_F_ADDR\|linux.MPOL_F_MEMS_ALLOWED) != 0 {
	return 0, nil, syserror.EINVAL
	}
	nodeFlag := flags&linux.MPOL_F_NODE != 0
	addrFlag := flags&linux.MPOL_F_ADDR != 0
	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0

	// "EINVAL: The value specified by maxnode is less than the number of node
	// IDs supported by the system." - get_mempolicy(2)
	if nodemask != 0 && maxnode < maxNodes {
	return 0, nil, syserror.EINVAL
	}

	// "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is
	// ignored and the set of nodes (memories) that the thread is allowed to
	// specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the
	// absence of any mode flags) is returned in nodemask."
	if memsAllowed {
	// "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either
	// MPOL_F_ADDR or MPOL_F_NODE."
	if nodeFlag \|\| addrFlag {
	return 0, nil, syserror.EINVAL
	}
	if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil {
	return 0, nil, err
	}
	return 0, nil, nil
	}

	// "If flags specifies MPOL_F_ADDR, then information is returned about the
	// policy governing the memory address given in addr. ... If the mode
	// argument is not NULL, then get_mempolicy() will store the policy mode
	// and any optional mode flags of the requested NUMA policy in the location
	// pointed to by this argument. If nodemask is not NULL, then the nodemask
	// associated with the policy will be stored in the location pointed to by
	// this argument."
	if addrFlag {
	policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr)
	if err != nil {
	return 0, nil, err
	}
	if nodeFlag {
	// "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR,
	// get_mempolicy() will return the node ID of the node on which the
	// address addr is allocated into the location pointed to by mode.
	// If no page has yet been allocated for the specified address,
	// get_mempolicy() will allocate a page as if the thread had
	// performed a read (load) access to that address, and return the
	// ID of the node where that page was allocated."
	buf := t.CopyScratchBuffer(1)
	_, err := t.CopyInBytes(addr, buf)
	if err != nil {
	return 0, nil, err
	}
	policy = linux.MPOL_DEFAULT // maxNodes == 1
	}
	if mode != 0 {
	if _, err := policy.CopyOut(t, mode); err != nil {
	return 0, nil, err
	}
	}
	if nodemask != 0 {
	if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
	return 0, nil, err
	}
	}
	return 0, nil, nil
	}

	// "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did
	// not specify MPOL_F_ADDR and addr is not NULL." This is partially
	// inaccurate: if flags specifies MPOL_F_ADDR,
	// mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will
	// just (usually) fail to find a VMA at address 0 and return EFAULT.
	if addr != 0 {
	return 0, nil, syserror.EINVAL
	}

	// "If flags is specified as 0, then information about the calling thread's
	// default policy (as set by set_mempolicy(2)) is returned, in the buffers
	// pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but
	// not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE,
	// then get_mempolicy() will return in the location pointed to by a
	// non-NULL mode argument, the node ID of the next node that will be used
	// for interleaving of internal kernel pages allocated on behalf of the
	// thread."
	policy, nodemaskVal := t.NumaPolicy()
	if nodeFlag {
	if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
	return 0, nil, syserror.EINVAL
	}
	policy = linux.MPOL_DEFAULT // maxNodes == 1
	}
	if mode != 0 {
	if _, err := policy.CopyOut(t, mode); err != nil {
	return 0, nil, err
	}
	}
	if nodemask != 0 {
	if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
	return 0, nil, err
	}
	}
	return 0, nil, nil
	}

	// SetMempolicy implements the syscall set_mempolicy(2).
	func SetMempolicy(t kernel.Task, args arch.SyscallArguments) (uintptr, kernel.SyscallControl, error) {
	modeWithFlags := linux.NumaPolicy(args[0].Int())
	nodemask := args[1].Pointer()
	maxnode := args[2].Uint()

	modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode)
	if err != nil {
	return 0, nil, err
	}

	t.SetNumaPolicy(modeWithFlags, nodemaskVal)
	return 0, nil, nil
	}

	// Mbind implements the syscall mbind(2).
	func Mbind(t kernel.Task, args arch.SyscallArguments) (uintptr, kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	length := args[1].Uint64()
	mode := linux.NumaPolicy(args[2].Int())
	nodemask := args[3].Pointer()
	maxnode := args[4].Uint()
	flags := args[5].Uint()

	if flags&^linux.MPOL_MF_VALID != 0 {
	return 0, nil, syserror.EINVAL
	}
	// "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be
	// privileged (CAP_SYS_NICE) to use this flag." - mbind(2)
	if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) {
	return 0, nil, syserror.EPERM
	}

	mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode)
	if err != nil {
	return 0, nil, err
	}

	// Since we claim to have only a single node, all flags can be ignored
	// (since all pages must already be on that single node).
	err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal)
	return 0, nil, err
	}

	func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask usermem.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) {
	flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS)
	mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS)
	if flags == linux.MPOL_MODE_FLAGS {
	// Can't specify both mode flags simultaneously.
	return 0, 0, syserror.EINVAL
	}
	if mode < 0 \|\| mode >= linux.MPOL_MAX {
	// Must specify a valid mode.
	return 0, 0, syserror.EINVAL
	}

	var nodemaskVal uint64
	if nodemask != 0 {
	var err error
	nodemaskVal, err = copyInNodemask(t, nodemask, maxnode)
	if err != nil {
	return 0, 0, err
	}
	}

	switch mode {
	case linux.MPOL_DEFAULT:
	// "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate;
	// Linux allows a nodemask to be specified, as long as it is empty.
	if nodemaskVal != 0 {
	return 0, 0, syserror.EINVAL
	}
	case linux.MPOL_BIND, linux.MPOL_INTERLEAVE:
	// These require a non-empty nodemask.
	if nodemaskVal == 0 {
	return 0, 0, syserror.EINVAL
	}
	case linux.MPOL_PREFERRED:
	// This permits an empty nodemask, as long as no flags are set.
	if nodemaskVal == 0 && flags != 0 {
	return 0, 0, syserror.EINVAL
	}
	case linux.MPOL_LOCAL:
	// This requires an empty nodemask and no flags set ...
	if nodemaskVal != 0 \|\| flags != 0 {
	return 0, 0, syserror.EINVAL
	}
	// ... and is implemented as MPOL_PREFERRED.
	mode = linux.MPOL_PREFERRED
	default:
	// Unknown mode, which we should have rejected above.
	panic(fmt.Sprintf("unknown mode: %v", mode))
	}

	return mode \| flags, nodemaskVal, nil
	}