blob: 364f30ca2a84b8150a92387c5109f2de5c7f85ff [file] [log] [blame]
// Copyright 2019 The Bazel Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package starlark
// This file defines a simple execution-time profiler for Starlark.
// It measures the wall time spent executing Starlark code, and emits a
// gzipped protocol message in pprof format (github.com/google/pprof).
//
// When profiling is enabled, the interpreter calls the profiler to
// indicate the start and end of each "span" or time interval. A leaf
// function (whether Go or Starlark) has a single span. A function that
// calls another function has spans for each interval in which it is the
// top of the stack. (A LOAD instruction also ends a span.)
//
// At the start of a span, the interpreter records the current time in
// the thread's topmost frame. At the end of the span, it obtains the
// time again and subtracts the span start time. The difference is added
// to an accumulator variable in the thread. If the accumulator exceeds
// some fixed quantum (10ms, say), the profiler records the current call
// stack and sends it to the profiler goroutine, along with the number
// of quanta, which are subtracted. For example, if the accumulator
// holds 3ms and then a completed span adds 25ms to it, its value is 28ms,
// which exceeds 10ms. The profiler records a stack with the value 20ms
// (2 quanta), and the accumulator is left with 8ms.
//
// The profiler goroutine converts the stacks into the pprof format and
// emits a gzip-compressed protocol message to the designated output
// file. We use a hand-written streaming proto encoder to avoid
// dependencies on pprof and proto, and to avoid the need to
// materialize the profile data structure in memory.
//
// A limitation of this profiler is that it measures wall time, which
// does not necessarily correspond to CPU time. A CPU profiler requires
// that only running (not runnable) threads are sampled; this is
// commonly achieved by having the kernel deliver a (PROF) signal to an
// arbitrary running thread, through setitimer(2). The CPU profiler in the
// Go runtime uses this mechanism, but it is not possible for a Go
// application to register a SIGPROF handler, nor is it possible for a
// Go handler for some other signal to read the stack pointer of
// the interrupted thread.
//
// Two caveats:
// (1) it is tempting to send the leaf Frame directly to the profiler
// goroutine instead of making a copy of the stack, since a Frame is a
// spaghetti stack--a linked list. However, as soon as execution
// resumes, the stack's Frame.pc values may be mutated, so Frames are
// not safe to share with the asynchronous profiler goroutine.
// (2) it is tempting to use Callables as keys in a map when tabulating
// the pprof protocols's Function entities. However, we cannot assume
// that Callables are valid map keys, and furthermore we must not
// pin function values in memory indefinitely as this may cause lambda
// values to keep their free variables live much longer than necessary.
// TODO(adonovan):
// - make Start/Stop fully thread-safe.
// - fix the pc hack.
// - experiment with other values of quantum.
import (
"bufio"
"bytes"
"compress/gzip"
"encoding/binary"
"fmt"
"io"
"log"
"reflect"
"sync/atomic"
"time"
"unsafe"
"go.starlark.net/syntax"
)
// StartProfile enables time profiling of all Starlark threads,
// and writes a profile in pprof format to w.
// It must be followed by a call to StopProfiler to stop
// the profiler and finalize the profile.
//
// StartProfile returns an error if profiling was already enabled.
//
// StartProfile must not be called concurrently with Starlark execution.
func StartProfile(w io.Writer) error {
if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) {
return fmt.Errorf("profiler already running")
}
// TODO(adonovan): make the API fully concurrency-safe.
// The main challenge is racy reads/writes of profiler.events,
// and of send/close races on the channel it refers to.
// It's easy to solve them with a mutex but harder to do
// it efficiently.
profiler.events = make(chan *profEvent, 1)
profiler.done = make(chan error)
go profile(w)
return nil
}
// StopProfile stops the profiler started by a prior call to
// StartProfile and finalizes the profile. It returns an error if the
// profile could not be completed.
//
// StopProfile must not be called concurrently with Starlark execution.
func StopProfile() error {
// Terminate the profiler goroutine and get its result.
close(profiler.events)
err := <-profiler.done
profiler.done = nil
profiler.events = nil
atomic.StoreUint32(&profiler.on, 0)
return err
}
// globals
var profiler struct {
on uint32 // nonzero => profiler running
events chan *profEvent // profile events from interpreter threads
done chan error // indicates profiler goroutine is ready
}
func (thread *Thread) beginProfSpan() {
if profiler.events == nil {
return // profiling not enabled
}
thread.frameAt(0).spanStart = nanotime()
}
// TODO(adonovan): experiment with smaller values,
// which trade space and time for greater precision.
const quantum = 10 * time.Millisecond
func (thread *Thread) endProfSpan() {
if profiler.events == nil {
return // profiling not enabled
}
// Add the span to the thread's accumulator.
thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart)
if thread.proftime < quantum {
return
}
// Only record complete quanta.
n := thread.proftime / quantum
thread.proftime -= n * quantum
// Copy the stack.
// (We can't save thread.frame because its pc will change.)
ev := &profEvent{
thread: thread,
time: n * quantum,
}
ev.stack = ev.stackSpace[:0]
for i := range thread.stack {
fr := thread.frameAt(i)
ev.stack = append(ev.stack, profFrame{
pos: fr.Position(),
fn: fr.Callable(),
pc: fr.pc,
})
}
profiler.events <- ev
}
type profEvent struct {
thread *Thread // currently unused
time time.Duration
stack []profFrame
stackSpace [8]profFrame // initial space for stack
}
type profFrame struct {
fn Callable // don't hold this live for too long (prevents GC of lambdas)
pc uint32 // program counter (Starlark frames only)
pos syntax.Position // position of pc within this frame
}
// profile is the profiler goroutine.
// It runs until StopProfiler is called.
func profile(w io.Writer) {
// Field numbers from pprof protocol.
// See https://github.com/google/pprof/blob/master/proto/profile.proto
const (
Profile_sample_type = 1 // repeated ValueType
Profile_sample = 2 // repeated Sample
Profile_mapping = 3 // repeated Mapping
Profile_location = 4 // repeated Location
Profile_function = 5 // repeated Function
Profile_string_table = 6 // repeated string
Profile_time_nanos = 9 // int64
Profile_duration_nanos = 10 // int64
Profile_period_type = 11 // ValueType
Profile_period = 12 // int64
ValueType_type = 1 // int64
ValueType_unit = 2 // int64
Sample_location_id = 1 // repeated uint64
Sample_value = 2 // repeated int64
Sample_label = 3 // repeated Label
Label_key = 1 // int64
Label_str = 2 // int64
Label_num = 3 // int64
Label_num_unit = 4 // int64
Location_id = 1 // uint64
Location_mapping_id = 2 // uint64
Location_address = 3 // uint64
Location_line = 4 // repeated Line
Line_function_id = 1 // uint64
Line_line = 2 // int64
Function_id = 1 // uint64
Function_name = 2 // int64
Function_system_name = 3 // int64
Function_filename = 4 // int64
Function_start_line = 5 // int64
)
bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks
gz := gzip.NewWriter(bufw)
enc := protoEncoder{w: gz}
// strings
stringIndex := make(map[string]int64)
str := func(s string) int64 {
i, ok := stringIndex[s]
if !ok {
i = int64(len(stringIndex))
enc.string(Profile_string_table, s)
stringIndex[s] = i
}
return i
}
str("") // entry 0
// functions
//
// function returns the ID of a Callable for use in Line.FunctionId.
// The ID is the same as the function's logical address,
// which is supplied by the caller to avoid the need to recompute it.
functionId := make(map[uintptr]uint64)
function := func(fn Callable, addr uintptr) uint64 {
id, ok := functionId[addr]
if !ok {
id = uint64(addr)
var pos syntax.Position
if fn, ok := fn.(callableWithPosition); ok {
pos = fn.Position()
}
name := fn.Name()
if name == "<toplevel>" {
name = pos.Filename()
}
nameIndex := str(name)
fun := new(bytes.Buffer)
funenc := protoEncoder{w: fun}
funenc.uint(Function_id, id)
funenc.int(Function_name, nameIndex)
funenc.int(Function_system_name, nameIndex)
funenc.int(Function_filename, str(pos.Filename()))
funenc.int(Function_start_line, int64(pos.Line))
enc.bytes(Profile_function, fun.Bytes())
functionId[addr] = id
}
return id
}
// locations
//
// location returns the ID of the location denoted by fr.
// For Starlark frames, this is the Frame pc.
locationId := make(map[uintptr]uint64)
location := func(fr profFrame) uint64 {
fnAddr := profFuncAddr(fr.fn)
// For Starlark functions, the frame position
// represents the current PC value.
// Mix it into the low bits of the address.
// This is super hacky and may result in collisions
// in large functions or if functions are numerous.
// TODO(adonovan): fix: try making this cleaner by treating
// each bytecode segment as a Profile.Mapping.
pcAddr := fnAddr
if _, ok := fr.fn.(*Function); ok {
pcAddr = (pcAddr << 16) ^ uintptr(fr.pc)
}
id, ok := locationId[pcAddr]
if !ok {
id = uint64(pcAddr)
line := new(bytes.Buffer)
lineenc := protoEncoder{w: line}
lineenc.uint(Line_function_id, function(fr.fn, fnAddr))
lineenc.int(Line_line, int64(fr.pos.Line))
loc := new(bytes.Buffer)
locenc := protoEncoder{w: loc}
locenc.uint(Location_id, id)
locenc.uint(Location_address, uint64(pcAddr))
locenc.bytes(Location_line, line.Bytes())
enc.bytes(Profile_location, loc.Bytes())
locationId[pcAddr] = id
}
return id
}
wallNanos := new(bytes.Buffer)
wnenc := protoEncoder{w: wallNanos}
wnenc.int(ValueType_type, str("wall"))
wnenc.int(ValueType_unit, str("nanoseconds"))
// informational fields of Profile
enc.bytes(Profile_sample_type, wallNanos.Bytes())
enc.int(Profile_period, quantum.Nanoseconds()) // magnitude of sampling period
enc.bytes(Profile_period_type, wallNanos.Bytes()) // dimension and unit of period
enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile
startNano := nanotime()
// Read profile events from the channel
// until it is closed by StopProfiler.
for e := range profiler.events {
sample := new(bytes.Buffer)
sampleenc := protoEncoder{w: sample}
sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds
for _, fr := range e.stack {
sampleenc.uint(Sample_location_id, location(fr))
}
enc.bytes(Profile_sample, sample.Bytes())
}
endNano := nanotime()
enc.int(Profile_duration_nanos, endNano-startNano)
err := gz.Close() // Close reports any prior write error
if flushErr := bufw.Flush(); err == nil {
err = flushErr
}
profiler.done <- err
}
// nanotime returns the time in nanoseconds since epoch.
// It is implemented by runtime.nanotime using the linkname hack;
// runtime.nanotime is defined for all OSs/ARCHS and uses the
// monotonic system clock, which there is no portable way to access.
// Should that function ever go away, these alternatives exist:
//
// // POSIX only. REALTIME not MONOTONIC. 17ns.
// var tv syscall.Timeval
// syscall.Gettimeofday(&tv) // can't fail
// return tv.Nano()
//
// // Portable. REALTIME not MONOTONIC. 46ns.
// return time.Now().Nanoseconds()
//
// // POSIX only. Adds a dependency.
// import "golang.org/x/sys/unix"
// var ts unix.Timespec
// unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail
// return unix.TimespecToNsec(ts)
//
//go:linkname nanotime runtime.nanotime
func nanotime() int64
// profFuncAddr returns the canonical "address"
// of a Callable for use by the profiler.
func profFuncAddr(fn Callable) uintptr {
switch fn := fn.(type) {
case *Builtin:
return reflect.ValueOf(fn.fn).Pointer()
case *Function:
return uintptr(unsafe.Pointer(fn.funcode))
}
// User-defined callable types are typically of
// kind pointer-to-struct. Handle them specially.
if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr {
return v.Pointer()
}
// Address zero is reserved by the protocol.
// Use 1 for callables we don't recognize.
log.Printf("Starlark profiler: no address for Callable %T", fn)
return 1
}
// We encode the protocol message by hand to avoid making
// the interpreter depend on both github.com/google/pprof
// and github.com/golang/protobuf.
//
// This also avoids the need to materialize a protocol message object
// tree of unbounded size and serialize it all at the end.
// The pprof format appears to have been designed to
// permit streaming implementations such as this one.
//
// See https://developers.google.com/protocol-buffers/docs/encoding.
type protoEncoder struct {
w io.Writer // *bytes.Buffer or *gzip.Writer
tmp [binary.MaxVarintLen64]byte
}
func (e *protoEncoder) uvarint(x uint64) {
n := binary.PutUvarint(e.tmp[:], x)
e.w.Write(e.tmp[:n])
}
func (e *protoEncoder) tag(field, wire uint) {
e.uvarint(uint64(field<<3 | wire))
}
func (e *protoEncoder) string(field uint, s string) {
e.tag(field, 2) // length-delimited
e.uvarint(uint64(len(s)))
io.WriteString(e.w, s)
}
func (e *protoEncoder) bytes(field uint, b []byte) {
e.tag(field, 2) // length-delimited
e.uvarint(uint64(len(b)))
e.w.Write(b)
}
func (e *protoEncoder) uint(field uint, x uint64) {
e.tag(field, 0) // varint
e.uvarint(x)
}
func (e *protoEncoder) int(field uint, x int64) {
e.tag(field, 0) // varint
e.uvarint(uint64(x))
}