Commit 238fc344 by Ian Lance Taylor

runtime: copy cpuprof code from Go 1.7 runtime

    
    This replaces runtime/cpuprof.goc with go/runtime/cpuprof.go and adjusts
    the supporting code in runtime/proc.c.
    
    This adds another case where the compiler needs to avoid heap allocation
    in the runtime package: when evaluating a method expression into a
    closure.  Implementing this required moving the relevant code from
    do_get_backend to do_flatten, so that I could easily add a temporary
    variable.  Doing that let me get rid of Bound_method_expression::do_lower.
    
    Reviewed-on: https://go-review.googlesource.com/31050

From-SVN: r241163
parent 6d59425d
e3913d96fb024b916c87a4dc01f413523467ead9 5f043fc2bf0f92a84a1f7da57acd79a61c9d2592
The first line of this file holds the git revision number of the last The first line of this file holds the git revision number of the last
merge done from the gofrontend repository. merge done from the gofrontend repository.
...@@ -3623,6 +3623,8 @@ Unsafe_type_conversion_expression::do_get_backend(Translate_context* context) ...@@ -3623,6 +3623,8 @@ Unsafe_type_conversion_expression::do_get_backend(Translate_context* context)
|| et->map_type() != NULL || et->map_type() != NULL
|| et->channel_type() != NULL || et->channel_type() != NULL
|| et->is_nil_type()); || et->is_nil_type());
else if (t->function_type() != NULL)
go_assert(et->points_to() != NULL);
else else
go_unreachable(); go_unreachable();
...@@ -6482,34 +6484,6 @@ Bound_method_expression::do_traverse(Traverse* traverse) ...@@ -6482,34 +6484,6 @@ Bound_method_expression::do_traverse(Traverse* traverse)
return Expression::traverse(&this->expr_, traverse); return Expression::traverse(&this->expr_, traverse);
} }
// Lower the expression. If this is a method value rather than being
// called, and the method is accessed via a pointer, we may need to
// add nil checks. Introduce a temporary variable so that those nil
// checks do not cause multiple evaluation.
Expression*
Bound_method_expression::do_lower(Gogo*, Named_object*,
Statement_inserter* inserter, int)
{
// For simplicity we use a temporary for every call to an embedded
// method, even though some of them might be pure value methods and
// not require a temporary.
if (this->expr_->var_expression() == NULL
&& this->expr_->temporary_reference_expression() == NULL
&& this->expr_->set_and_use_temporary_expression() == NULL
&& (this->method_->field_indexes() != NULL
|| (this->method_->is_value_method()
&& this->expr_->type()->points_to() != NULL)))
{
Temporary_statement* temp =
Statement::make_temporary(this->expr_->type(), NULL, this->location());
inserter->insert(temp);
this->expr_ = Expression::make_set_and_use_temporary(temp, this->expr_,
this->location());
}
return this;
}
// Return the type of a bound method expression. The type of this // Return the type of a bound method expression. The type of this
// object is simply the type of the method with no receiver. // object is simply the type of the method with no receiver.
...@@ -6724,32 +6698,43 @@ bme_check_nil(const Method::Field_indexes* field_indexes, Location loc, ...@@ -6724,32 +6698,43 @@ bme_check_nil(const Method::Field_indexes* field_indexes, Location loc,
return cond; return cond;
} }
// Get the backend representation for a method value. // Flatten a method value into a struct with nil checks. We can't do
// this in the lowering phase, because if the method value is called
// directly we don't need a thunk. That case will have been handled
// by Call_expression::do_lower, so if we get here then we do need a
// thunk.
Bexpression* Expression*
Bound_method_expression::do_get_backend(Translate_context* context) Bound_method_expression::do_flatten(Gogo* gogo, Named_object*,
Statement_inserter* inserter)
{ {
Named_object* thunk = Bound_method_expression::create_thunk(context->gogo(), Location loc = this->location();
Named_object* thunk = Bound_method_expression::create_thunk(gogo,
this->method_, this->method_,
this->function_); this->function_);
if (thunk->is_erroneous()) if (thunk->is_erroneous())
{ {
go_assert(saw_errors()); go_assert(saw_errors());
return context->backend()->error_expression(); return Expression::make_error(loc);
} }
// FIXME: We should lower this earlier, but we can't lower it in the // Force the expression into a variable. This is only necessary if
// lowering pass because at that point we don't know whether we need // we are going to do nil checks below, but it's easy enough to
// to create the thunk or not. If the expression is called, we // always do it.
// don't need the thunk. Expression* expr = this->expr_;
if (!expr->is_variable())
Location loc = this->location(); {
Temporary_statement* etemp = Statement::make_temporary(NULL, expr, loc);
inserter->insert(etemp);
expr = Expression::make_temporary_reference(etemp, loc);
}
// If the method expects a value, and we have a pointer, we need to // If the method expects a value, and we have a pointer, we need to
// dereference the pointer. // dereference the pointer.
Named_object* fn = this->method_->named_object(); Named_object* fn = this->method_->named_object();
Function_type* fntype; Function_type *fntype;
if (fn->is_function()) if (fn->is_function())
fntype = fn->func_value()->type(); fntype = fn->func_value()->type();
else if (fn->is_function_declaration()) else if (fn->is_function_declaration())
...@@ -6757,7 +6742,7 @@ Bound_method_expression::do_get_backend(Translate_context* context) ...@@ -6757,7 +6742,7 @@ Bound_method_expression::do_get_backend(Translate_context* context)
else else
go_unreachable(); go_unreachable();
Expression* val = this->expr_; Expression* val = expr;
if (fntype->receiver()->type()->points_to() == NULL if (fntype->receiver()->type()->points_to() == NULL
&& val->type()->points_to() != NULL) && val->type()->points_to() != NULL)
val = Expression::make_unary(OPERATOR_MULT, val, loc); val = Expression::make_unary(OPERATOR_MULT, val, loc);
...@@ -6781,17 +6766,28 @@ Bound_method_expression::do_get_backend(Translate_context* context) ...@@ -6781,17 +6766,28 @@ Bound_method_expression::do_get_backend(Translate_context* context)
vals->push_back(val); vals->push_back(val);
Expression* ret = Expression::make_struct_composite_literal(st, vals, loc); Expression* ret = Expression::make_struct_composite_literal(st, vals, loc);
ret = Expression::make_heap_expression(ret, loc);
// See whether the expression or any embedded pointers are nil. if (!gogo->compiling_runtime() || gogo->package_name() != "runtime")
ret = Expression::make_heap_expression(ret, loc);
else
{
// When compiling the runtime, method closures do not escape.
// When escape analysis becomes the default, and applies to
// method closures, this should be changed to make it an error
// if a method closure escapes.
Temporary_statement* ctemp = Statement::make_temporary(st, ret, loc);
inserter->insert(ctemp);
ret = Expression::make_temporary_reference(ctemp, loc);
ret = Expression::make_unary(OPERATOR_AND, ret, loc);
ret->unary_expression()->set_does_not_escape();
}
// If necessary, check whether the expression or any embedded
// pointers are nil.
Expression* nil_check = NULL; Expression* nil_check = NULL;
Expression* expr = this->expr_;
if (this->method_->field_indexes() != NULL) if (this->method_->field_indexes() != NULL)
{ {
// Note that we are evaluating this->expr_ twice, but that is OK
// because in the lowering pass we forced it into a temporary
// variable.
Expression* ref = expr; Expression* ref = expr;
nil_check = bme_check_nil(this->method_->field_indexes(), loc, &ref); nil_check = bme_check_nil(this->method_->field_indexes(), loc, &ref);
expr = ref; expr = ref;
...@@ -6808,19 +6804,20 @@ Bound_method_expression::do_get_backend(Translate_context* context) ...@@ -6808,19 +6804,20 @@ Bound_method_expression::do_get_backend(Translate_context* context)
nil_check = Expression::make_binary(OPERATOR_OROR, nil_check, n, loc); nil_check = Expression::make_binary(OPERATOR_OROR, nil_check, n, loc);
} }
Bexpression* bme = ret->get_backend(context);
if (nil_check != NULL) if (nil_check != NULL)
{ {
Gogo* gogo = context->gogo(); Expression* crash = gogo->runtime_error(RUNTIME_ERROR_NIL_DEREFERENCE,
Bexpression* crash = loc);
gogo->runtime_error(RUNTIME_ERROR_NIL_DEREFERENCE, // Fix the type of the conditional expression by pretending to
loc)->get_backend(context); // evaluate to RET either way through the conditional.
Btype* btype = ret->type()->get_backend(gogo); crash = Expression::make_compound(crash, ret, loc);
Bexpression* bcheck = nil_check->get_backend(context); ret = Expression::make_conditional(nil_check, crash, ret, loc);
bme = gogo->backend()->conditional_expression(btype, bcheck, crash, }
bme, loc);
} // RET is a pointer to a struct, but we want a function type.
return bme; ret = Expression::make_unsafe_cast(this->type(), ret, loc);
return ret;
} }
// Dump ast representation of a bound method expression. // Dump ast representation of a bound method expression.
......
...@@ -2888,7 +2888,7 @@ class Bound_method_expression : public Expression ...@@ -2888,7 +2888,7 @@ class Bound_method_expression : public Expression
do_traverse(Traverse*); do_traverse(Traverse*);
Expression* Expression*
do_lower(Gogo*, Named_object*, Statement_inserter*, int); do_flatten(Gogo*, Named_object*, Statement_inserter*);
Type* Type*
do_type(); do_type();
...@@ -2907,7 +2907,8 @@ class Bound_method_expression : public Expression ...@@ -2907,7 +2907,8 @@ class Bound_method_expression : public Expression
} }
Bexpression* Bexpression*
do_get_backend(Translate_context*); do_get_backend(Translate_context*)
{ go_unreachable(); }
void void
do_dump_expression(Ast_dump_context*) const; do_dump_expression(Ast_dump_context*) const;
......
...@@ -512,7 +512,6 @@ runtime_files = \ ...@@ -512,7 +512,6 @@ runtime_files = \
$(runtime_thread_files) \ $(runtime_thread_files) \
runtime/yield.c \ runtime/yield.c \
$(rtems_task_variable_add_file) \ $(rtems_task_variable_add_file) \
cpuprof.c \
go-iface.c \ go-iface.c \
lfstack.c \ lfstack.c \
malloc.c \ malloc.c \
......
...@@ -261,9 +261,9 @@ am__objects_6 = go-append.lo go-assert.lo go-assert-interface.lo \ ...@@ -261,9 +261,9 @@ am__objects_6 = go-append.lo go-assert.lo go-assert-interface.lo \
mcentral.lo $(am__objects_1) mfixalloc.lo mgc0.lo mheap.lo \ mcentral.lo $(am__objects_1) mfixalloc.lo mgc0.lo mheap.lo \
msize.lo $(am__objects_2) panic.lo parfor.lo print.lo proc.lo \ msize.lo $(am__objects_2) panic.lo parfor.lo print.lo proc.lo \
runtime.lo signal_unix.lo thread.lo $(am__objects_3) yield.lo \ runtime.lo signal_unix.lo thread.lo $(am__objects_3) yield.lo \
$(am__objects_4) cpuprof.lo go-iface.lo lfstack.lo malloc.lo \ $(am__objects_4) go-iface.lo lfstack.lo malloc.lo mprof.lo \
mprof.lo netpoll.lo rdebug.lo reflect.lo runtime1.lo \ netpoll.lo rdebug.lo reflect.lo runtime1.lo sigqueue.lo \
sigqueue.lo time.lo $(am__objects_5) time.lo $(am__objects_5)
am_libgo_llgo_la_OBJECTS = $(am__objects_6) am_libgo_llgo_la_OBJECTS = $(am__objects_6)
libgo_llgo_la_OBJECTS = $(am_libgo_llgo_la_OBJECTS) libgo_llgo_la_OBJECTS = $(am_libgo_llgo_la_OBJECTS)
libgo_llgo_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ libgo_llgo_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
...@@ -911,7 +911,6 @@ runtime_files = \ ...@@ -911,7 +911,6 @@ runtime_files = \
$(runtime_thread_files) \ $(runtime_thread_files) \
runtime/yield.c \ runtime/yield.c \
$(rtems_task_variable_add_file) \ $(rtems_task_variable_add_file) \
cpuprof.c \
go-iface.c \ go-iface.c \
lfstack.c \ lfstack.c \
malloc.c \ malloc.c \
...@@ -1547,7 +1546,6 @@ mostlyclean-compile: ...@@ -1547,7 +1546,6 @@ mostlyclean-compile:
distclean-compile: distclean-compile:
-rm -f *.tab.c -rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cpuprof.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env_posix.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env_posix.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-bsd.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-bsd.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-irix.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-irix.Plo@am__quote@
......
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// CPU profiling.
// Based on algorithms and data structures used in
// http://code.google.com/p/google-perftools/.
//
// The main difference between this code and the google-perftools
// code is that this code is written to allow copying the profile data
// to an arbitrary io.Writer, while the google-perftools code always
// writes to an operating system file.
//
// The signal handler for the profiling clock tick adds a new stack trace
// to a hash table tracking counts for recent traces. Most clock ticks
// hit in the cache. In the event of a cache miss, an entry must be
// evicted from the hash table, copied to a log that will eventually be
// written as profile data. The google-perftools code flushed the
// log itself during the signal handler. This code cannot do that, because
// the io.Writer might block or need system calls or locks that are not
// safe to use from within the signal handler. Instead, we split the log
// into two halves and let the signal handler fill one half while a goroutine
// is writing out the other half. When the signal handler fills its half, it
// offers to swap with the goroutine. If the writer is not done with its half,
// we lose the stack trace for this clock tick (and record that loss).
// The goroutine interacts with the signal handler by calling getprofile() to
// get the next log piece to write, implicitly handing back the last log
// piece it obtained.
//
// The state of this dance between the signal handler and the goroutine
// is encoded in the Profile.handoff field. If handoff == 0, then the goroutine
// is not using either log half and is waiting (or will soon be waiting) for
// a new piece by calling notesleep(&p.wait). If the signal handler
// changes handoff from 0 to non-zero, it must call notewakeup(&p.wait)
// to wake the goroutine. The value indicates the number of entries in the
// log half being handed off. The goroutine leaves the non-zero value in
// place until it has finished processing the log half and then flips the number
// back to zero. Setting the high bit in handoff means that the profiling is over,
// and the goroutine is now in charge of flushing the data left in the hash table
// to the log and returning that data.
//
// The handoff field is manipulated using atomic operations.
// For the most part, the manipulation of handoff is orderly: if handoff == 0
// then the signal handler owns it and can change it to non-zero.
// If handoff != 0 then the goroutine owns it and can change it to zero.
// If that were the end of the story then we would not need to manipulate
// handoff using atomic operations. The operations are needed, however,
// in order to let the log closer set the high bit to indicate "EOF" safely
// in the situation when normally the goroutine "owns" handoff.
package runtime
import (
"runtime/internal/atomic"
"unsafe"
)
const (
numBuckets = 1 << 10
logSize = 1 << 17
assoc = 4
maxCPUProfStack = 64
)
type cpuprofEntry struct {
count uintptr
depth int
stack [maxCPUProfStack]uintptr
}
type cpuProfile struct {
on bool // profiling is on
wait note // goroutine waits here
count uintptr // tick count
evicts uintptr // eviction count
lost uintptr // lost ticks that need to be logged
// Active recent stack traces.
hash [numBuckets]struct {
entry [assoc]cpuprofEntry
}
// Log of traces evicted from hash.
// Signal handler has filled log[toggle][:nlog].
// Goroutine is writing log[1-toggle][:handoff].
log [2][logSize / 2]uintptr
nlog int
toggle int32
handoff uint32
// Writer state.
// Writer maintains its own toggle to avoid races
// looking at signal handler's toggle.
wtoggle uint32
wholding bool // holding & need to release a log half
flushing bool // flushing hash table - profile is over
eodSent bool // special end-of-data record sent; => flushing
}
var (
cpuprofLock mutex
cpuprof *cpuProfile
eod = [3]uintptr{0, 1, 0}
)
func setcpuprofilerate(hz int32) {
systemstack(func() {
setcpuprofilerate_m(hz)
})
}
// lostProfileData is a no-op function used in profiles
// to mark the number of profiling stack traces that were
// discarded due to slow data writers.
func lostProfileData() {}
// SetCPUProfileRate sets the CPU profiling rate to hz samples per second.
// If hz <= 0, SetCPUProfileRate turns off profiling.
// If the profiler is on, the rate cannot be changed without first turning it off.
//
// Most clients should use the runtime/pprof package or
// the testing package's -test.cpuprofile flag instead of calling
// SetCPUProfileRate directly.
func SetCPUProfileRate(hz int) {
// Clamp hz to something reasonable.
if hz < 0 {
hz = 0
}
if hz > 1000000 {
hz = 1000000
}
lock(&cpuprofLock)
if hz > 0 {
if cpuprof == nil {
cpuprof = (*cpuProfile)(sysAlloc(unsafe.Sizeof(cpuProfile{}), &memstats.other_sys))
if cpuprof == nil {
print("runtime: cpu profiling cannot allocate memory\n")
unlock(&cpuprofLock)
return
}
}
if cpuprof.on || cpuprof.handoff != 0 {
print("runtime: cannot set cpu profile rate until previous profile has finished.\n")
unlock(&cpuprofLock)
return
}
cpuprof.on = true
// pprof binary header format.
// https://github.com/gperftools/gperftools/blob/master/src/profiledata.cc#L119
p := &cpuprof.log[0]
p[0] = 0 // count for header
p[1] = 3 // depth for header
p[2] = 0 // version number
p[3] = uintptr(1e6 / hz) // period (microseconds)
p[4] = 0
cpuprof.nlog = 5
cpuprof.toggle = 0
cpuprof.wholding = false
cpuprof.wtoggle = 0
cpuprof.flushing = false
cpuprof.eodSent = false
noteclear(&cpuprof.wait)
setcpuprofilerate(int32(hz))
} else if cpuprof != nil && cpuprof.on {
setcpuprofilerate(0)
cpuprof.on = false
// Now add is not running anymore, and getprofile owns the entire log.
// Set the high bit in cpuprof.handoff to tell getprofile.
for {
n := cpuprof.handoff
if n&0x80000000 != 0 {
print("runtime: setcpuprofile(off) twice\n")
}
if atomic.Cas(&cpuprof.handoff, n, n|0x80000000) {
if n == 0 {
// we did the transition from 0 -> nonzero so we wake getprofile
notewakeup(&cpuprof.wait)
}
break
}
}
}
unlock(&cpuprofLock)
}
// add adds the stack trace to the profile.
// It is called from signal handlers and other limited environments
// and cannot allocate memory or acquire locks that might be
// held at the time of the signal, nor can it use substantial amounts
// of stack. It is allowed to call evict.
//go:nowritebarrierrec
func (p *cpuProfile) add(pc []uintptr) {
p.addWithFlushlog(pc, p.flushlog)
}
// addWithFlushlog implements add and addNonGo.
// It is called from signal handlers and other limited environments
// and cannot allocate memory or acquire locks that might be
// held at the time of the signal, nor can it use substantial amounts
// of stack. It may be called by a signal handler with no g or m.
// It is allowed to call evict, passing the flushlog parameter.
//go:nosplit
//go:nowritebarrierrec
func (p *cpuProfile) addWithFlushlog(pc []uintptr, flushlog func() bool) {
if len(pc) > maxCPUProfStack {
pc = pc[:maxCPUProfStack]
}
// Compute hash.
h := uintptr(0)
for _, x := range pc {
h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1)))
h += x * 41
}
p.count++
// Add to entry count if already present in table.
b := &p.hash[h%numBuckets]
Assoc:
for i := range b.entry {
e := &b.entry[i]
if e.depth != len(pc) {
continue
}
for j := range pc {
if e.stack[j] != pc[j] {
continue Assoc
}
}
e.count++
return
}
// Evict entry with smallest count.
var e *cpuprofEntry
for i := range b.entry {
if e == nil || b.entry[i].count < e.count {
e = &b.entry[i]
}
}
if e.count > 0 {
if !p.evict(e, flushlog) {
// Could not evict entry. Record lost stack.
p.lost++
return
}
p.evicts++
}
// Reuse the newly evicted entry.
e.depth = len(pc)
e.count = 1
copy(e.stack[:], pc)
}
// evict copies the given entry's data into the log, so that
// the entry can be reused. evict is called from add, which
// is called from the profiling signal handler, so it must not
// allocate memory or block, and it may be called with no g or m.
// It is safe to call flushlog. evict returns true if the entry was
// copied to the log, false if there was no room available.
//go:nosplit
//go:nowritebarrierrec
func (p *cpuProfile) evict(e *cpuprofEntry, flushlog func() bool) bool {
d := e.depth
nslot := d + 2
log := &p.log[p.toggle]
if p.nlog+nslot > len(log) {
if !flushlog() {
return false
}
log = &p.log[p.toggle]
}
q := p.nlog
log[q] = e.count
q++
log[q] = uintptr(d)
q++
copy(log[q:], e.stack[:d])
q += d
p.nlog = q
e.count = 0
return true
}
// flushlog tries to flush the current log and switch to the other one.
// flushlog is called from evict, called from add, called from the signal handler,
// so it cannot allocate memory or block. It can try to swap logs with
// the writing goroutine, as explained in the comment at the top of this file.
//go:nowritebarrierrec
func (p *cpuProfile) flushlog() bool {
if !atomic.Cas(&p.handoff, 0, uint32(p.nlog)) {
return false
}
notewakeup(&p.wait)
p.toggle = 1 - p.toggle
log := &p.log[p.toggle]
q := 0
if p.lost > 0 {
lostPC := funcPC(lostProfileData)
log[0] = p.lost
log[1] = 1
log[2] = lostPC
q = 3
p.lost = 0
}
p.nlog = q
return true
}
// addNonGo is like add, but runs on a non-Go thread.
// It can't do anything that might need a g or an m.
// With this entry point, we don't try to flush the log when evicting an
// old entry. Instead, we just drop the stack trace if we're out of space.
//go:nosplit
//go:nowritebarrierrec
func (p *cpuProfile) addNonGo(pc []uintptr) {
p.addWithFlushlog(pc, func() bool { return false })
}
// getprofile blocks until the next block of profiling data is available
// and returns it as a []byte. It is called from the writing goroutine.
func (p *cpuProfile) getprofile() []byte {
if p == nil {
return nil
}
if p.wholding {
// Release previous log to signal handling side.
// Loop because we are racing against SetCPUProfileRate(0).
for {
n := p.handoff
if n == 0 {
print("runtime: phase error during cpu profile handoff\n")
return nil
}
if n&0x80000000 != 0 {
p.wtoggle = 1 - p.wtoggle
p.wholding = false
p.flushing = true
goto Flush
}
if atomic.Cas(&p.handoff, n, 0) {
break
}
}
p.wtoggle = 1 - p.wtoggle
p.wholding = false
}
if p.flushing {
goto Flush
}
if !p.on && p.handoff == 0 {
return nil
}
// Wait for new log.
notetsleepg(&p.wait, -1)
noteclear(&p.wait)
switch n := p.handoff; {
case n == 0:
print("runtime: phase error during cpu profile wait\n")
return nil
case n == 0x80000000:
p.flushing = true
goto Flush
default:
n &^= 0x80000000
// Return new log to caller.
p.wholding = true
return uintptrBytes(p.log[p.wtoggle][:n])
}
// In flush mode.
// Add is no longer being called. We own the log.
// Also, p.handoff is non-zero, so flushlog will return false.
// Evict the hash table into the log and return it.
Flush:
for i := range p.hash {
b := &p.hash[i]
for j := range b.entry {
e := &b.entry[j]
if e.count > 0 && !p.evict(e, p.flushlog) {
// Filled the log. Stop the loop and return what we've got.
break Flush
}
}
}
// Return pending log data.
if p.nlog > 0 {
// Note that we're using toggle now, not wtoggle,
// because we're working on the log directly.
n := p.nlog
p.nlog = 0
return uintptrBytes(p.log[p.toggle][:n])
}
// Made it through the table without finding anything to log.
if !p.eodSent {
// We may not have space to append this to the partial log buf,
// so we always return a new slice for the end-of-data marker.
p.eodSent = true
return uintptrBytes(eod[:])
}
// Finally done. Clean up and return nil.
p.flushing = false
if !atomic.Cas(&p.handoff, p.handoff, 0) {
print("runtime: profile flush racing with something\n")
}
return nil
}
func uintptrBytes(p []uintptr) (ret []byte) {
pp := (*slice)(unsafe.Pointer(&p))
rp := (*slice)(unsafe.Pointer(&ret))
rp.array = pp.array
rp.len = pp.len * int(unsafe.Sizeof(p[0]))
rp.cap = rp.len
return
}
// CPUProfile returns the next chunk of binary CPU profiling stack trace data,
// blocking until data is available. If profiling is turned off and all the profile
// data accumulated while it was on has been returned, CPUProfile returns nil.
// The caller must save the returned data before calling CPUProfile again.
//
// Most clients should use the runtime/pprof package or
// the testing package's -test.cpuprofile flag instead of calling
// CPUProfile directly.
func CPUProfile() []byte {
return cpuprof.getprofile()
}
//go:linkname runtime_pprof_runtime_cyclesPerSecond runtime_pprof.runtime_cyclesPerSecond
func runtime_pprof_runtime_cyclesPerSecond() int64 {
return tickspersecond()
}
...@@ -415,3 +415,16 @@ func startTheWorld() { ...@@ -415,3 +415,16 @@ func startTheWorld() {
func getMstats() *mstats { func getMstats() *mstats {
return &memstats return &memstats
} }
// Temporary for gccgo until we port proc.go.
func setcpuprofilerate_m(hz int32)
// Temporary for gccgo until we port mem_GOOS.go.
func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer
// Temporary for gccgo until we port proc.go, so that the C signal
// handler can call into cpuprof.
//go:linkname cpuprofAdd runtime.cpuprofAdd
func cpuprofAdd(stk []uintptr) {
cpuprof.add(stk)
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// CPU profiling.
// Based on algorithms and data structures used in
// http://code.google.com/p/google-perftools/.
//
// The main difference between this code and the google-perftools
// code is that this code is written to allow copying the profile data
// to an arbitrary io.Writer, while the google-perftools code always
// writes to an operating system file.
//
// The signal handler for the profiling clock tick adds a new stack trace
// to a hash table tracking counts for recent traces. Most clock ticks
// hit in the cache. In the event of a cache miss, an entry must be
// evicted from the hash table, copied to a log that will eventually be
// written as profile data. The google-perftools code flushed the
// log itself during the signal handler. This code cannot do that, because
// the io.Writer might block or need system calls or locks that are not
// safe to use from within the signal handler. Instead, we split the log
// into two halves and let the signal handler fill one half while a goroutine
// is writing out the other half. When the signal handler fills its half, it
// offers to swap with the goroutine. If the writer is not done with its half,
// we lose the stack trace for this clock tick (and record that loss).
// The goroutine interacts with the signal handler by calling getprofile() to
// get the next log piece to write, implicitly handing back the last log
// piece it obtained.
//
// The state of this dance between the signal handler and the goroutine
// is encoded in the Profile.handoff field. If handoff == 0, then the goroutine
// is not using either log half and is waiting (or will soon be waiting) for
// a new piece by calling notesleep(&p->wait). If the signal handler
// changes handoff from 0 to non-zero, it must call notewakeup(&p->wait)
// to wake the goroutine. The value indicates the number of entries in the
// log half being handed off. The goroutine leaves the non-zero value in
// place until it has finished processing the log half and then flips the number
// back to zero. Setting the high bit in handoff means that the profiling is over,
// and the goroutine is now in charge of flushing the data left in the hash table
// to the log and returning that data.
//
// The handoff field is manipulated using atomic operations.
// For the most part, the manipulation of handoff is orderly: if handoff == 0
// then the signal handler owns it and can change it to non-zero.
// If handoff != 0 then the goroutine owns it and can change it to zero.
// If that were the end of the story then we would not need to manipulate
// handoff using atomic operations. The operations are needed, however,
// in order to let the log closer set the high bit to indicate "EOF" safely
// in the situation when normally the goroutine "owns" handoff.
package runtime
#include "runtime.h"
#include "arch.h"
#include "malloc.h"
#include "array.h"
typedef struct __go_open_array Slice;
#define array __values
#define len __count
#define cap __capacity
enum
{
HashSize = 1<<10,
LogSize = 1<<17,
Assoc = 4,
MaxStack = 64,
};
typedef struct Profile Profile;
typedef struct Bucket Bucket;
typedef struct Entry Entry;
struct Entry {
uintptr count;
uintptr depth;
uintptr stack[MaxStack];
};
struct Bucket {
Entry entry[Assoc];
};
struct Profile {
bool on; // profiling is on
Note wait; // goroutine waits here
uintptr count; // tick count
uintptr evicts; // eviction count
uintptr lost; // lost ticks that need to be logged
// Active recent stack traces.
Bucket hash[HashSize];
// Log of traces evicted from hash.
// Signal handler has filled log[toggle][:nlog].
// Goroutine is writing log[1-toggle][:handoff].
uintptr log[2][LogSize/2];
uintptr nlog;
int32 toggle;
uint32 handoff;
// Writer state.
// Writer maintains its own toggle to avoid races
// looking at signal handler's toggle.
uint32 wtoggle;
bool wholding; // holding & need to release a log half
bool flushing; // flushing hash table - profile is over
bool eod_sent; // special end-of-data record sent; => flushing
};
static Lock lk;
static Profile *prof;
static void tick(uintptr*, int32);
static void add(Profile*, uintptr*, int32);
static bool evict(Profile*, Entry*);
static bool flushlog(Profile*);
static uintptr eod[3] = {0, 1, 0};
// LostProfileData is a no-op function used in profiles
// to mark the number of profiling stack traces that were
// discarded due to slow data writers.
static void
LostProfileData(void)
{
}
extern void runtime_SetCPUProfileRate(intgo)
__asm__ (GOSYM_PREFIX "runtime.SetCPUProfileRate");
// SetCPUProfileRate sets the CPU profiling rate.
// The user documentation is in debug.go.
void
runtime_SetCPUProfileRate(intgo hz)
{
uintptr *p;
uintptr n;
// Clamp hz to something reasonable.
if(hz < 0)
hz = 0;
if(hz > 1000000)
hz = 1000000;
runtime_lock(&lk);
if(hz > 0) {
if(prof == nil) {
prof = runtime_SysAlloc(sizeof *prof, &mstats()->other_sys);
if(prof == nil) {
runtime_printf("runtime: cpu profiling cannot allocate memory\n");
runtime_unlock(&lk);
return;
}
}
if(prof->on || prof->handoff != 0) {
runtime_printf("runtime: cannot set cpu profile rate until previous profile has finished.\n");
runtime_unlock(&lk);
return;
}
prof->on = true;
p = prof->log[0];
// pprof binary header format.
// http://code.google.com/p/google-perftools/source/browse/trunk/src/profiledata.cc#117
*p++ = 0; // count for header
*p++ = 3; // depth for header
*p++ = 0; // version number
*p++ = 1000000 / hz; // period (microseconds)
*p++ = 0;
prof->nlog = p - prof->log[0];
prof->toggle = 0;
prof->wholding = false;
prof->wtoggle = 0;
prof->flushing = false;
prof->eod_sent = false;
runtime_noteclear(&prof->wait);
runtime_setcpuprofilerate(tick, hz);
} else if(prof != nil && prof->on) {
runtime_setcpuprofilerate(nil, 0);
prof->on = false;
// Now add is not running anymore, and getprofile owns the entire log.
// Set the high bit in prof->handoff to tell getprofile.
for(;;) {
n = prof->handoff;
if(n&0x80000000)
runtime_printf("runtime: setcpuprofile(off) twice");
if(runtime_cas(&prof->handoff, n, n|0x80000000))
break;
}
if(n == 0) {
// we did the transition from 0 -> nonzero so we wake getprofile
runtime_notewakeup(&prof->wait);
}
}
runtime_unlock(&lk);
}
static void
tick(uintptr *pc, int32 n)
{
add(prof, pc, n);
}
// add adds the stack trace to the profile.
// It is called from signal handlers and other limited environments
// and cannot allocate memory or acquire locks that might be
// held at the time of the signal, nor can it use substantial amounts
// of stack. It is allowed to call evict.
static void
add(Profile *p, uintptr *pc, int32 n)
{
int32 i, j;
uintptr h, x;
Bucket *b;
Entry *e;
if(n > MaxStack)
n = MaxStack;
// Compute hash.
h = 0;
for(i=0; i<n; i++) {
h = h<<8 | (h>>(8*(sizeof(h)-1)));
x = pc[i];
h += x*31 + x*7 + x*3;
}
p->count++;
// Add to entry count if already present in table.
b = &p->hash[h%HashSize];
for(i=0; i<Assoc; i++) {
e = &b->entry[i];
if(e->depth != (uintptr)n)
continue;
for(j=0; j<n; j++)
if(e->stack[j] != pc[j])
goto ContinueAssoc;
e->count++;
return;
ContinueAssoc:;
}
// Evict entry with smallest count.
e = &b->entry[0];
for(i=1; i<Assoc; i++)
if(b->entry[i].count < e->count)
e = &b->entry[i];
if(e->count > 0) {
if(!evict(p, e)) {
// Could not evict entry. Record lost stack.
p->lost++;
return;
}
p->evicts++;
}
// Reuse the newly evicted entry.
e->depth = n;
e->count = 1;
for(i=0; i<n; i++)
e->stack[i] = pc[i];
}
// evict copies the given entry's data into the log, so that
// the entry can be reused. evict is called from add, which
// is called from the profiling signal handler, so it must not
// allocate memory or block. It is safe to call flushLog.
// evict returns true if the entry was copied to the log,
// false if there was no room available.
static bool
evict(Profile *p, Entry *e)
{
int32 i, d, nslot;
uintptr *log, *q;
d = e->depth;
nslot = d+2;
log = p->log[p->toggle];
if(p->nlog+nslot > nelem(p->log[0])) {
if(!flushlog(p))
return false;
log = p->log[p->toggle];
}
q = log+p->nlog;
*q++ = e->count;
*q++ = d;
for(i=0; i<d; i++)
*q++ = e->stack[i];
p->nlog = q - log;
e->count = 0;
return true;
}
// flushlog tries to flush the current log and switch to the other one.
// flushlog is called from evict, called from add, called from the signal handler,
// so it cannot allocate memory or block. It can try to swap logs with
// the writing goroutine, as explained in the comment at the top of this file.
static bool
flushlog(Profile *p)
{
uintptr *log, *q;
if(!runtime_cas(&p->handoff, 0, p->nlog))
return false;
runtime_notewakeup(&p->wait);
p->toggle = 1 - p->toggle;
log = p->log[p->toggle];
q = log;
if(p->lost > 0) {
*q++ = p->lost;
*q++ = 1;
*q++ = (uintptr)LostProfileData;
p->lost = 0;
}
p->nlog = q - log;
return true;
}
// getprofile blocks until the next block of profiling data is available
// and returns it as a []byte. It is called from the writing goroutine.
Slice
getprofile(Profile *p)
{
uint32 i, j, n;
Slice ret;
Bucket *b;
Entry *e;
ret.array = nil;
ret.len = 0;
ret.cap = 0;
if(p == nil)
return ret;
if(p->wholding) {
// Release previous log to signal handling side.
// Loop because we are racing against SetCPUProfileRate(0).
for(;;) {
n = p->handoff;
if(n == 0) {
runtime_printf("runtime: phase error during cpu profile handoff\n");
return ret;
}
if(n & 0x80000000) {
p->wtoggle = 1 - p->wtoggle;
p->wholding = false;
p->flushing = true;
goto flush;
}
if(runtime_cas(&p->handoff, n, 0))
break;
}
p->wtoggle = 1 - p->wtoggle;
p->wholding = false;
}
if(p->flushing)
goto flush;
if(!p->on && p->handoff == 0)
return ret;
// Wait for new log.
runtime_notetsleepg(&p->wait, -1);
runtime_noteclear(&p->wait);
n = p->handoff;
if(n == 0) {
runtime_printf("runtime: phase error during cpu profile wait\n");
return ret;
}
if(n == 0x80000000) {
p->flushing = true;
goto flush;
}
n &= ~0x80000000;
// Return new log to caller.
p->wholding = true;
ret.array = (byte*)p->log[p->wtoggle];
ret.len = n*sizeof(uintptr);
ret.cap = ret.len;
return ret;
flush:
// In flush mode.
// Add is no longer being called. We own the log.
// Also, p->handoff is non-zero, so flushlog will return false.
// Evict the hash table into the log and return it.
for(i=0; i<HashSize; i++) {
b = &p->hash[i];
for(j=0; j<Assoc; j++) {
e = &b->entry[j];
if(e->count > 0 && !evict(p, e)) {
// Filled the log. Stop the loop and return what we've got.
goto breakflush;
}
}
}
breakflush:
// Return pending log data.
if(p->nlog > 0) {
// Note that we're using toggle now, not wtoggle,
// because we're working on the log directly.
ret.array = (byte*)p->log[p->toggle];
ret.len = p->nlog*sizeof(uintptr);
ret.cap = ret.len;
p->nlog = 0;
return ret;
}
// Made it through the table without finding anything to log.
if(!p->eod_sent) {
// We may not have space to append this to the partial log buf,
// so we always return a new slice for the end-of-data marker.
p->eod_sent = true;
ret.array = (byte*)eod;
ret.len = sizeof eod;
ret.cap = ret.len;
return ret;
}
// Finally done. Clean up and return nil.
p->flushing = false;
if(!runtime_cas(&p->handoff, p->handoff, 0))
runtime_printf("runtime: profile flush racing with something\n");
return ret; // set to nil at top of function
}
// CPUProfile returns the next cpu profile block as a []byte.
// The user documentation is in debug.go.
func CPUProfile() (ret Slice) {
ret = getprofile(prof);
}
...@@ -156,6 +156,8 @@ runtime_sighandler (int sig, Siginfo *info, ...@@ -156,6 +156,8 @@ runtime_sighandler (int sig, Siginfo *info,
#ifdef SIGPROF #ifdef SIGPROF
if (sig == SIGPROF) if (sig == SIGPROF)
{ {
/* FIXME: Handle m == NULL by calling something like gc's
sigprofNonGo. */
if (m != NULL && gp != m->g0 && gp != m->gsignal) if (m != NULL && gp != m->g0 && gp != m->gsignal)
runtime_sigprof (); runtime_sigprof ();
return; return;
......
...@@ -184,7 +184,8 @@ enum ...@@ -184,7 +184,8 @@ enum
// SysFault marks a (already SysAlloc'd) region to fault // SysFault marks a (already SysAlloc'd) region to fault
// if accessed. Used only for debugging the runtime. // if accessed. Used only for debugging the runtime.
void* runtime_SysAlloc(uintptr nbytes, uint64 *stat); void* runtime_SysAlloc(uintptr nbytes, uint64 *stat)
__asm__ (GOSYM_PREFIX "runtime.sysAlloc");
void runtime_SysFree(void *v, uintptr nbytes, uint64 *stat); void runtime_SysFree(void *v, uintptr nbytes, uint64 *stat);
void runtime_SysUnused(void *v, uintptr nbytes); void runtime_SysUnused(void *v, uintptr nbytes);
void runtime_SysUsed(void *v, uintptr nbytes); void runtime_SysUsed(void *v, uintptr nbytes);
......
...@@ -2686,11 +2686,8 @@ runtime_mcount(void) ...@@ -2686,11 +2686,8 @@ runtime_mcount(void)
} }
static struct { static struct {
Lock; uint32 lock;
void (*fn)(uintptr*, int32);
int32 hz; int32 hz;
uintptr pcbuf[TracebackMaxFrames];
Location locbuf[TracebackMaxFrames];
} prof; } prof;
static void System(void) {} static void System(void) {}
...@@ -2703,8 +2700,11 @@ runtime_sigprof() ...@@ -2703,8 +2700,11 @@ runtime_sigprof()
M *mp = g->m; M *mp = g->m;
int32 n, i; int32 n, i;
bool traceback; bool traceback;
uintptr pcbuf[TracebackMaxFrames];
Location locbuf[TracebackMaxFrames];
Slice stk;
if(prof.fn == nil || prof.hz == 0) if(prof.hz == 0)
return; return;
if(mp == nil) if(mp == nil)
...@@ -2718,12 +2718,6 @@ runtime_sigprof() ...@@ -2718,12 +2718,6 @@ runtime_sigprof()
if(mp->mcache == nil) if(mp->mcache == nil)
traceback = false; traceback = false;
runtime_lock(&prof);
if(prof.fn == nil) {
runtime_unlock(&prof);
mp->mallocing--;
return;
}
n = 0; n = 0;
if(runtime_atomicload(&runtime_in_callers) > 0) { if(runtime_atomicload(&runtime_in_callers) > 0) {
...@@ -2735,34 +2729,44 @@ runtime_sigprof() ...@@ -2735,34 +2729,44 @@ runtime_sigprof()
} }
if(traceback) { if(traceback) {
n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf), false); n = runtime_callers(0, locbuf, nelem(locbuf), false);
for(i = 0; i < n; i++) for(i = 0; i < n; i++)
prof.pcbuf[i] = prof.locbuf[i].pc; pcbuf[i] = locbuf[i].pc;
} }
if(!traceback || n <= 0) { if(!traceback || n <= 0) {
n = 2; n = 2;
prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n); pcbuf[0] = (uintptr)runtime_getcallerpc(&n);
if(mp->gcing || mp->helpgc) if(mp->gcing || mp->helpgc)
prof.pcbuf[1] = (uintptr)GC; pcbuf[1] = (uintptr)GC;
else else
prof.pcbuf[1] = (uintptr)System; pcbuf[1] = (uintptr)System;
}
if (prof.hz != 0) {
stk.__values = &pcbuf[0];
stk.__count = n;
stk.__capacity = n;
// Simple cas-lock to coordinate with setcpuprofilerate.
while (!runtime_cas(&prof.lock, 0, 1)) {
runtime_osyield();
}
if (prof.hz != 0) {
runtime_cpuprofAdd(stk);
}
runtime_atomicstore(&prof.lock, 0);
} }
prof.fn(prof.pcbuf, n);
runtime_unlock(&prof);
mp->mallocing--; mp->mallocing--;
} }
// Arrange to call fn with a traceback hz times a second. // Arrange to call fn with a traceback hz times a second.
void void
runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz) runtime_setcpuprofilerate_m(int32 hz)
{ {
// Force sane arguments. // Force sane arguments.
if(hz < 0) if(hz < 0)
hz = 0; hz = 0;
if(hz == 0)
fn = nil;
if(fn == nil)
hz = 0;
// Disable preemption, otherwise we can be rescheduled to another thread // Disable preemption, otherwise we can be rescheduled to another thread
// that has profiling enabled. // that has profiling enabled.
...@@ -2773,10 +2777,12 @@ runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz) ...@@ -2773,10 +2777,12 @@ runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
// it would deadlock. // it would deadlock.
runtime_resetcpuprofiler(0); runtime_resetcpuprofiler(0);
runtime_lock(&prof); while (!runtime_cas(&prof.lock, 0, 1)) {
prof.fn = fn; runtime_osyield();
}
prof.hz = hz; prof.hz = hz;
runtime_unlock(&prof); runtime_atomicstore(&prof.lock, 0);
runtime_lock(&runtime_sched); runtime_lock(&runtime_sched);
runtime_sched.profilehz = hz; runtime_sched.profilehz = hz;
runtime_unlock(&runtime_sched); runtime_unlock(&runtime_sched);
......
...@@ -417,7 +417,10 @@ void runtime_freezetheworld(void); ...@@ -417,7 +417,10 @@ void runtime_freezetheworld(void);
void runtime_unwindstack(G*, byte*); void runtime_unwindstack(G*, byte*);
void runtime_sigprof(); void runtime_sigprof();
void runtime_resetcpuprofiler(int32); void runtime_resetcpuprofiler(int32);
void runtime_setcpuprofilerate(void(*)(uintptr*, int32), int32); void runtime_setcpuprofilerate_m(int32)
__asm__ (GOSYM_PREFIX "runtime.setcpuprofilerate_m");
void runtime_cpuprofAdd(Slice)
__asm__ (GOSYM_PREFIX "runtime.cpuprofAdd");
void runtime_usleep(uint32) void runtime_usleep(uint32)
__asm__ (GOSYM_PREFIX "runtime.usleep"); __asm__ (GOSYM_PREFIX "runtime.usleep");
int64 runtime_cputicks(void) int64 runtime_cputicks(void)
......
...@@ -55,10 +55,6 @@ func getgoroot() (out String) { ...@@ -55,10 +55,6 @@ func getgoroot() (out String) {
out = runtime_getenv("GOROOT"); out = runtime_getenv("GOROOT");
} }
func runtime_pprof.runtime_cyclesPerSecond() (res int64) {
res = runtime_tickspersecond();
}
func sync.runtime_procPin() (p int) { func sync.runtime_procPin() (p int) {
M *mp; M *mp;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment