engine.cpp - OpenGrok cross reference for /system/ulib/trace-engine/engine.cpp

// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <trace-engine/handler.h>

#include <atomic>
#include <stdio.h>
#include <string.h>
#include <utility>

#include <zircon/assert.h>

#include <fbl/auto_lock.h>
#include <fbl/mutex.h>
#include <fbl/vector.h>
#include <lib/async/cpp/task.h>
#include <lib/async/cpp/wait.h>
#include <lib/zx/event.h>
#include <trace-engine/instrumentation.h>

#include "context_impl.h"

namespace {

// Amount of time to allow for other threads to release their references
// to the trace buffer during shutdown.  See point of use for details.
constexpr zx::duration kSynchronousShutdownTimeout = zx::msec(1000);

// Trace engine lock.
// See rules below for how this is used.
fbl::Mutex g_engine_mutex;

// Trace instrumentation state.
// Rules:
//   - can only be modified while holding g_engine_mutex
//   - can be read atomically at any time
std::atomic<int> g_state{TRACE_STOPPED};

// Trace disposition.
// This is the status that will be reported to the trace handler when the
// trace finishes.
// Rules:
//   - can only be accessed or modified while holding g_engine_mutex
zx_status_t g_disposition __TA_GUARDED(g_engine_mutex) {ZX_OK};

// Trace asynchronous dispatcher.
// Rules:
//   - can only be modified while holding g_engine_mutex and engine is stopped
//   - can be read outside the lock only while the engine is not stopped
async_dispatcher_t* g_dispatcher{nullptr};

// Trace handler.
// Rules:
//   - can only be modified while holding g_engine_mutex and engine is stopped
//   - can be read outside the lock only while the engine is not stopped
trace_handler_t* g_handler{nullptr};

// Trace observer table.
// Rules:
//   - can only be accessed or modified while holding g_engine_mutex
struct Observer {
    // The event handle that we notify the observer through.
    zx_handle_t event;
    // Set to true when the engine starts to indicate we're waiting for this
    // observer to call us back, via |trace_notify_observer_updated()|, that
    // it has started. When it does call us back this is set back to false.
    bool awaiting_update_after_start;
};
fbl::Vector<Observer> g_observers __TA_GUARDED(g_engine_mutex);

// Trace context reference count.
// This functions as a non-exclusive lock for the engine's trace context.
// Rules:
//   - acquiring a reference acts as an ACQUIRE fence
//   - releasing a reference acts as a RELEASE fence
//   - always 0 when engine stopped
//   - transition from 0 to non-zero only happens when engine is started
//   - the engine stops when the reference count goes to 0
//     (in other words, holding a context reference prevents the engine from stopping)
//
// There are two separate counters here that collectively provide the full
// count: buffer acquisitions and prolonged acquisitions. Buffer acquisitions
// are for the purpose of writing to the trace buffer. Prolonged acquisitions
// are for things like adhoc trace providers where they want to maintain a
// reference to the context for the duration of the trace.
// Buffer acquisitions increment/decrement the count by
// |kBufferCounterIncrement|. Prolonged acquisitions increment/decrement the
// count by |kProlongedCounterIncrement|.
// To maintain the property that the full count only transitions from 0 to 1
// when the engine is started |kProlongedCounterIncrement| == 1.
std::atomic_uint32_t g_context_refs{0u};

// The uint32_t context ref count is split this way:
// |31 ... 8| = buffer acquisition count
// |7 ... 0| = prolonged acquisition count
// There are generally only a handful of prolonged acquisitions. The code will
// assert-fail if there are more. This allows for 2^24 buffer acquisitions
// which is basically 2^24 threads. The values are also chosen so that the
// full count is easily interpreted when printed in hex.
constexpr uint32_t kProlongedCounterShift = 0;
constexpr uint32_t kProlongedCounterIncrement = 1 << kProlongedCounterShift;
constexpr uint32_t kMaxProlongedCounter = 127;
constexpr uint32_t kProlongedCounterMask = 0xff;
constexpr uint32_t kBufferCounterShift = 8;
constexpr uint32_t kBufferCounterIncrement = 1 << kBufferCounterShift;
constexpr uint32_t kBufferCounterMask = 0xffffff00;

// Trace context.
// Rules:
//   - can only be modified while holding g_engine_mutex and engine is stopped
//   - can be accessed outside the lock while holding a context reference
trace_context_t* g_context{nullptr};

// Event for tracking:
// - when all observers has started
//   (SIGNAL_ALL_OBSERVERS_STARTED)
// - when the trace context reference count has dropped to zero
//   (SIGNAL_CONTEXT_RELEASED)
// Rules:
//   - can only be modified while holding g_engine_mutex and engine is stopped
//   - can be read outside the lock while the engine is not stopped
zx::event g_event;
constexpr zx_signals_t SIGNAL_ALL_OBSERVERS_STARTED = ZX_USER_SIGNAL_0;
constexpr zx_signals_t SIGNAL_CONTEXT_RELEASED = ZX_USER_SIGNAL_1;

// Asynchronous operations posted to the asynchronous dispatcher while the
// engine is running.  Use of these structures is guarded by the engine lock.
async_wait_t g_event_wait;

inline uint32_t get_prolonged_context_refs(uint32_t raw) {
    return (raw & kProlongedCounterMask) >> kProlongedCounterShift;
}

inline uint32_t get_buffer_context_refs(uint32_t raw) {
    return (raw & kBufferCounterMask) >> kBufferCounterShift;
}

void handle_event(async_dispatcher_t* dispatcher, async_wait_t* wait,
                  zx_status_t status, const zx_packet_signal_t* signal);

// must hold g_engine_mutex
inline void update_disposition_locked(zx_status_t disposition) __TA_REQUIRES(g_engine_mutex) {
    if (g_disposition == ZX_OK)
        g_disposition = disposition;
}

void notify_observers_locked() __TA_REQUIRES(g_engine_mutex) {
    for (auto& observer : g_observers) {
        zx_status_t status = zx_object_signal(observer.event, 0u, ZX_EVENT_SIGNALED);
        ZX_DEBUG_ASSERT(status == ZX_OK);
    }
}

void notify_engine_all_observers_started_if_needed_locked() __TA_REQUIRES(g_engine_mutex) {
    for (auto& item : g_observers) {
        if (item.awaiting_update_after_start)
            return;
    }
    g_event.signal(0u, SIGNAL_ALL_OBSERVERS_STARTED);
}

// Table of per-call-site cached category enabled/disabled flags.
// This is done by chaining all the
// |trace_acquire_context_for_category_cached()| call sites at runtime,
// and recording with the pointer the enabled/disabled flag.
//
// Operation:
// 1. When tracing starts each value is zero (kSiteStateUnknown).
//    The value is generally a static local at the call site.
//    Note that while tracing was off various call sites may have been cached,
//    they are all reset to zero.
// 2. When a TRACE_*() macro is called, it calls
//    trace_acquire_context_for_category_cached().
// 3. If the DISABLED bit is set, skip, we're done.
// 4. Call trace_acquire_context_for_category()
// 5. If the ENABLED bit is set, return, we're done.
// 6. Insert the call site to the head of the chain with the
//    enabled/disabled bits set appropriately.
// 7. When tracing stops, empty the list. This includes resetting all chained
//    values to "unknown". We know they're actually disabled, but the important
//    part here is to flush the cache. A minor improvement would be to keep
//    the current list.
//    This is done both when the state transitions to STOPPING and again when
//    the state transitions to STOPPED.
// 8. When tracing starts again, reset all chained values to "unknown" and
//    flush the cache.
//
// The trick is doing this in as lock-free way as possible.
// Atomics are used for accessing the static local at the call site, and when
// the list needs to be traversed it is first atomically unchained from the
// main list and then operated on.
// Generally there aren't that many call sites, and we only need to traverse
// the list at trace start/stop time; so using a list isn't that much of a
// performance issue.

using trace_site_atomic_state_t = std::atomic<trace_site_state_t>;

// A sentinel is used so that there is no ambiguity between a null value
// being the end of the chain and a null value being the initial value of
// a chain slot.
trace_site_t g_site_cache_sentinel{};

std::atomic<trace_site_t*> g_site_cache{&g_site_cache_sentinel};

// Extra bits that are combined with the chain pointer to provide
// the full state.
constexpr trace_site_state_t kSiteStateUnknown = 0u;
constexpr trace_site_state_t kSiteStateDisabled = 1u;
constexpr trace_site_state_t kSiteStateEnabled = 2u;
constexpr trace_site_state_t kSiteStateFlagsMask = 3u;
// We don't export this value to the API, the API just says these values
// must be initialized to zero.
static_assert(kSiteStateUnknown == 0u);

// For clarity when reading the source.
using trace_site_flags_t = trace_site_state_t;

trace_site_state_t get_trace_site_raw_successor(trace_site_state_t state) {
    return state & ~kSiteStateFlagsMask;
}

trace_site_t* get_trace_site_successor(trace_site_state_t state) {
    return reinterpret_cast<trace_site_t*>(get_trace_site_raw_successor(state));
}

trace_site_flags_t get_trace_site_flags(trace_site_state_t state) {
    return state & kSiteStateFlagsMask;
}

trace_site_atomic_state_t* get_trace_site_state_as_atomic(trace_site_t* site) {
    return reinterpret_cast<trace_site_atomic_state_t*>(&site->state);
}

trace_site_state_t make_trace_site_state(trace_site_state_t successor,
                                         trace_site_flags_t flags) {
    return successor | flags;
}

trace_site_state_t make_trace_site_state(trace_site_t* successor,
                                         trace_site_flags_t flags) {
    return reinterpret_cast<trace_site_state_t>(successor) | flags;
}

trace_site_t* unchain_site_cache() {
    trace_site_t* empty_cache = &g_site_cache_sentinel;
    return g_site_cache.exchange(empty_cache, std::memory_order_relaxed);
}

void flush_site_cache() {
    // Atomically swap in an empty cache with the current one.
    trace_site_t* chain_head = unchain_site_cache();

    trace_site_t* chain = chain_head;
    while (chain != &g_site_cache_sentinel) {
        trace_site_atomic_state_t* state_ptr = get_trace_site_state_as_atomic(chain);
        trace_site_state_t curr_state = state_ptr->load(std::memory_order_relaxed);
        trace_site_state_t new_state = kSiteStateUnknown;
        state_ptr->store(new_state, std::memory_order_relaxed);
        chain = get_trace_site_successor(curr_state);
    }
}

// Update the state at |*site|.
// Note that multiple threads may race here for the same site.
void add_to_site_cache(trace_site_t* site, trace_site_state_t current_state, bool enabled) {
    trace_site_atomic_state_t* state_ptr = get_trace_site_state_as_atomic(site);

    // Even when tracing is on generally only a subset of categories
    // are traced, so the test uses "unlikely".
    trace_site_flags_t new_flags;
    if (unlikely(enabled)) {
        new_flags = kSiteStateEnabled;
    } else {
        new_flags = kSiteStateDisabled;
    }

    // At this point the recorded flags are zero. If we're the first to set
    // them then we're good to add our entry to the cache (if not already in
    // the cache). Otherwise punt. Note that this first setting of the flags
    // won't be the last if we need to also chain this entry into the cache.
    ZX_DEBUG_ASSERT(get_trace_site_flags(current_state) == kSiteStateUnknown);

    trace_site_state_t new_state =
        make_trace_site_state(get_trace_site_raw_successor(current_state),
                              new_flags);
    // If someone else changed our state punt. This can happen when another
    // thread is tracing and gets there first.
    if (unlikely(!state_ptr->compare_exchange_strong(current_state, new_state,
                                                     std::memory_order_acquire,
                                                     std::memory_order_relaxed))) {
        return;
    }

    if (get_trace_site_raw_successor(new_state)) {
        // Already in chain.
        return;
    }

    // Add to chain.
    trace_site_t* old_cache_ptr =
        g_site_cache.load(std::memory_order_relaxed);
    new_state = make_trace_site_state(old_cache_ptr, new_flags);
    state_ptr->store(new_state, std::memory_order_relaxed);

    // Atomically update both:
    // - |g_site_cache| to point to |new_cache_ptr| (which is our entry)
    // - |*state_ptr| (our entry) to point to the old |g_site_cache|
    // This works because until our entry is live only its flag values
    // matter to other threads. See the discussion in |trace_stop_engine()|.
    trace_site_t* new_cache_ptr = site;
    while (!g_site_cache.compare_exchange_weak(
               old_cache_ptr, new_cache_ptr,
               std::memory_order_relaxed,
               std::memory_order_relaxed)) {
        // Someone else updated |g_site_cache|. Reset our chain pointer
        // and try again.
        new_state = make_trace_site_state(old_cache_ptr, new_flags);
        state_ptr->store(new_state, std::memory_order_relaxed);
    }
}

} // namespace

/*** Trace engine functions ***/

// thread-safe
EXPORT_NO_DDK zx_status_t trace_start_engine(
        async_dispatcher_t* dispatcher, trace_handler_t* handler,
        trace_buffering_mode_t buffering_mode,
        void* buffer, size_t buffer_num_bytes) {
    ZX_DEBUG_ASSERT(dispatcher);
    ZX_DEBUG_ASSERT(handler);
    ZX_DEBUG_ASSERT(buffer);

    switch (buffering_mode) {
    case TRACE_BUFFERING_MODE_ONESHOT:
    case TRACE_BUFFERING_MODE_CIRCULAR:
    case TRACE_BUFFERING_MODE_STREAMING:
        break;
    default:
        return ZX_ERR_INVALID_ARGS;
    }

    // The buffer size must be a multiple of 4096 (simplifies buffer size
    // calcs).
    if ((buffer_num_bytes & 0xfff) != 0) {
        return ZX_ERR_INVALID_ARGS;
    }
    if (buffer_num_bytes < trace_context::min_buffer_size() ||
        buffer_num_bytes > trace_context::max_buffer_size()) {
        return ZX_ERR_INVALID_ARGS;
    }

    fbl::AutoLock lock(&g_engine_mutex);

    // We must have fully stopped a prior tracing session before starting a new one.
    if (g_state.load(std::memory_order_relaxed) != TRACE_STOPPED)
        return ZX_ERR_BAD_STATE;
    ZX_DEBUG_ASSERT(g_context_refs.load(std::memory_order_relaxed) == 0u);

    zx::event event;
    zx_status_t status = zx::event::create(0u, &event);
    if (status != ZX_OK)
        return status;

    // Schedule a waiter for |event|.
    g_event_wait = {
        .state = {ASYNC_STATE_INIT},
        .handler = &handle_event,
        .object = event.get(),
        .trigger = (SIGNAL_ALL_OBSERVERS_STARTED |
                    SIGNAL_CONTEXT_RELEASED)};
    status = async_begin_wait(dispatcher, &g_event_wait);
    if (status != ZX_OK)
        return status;

    // Initialize the trace engine state and context.
    g_state.store(TRACE_STARTED, std::memory_order_relaxed);
    g_dispatcher = dispatcher;
    g_handler = handler;
    g_disposition = ZX_OK;
    g_context = new trace_context(buffer, buffer_num_bytes, buffering_mode, handler);
    g_event = std::move(event);

    g_context->InitBufferHeader();

    // Write the trace initialization record first before allowing clients to
    // get in and write their own trace records.
    trace_context_write_initialization_record(g_context, zx_ticks_per_second());

    // After this point clients can acquire references to the trace context.
    g_context_refs.store(kProlongedCounterIncrement, std::memory_order_release);

    // Flush the call-site cache.
    // Do this after clients can acquire the trace context so that any cached
    // values that got recorded prior to this are reset, and any new values
    // from this point on will see that tracing is on.
    flush_site_cache();

    // Notify observers that the state changed.
    if (g_observers.is_empty()) {
        g_event.signal(0u, SIGNAL_ALL_OBSERVERS_STARTED);
    } else {
        for (auto& observer : g_observers)
            observer.awaiting_update_after_start = true;
        notify_observers_locked();
    }

    return ZX_OK;
}

// thread-safe
EXPORT_NO_DDK zx_status_t trace_stop_engine(zx_status_t disposition) {
    fbl::AutoLock lock(&g_engine_mutex);

    // We must have have an active trace in order to stop it.
    int state = g_state.load(std::memory_order_relaxed);
    if (state == TRACE_STOPPED)
        return ZX_ERR_BAD_STATE;

    update_disposition_locked(disposition);
    if (state == TRACE_STOPPING)
        return ZX_OK; // already stopping

    ZX_DEBUG_ASSERT(state == TRACE_STARTED);
    ZX_DEBUG_ASSERT(g_context_refs.load(std::memory_order_relaxed) != 0u);

    // Begin stopping the trace.
    g_state.store(TRACE_STOPPING, std::memory_order_relaxed);

    // Flush the call-site cache.
    // Do this after tracing is marked as stopping so that any cached
    // values that got recorded prior to this are reset, and any new
    // values from this point on will see that tracing is stopping.
    // It's still possible that a cached value could be in the process of
    // being recorded as being enabled. So we might reset the site's state
    // and then it gets subsequently marked as enabled by another thread.
    // This is perhaps clumsy but ok: If the site got marked as enabled then a
    // trace context was acquired and engine state cannot change to STOPPED
    // until that context is released after which we will reset the state back
    // to disabled.
    flush_site_cache();

    // Notify observers that the state changed.
    notify_observers_locked();

    // Release the trace engine's own reference to the trace context.
    // |handle_context_released()| will be called asynchronously when the last
    // reference is released.
    trace_release_prolonged_context(reinterpret_cast<trace_prolonged_context_t*>(g_context));

    return ZX_OK;
}

// This is an internal function, only called from context.cpp.
// thread-safe
bool trace_engine_is_buffer_context_released() {
    return (g_context_refs.load(std::memory_order_relaxed) &
            kBufferCounterMask) == 0;
}

// This is an internal function, only called from context.cpp.
// thread-safe
void trace_engine_request_save_buffer(uint32_t wrapped_count,
                                      uint64_t durable_data_end) {
    // Handle the request on the engine's async loop. This may be get called
    // while servicing a client trace request, and we don't want to handle it
    // there.
    async::PostTask(g_dispatcher, [wrapped_count, durable_data_end] () {
        auto context = trace_acquire_prolonged_context();
        if (context) {
            auto tcontext = reinterpret_cast<trace_context_t*>(context);
            tcontext->HandleSaveRollingBufferRequest(wrapped_count, durable_data_end);
            trace_release_prolonged_context(context);
        }
    });
}

// This is called by the handler after it has saved a buffer.
// |wrapped_count| and |durable_end| are the values that were passed to it,
// and are passed back to us for sanity checking purposes.
// thread-safe
EXPORT_NO_DDK zx_status_t trace_engine_mark_buffer_saved(
    uint32_t wrapped_count, uint64_t durable_data_end) {
    auto context = trace_acquire_prolonged_context();

    // No point in updating if there's no active trace.
    if (!context) {
        return ZX_ERR_BAD_STATE;
    }

    // Do this now, instead of as a separate iteration on the async loop.
    // The concern is that we want to update buffer state ASAP to reduce the
    // window where records might be dropped because the buffer is full.
    auto tcontext = reinterpret_cast<trace_context_t*>(context);
    tcontext->MarkRollingBufferSaved(wrapped_count, durable_data_end);

    trace_release_prolonged_context(context);
    return ZX_OK;
}

namespace {

void handle_all_observers_started() {
    // TODO(TO-530): Allow indicating an observer failed to start.

    // Clear the signal, otherwise we'll keep getting called.
    g_event.signal(SIGNAL_ALL_OBSERVERS_STARTED, 0u);

    // Note: There's no race in the use of |g_handler| here. If it will be set
    // to NULL that will be done later (handle_context_released is called by
    // handle_event after we are).
    if (g_handler) {
        g_handler->ops->trace_started(g_handler);
    }
}

void handle_context_released(async_dispatcher_t* dispatcher) {
    // All ready to clean up.
    // Grab the mutex while modifying shared state.
    zx_status_t disposition;
    trace_handler_t* handler;
    size_t buffer_bytes_written;
    {
        fbl::AutoLock lock(&g_engine_mutex);

        ZX_DEBUG_ASSERT(g_state.load(std::memory_order_relaxed) == TRACE_STOPPING);
        ZX_DEBUG_ASSERT(g_context_refs.load(std::memory_order_relaxed) == 0u);
        ZX_DEBUG_ASSERT(g_context != nullptr);

        // Update final buffer state.
        g_context->UpdateBufferHeaderAfterStopped();

        // Get final disposition.
        if (g_context->WasRecordDropped())
            update_disposition_locked(ZX_ERR_NO_MEMORY);
        disposition = g_disposition;
        handler = g_handler;
        buffer_bytes_written = (g_context->RollingBytesAllocated() +
                                g_context->DurableBytesAllocated());

        // Tidy up.
        g_dispatcher = nullptr;
        g_handler = nullptr;
        g_disposition = ZX_OK;
        g_event.reset();
        delete g_context;
        g_context = nullptr;

        // After this point, it's possible for the engine to be restarted
        // (once we release the lock).
        g_state.store(TRACE_STOPPED, std::memory_order_relaxed);

        // Flush the call-site cache.
        // Do this after tracing is marked as stopped so that any cached
        // values that got recorded prior to this are reset, and any new
        // values from this point on will see that tracing is stopped.
        // Any call sites already chained are ok, the concern is with the
        // timing of call sites about to be added to the chain. We're ok
        // here because at this point it's impossible to acquire a trace
        // context, and therefore it's impossible for a category to be
        // cached as enabled.
        flush_site_cache();

        // Notify observers that the state changed.
        notify_observers_locked();
    }

    // Notify the handler about the final disposition.
    handler->ops->trace_stopped(handler, dispatcher, disposition, buffer_bytes_written);
}

// Handles the case where the asynchronous dispatcher has encountered an error
// and will no longer be servicing the wait callback.  Consequently, this is
// our last chance to stop the engine and await for all contexts to be released.
void handle_hard_shutdown(async_dispatcher_t* dispatcher) {
    // Stop the engine, in case it hasn't noticed yet.
    trace_stop_engine(ZX_ERR_CANCELED);

    // There may still be outstanding references to the trace context.
    // We don't know when or whether they will be cleared but we can't complete
    // shut down until they are gone since there might still be live pointers
    // into the trace buffer so allow a brief timeout.  If the release event
    // hasn't been signaled by then, declare the trace engine dead in the water
    // to prevent dangling pointers.  This situations should be very rare as it
    // only occurs when the asynchronous dispatcher is shutting down, typically
    // just prior to process exit.
    auto status = g_event.wait_one(
        SIGNAL_CONTEXT_RELEASED,
        zx::deadline_after(kSynchronousShutdownTimeout),
        nullptr);
    if (status == ZX_OK) {
        handle_context_released(dispatcher);
        return;
    }

    // Uh oh.
    auto context_refs = g_context_refs.load(std::memory_order_relaxed);
    fprintf(stderr,
            "TraceEngine: Timed out waiting for %u buffer, %u prolonged trace context\n"
            "references (raw 0x%x) to be released after %lu ns\n"
            "while the asynchronous dispatcher was shutting down.\n"
            "Tracing will no longer be available in this process.",
            get_buffer_context_refs(context_refs),
            get_prolonged_context_refs(context_refs),
            context_refs,
            kSynchronousShutdownTimeout.get());
}

void handle_event(async_dispatcher_t* dispatcher, async_wait_t* wait,
                  zx_status_t status, const zx_packet_signal_t* signal) {
    // Note: This function may get all signals at the same time.

    if (status == ZX_OK) {
        if (signal->observed & SIGNAL_ALL_OBSERVERS_STARTED) {
            handle_all_observers_started();
        }
        if (signal->observed & SIGNAL_CONTEXT_RELEASED) {
            handle_context_released(dispatcher);
            return; // trace engine is completely stopped now
        }
        status = async_begin_wait(dispatcher, &g_event_wait);
    }

    if (status != ZX_OK) {
        handle_hard_shutdown(dispatcher);
    }
}

} // namespace

/*** Trace instrumentation functions ***/

// thread-safe, lock-free
EXPORT trace_state_t trace_state() {
    return static_cast<trace_state_t>(g_state.load(std::memory_order_relaxed));
}

// thread-safe
EXPORT bool trace_is_category_enabled(const char* category_literal) {
    trace_context_t* context = trace_acquire_context();
    if (likely(!context))
        return false;
    bool result = trace_context_is_category_enabled(context, category_literal);
    trace_release_context(context);
    return result;
}

// thread-safe, fail-fast, lock-free
EXPORT trace_context_t* trace_acquire_context() {
    // Fail fast: Check whether we could possibly write into the trace buffer.
    // The count must be at least 1 to indicate that the buffer is initialized.
    // This is marked likely because tracing is usually disabled and we want
    // to return as quickly as possible from this function.
    uint32_t count = g_context_refs.load(std::memory_order_relaxed);
    if (likely(count == 0u))
        return nullptr;

    // Attempt to increment the reference count.
    // This also acts as a fence for future access to buffer state variables.
    //
    // Note the ACQUIRE fence here since the trace context may have changed
    // from the perspective of this thread.
    while (!g_context_refs.compare_exchange_weak(count,
                                                 count + kBufferCounterIncrement,
                                                 std::memory_order_acquire,
                                                 std::memory_order_relaxed)) {
        if (unlikely(count == 0u))
            return nullptr;
    }
    return g_context;
}

// thread-safe, fail-fast, lock-free
EXPORT trace_context_t* trace_acquire_context_for_category(
        const char* category_literal, trace_string_ref_t* out_ref) {
    // This is marked likely because tracing is usually disabled and we want
    // to return as quickly as possible from this function.
    trace_context_t* context = trace_acquire_context();
    if (likely(!context))
        return nullptr;

    if (!trace_context_register_category_literal(context, category_literal, out_ref)) {
        trace_release_context(context);
        return nullptr;
    }

    return context;
}

// TODO(PT-84): This function is split out from
// |trace_acquire_context_for_category_cached()| because gcc doesn't
// optimize the prologue as well as it could: It creates the stack frame
// for the entire function prior to the "is disabled?" early-exit test.
// Clang does fine, but for now to achieve optimum performance for the common
// case of tracing off, regardless of compiler, we employ this workaround.
// Both gcc and clang do the expected tail-call optimization, so all this
// costs is an extra branch when tracing is on.
//
// |current_state| is appended as an argument, violating the convention to
// put output parameters last to minimize the changes in the caller's tail
// call.
static __NO_INLINE trace_context_t* trace_acquire_context_for_category_cached_worker(
    const char* category_literal, trace_site_t* site,
    trace_string_ref_t* out_ref, trace_site_state_t current_state) {
    trace_context_t* context =
        trace_acquire_context_for_category(category_literal, out_ref);

    if (likely((current_state & kSiteStateFlagsMask) != kSiteStateUnknown)) {
        return context;
    }

    // First time through for this trace run. Note that multiple threads may
    // get to this point for the same call-site.
    add_to_site_cache(site, current_state, context != nullptr);

    return context;
}

// thread-safe, fail-fast, lock-free
EXPORT trace_context_t* trace_acquire_context_for_category_cached(
    const char* category_literal, trace_site_t* site,
    trace_string_ref_t* out_ref) {
    trace_site_atomic_state_t* state_ptr = get_trace_site_state_as_atomic(site);

    trace_site_state_t current_state =
        state_ptr->load(std::memory_order_relaxed);
    if (likely(current_state & kSiteStateDisabled)) {
        return nullptr;
    }

    return trace_acquire_context_for_category_cached_worker(
        category_literal, site, out_ref, current_state);
}

// thread-safe
EXPORT zx_status_t trace_engine_flush_category_cache(void) {
    fbl::AutoLock lock(&g_engine_mutex);

    if (g_state.load(std::memory_order_relaxed) != TRACE_STOPPED)
        return ZX_ERR_BAD_STATE;

    // Empty the site cache. The next time the app tries to emit a trace event
    // it will get re-added to the cache, but that's ok.
    flush_site_cache();

    return ZX_OK;
}

// thread-safe, never-fail, lock-free
EXPORT void trace_release_context(trace_context_t* context) {
    ZX_DEBUG_ASSERT(context == g_context);
    ZX_DEBUG_ASSERT(get_buffer_context_refs(g_context_refs.load(std::memory_order_relaxed)) != 0u);

    // Note the RELEASE fence here since the trace context and trace buffer
    // contents may have changes from the perspective of other threads.
    auto previous = g_context_refs.fetch_sub(kBufferCounterIncrement,
                                             std::memory_order_release);
    if (unlikely(previous == kBufferCounterIncrement)) {
        // Notify the engine that the last reference was released.
        zx_status_t status = g_event.signal(0u, SIGNAL_CONTEXT_RELEASED);
        ZX_DEBUG_ASSERT(status == ZX_OK);
    }
}

// thread-safe, fail-fast, lock-free
EXPORT_NO_DDK trace_prolonged_context_t* trace_acquire_prolonged_context() {
    // There's no need for extreme efficiency here, but for consistency with
    // |trace_acquire_context()| we copy what it does.
    uint32_t count = g_context_refs.load(std::memory_order_relaxed);
    if (likely(count == 0u))
        return nullptr;

    // Attempt to increment the reference count.
    // This also acts as a fence for future access to buffer state variables.
    //
    // Note the ACQUIRE fence here since the trace context may have changed
    // from the perspective of this thread.
    while (!g_context_refs.compare_exchange_weak(count,
                                                 count + kProlongedCounterIncrement,
                                                 std::memory_order_acquire,
                                                 std::memory_order_relaxed)) {
        if (likely(count == 0u))
            return nullptr;
    }
    ZX_DEBUG_ASSERT(get_prolonged_context_refs(g_context_refs.load(std::memory_order_relaxed)) <=
                    kMaxProlongedCounter);
    return reinterpret_cast<trace_prolonged_context_t*>(g_context);
}

// thread-safe, never-fail, lock-free
EXPORT_NO_DDK void trace_release_prolonged_context(trace_prolonged_context_t* context) {
    auto tcontext = reinterpret_cast<trace_context_t*>(context);
    ZX_DEBUG_ASSERT(tcontext == g_context);
    ZX_DEBUG_ASSERT(get_prolonged_context_refs(g_context_refs.load(std::memory_order_relaxed)) != 0u);

    // Note the RELEASE fence here since the trace context and trace buffer
    // contents may have changes from the perspective of other threads.
    auto previous = g_context_refs.fetch_sub(kProlongedCounterIncrement,
                                             std::memory_order_release);
    if (unlikely(previous == kProlongedCounterIncrement)) {
        // Notify the engine that the last reference was released.
        zx_status_t status = g_event.signal(0u, SIGNAL_CONTEXT_RELEASED);
        ZX_DEBUG_ASSERT(status == ZX_OK);
    }
}

/*** Asynchronous observers ***/

EXPORT zx_status_t trace_register_observer(zx_handle_t event) {
    fbl::AutoLock lock(&g_engine_mutex);

    for (const auto& item : g_observers) {
        if (item.event == event)
            return ZX_ERR_INVALID_ARGS;
    }

    g_observers.push_back(Observer{event, false});
    return ZX_OK;
}

EXPORT zx_status_t trace_unregister_observer(zx_handle_t event) {
    fbl::AutoLock lock(&g_engine_mutex);

    for (size_t i = 0; i < g_observers.size(); i++) {
        if (g_observers[i].event == event) {
            bool awaited = g_observers[i].awaiting_update_after_start;
            g_observers.erase(i);
            if (awaited) {
                notify_engine_all_observers_started_if_needed_locked();
            }
            return ZX_OK;
        }
    }
    return ZX_ERR_NOT_FOUND;
}

EXPORT void trace_notify_observer_updated(zx_handle_t event) {
    fbl::AutoLock lock(&g_engine_mutex);

    for (auto& item : g_observers) {
        if (item.event == event) {
            if (item.awaiting_update_after_start) {
                item.awaiting_update_after_start = false;
                notify_engine_all_observers_started_if_needed_locked();
            }
            return;
        }
    }
}