1 /* Copyright 2020 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef RUY_RUY_PROFILER_INSTRUMENTATION_H_
17 #define RUY_RUY_PROFILER_INSTRUMENTATION_H_
18 
19 #ifdef RUY_PROFILER
20 #include <cstdio>
21 #include <mutex>
22 #include <vector>
23 #endif
24 
25 namespace ruy {
26 namespace profiler {
27 
28 #ifdef RUY_PROFILER
29 
30 // A label is how a code scope is annotated to appear in profiles.
31 // The stacks that are sampled by the profiler are stacks of such labels.
32 // A label consists of a literal string, plus optional integer arguments.
33 class Label {
34  public:
Label()35   Label() {}
36   template <typename... Args>
Label(Args...args)37   explicit Label(Args... args) {
38     Set(args...);
39   }
Set(const char * format)40   void Set(const char* format) {
41     format_ = format;
42     args_count_ = 0;
43   }
44   template <typename... Args>
Set(const char * format,Args...args)45   void Set(const char* format, Args... args) {
46     format_ = format;
47     args_count_ = sizeof...(args);
48     SetArgs(0, args...);
49   }
50 
51   void operator=(const Label& other);
52 
53   bool operator==(const Label& other) const;
54 
55   std::string Formatted() const;
format()56   const char* format() const { return format_; }
57 
58  private:
SetArgs(int position,int arg0)59   void SetArgs(int position, int arg0) { args_[position] = arg0; }
60 
61   template <typename... Args>
SetArgs(int position,int arg0,Args...args)62   void SetArgs(int position, int arg0, Args... args) {
63     SetArgs(position, arg0);
64     SetArgs(position + 1, args...);
65   }
66 
67   static constexpr int kMaxArgs = 4;
68   const char* format_ = nullptr;
69   int args_count_ = 0;
70   int args_[kMaxArgs];
71 };
72 
73 namespace detail {
74 
75 // Forward-declaration, see class ThreadStack below.
76 class ThreadStack;
77 
78 bool& GlobalIsProfilerRunning();
79 
80 // Returns the global vector of pointers to all stacks, there being one stack
81 // per thread executing instrumented code.
82 std::vector<ThreadStack*>* GlobalAllThreadStacks();
83 
84 // Returns the mutex to be locked around any access to GlobalAllThreadStacks().
85 std::mutex* GlobalsMutex();
86 
87 // Returns the thread-local stack, specific to the current thread.
88 ThreadStack* ThreadLocalThreadStack();
89 
90 // This 'stack' is what may be more appropriately called a 'pseudostack':
91 // It contains Label entries that are 'manually' entered by instrumentation
92 // code. It's unrelated to real call stacks.
93 struct Stack {
94   std::uint32_t id = 0;
95   static constexpr int kMaxSize = 64;
96   int size = 0;
97   Label labels[kMaxSize];
98 };
99 
100 // Returns the buffer byte size required by CopyToSample.
101 int GetBufferSize(const Stack& stack);
102 
103 // Copies this Stack into a byte buffer, called a 'sample'.
104 void CopyToBuffer(const Stack& stack, char* dst);
105 
106 // Populates this Stack from an existing sample buffer, typically
107 // produced by CopyToSample.
108 void ReadFromBuffer(const char* src, Stack* stack);
109 
110 // ThreadStack is meant to be used as a thread-local singleton, assigning to
111 // each thread a Stack object holding its pseudo-stack of profile labels,
112 // plus a mutex allowing to synchronize accesses to this pseudo-stack between
113 // this thread and a possible profiler thread sampling it.
114 class ThreadStack {
115  public:
116   ThreadStack();
117   ~ThreadStack();
118 
stack()119   const Stack& stack() const { return stack_; }
120 
121   // Returns the mutex to lock around any access to this stack. Each stack is
122   // accessed by potentially two threads: the thread that it belongs to
123   // (which calls Push and Pop) and the profiler thread during profiling
124   // (which calls CopyToSample).
Mutex()125   std::mutex& Mutex() const { return mutex_; }
126 
127   // Pushes a new label on the top of this Stack.
128   template <typename... Args>
Push(Args...args)129   void Push(Args... args) {
130     // This mutex locking is needed to guard against race conditions as both
131     // the current thread and the profiler thread may be concurrently accessing
132     // this stack. In addition to that, this mutex locking also serves the other
133     // purpose of acting as a barrier (of compiler code reordering, of runtime
134     // CPU instruction reordering, and of memory access reordering), which
135     // gives a measure of correctness to this profiler. The downside is some
136     // latency. As this lock will be uncontended most of the times, the cost
137     // should be roughly that of an sequentially-consistent atomic access,
138     // comparable to an access to the level of CPU data cache that is shared
139     // among all cores, typically 60 cycles on current ARM CPUs, plus side
140     // effects from barrier instructions.
141     std::lock_guard<std::mutex> lock(mutex_);
142     // Avoid overrunning the stack, even in 'release' builds. This profiling
143     // instrumentation code should not ship in release builds anyway, the
144     // overhead of this check is negligible, and overrunning a stack array would
145     // be bad.
146     if (stack_.size >= Stack::kMaxSize) {
147       abort();
148     }
149     stack_.labels[stack_.size++].Set(args...);
150   }
151 
152   // Pops the top-most label from this Stack.
Pop()153   void Pop() {
154     // See the comment in Push about this lock. While it would be tempting to
155     // try to remove this lock and just atomically decrement size_ with a
156     // store-release, that would not necessarily be a substitute for all of the
157     // purposes that this lock serves, or if it was done carefully to serve all
158     // of the same purposes, then that wouldn't be faster than this (mostly
159     // uncontended) lock.
160     std::lock_guard<std::mutex> lock(mutex_);
161     stack_.size--;
162   }
163 
164  private:
165   mutable std::mutex mutex_;
166   Stack stack_;
167 };
168 
169 }  // namespace detail
170 
171 // RAII user-facing way to construct Labels associated with their life scope
172 // and get them pushed to / popped from the current thread stack.
173 class ScopeLabel {
174  public:
175   template <typename... Args>
ScopeLabel(Args...args)176   ScopeLabel(Args... args) : thread_stack_(detail::ThreadLocalThreadStack()) {
177     thread_stack_->Push(args...);
178   }
179 
~ScopeLabel()180   ~ScopeLabel() { thread_stack_->Pop(); }
181 
182  private:
183   detail::ThreadStack* thread_stack_;
184 };
185 
186 #else  // no RUY_PROFILER
187 
188 class ScopeLabel {
189  public:
190   template <typename... Args>
191   explicit ScopeLabel(Args...) {}
192 
193   // This destructor is needed to consistently silence clang's -Wunused-variable
194   // which seems to trigger semi-randomly.
195   ~ScopeLabel() {}
196 };
197 
198 #endif
199 
200 }  // namespace profiler
201 }  // namespace ruy
202 
203 #endif  // RUY_RUY_PROFILER_INSTRUMENTATION_H_
204