1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "ruy/ctx.h"
17 
18 #include <cstdlib>
19 #include <functional>
20 #include <string>
21 
22 #include "ruy/check_macros.h"
23 #include "ruy/cpuinfo.h"
24 #include "ruy/ctx_impl.h"
25 #include "ruy/have_built_path_for.h"
26 #include "ruy/path.h"
27 #include "ruy/performance_advisory.h"
28 #include "ruy/platform.h"
29 #include "ruy/prepacked_cache.h"
30 #include "ruy/trace.h"
31 
32 namespace ruy {
33 
impl() const34 const CtxImpl& Ctx::impl() const { return static_cast<const CtxImpl&>(*this); }
mutable_impl()35 CtxImpl* Ctx::mutable_impl() { return static_cast<CtxImpl*>(this); }
36 
last_used_path() const37 Path Ctx::last_used_path() const { return impl().last_used_path_; }
explicit_tuning() const38 Tuning Ctx::explicit_tuning() const { return impl().explicit_tuning_; }
set_explicit_tuning(Tuning value)39 void Ctx::set_explicit_tuning(Tuning value) {
40   mutable_impl()->explicit_tuning_ = value;
41 }
thread_pool() const42 const ThreadPool& Ctx::thread_pool() const { return impl().thread_pool_; }
mutable_thread_pool()43 ThreadPool* Ctx::mutable_thread_pool() { return &mutable_impl()->thread_pool_; }
max_num_threads() const44 int Ctx::max_num_threads() const { return impl().max_num_threads_; }
set_max_num_threads(int value)45 void Ctx::set_max_num_threads(int value) {
46   mutable_impl()->max_num_threads_ = value;
47 }
clear_performance_advisories()48 void Ctx::clear_performance_advisories() {
49   mutable_impl()->performance_advisory_ = PerformanceAdvisory::kNone;
50 }
set_performance_advisory(PerformanceAdvisory advisory)51 void Ctx::set_performance_advisory(PerformanceAdvisory advisory) {
52   mutable_impl()->performance_advisory_ =
53       mutable_impl()->performance_advisory_ | advisory;
54 }
performance_advisory(PerformanceAdvisory advisory) const55 bool Ctx::performance_advisory(PerformanceAdvisory advisory) const {
56   return (impl().performance_advisory_ & advisory) !=
57          PerformanceAdvisory::kNone;
58 }
59 
SetRuntimeEnabledPaths(Path paths)60 void Ctx::SetRuntimeEnabledPaths(Path paths) {
61   if (paths == Path::kNone) {
62     // Revert to default behavior using runtime detection.
63     mutable_impl()->runtime_enabled_paths_ = Path::kNone;
64   } else {
65     // Explicitly set enabled paths. Ensure that non-arch are always enabled
66     // (needed for fallbacks).
67     mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
68   }
69 }
70 
mutable_cpuinfo()71 CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; }
72 
73 namespace {
74 
GetHexIntEnvVarOrZero(const char * name)75 int GetHexIntEnvVarOrZero(const char* name) {
76   const char* val = getenv(name);
77   if (!val) {
78     return 0;
79   }
80   return std::stoi(val, nullptr, 16);
81 }
82 
83 // For each Path bit set in `paths_to_test`, performs runtime detection and
84 // sets the corresponding bit in the return value if and only if it is
85 // supported. Path bits that are not set in the input
86 // `paths_to_detect` value are also left not set in the return value.
DetectRuntimeSupportedPaths(Path paths_to_detect,CpuInfo * cpuinfo)87 Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) {
88   // Paths in kNonArchPathsIncludingInternalVariants are always implicitly
89   // supported. Further logic below may add more bits to `results`.
90   Path result = kNonArchPathsIncludingInternalVariants;
91 
92   // Conditionally sets the `path` bit in `result`, if reported as supported
93   // by the `is_supported` predicate.
94   auto maybe_add = [&](Path path, std::function<bool(void)> is_supported) {
95     if ((paths_to_detect & path) != Path::kNone) {
96       if (is_supported()) {
97         result = result | path;
98       }
99     }
100   };
101 
102 #if RUY_PLATFORM_ARM
103   // NEON is unconditionally available on ARM64.
104   // On ARM32 it's technically possible for it to be unavailable, but we've
105   // always chosen to just crash on such devices. We could reevaluate that,
106   // however for non-NEON devices to be actually supported, we would need to
107   // address also compiler-generated NEON code. That would mean to remove
108   // -mfpu=neon from ruy_copts and only use this flag in select NEON translation
109   // units, and implement have_built_path_for_neon, similar to the x86 SIMD
110   // paths.
111   maybe_add(Path::kNeon, []() { return true; });
112 
113   // NEON dotprod requires runtime detection, however unlike the x86 SIMD paths
114   // it still does not require have_built_path_for because we unconditionally
115   // build it at the moment. That is largely because we have had to machine
116   // encode dotprod instructions, so we don't actually rely on toolchain support
117   // for them.
118   maybe_add(Path::kNeonDotprod, [=]() { return cpuinfo->NeonDotprod(); });
119 #elif RUY_PLATFORM_X86
120   // x86 SIMD paths currently require both runtime detection, and detection of
121   // whether we're building the path at all.
122   maybe_add(Path::kAvx,
123             [=]() { return HaveBuiltPathForAvx() && cpuinfo->Avx(); });
124   maybe_add(Path::kAvx2Fma,
125             [=]() { return HaveBuiltPathForAvx2Fma() && cpuinfo->Avx2Fma(); });
126   maybe_add(Path::kAvx512,
127             [=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
128 #else
129   (void)maybe_add;
130   (void)cpuinfo;
131 #endif
132 
133   // Sanity checks
134   RUY_DCHECK_EQ(kNonArchPaths & ~result, Path::kNone);
135   RUY_DCHECK_EQ(
136       result & ~(kNonArchPathsIncludingInternalVariants | paths_to_detect),
137       Path::kNone);
138   return result;
139 }
140 
141 }  // namespace
142 
GetRuntimeEnabledPaths()143 Path Ctx::GetRuntimeEnabledPaths() {
144   RUY_TRACE_SCOPE;
145   // Just a shorthand alias. Using a pointer to make it clear we're mutating
146   // this value in-place.
147   Path* paths = &mutable_impl()->runtime_enabled_paths_;
148 
149   // The value Path::kNone indicates the initial state before detection has been
150   // performed.
151   if (*paths != Path::kNone) {
152     RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE);
153     return *paths;
154   }
155   // User may have set path explicitly in env var.
156   Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("RUY_PATHS"));
157   if (paths_bitfield != Path::kNone) {
158     *paths = paths_bitfield;
159     RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR);
160     return *paths;
161   }
162   // Finally, use runtime detection.
163   *paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo());
164   RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_DETECTION);
165   return *paths;
166 }
167 
SelectPath(Path compiled_paths)168 Path Ctx::SelectPath(Path compiled_paths) {
169   return mutable_impl()->last_used_path_ =
170              GetMostSignificantPath(compiled_paths & GetRuntimeEnabledPaths());
171 }
172 
EnsureThreadSpecificResources(int thread_count)173 void Ctx::EnsureThreadSpecificResources(int thread_count) {
174   auto& resources = mutable_impl()->thread_specific_resources_;
175   while (thread_count > static_cast<int>(resources.size())) {
176     resources.emplace_back(new ThreadSpecificResource);
177   }
178   RUY_DCHECK_LE(thread_count, static_cast<int>(resources.size()));
179 }
180 
GetThreadSpecificTuningResolver(int thread_index) const181 TuningResolver* Ctx::GetThreadSpecificTuningResolver(int thread_index) const {
182   const auto& resources = impl().thread_specific_resources_;
183   RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
184   return &resources[thread_index]->tuning_resolver;
185 }
186 
GetThreadSpecificAllocator(int thread_index) const187 Allocator* Ctx::GetThreadSpecificAllocator(int thread_index) const {
188   const auto& resources = impl().thread_specific_resources_;
189   RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
190   return &resources[thread_index]->allocator;
191 }
192 
GetMainAllocator()193 Allocator* Ctx::GetMainAllocator() {
194   if (!impl().main_allocator_) {
195     mutable_impl()->main_allocator_.reset(new Allocator);
196   }
197   return impl().main_allocator_.get();
198 }
199 
GetPrepackedCache()200 PrepackedCache* Ctx::GetPrepackedCache() {
201   if (!impl().prepacked_cache_) {
202     mutable_impl()->prepacked_cache_.reset(new PrepackedCache);
203   }
204   return impl().prepacked_cache_.get();
205 }
206 
GetMainThreadTuning()207 Tuning Ctx::GetMainThreadTuning() {
208   EnsureThreadSpecificResources(1);
209   TuningResolver* tuning_resolver = GetThreadSpecificTuningResolver(0);
210   tuning_resolver->SetTuning(explicit_tuning());
211   return tuning_resolver->Resolve(mutable_cpuinfo());
212 }
213 
ClearPrepackedCache()214 void Ctx::ClearPrepackedCache() { mutable_impl()->prepacked_cache_ = nullptr; }
215 
216 }  // namespace ruy
217