1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "ruy/prepacked_cache.h"
17 
18 #include "ruy/mat.h"
19 #include "ruy/profiler/instrumentation.h"
20 #include "ruy/system_aligned_alloc.h"
21 
22 namespace ruy {
23 
24 namespace {
25 
26 // Allocates the `data` and `sums` buffers, and sets the corresponding
27 // pointer fields, in a PEMat whose other fields, particularly `layout`
28 // and the runtime data types, are already populated.
AllocateBuffers(PEMat * packed_matrix)29 std::ptrdiff_t AllocateBuffers(PEMat* packed_matrix) {
30   const std::ptrdiff_t data_bytes = DataBytes(*packed_matrix);
31   packed_matrix->data = detail::SystemAlignedAlloc(data_bytes);
32   std::ptrdiff_t sums_bytes = 0;
33   if (!packed_matrix->sums_type.is_floating_point) {
34     // Integer quantized matrices also need the `sums` buffer.
35     sums_bytes = SumsBytes(*packed_matrix);
36     packed_matrix->sums = detail::SystemAlignedAlloc(sums_bytes);
37   }
38   return data_bytes + sums_bytes;
39 }
40 
41 // Frees the `data` and `sums` buffers held by a PEMat.
FreeBuffers(const PEMat & packed_matrix)42 void FreeBuffers(const PEMat& packed_matrix) {
43   detail::SystemAlignedFree(packed_matrix.data);
44   detail::SystemAlignedFree(packed_matrix.sums);
45 }
46 
47 }  // end anonymous namespace
48 
operator ()(const PrepackedCache::Key & key) const49 std::size_t PrepackedCache::KeyHash::operator()(
50     const PrepackedCache::Key& key) const {
51   std::size_t src_data_hash = reinterpret_cast<std::size_t>(key.src_data);
52   // Naive hash of the layout. Based on some heuristic reasoning, not any
53   // benchmarking.
54   // A choice of hash function here is just an optimization matter
55   // anyway, since a hash collision only results in some Key::operator== calls
56   // to disambiguate, and even just returning src_data_hash, ignoring the layout
57   // altogether, would probably be good enough, as the case of multiple entries
58   // with the same data pointer will be uncommon.
59   // Here we multiply-add the layout fields using some small constant prime
60   // numbers as multipliers. The conventional approach of xor-ing bit-rotations
61   // would result in some hash collisions because these values are typically
62   // small positive integers, so bit-rotations are essentially bit-shifts,
63   // and powers of two are common.
64   std::size_t packed_layout_hash =
65       static_cast<int>(key.packed_layout.order) +
66       static_cast<int>(key.packed_layout.kernel.order) * 2 +
67       key.packed_layout.stride * 3 + key.packed_layout.kernel.rows * 5 +
68       key.packed_layout.kernel.cols * 7 + key.packed_layout.rows * 11 +
69       key.packed_layout.cols * 13;
70   return src_data_hash ^ packed_layout_hash;
71 }
72 
~PrepackedCache()73 PrepackedCache::~PrepackedCache() {
74   for (const auto& pair : cache_) {
75     FreeBuffers(pair.second.packed_matrix);
76   }
77 }
78 
Get(const void * src_data,PEMat * packed_matrix)79 PrepackedCache::Action PrepackedCache::Get(const void* src_data,
80                                            PEMat* packed_matrix) {
81   // Construct a Key and look up the cache.
82   Key key;
83   key.src_data = src_data;
84   key.packed_layout = packed_matrix->layout;
85   key.zero_point = packed_matrix->zero_point;
86   const auto& itr = cache_.find(key);
87 
88   if (itr != cache_.end()) {
89     // Found existing entry. Update its timestamp and return it.
90     itr->second.timestamp = timestamp_++;
91     *packed_matrix = itr->second.packed_matrix;
92     return Action::kGotExistingEntry;
93   }
94 
95   // No existing entry found. Allocate new buffers now and insert in the cache.
96   const std::ptrdiff_t new_bytes = AllocateBuffers(packed_matrix);
97   EjectUntilRoomFor(new_bytes);
98   Entry entry{*packed_matrix, timestamp_++};
99   cache_.emplace(key, entry);
100   buffers_bytes_ += new_bytes;
101   return Action::kInsertedNewEntry;
102 }
103 
EjectUntilRoomFor(std::ptrdiff_t new_bytes)104 void PrepackedCache::EjectUntilRoomFor(std::ptrdiff_t new_bytes) {
105   profiler::ScopeLabel label("PrepackedCacheEjection");
106   // While we are above the threshold of ejection, eject the LRU entry.
107   while (!cache_.empty() && buffers_bytes_ + new_bytes > max_buffers_bytes_) {
108     EjectOne();
109   }
110 }
111 
EjectOne()112 void PrepackedCache::EjectOne() {
113   auto oldest = cache_.begin();
114   Timestamp oldest_timestamp = oldest->second.timestamp;
115   {
116     for (auto itr = cache_.begin(); itr != cache_.end(); ++itr) {
117       if (itr->second.timestamp < oldest_timestamp) {
118         oldest = itr;
119         oldest_timestamp = itr->second.timestamp;
120       }
121     }
122   }
123   const PEMat& packed_matrix = oldest->second.packed_matrix;
124   buffers_bytes_ -= DataBytes(packed_matrix) + SumsBytes(packed_matrix);
125   FreeBuffers(packed_matrix);
126   cache_.erase(oldest);
127 }
128 
129 }  // namespace ruy
130