1 /* Copyright 2019 Google LLC. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef RUY_RUY_PREPACKED_CACHE_H_ 17 #define RUY_RUY_PREPACKED_CACHE_H_ 18 19 #include <cstddef> 20 #include <cstdint> 21 #include <unordered_map> 22 23 #include "ruy/mat.h" 24 25 namespace ruy { 26 27 // "Low effort" Least Recently Used Cache for Prepacked Matrices 28 // A cache mechanism for prepacked matrices that ejects oldest entries. 29 // The implementation is "low effort" in the following ways: 30 // - we just linearly search for the oldest entry when doing an ejection 31 // - the ejection policy is very simple: if the new size would be above the 32 // . threshold, we will eject entries until the size is below the threshold. 33 // Current use cases (RNNs with GEMV operations) indicate that ejection is rare 34 // and memory constraints are tight, so we devote no additional storage to the 35 // LRU mechanism and accept O(n) search to eject oldest entry. In practice, 36 // the number of total entries has not been shown to be large. 37 // 38 // An instance of PrepackedCache is always owned by a Context. Just like 39 // Context, no "thread safety" consideration is applicable to this class: 40 // no two threads may simulaneously be accessing the same instance. 41 class PrepackedCache final { 42 public: 43 enum class Action { kGotExistingEntry, kInsertedNewEntry }; 44 45 static constexpr int kDefaultMaxBuffersBytes = 1 << 28; 46 47 // A key in this key-value cache. Equality of keys implies interchangeable 48 // packed matrices, so we must be careful to make this Key type specific 49 // enough, and its equality comparison operator strict enough. 50 // 51 // These keys need to be used before the packed matrix buffers are allocated 52 // (since they are used to determine whether to allocate a new buffer). 53 // So they instead use the source matrix's data pointer. On the other hand, 54 // the packed matrix's layout structure is already available by the time we 55 // need to handle Keys, and that's fortunate because it is more specific 56 // than the source matrix's layout: it also encodes details about the kernel's 57 // small-scale block layout. In the past, we made the kernel function pointer 58 // part of the cache key, but all that is relevant here is the packed layout. 59 // 60 // The only other field that needs to be involved is the zero_point, for 61 // quantized matrices, although it seems far-fetched that the same matrix 62 // data would be reused with different zero_point values. 63 // 64 // The data types (PEMat::data_type and PEMat::sums_type) are omitted based on 65 // the "strict aliasing" model: each memory location should contain data of 66 // only one type. This could be relaxed in the future, by adding data types 67 // to this Key type, if a use case arises. 68 struct Key { 69 // The source matrix's data pointer. 70 const void* src_data; 71 // The packed matrix's layout, see PEMat::layout. 72 PMatLayout packed_layout; 73 // The packed matrix's zero point (for integer-quantized matrices only). 74 std::int32_t zero_point; 75 }; 76 77 friend bool operator==(const Key& a, const Key& b) { 78 return a.src_data == b.src_data && a.packed_layout == b.packed_layout && 79 a.zero_point == b.zero_point; 80 } 81 82 struct KeyHash { 83 std::size_t operator()(const Key&) const; 84 }; 85 86 // A dummy timestamp to associate to each entry (see struct Entry) to 87 // determine which entry is "least recently used" when ejecting entries. 88 // This is just an integer counter, not related to physical time. 89 // It does not need to be atomic because only one thread uses an instance 90 // of PrepackedCache at a time (see class comment). 91 using Timestamp = std::uint64_t; 92 93 struct Entry { 94 PEMat packed_matrix; 95 Timestamp timestamp; 96 }; 97 98 explicit PrepackedCache(int max_buffers_bytes = kDefaultMaxBuffersBytes) max_buffers_bytes_(max_buffers_bytes)99 : max_buffers_bytes_(max_buffers_bytes) {} 100 101 ~PrepackedCache(); 102 103 // Returns the total size in bytes of buffers held in this cache. BuffersBytes()104 std::ptrdiff_t BuffersBytes() const { return buffers_bytes_; } 105 106 // Returns the number of packed matrices held in this cache. MatrixCount()107 int MatrixCount() const { return cache_.size(); } 108 109 // This is the method by which new matrices are cached, and existing cache 110 // entries are queried. 111 // `src_data` is the source matrix data pointer. 112 // `packed_matrix` is a packed matrix structure where all fields have already 113 // been populated, except for the `data` and `sums` pointers which have not 114 // yet been allocated. 115 // 116 // This method: 117 // 1. Queries the cache for an entry matching the given `src_data` pointer and 118 // the relevant fields of `packed_matrix`, particularly its `layout`. 119 // 2. If a matching cache entry does not exist, it is created and inserted 120 // into the cache, and its `data` and `sums` buffers are allocated. 121 // 3. The `packed_matrix` has its `data` and `sums` pointers set to point 122 // to the allocated buffers. 123 // 4. The cache entry's timestamp is updated so it's the most recently used 124 // entry. 125 // 5. The return value is Action::kInsertedNewEntry if at step 2 a new 126 // entry was created. Otherwise it is Action::kGotExistingEntry. 127 Action Get(const void* src_data, PEMat* packed_matrix); 128 129 private: 130 void EjectOne(); 131 void EjectUntilRoomFor(std::ptrdiff_t new_bytes); 132 133 std::unordered_map<Key, Entry, KeyHash> cache_; 134 const std::ptrdiff_t max_buffers_bytes_; 135 std::ptrdiff_t buffers_bytes_ = 0; 136 Timestamp timestamp_ = 0; 137 }; 138 139 } // namespace ruy 140 141 #endif // RUY_RUY_PREPACKED_CACHE_H_ 142