1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef RUY_RUY_PREPACKED_CACHE_H_
17 #define RUY_RUY_PREPACKED_CACHE_H_
18 
19 #include <cstddef>
20 #include <cstdint>
21 #include <unordered_map>
22 
23 #include "ruy/mat.h"
24 
25 namespace ruy {
26 
27 // "Low effort" Least Recently Used Cache for Prepacked Matrices
28 // A cache mechanism for prepacked matrices that ejects oldest entries.
29 // The implementation is "low effort" in the following ways:
30 //  - we just linearly search for the oldest entry when doing an ejection
31 //  - the ejection policy is very simple: if the new size would be above the
32 // .  threshold, we will eject entries until the size is below the threshold.
33 // Current use cases (RNNs with GEMV operations) indicate that ejection is rare
34 // and memory constraints are tight, so we devote no additional storage to the
35 // LRU mechanism and accept O(n) search to eject oldest entry. In practice,
36 // the number of total entries has not been shown to be large.
37 //
38 // An instance of PrepackedCache is always owned by a Context. Just like
39 // Context, no "thread safety" consideration is applicable to this class:
40 // no two threads may simulaneously be accessing the same instance.
41 class PrepackedCache final {
42  public:
43   enum class Action { kGotExistingEntry, kInsertedNewEntry };
44 
45   static constexpr int kDefaultMaxBuffersBytes = 1 << 28;
46 
47   // A key in this key-value cache. Equality of keys implies interchangeable
48   // packed matrices, so we must be careful to make this Key type specific
49   // enough, and its equality comparison operator strict enough.
50   //
51   // These keys need to be used before the packed matrix buffers are allocated
52   // (since they are used to determine whether to allocate a new buffer).
53   // So they instead use the source matrix's data pointer. On the other hand,
54   // the packed matrix's layout structure is already available by the time we
55   // need to handle Keys, and that's fortunate because it is more specific
56   // than the source matrix's layout: it also encodes details about the kernel's
57   // small-scale block layout. In the past, we made the kernel function pointer
58   // part of the cache key, but all that is relevant here is the packed layout.
59   //
60   // The only other field that needs to be involved is the zero_point, for
61   // quantized matrices, although it seems far-fetched that the same matrix
62   // data would be reused with different zero_point values.
63   //
64   // The data types (PEMat::data_type and PEMat::sums_type) are omitted based on
65   // the "strict aliasing" model: each memory location should contain data of
66   // only one type. This could be relaxed in the future, by adding data types
67   // to this Key type, if a use case arises.
68   struct Key {
69     // The source matrix's data pointer.
70     const void* src_data;
71     // The packed matrix's layout, see PEMat::layout.
72     PMatLayout packed_layout;
73     // The packed matrix's zero point (for integer-quantized matrices only).
74     std::int32_t zero_point;
75   };
76 
77   friend bool operator==(const Key& a, const Key& b) {
78     return a.src_data == b.src_data && a.packed_layout == b.packed_layout &&
79            a.zero_point == b.zero_point;
80   }
81 
82   struct KeyHash {
83     std::size_t operator()(const Key&) const;
84   };
85 
86   // A dummy timestamp to associate to each entry (see struct Entry) to
87   // determine which entry is "least recently used" when ejecting entries.
88   // This is just an integer counter, not related to physical time.
89   // It does not need to be atomic because only one thread uses an instance
90   // of PrepackedCache at a time (see class comment).
91   using Timestamp = std::uint64_t;
92 
93   struct Entry {
94     PEMat packed_matrix;
95     Timestamp timestamp;
96   };
97 
98   explicit PrepackedCache(int max_buffers_bytes = kDefaultMaxBuffersBytes)
max_buffers_bytes_(max_buffers_bytes)99       : max_buffers_bytes_(max_buffers_bytes) {}
100 
101   ~PrepackedCache();
102 
103   // Returns the total size in bytes of buffers held in this cache.
BuffersBytes()104   std::ptrdiff_t BuffersBytes() const { return buffers_bytes_; }
105 
106   // Returns the number of packed matrices held in this cache.
MatrixCount()107   int MatrixCount() const { return cache_.size(); }
108 
109   // This is the method by which new matrices are cached, and existing cache
110   // entries are queried.
111   // `src_data` is the source matrix data pointer.
112   // `packed_matrix` is a packed matrix structure where all fields have already
113   // been populated, except for the `data` and `sums` pointers which have not
114   // yet been allocated.
115   //
116   // This method:
117   // 1. Queries the cache for an entry matching the given `src_data` pointer and
118   //    the relevant fields of `packed_matrix`, particularly its `layout`.
119   // 2. If a matching cache entry does not exist, it is created and inserted
120   //    into the cache, and its `data` and `sums` buffers are allocated.
121   // 3. The `packed_matrix` has its `data` and `sums` pointers set to point
122   //    to the allocated buffers.
123   // 4. The cache entry's timestamp is updated so it's the most recently used
124   //    entry.
125   // 5. The return value is Action::kInsertedNewEntry if at step 2 a new
126   //    entry was created. Otherwise it is Action::kGotExistingEntry.
127   Action Get(const void* src_data, PEMat* packed_matrix);
128 
129  private:
130   void EjectOne();
131   void EjectUntilRoomFor(std::ptrdiff_t new_bytes);
132 
133   std::unordered_map<Key, Entry, KeyHash> cache_;
134   const std::ptrdiff_t max_buffers_bytes_;
135   std::ptrdiff_t buffers_bytes_ = 0;
136   Timestamp timestamp_ = 0;
137 };
138 
139 }  // namespace ruy
140 
141 #endif  // RUY_RUY_PREPACKED_CACHE_H_
142