1 /*
2  * Copyright (c) 2008-2012 Travis Geiselbrecht
3  *
4  * Use of this source code is governed by a MIT-style
5  * license that can be found in the LICENSE file or at
6  * https://opensource.org/licenses/MIT
7  */
8 #include <sys/types.h>
9 #include <stdio.h>
10 #include <rand.h>
11 #include <lk/err.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <app/tests.h>
15 #include <kernel/thread.h>
16 #include <kernel/mutex.h>
17 #include <kernel/semaphore.h>
18 #include <kernel/event.h>
19 #include <platform.h>
20 
21 const size_t BUFSIZE = (1024*1024);
22 const uint ITER = 1024;
23 
bench_set_overhead(void)24 __NO_INLINE static void bench_set_overhead(void) {
25     uint32_t *buf = malloc(BUFSIZE);
26     if (!buf) {
27         printf("failed to allocate buffer\n");
28         return;
29     }
30 
31     ulong count = arch_cycle_count();
32     for (uint i = 0; i < ITER; i++) {
33         __asm__ volatile("");
34     }
35     count = arch_cycle_count() - count;
36 
37     printf("took %lu cycles overhead to loop %u times\n", count, ITER);
38 
39     free(buf);
40 }
41 
bench_memset(void)42 __NO_INLINE static void bench_memset(void) {
43     void *buf = malloc(BUFSIZE);
44     if (!buf) {
45         printf("failed to allocate buffer\n");
46         return;
47     }
48 
49     ulong count = arch_cycle_count();
50     for (uint i = 0; i < ITER; i++) {
51         memset(buf, 0, BUFSIZE);
52     }
53     count = arch_cycle_count() - count;
54 
55     size_t total_bytes = BUFSIZE * ITER;
56     double bytes_cycle = total_bytes / (double)count;
57     printf("took %lu cycles to memset a buffer of size %zu %d times (%zu bytes), %f bytes/cycle\n",
58            count, BUFSIZE, ITER, total_bytes, bytes_cycle);
59 
60     free(buf);
61 }
62 
63 #define bench_cset(type) \
64 __NO_INLINE static void bench_cset_##type(void) \
65 { \
66     type *buf = malloc(BUFSIZE); \
67     if (!buf) { \
68         printf("failed to allocate buffer\n"); \
69         return; \
70     } \
71  \
72     ulong count = arch_cycle_count(); \
73     for (uint i = 0; i < ITER; i++) { \
74         for (uint j = 0; j < BUFSIZE / sizeof(*buf); j++) { \
75             buf[j] = 0; \
76         } \
77     } \
78     count = arch_cycle_count() - count; \
79  \
80     size_t total_bytes = BUFSIZE * ITER; \
81     double bytes_cycle = total_bytes / (double)count; \
82     printf("took %lu cycles to manually clear a buffer using wordsize %zu of size %zu %u times (%zu bytes), %f bytes/cycle\n", \
83            count, sizeof(*buf), BUFSIZE, ITER, total_bytes, bytes_cycle); \
84  \
85     free(buf); \
86 }
87 
88 bench_cset(uint8_t)
bench_cset(uint16_t)89 bench_cset(uint16_t)
90 bench_cset(uint32_t)
91 bench_cset(uint64_t)
92 
93 __NO_INLINE static void bench_cset_wide(void) {
94     uint32_t *buf = malloc(BUFSIZE);
95     if (!buf) {
96         printf("failed to allocate buffer\n");
97         return;
98     }
99 
100     ulong count = arch_cycle_count();
101     for (uint i = 0; i < ITER; i++) {
102         for (uint j = 0; j < BUFSIZE / sizeof(*buf) / 8; j++) {
103             buf[j*8] = 0;
104             buf[j*8+1] = 0;
105             buf[j*8+2] = 0;
106             buf[j*8+3] = 0;
107             buf[j*8+4] = 0;
108             buf[j*8+5] = 0;
109             buf[j*8+6] = 0;
110             buf[j*8+7] = 0;
111         }
112     }
113     count = arch_cycle_count() - count;
114 
115     size_t total_bytes = BUFSIZE * ITER;
116     double bytes_cycle = total_bytes / (double)count;
117     printf("took %lu cycles to manually clear a buffer of size %zu %d times 8 words at a time (%zu bytes), %f bytes/cycle\n",
118            count, BUFSIZE, ITER, total_bytes, bytes_cycle);
119 
120     free(buf);
121 }
122 
bench_memcpy(void)123 __NO_INLINE static void bench_memcpy(void) {
124     uint8_t *buf = malloc(BUFSIZE);
125     if (!buf) {
126         printf("failed to allocate buffer\n");
127         return;
128     }
129 
130     ulong count = arch_cycle_count();
131     for (uint i = 0; i < ITER; i++) {
132         memcpy(buf, buf + BUFSIZE / 2, BUFSIZE / 2);
133     }
134     count = arch_cycle_count() - count;
135 
136     size_t total_bytes = (BUFSIZE / 2) * ITER;
137     double bytes_cycle = total_bytes / (double)count;
138     printf("took %lu cycles to memcpy a buffer of size %zu %d times (%zu source bytes), %f source bytes/cycle\n",
139            count, BUFSIZE / 2, ITER, total_bytes, bytes_cycle);
140 
141     free(buf);
142 }
143 
144 #if ARCH_ARM
arm_bench_cset_stm(void)145 __NO_INLINE static void arm_bench_cset_stm(void) {
146     uint32_t *buf = malloc(BUFSIZE);
147     if (!buf) {
148         printf("failed to allocate buffer\n");
149         return;
150     }
151 
152     ulong count = arch_cycle_count();
153     for (uint i = 0; i < ITER; i++) {
154         for (uint j = 0; j < BUFSIZE / sizeof(*buf) / 8; j++) {
155             __asm__ volatile(
156                 "stm    %0, {r0-r7};"
157                 :: "r" (&buf[j*8])
158             );
159         }
160     }
161     count = arch_cycle_count() - count;
162 
163     size_t total_bytes = BUFSIZE * ITER;
164     double bytes_cycle = total_bytes / (float)count;
165     printf("took %lu cycles to manually clear a buffer of size %zu %d times 8 words at a time using stm (%zu bytes), %f bytes/cycle\n",
166            count, BUFSIZE, ITER, total_bytes, bytes_cycle);
167 
168     free(buf);
169 }
170 
171 #if       (__CORTEX_M >= 0x03)
arm_bench_multi_issue(void)172 __NO_INLINE static void arm_bench_multi_issue(void) {
173     ulong cycles;
174     uint32_t a = 0, b = 0, c = 0, d = 0, e = 0, f = 0, g = 0, h = 0;
175 #define ITER 1000000
176     uint count = ITER;
177     cycles = arch_cycle_count();
178     while (count--) {
179         asm volatile ("");
180         asm volatile ("add %0, %0, %0" : "=r" (a) : "r" (a));
181         asm volatile ("add %0, %0, %0" : "=r" (b) : "r" (b));
182         asm volatile ("and %0, %0, %0" : "=r" (c) : "r" (c));
183         asm volatile ("mov %0, %0" : "=r" (d) : "r" (d));
184         asm volatile ("orr %0, %0, %0" : "=r" (e) : "r" (e));
185         asm volatile ("add %0, %0, %0" : "=r" (f) : "r" (f));
186         asm volatile ("and %0, %0, %0" : "=r" (g) : "r" (g));
187         asm volatile ("mov %0, %0" : "=r" (h) : "r" (h));
188     }
189     cycles = arch_cycle_count() - cycles;
190 
191     double cycles_iter = (float)cycles / ITER;
192     printf("took %lu cycles to issue 8 integer ops (%f cycles/iteration)\n", cycles, cycles_iter);
193 #undef ITER
194 }
195 #endif // __CORTEX_M
196 #endif // ARCH_ARM
197 
198 #if WITH_LIB_LIBM
199 #include <math.h>
200 
bench_sincos(void)201 __NO_INLINE static void bench_sincos(void) {
202     printf("touching the floating point unit\n");
203     __UNUSED volatile double _hole = sin(0);
204     volatile float input_f = 1234567.0f;
205     volatile double input_d = 1234567.0;
206 
207     ulong count = arch_cycle_count();
208     __UNUSED volatile double d = sin(input_d);
209     count = arch_cycle_count() - count;
210     printf("took %lu cycles for sin()\n", count);
211 
212     count = arch_cycle_count();
213     d = cos(input_d);
214     count = arch_cycle_count() - count;
215     printf("took %lu cycles for cos()\n", count);
216 
217     count = arch_cycle_count();
218     __UNUSED volatile float f = sinf(input_f);
219     count = arch_cycle_count() - count;
220     printf("took %lu cycles for sinf()\n", count);
221 
222     count = arch_cycle_count();
223     f = cosf(input_f);
224     count = arch_cycle_count() - count;
225     printf("took %lu cycles for cosf()\n", count);
226 
227     count = arch_cycle_count();
228     d = sqrt(input_d);
229     count = arch_cycle_count() - count;
230     printf("took %lu cycles for sqrt()\n", count);
231 
232     count = arch_cycle_count();
233     f = sqrtf(input_f);
234     count = arch_cycle_count() - count;
235     printf("took %lu cycles for sqrtf()\n", count);
236 }
237 
238 #endif // WITH_LIB_LIBM
239 
benchmarks(int argc,const console_cmd_args * argv)240 int benchmarks(int argc, const console_cmd_args *argv) {
241     bench_set_overhead();
242     bench_memset();
243     bench_memcpy();
244 
245     bench_cset_uint8_t();
246     bench_cset_uint16_t();
247     bench_cset_uint32_t();
248     bench_cset_uint64_t();
249     bench_cset_wide();
250 
251 #if ARCH_ARM
252     arm_bench_cset_stm();
253 
254 #if       (__CORTEX_M >= 0x03)
255     arm_bench_multi_issue();
256 #endif
257 #endif
258 #if WITH_LIB_LIBM
259     bench_sincos();
260 #endif
261 
262     return NO_ERROR;
263 }
264 
265