1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Benchmark module for page_pool.
4  *
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/interrupt.h>
9 #include <linux/limits.h>
10 #include <linux/module.h>
11 #include <linux/mutex.h>
12 #include <net/page_pool/helpers.h>
13 
14 #include "time_bench.h"
15 
16 static int verbose = 1;
17 #define MY_POOL_SIZE 1024
18 
19 /* Makes tests selectable. Useful for perf-record to analyze a single test.
20  * Hint: Bash shells support writing binary number like: $((2#101010)
21  *
22  * # modprobe bench_page_pool_simple run_flags=$((2#100))
23  */
24 static unsigned long run_flags = 0xFFFFFFFF;
25 module_param(run_flags, ulong, 0);
26 MODULE_PARM_DESC(run_flags, "Limit which bench test that runs");
27 
28 /* Count the bit number from the enum */
29 enum benchmark_bit {
30 	bit_run_bench_baseline,
31 	bit_run_bench_no_softirq01,
32 	bit_run_bench_no_softirq02,
33 	bit_run_bench_no_softirq03,
34 };
35 
36 #define bit(b)		(1 << (b))
37 #define enabled(b)	((run_flags & (bit(b))))
38 
39 /* notice time_bench is limited to U32_MAX nr loops */
40 static unsigned long loops = 10000000;
41 module_param(loops, ulong, 0);
42 MODULE_PARM_DESC(loops, "Specify loops bench will run");
43 
44 /* Timing at the nanosec level, we need to know the overhead
45  * introduced by the for loop itself
46  */
time_bench_for_loop(struct time_bench_record * rec,void * data)47 static int time_bench_for_loop(struct time_bench_record *rec, void *data)
48 {
49 	uint64_t loops_cnt = 0;
50 	int i;
51 
52 	time_bench_start(rec);
53 	/** Loop to measure **/
54 	for (i = 0; i < rec->loops; i++) {
55 		loops_cnt++;
56 		barrier(); /* avoid compiler to optimize this loop */
57 	}
58 	time_bench_stop(rec, loops_cnt);
59 	return loops_cnt;
60 }
61 
time_bench_atomic_inc(struct time_bench_record * rec,void * data)62 static int time_bench_atomic_inc(struct time_bench_record *rec, void *data)
63 {
64 	uint64_t loops_cnt = 0;
65 	atomic_t cnt;
66 	int i;
67 
68 	atomic_set(&cnt, 0);
69 
70 	time_bench_start(rec);
71 	/** Loop to measure **/
72 	for (i = 0; i < rec->loops; i++) {
73 		atomic_inc(&cnt);
74 		barrier(); /* avoid compiler to optimize this loop */
75 	}
76 	loops_cnt = atomic_read(&cnt);
77 	time_bench_stop(rec, loops_cnt);
78 	return loops_cnt;
79 }
80 
81 /* The ptr_ping in page_pool uses a spinlock. We need to know the minimum
82  * overhead of taking+releasing a spinlock, to know the cycles that can be saved
83  * by e.g. amortizing this via bulking.
84  */
time_bench_lock(struct time_bench_record * rec,void * data)85 static int time_bench_lock(struct time_bench_record *rec, void *data)
86 {
87 	uint64_t loops_cnt = 0;
88 	spinlock_t lock;
89 	int i;
90 
91 	spin_lock_init(&lock);
92 
93 	time_bench_start(rec);
94 	/** Loop to measure **/
95 	for (i = 0; i < rec->loops; i++) {
96 		spin_lock(&lock);
97 		loops_cnt++;
98 		barrier(); /* avoid compiler to optimize this loop */
99 		spin_unlock(&lock);
100 	}
101 	time_bench_stop(rec, loops_cnt);
102 	return loops_cnt;
103 }
104 
105 /* Helper for filling some page's into ptr_ring */
pp_fill_ptr_ring(struct page_pool * pp,int elems)106 static void pp_fill_ptr_ring(struct page_pool *pp, int elems)
107 {
108 	/* GFP_ATOMIC needed when under run softirq */
109 	gfp_t gfp_mask = GFP_ATOMIC;
110 	struct page **array;
111 	int i;
112 
113 	array = kcalloc(elems, sizeof(struct page *), gfp_mask);
114 
115 	for (i = 0; i < elems; i++)
116 		array[i] = page_pool_alloc_pages(pp, gfp_mask);
117 	for (i = 0; i < elems; i++)
118 		page_pool_put_page(pp, array[i], -1, false);
119 
120 	kfree(array);
121 }
122 
123 enum test_type { type_fast_path, type_ptr_ring, type_page_allocator };
124 
125 /* Depends on compile optimizing this function */
time_bench_page_pool(struct time_bench_record * rec,void * data,enum test_type type,const char * func)126 static int time_bench_page_pool(struct time_bench_record *rec, void *data,
127 				enum test_type type, const char *func)
128 {
129 	uint64_t loops_cnt = 0;
130 	gfp_t gfp_mask = GFP_ATOMIC; /* GFP_ATOMIC is not really needed */
131 	int i, err;
132 
133 	struct page_pool *pp;
134 	struct page *page;
135 
136 	struct page_pool_params pp_params = {
137 		.order = 0,
138 		.flags = 0,
139 		.pool_size = MY_POOL_SIZE,
140 		.nid = NUMA_NO_NODE,
141 		.dev = NULL, /* Only use for DMA mapping */
142 		.dma_dir = DMA_BIDIRECTIONAL,
143 	};
144 
145 	pp = page_pool_create(&pp_params);
146 	if (IS_ERR(pp)) {
147 		err = PTR_ERR(pp);
148 		pr_warn("%s: Error(%d) creating page_pool\n", func, err);
149 		goto out;
150 	}
151 	pp_fill_ptr_ring(pp, 64);
152 
153 	if (in_serving_softirq())
154 		pr_warn("%s(): in_serving_softirq fast-path\n", func);
155 	else
156 		pr_warn("%s(): Cannot use page_pool fast-path\n", func);
157 
158 	time_bench_start(rec);
159 	/** Loop to measure **/
160 	for (i = 0; i < rec->loops; i++) {
161 		/* Common fast-path alloc that depend on in_serving_softirq() */
162 		page = page_pool_alloc_pages(pp, gfp_mask);
163 		if (!page)
164 			break;
165 		loops_cnt++;
166 		barrier(); /* avoid compiler to optimize this loop */
167 
168 		/* The benchmarks purpose it to test different return paths.
169 		 * Compiler should inline optimize other function calls out
170 		 */
171 		if (type == type_fast_path) {
172 			/* Fast-path recycling e.g. XDP_DROP use-case */
173 			page_pool_recycle_direct(pp, page);
174 
175 		} else if (type == type_ptr_ring) {
176 			/* Normal return path */
177 			page_pool_put_page(pp, page, -1, false);
178 
179 		} else if (type == type_page_allocator) {
180 			/* Test if not pages are recycled, but instead
181 			 * returned back into systems page allocator
182 			 */
183 			get_page(page); /* cause no-recycling */
184 			page_pool_put_page(pp, page, -1, false);
185 			put_page(page);
186 		} else {
187 			BUILD_BUG();
188 		}
189 	}
190 	time_bench_stop(rec, loops_cnt);
191 out:
192 	page_pool_destroy(pp);
193 	return loops_cnt;
194 }
195 
time_bench_page_pool01_fast_path(struct time_bench_record * rec,void * data)196 static int time_bench_page_pool01_fast_path(struct time_bench_record *rec,
197 					    void *data)
198 {
199 	return time_bench_page_pool(rec, data, type_fast_path, __func__);
200 }
201 
time_bench_page_pool02_ptr_ring(struct time_bench_record * rec,void * data)202 static int time_bench_page_pool02_ptr_ring(struct time_bench_record *rec,
203 					   void *data)
204 {
205 	return time_bench_page_pool(rec, data, type_ptr_ring, __func__);
206 }
207 
time_bench_page_pool03_slow(struct time_bench_record * rec,void * data)208 static int time_bench_page_pool03_slow(struct time_bench_record *rec,
209 				       void *data)
210 {
211 	return time_bench_page_pool(rec, data, type_page_allocator, __func__);
212 }
213 
run_benchmark_tests(void)214 static int run_benchmark_tests(void)
215 {
216 	uint32_t nr_loops = loops;
217 
218 	/* Baseline tests */
219 	if (enabled(bit_run_bench_baseline)) {
220 		time_bench_loop(nr_loops * 10, 0, "for_loop", NULL,
221 				time_bench_for_loop);
222 		time_bench_loop(nr_loops * 10, 0, "atomic_inc", NULL,
223 				time_bench_atomic_inc);
224 		time_bench_loop(nr_loops, 0, "lock", NULL, time_bench_lock);
225 	}
226 
227 	/* This test cannot activate correct code path, due to no-softirq ctx */
228 	if (enabled(bit_run_bench_no_softirq01))
229 		time_bench_loop(nr_loops, 0, "no-softirq-page_pool01", NULL,
230 				time_bench_page_pool01_fast_path);
231 	if (enabled(bit_run_bench_no_softirq02))
232 		time_bench_loop(nr_loops, 0, "no-softirq-page_pool02", NULL,
233 				time_bench_page_pool02_ptr_ring);
234 	if (enabled(bit_run_bench_no_softirq03))
235 		time_bench_loop(nr_loops, 0, "no-softirq-page_pool03", NULL,
236 				time_bench_page_pool03_slow);
237 
238 	return 0;
239 }
240 
bench_page_pool_simple_module_init(void)241 static int __init bench_page_pool_simple_module_init(void)
242 {
243 	if (verbose)
244 		pr_info("Loaded\n");
245 
246 	if (loops > U32_MAX) {
247 		pr_err("Module param loops(%lu) exceeded U32_MAX(%u)\n", loops,
248 		       U32_MAX);
249 		return -ECHRNG;
250 	}
251 
252 	run_benchmark_tests();
253 
254 	return 0;
255 }
256 module_init(bench_page_pool_simple_module_init);
257 
bench_page_pool_simple_module_exit(void)258 static void __exit bench_page_pool_simple_module_exit(void)
259 {
260 	if (verbose)
261 		pr_info("Unloaded\n");
262 }
263 module_exit(bench_page_pool_simple_module_exit);
264 
265 MODULE_DESCRIPTION("Benchmark of page_pool simple cases");
266 MODULE_AUTHOR("Jesper Dangaard Brouer <netoptimizer@brouer.com>");
267 MODULE_LICENSE("GPL");
268