1 #ifndef _X86_BITOPS_H
2 #define _X86_BITOPS_H
3 
4 /*
5  * Copyright 1992, Linus Torvalds.
6  */
7 
8 #include <asm/alternative.h>
9 #include <asm/asm_defns.h>
10 #include <asm/cpufeatureset.h>
11 
12 /*
13  * We specify the memory operand as both input and output because the memory
14  * operand is both read from and written to. Since the operand is in fact a
15  * word array, we also specify "memory" in the clobbers list to indicate that
16  * words other than the one directly addressed by the memory operand may be
17  * modified.
18  */
19 
20 #define ADDR (*(volatile int *) addr)
21 #define CONST_ADDR (*(const volatile int *) addr)
22 
23 /**
24  * set_bit - Atomically set a bit in memory
25  * @nr: the bit to set
26  * @addr: the address to start counting from
27  *
28  * This function is atomic and may not be reordered.  See __set_bit()
29  * if you do not require the atomic guarantees.
30  * Note that @nr may be almost arbitrarily large; this function is not
31  * restricted to acting on a single-word quantity.
32  */
set_bit(int nr,volatile void * addr)33 static inline void set_bit(int nr, volatile void *addr)
34 {
35     asm volatile ( "lock btsl %1,%0"
36                    : "+m" (ADDR) : "Ir" (nr) : "memory");
37 }
38 #define set_bit(nr, addr) ({                            \
39     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
40     set_bit(nr, addr);                                  \
41 })
42 
43 /**
44  * __set_bit - Set a bit in memory
45  * @nr: the bit to set
46  * @addr: the address to start counting from
47  *
48  * Unlike set_bit(), this function is non-atomic and may be reordered.
49  * If it's called on the same region of memory simultaneously, the effect
50  * may be that only one operation succeeds.
51  */
variable_set_bit(int nr,void * addr)52 static inline void variable_set_bit(int nr, void *addr)
53 {
54     asm volatile ( "btsl %1,%0" : "+m" (*(int *)addr) : "Ir" (nr) : "memory" );
55 }
constant_set_bit(int nr,void * addr)56 static inline void constant_set_bit(int nr, void *addr)
57 {
58     ((unsigned int *)addr)[nr >> 5] |= (1u << (nr & 31));
59 }
60 #define __set_bit(nr, addr) ({                          \
61     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
62     __builtin_constant_p(nr) ?                          \
63         constant_set_bit(nr, addr) :                    \
64         variable_set_bit(nr, addr);                     \
65 })
66 
67 /**
68  * clear_bit - Clears a bit in memory
69  * @nr: Bit to clear
70  * @addr: Address to start counting from
71  *
72  * clear_bit() is atomic and may not be reordered.
73  */
clear_bit(int nr,volatile void * addr)74 static inline void clear_bit(int nr, volatile void *addr)
75 {
76     asm volatile ( "lock btrl %1,%0"
77                    : "+m" (ADDR) : "Ir" (nr) : "memory");
78 }
79 #define clear_bit(nr, addr) ({                          \
80     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
81     clear_bit(nr, addr);                                \
82 })
83 
84 /**
85  * __clear_bit - Clears a bit in memory
86  * @nr: Bit to clear
87  * @addr: Address to start counting from
88  *
89  * Unlike clear_bit(), this function is non-atomic and may be reordered.
90  * If it's called on the same region of memory simultaneously, the effect
91  * may be that only one operation succeeds.
92  */
variable_clear_bit(int nr,void * addr)93 static inline void variable_clear_bit(int nr, void *addr)
94 {
95     asm volatile ( "btrl %1,%0" : "+m" (*(int *)addr) : "Ir" (nr) : "memory" );
96 }
constant_clear_bit(int nr,void * addr)97 static inline void constant_clear_bit(int nr, void *addr)
98 {
99     ((unsigned int *)addr)[nr >> 5] &= ~(1u << (nr & 31));
100 }
101 #define __clear_bit(nr, addr) ({                        \
102     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
103     __builtin_constant_p(nr) ?                          \
104         constant_clear_bit(nr, addr) :                  \
105         variable_clear_bit(nr, addr);                   \
106 })
107 
108 /**
109  * __change_bit - Toggle a bit in memory
110  * @nr: the bit to set
111  * @addr: the address to start counting from
112  *
113  * Unlike change_bit(), this function is non-atomic and may be reordered.
114  * If it's called on the same region of memory simultaneously, the effect
115  * may be that only one operation succeeds.
116  */
variable_change_bit(int nr,void * addr)117 static inline void variable_change_bit(int nr, void *addr)
118 {
119     asm volatile ( "btcl %1,%0" : "+m" (*(int *)addr) : "Ir" (nr) : "memory" );
120 }
constant_change_bit(int nr,void * addr)121 static inline void constant_change_bit(int nr, void *addr)
122 {
123     ((unsigned int *)addr)[nr >> 5] ^= (1u << (nr & 31));
124 }
125 #define __change_bit(nr, addr) ({                       \
126     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
127     __builtin_constant_p(nr) ?                          \
128         constant_change_bit(nr, addr) :                 \
129         variable_change_bit(nr, addr);                  \
130 })
131 
132 /**
133  * change_bit - Toggle a bit in memory
134  * @nr: Bit to clear
135  * @addr: Address to start counting from
136  *
137  * change_bit() is atomic and may not be reordered.
138  * Note that @nr may be almost arbitrarily large; this function is not
139  * restricted to acting on a single-word quantity.
140  */
change_bit(int nr,volatile void * addr)141 static inline void change_bit(int nr, volatile void *addr)
142 {
143     asm volatile ( "lock btcl %1,%0"
144                     : "+m" (ADDR) : "Ir" (nr) : "memory");
145 }
146 #define change_bit(nr, addr) ({                         \
147     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
148     change_bit(nr, addr);                               \
149 })
150 
151 /**
152  * test_and_set_bit - Set a bit and return its old value
153  * @nr: Bit to set
154  * @addr: Address to count from
155  *
156  * This operation is atomic and cannot be reordered.
157  * It also implies a memory barrier.
158  */
test_and_set_bit(int nr,volatile void * addr)159 static inline int test_and_set_bit(int nr, volatile void *addr)
160 {
161     int oldbit;
162 
163     asm volatile ( "lock btsl %[nr], %[addr]\n\t"
164                    ASM_FLAG_OUT(, "sbbl %[old], %[old]\n\t")
165                    : [old] ASM_FLAG_OUT("=@ccc", "=r") (oldbit),
166                      [addr] "+m" (ADDR) : [nr] "Ir" (nr) : "memory" );
167 
168     return oldbit;
169 }
170 #define test_and_set_bit(nr, addr) ({                   \
171     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
172     test_and_set_bit(nr, addr);                         \
173 })
174 
175 /**
176  * arch__test_and_set_bit - Set a bit and return its old value
177  * @nr: Bit to set
178  * @addr: Address to count from
179  *
180  * This operation is non-atomic and can be reordered.
181  * If two examples of this operation race, one can appear to succeed
182  * but actually fail.  You must protect multiple accesses with a lock.
183  */
arch__test_and_set_bit(int nr,volatile void * addr)184 static inline int arch__test_and_set_bit(int nr, volatile void *addr)
185 {
186     int oldbit;
187 
188     asm volatile ( "btsl %[nr], %[addr]\n\t"
189                    ASM_FLAG_OUT(, "sbbl %[old], %[old]\n\t")
190                    : [old] ASM_FLAG_OUT("=@ccc", "=r") (oldbit),
191                      [addr] "+m" (*(int *)addr) : [nr] "Ir" (nr) : "memory" );
192 
193     return oldbit;
194 }
195 #define arch__test_and_set_bit arch__test_and_set_bit
196 
197 /**
198  * test_and_clear_bit - Clear a bit and return its old value
199  * @nr: Bit to set
200  * @addr: Address to count from
201  *
202  * This operation is atomic and cannot be reordered.
203  * It also implies a memory barrier.
204  */
test_and_clear_bit(int nr,volatile void * addr)205 static inline int test_and_clear_bit(int nr, volatile void *addr)
206 {
207     int oldbit;
208 
209     asm volatile ( "lock btrl %[nr], %[addr]\n\t"
210                    ASM_FLAG_OUT(, "sbbl %[old], %[old]\n\t")
211                    : [old] ASM_FLAG_OUT("=@ccc", "=r") (oldbit),
212                      [addr] "+m" (ADDR) : [nr] "Ir" (nr) : "memory" );
213 
214     return oldbit;
215 }
216 #define test_and_clear_bit(nr, addr) ({                 \
217     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
218     test_and_clear_bit(nr, addr);                       \
219 })
220 
221 /**
222  * arch__test_and_clear_bit - Clear a bit and return its old value
223  * @nr: Bit to set
224  * @addr: Address to count from
225  *
226  * This operation is non-atomic and can be reordered.
227  * If two examples of this operation race, one can appear to succeed
228  * but actually fail.  You must protect multiple accesses with a lock.
229  */
arch__test_and_clear_bit(int nr,volatile void * addr)230 static inline int arch__test_and_clear_bit(int nr, volatile void *addr)
231 {
232     int oldbit;
233 
234     asm volatile ( "btrl %[nr], %[addr]\n\t"
235                    ASM_FLAG_OUT(, "sbbl %[old], %[old]\n\t")
236                    : [old] ASM_FLAG_OUT("=@ccc", "=r") (oldbit),
237                      [addr] "+m" (*(int *)addr) : [nr] "Ir" (nr) : "memory" );
238 
239     return oldbit;
240 }
241 #define arch__test_and_clear_bit arch__test_and_clear_bit
242 
243 /* WARNING: non atomic and it can be reordered! */
arch__test_and_change_bit(int nr,volatile void * addr)244 static inline int arch__test_and_change_bit(int nr, volatile void *addr)
245 {
246     int oldbit;
247 
248     asm volatile ( "btcl %[nr], %[addr]\n\t"
249                    ASM_FLAG_OUT(, "sbbl %[old], %[old]\n\t")
250                    : [old] ASM_FLAG_OUT("=@ccc", "=r") (oldbit),
251                      [addr] "+m" (*(int *)addr) : [nr] "Ir" (nr) : "memory" );
252 
253     return oldbit;
254 }
255 #define arch__test_and_change_bit arch__test_and_change_bit
256 
257 /**
258  * test_and_change_bit - Change a bit and return its new value
259  * @nr: Bit to set
260  * @addr: Address to count from
261  *
262  * This operation is atomic and cannot be reordered.
263  * It also implies a memory barrier.
264  */
test_and_change_bit(int nr,volatile void * addr)265 static inline int test_and_change_bit(int nr, volatile void *addr)
266 {
267     int oldbit;
268 
269     asm volatile ( "lock btcl %[nr], %[addr]\n\t"
270                    ASM_FLAG_OUT(, "sbbl %[old], %[old]\n\t")
271                    : [old] ASM_FLAG_OUT("=@ccc", "=r") (oldbit),
272                      [addr] "+m" (ADDR) : [nr] "Ir" (nr) : "memory" );
273 
274     return oldbit;
275 }
276 #define test_and_change_bit(nr, addr) ({                \
277     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
278     test_and_change_bit(nr, addr);                      \
279 })
280 
variable_test_bit(int nr,const volatile void * addr)281 static inline int variable_test_bit(int nr, const volatile void *addr)
282 {
283     int oldbit;
284 
285     asm volatile ( "btl %[nr], %[addr]\n\t"
286                    ASM_FLAG_OUT(, "sbbl %[old], %[old]\n\t")
287                    : [old] ASM_FLAG_OUT("=@ccc", "=r") (oldbit)
288                    : [addr] "m" (CONST_ADDR), [nr] "Ir" (nr) : "memory" );
289 
290     return oldbit;
291 }
292 
293 #define arch_test_bit(nr, addr) ({                      \
294     __builtin_constant_p(nr) ?                          \
295         generic_test_bit(nr, addr) :                    \
296         variable_test_bit(nr, addr);                    \
297 })
298 
299 extern unsigned int __find_first_bit(
300     const unsigned long *addr, unsigned int size);
301 extern unsigned int __find_next_bit(
302     const unsigned long *addr, unsigned int size, unsigned int offset);
303 extern unsigned int __find_first_zero_bit(
304     const unsigned long *addr, unsigned int size);
305 extern unsigned int __find_next_zero_bit(
306     const unsigned long *addr, unsigned int size, unsigned int offset);
307 
__scanbit(unsigned long val,unsigned int max)308 static always_inline unsigned int __scanbit(unsigned long val, unsigned int max)
309 {
310     if ( __builtin_constant_p(max) && max == BITS_PER_LONG )
311         alternative_io("bsf %[in],%[out]; cmovz %[max],%k[out]",
312                        "rep; bsf %[in],%[out]",
313                        X86_FEATURE_BMI1,
314                        [out] "=&r" (val),
315                        [in] "r" (val), [max] "r" (max));
316     else
317         asm ( "bsf %1,%0 ; cmovz %2,%k0"
318               : "=&r" (val) : "r" (val), "r" (max) );
319     return (unsigned int)val;
320 }
321 
322 /**
323  * find_first_bit - find the first set bit in a memory region
324  * @addr: The address to start the search at
325  * @size: The maximum size to search
326  *
327  * Returns the bit-number of the first set bit, not the number of the byte
328  * containing a bit.
329  */
330 #define find_first_bit(addr, size) find_next_bit(addr, size, 0)
331 
332 /**
333  * find_next_bit - find the first set bit in a memory region
334  * @addr: The address to base the search on
335  * @offset: The bitnumber to start searching at
336  * @size: The maximum size to search
337  */
338 #define find_next_bit(addr, size, off) ({                                   \
339     unsigned int r__;                                                       \
340     const unsigned long *a__ = (addr);                                      \
341     unsigned int s__ = (size);                                              \
342     unsigned int o__ = (off);                                               \
343     if ( o__ >= s__ )                                                       \
344         r__ = s__;                                                          \
345     else if ( __builtin_constant_p(size) && s__ <= BITS_PER_LONG )          \
346         r__ = o__ + __scanbit(*(const unsigned long *)(a__) >> o__, s__);   \
347     else if ( __builtin_constant_p(off) && !o__ )                           \
348         r__ = __find_first_bit(a__, s__);                                   \
349     else                                                                    \
350         r__ = __find_next_bit(a__, s__, o__);                               \
351     r__;                                                                    \
352 })
353 
354 /**
355  * find_first_zero_bit - find the first zero bit in a memory region
356  * @addr: The address to start the search at
357  * @size: The maximum size to search
358  *
359  * Returns the bit-number of the first zero bit, not the number of the byte
360  * containing a bit.
361  */
362 #define find_first_zero_bit(addr, size) find_next_zero_bit(addr, size, 0)
363 
364 /**
365  * find_next_zero_bit - find the first zero bit in a memory region
366  * @addr: The address to base the search on
367  * @offset: The bitnumber to start searching at
368  * @size: The maximum size to search
369  */
370 #define find_next_zero_bit(addr, size, off) ({                              \
371     unsigned int r__;                                                       \
372     const unsigned long *a__ = (addr);                                      \
373     unsigned int s__ = (size);                                              \
374     unsigned int o__ = (off);                                               \
375     if ( o__ >= s__ )                                                       \
376         r__ = s__;                                                          \
377     else if ( __builtin_constant_p(size) && s__ <= BITS_PER_LONG )          \
378         r__ = o__ + __scanbit(~*(const unsigned long *)(a__) >> o__, s__);  \
379     else if ( __builtin_constant_p(off) && !o__ )                           \
380         r__ = __find_first_zero_bit(a__, s__);                              \
381     else                                                                    \
382         r__ = __find_next_zero_bit(a__, s__, o__);                          \
383     r__;                                                                    \
384 })
385 
arch_ffs(unsigned int x)386 static always_inline unsigned int arch_ffs(unsigned int x)
387 {
388     unsigned int r;
389 
390     if ( __builtin_constant_p(x > 0) && x > 0 )
391     {
392         /*
393          * A common code pattern is:
394          *
395          *     while ( bits )
396          *     {
397          *         bit = ffs(bits);
398          *         ...
399          *
400          * and the optimiser really can work with the knowledge of x being
401          * non-zero without knowing it's exact value, in which case we don't
402          * need to compensate for BSF's corner cases.  Otherwise...
403          */
404         asm ( "bsf %[val], %[res]"
405               : [res] "=r" (r)
406               : [val] "rm" (x) );
407     }
408     else
409     {
410         /*
411          * ... the AMD manual states that BSF won't modify the destination
412          * register if x=0.  The Intel manual states that the result is
413          * undefined, but the architects have said that the register is
414          * written back with it's old value (zero extended as normal).
415          */
416         asm ( "bsf %[val], %[res]"
417               : [res] "=r" (r)
418               : [val] "rm" (x), "[res]" (-1) );
419     }
420 
421     return r + 1;
422 }
423 #define arch_ffs arch_ffs
424 
arch_ffsl(unsigned long x)425 static always_inline unsigned int arch_ffsl(unsigned long x)
426 {
427     unsigned int r;
428 
429     /* See arch_ffs() for safety discussions. */
430     if ( __builtin_constant_p(x > 0) && x > 0 )
431         asm ( "bsf %[val], %q[res]"
432               : [res] "=r" (r)
433               : [val] "rm" (x) );
434     else
435         asm ( "bsf %[val], %q[res]"
436               : [res] "=r" (r)
437               : [val] "rm" (x), "[res]" (-1) );
438 
439     return r + 1;
440 }
441 #define arch_ffsl arch_ffsl
442 
arch_fls(unsigned int x)443 static always_inline unsigned int arch_fls(unsigned int x)
444 {
445     unsigned int r;
446 
447     /* See arch_ffs() for safety discussions. */
448     if ( __builtin_constant_p(x > 0) && x > 0 )
449         asm ( "bsr %[val], %[res]"
450               : [res] "=r" (r)
451               : [val] "rm" (x) );
452     else
453         asm ( "bsr %[val], %[res]"
454               : [res] "=r" (r)
455               : [val] "rm" (x), "[res]" (-1) );
456 
457     return r + 1;
458 }
459 #define arch_fls arch_fls
460 
arch_flsl(unsigned long x)461 static always_inline unsigned int arch_flsl(unsigned long x)
462 {
463     unsigned int r;
464 
465     /* See arch_ffs() for safety discussions. */
466     if ( __builtin_constant_p(x > 0) && x > 0 )
467         asm ( "bsr %[val], %q[res]"
468               : [res] "=r" (r)
469               : [val] "rm" (x) );
470     else
471         asm ( "bsr %[val], %q[res]"
472               : [res] "=r" (r)
473               : [val] "rm" (x), "[res]" (-1) );
474 
475     return r + 1;
476 }
477 #define arch_flsl arch_flsl
478 
479 unsigned int arch_generic_hweightl(unsigned long x);
480 
arch_hweightl(unsigned long x)481 static always_inline unsigned int arch_hweightl(unsigned long x)
482 {
483     unsigned int r;
484 
485     /*
486      * arch_generic_hweightl() is written in ASM in order to preserve all
487      * registers, as the compiler can't see the call.
488      *
489      * This limits the POPCNT instruction to using the same ABI as a function
490      * call (input in %rdi, output in %eax) but that's fine.
491      *
492      * On Intel CPUs prior to Cannon Lake, the POPCNT instruction has a false
493      * input dependency on it's destination register (errata HSD146, SKL029
494      * amongst others), impacting loops such as bitmap_weight().  Insert an
495      * XOR to manually break the dependency.
496      */
497     alternative_io("call arch_generic_hweightl",
498                    "xor %k[res], %k[res]\n\t"
499                    "popcnt %[val], %q[res]", X86_FEATURE_POPCNT,
500                    ASM_OUTPUT2([res] "=&a" (r) ASM_CALL_CONSTRAINT),
501                    [val] "D" (x));
502 
503     return r;
504 }
505 #define arch_hweightl arch_hweightl
506 
507 #endif /* _X86_BITOPS_H */
508