1 /* Taken from Linux arch/arm */ 2 #ifndef __ASM_ARM_DIV64 3 #define __ASM_ARM_DIV64 4 5 #include <asm/system.h> 6 #include <xen/types.h> 7 8 /* 9 * The semantics of do_div() are: 10 * 11 * uint32_t do_div(uint64_t *n, uint32_t base) 12 * { 13 * uint32_t remainder = *n % base; 14 * *n = *n / base; 15 * return remainder; 16 * } 17 * 18 * In other words, a 64-bit dividend with a 32-bit divisor producing 19 * a 64-bit result and a 32-bit remainder. To accomplish this optimally 20 * we call a special __do_div64 helper with completely non standard 21 * calling convention for arguments and results (beware). 22 */ 23 24 25 #if BITS_PER_LONG == 64 26 27 # define do_div(n,base) ({ \ 28 uint32_t __base = (base); \ 29 uint32_t __rem; \ 30 __rem = ((uint64_t)(n)) % __base; \ 31 (n) = ((uint64_t)(n)) / __base; \ 32 __rem; \ 33 }) 34 35 #elif BITS_PER_LONG == 32 36 37 #ifdef __ARMEB__ 38 #define __xh "r0" 39 #define __xl "r1" 40 #else 41 #define __xl "r0" 42 #define __xh "r1" 43 #endif 44 45 #define __do_div_asm(n, base) \ 46 ({ \ 47 register unsigned int __base asm("r4") = base; \ 48 register unsigned long long __n asm("r0") = n; \ 49 register unsigned long long __res asm("r2"); \ 50 register unsigned int __rem asm(__xh); \ 51 asm( __asmeq("%0", __xh) \ 52 __asmeq("%1", "r2") \ 53 __asmeq("%2", "r0") \ 54 __asmeq("%3", "r4") \ 55 "bl __do_div64" \ 56 : "=r" (__rem), "=r" (__res) \ 57 : "r" (__n), "r" (__base) \ 58 : "ip", "lr", "cc"); \ 59 n = __res; \ 60 __rem; \ 61 }) 62 63 #if __GNUC__ < 4 64 65 /* 66 * gcc versions earlier than 4.0 are simply too problematic for the 67 * optimized implementation below. First there is gcc PR 15089 that 68 * tend to trig on more complex constructs, spurious .global __udivsi3 69 * are inserted even if none of those symbols are referenced in the 70 * generated code, and those gcc versions are not able to do constant 71 * propagation on long long values anyway. 72 */ 73 #define do_div(n, base) __do_div_asm(n, base) 74 75 #elif __GNUC__ >= 4 76 77 #include <asm/bug.h> 78 79 /* 80 * If the divisor happens to be constant, we determine the appropriate 81 * inverse at compile time to turn the division into a few inline 82 * multiplications instead which is much faster. And yet only if compiling 83 * for ARMv4 or higher (we need umull/umlal) and if the gcc version is 84 * sufficiently recent to perform proper long long constant propagation. 85 * (It is unfortunate that gcc doesn't perform all this internally.) 86 */ 87 #define do_div(n, base) \ 88 ({ \ 89 unsigned int __r, __b = (base); \ 90 if (!__builtin_constant_p(__b) || __b == 0) { \ 91 /* non-constant divisor (or zero): slow path */ \ 92 __r = __do_div_asm(n, __b); \ 93 } else if ((__b & (__b - 1)) == 0) { \ 94 /* Trivial: __b is constant and a power of 2 */ \ 95 /* gcc does the right thing with this code. */ \ 96 __r = n; \ 97 __r &= (__b - 1); \ 98 n /= __b; \ 99 } else { \ 100 /* Multiply by inverse of __b: n/b = n*(p/b)/p */ \ 101 /* We rely on the fact that most of this code gets */ \ 102 /* optimized away at compile time due to constant */ \ 103 /* propagation and only a couple inline assembly */ \ 104 /* instructions should remain. Better avoid any */ \ 105 /* code construct that might prevent that. */ \ 106 unsigned long long __res, __x, __t, __m, __n = n; \ 107 unsigned int __c, __p, __z = 0; \ 108 /* preserve low part of n for reminder computation */ \ 109 __r = __n; \ 110 /* determine number of bits to represent __b */ \ 111 __p = 1 << __div64_fls(__b); \ 112 /* compute __m = ((__p << 64) + __b - 1) / __b */ \ 113 __m = (~0ULL / __b) * __p; \ 114 __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \ 115 /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \ 116 __x = ~0ULL / __b * __b - 1; \ 117 __res = (__m & 0xffffffff) * (__x & 0xffffffff); \ 118 __res >>= 32; \ 119 __res += (__m & 0xffffffff) * (__x >> 32); \ 120 __t = __res; \ 121 __res += (__x & 0xffffffff) * (__m >> 32); \ 122 __t = (__res < __t) ? (1ULL << 32) : 0; \ 123 __res = (__res >> 32) + __t; \ 124 __res += (__m >> 32) * (__x >> 32); \ 125 __res /= __p; \ 126 /* Now sanitize and optimize what we've got. */ \ 127 if (~0ULL % (__b / (__b & -__b)) == 0) { \ 128 /* those cases can be simplified with: */ \ 129 __n /= (__b & -__b); \ 130 __m = ~0ULL / (__b / (__b & -__b)); \ 131 __p = 1; \ 132 __c = 1; \ 133 } else if (__res != __x / __b) { \ 134 /* We can't get away without a correction */ \ 135 /* to compensate for bit truncation errors. */ \ 136 /* To avoid it we'd need an additional bit */ \ 137 /* to represent __m which would overflow it. */ \ 138 /* Instead we do m=p/b and n/b=(n*m+m)/p. */ \ 139 __c = 1; \ 140 /* Compute __m = (__p << 64) / __b */ \ 141 __m = (~0ULL / __b) * __p; \ 142 __m += ((~0ULL % __b + 1) * __p) / __b; \ 143 } else { \ 144 /* Reduce __m/__p, and try to clear bit 31 */ \ 145 /* of __m when possible otherwise that'll */ \ 146 /* need extra overflow handling later. */ \ 147 unsigned int __bits = -(__m & -__m); \ 148 __bits |= __m >> 32; \ 149 __bits = (~__bits) << 1; \ 150 /* If __bits == 0 then setting bit 31 is */ \ 151 /* unavoidable. Simply apply the maximum */ \ 152 /* possible reduction in that case. */ \ 153 /* Otherwise the MSB of __bits indicates the */ \ 154 /* best reduction we should apply. */ \ 155 if (!__bits) { \ 156 __p /= (__m & -__m); \ 157 __m /= (__m & -__m); \ 158 } else { \ 159 __p >>= __div64_fls(__bits); \ 160 __m >>= __div64_fls(__bits); \ 161 } \ 162 /* No correction needed. */ \ 163 __c = 0; \ 164 } \ 165 /* Now we have a combination of 2 conditions: */ \ 166 /* 1) whether or not we need a correction (__c), and */ \ 167 /* 2) whether or not there might be an overflow in */ \ 168 /* the cross product (__m & ((1<<63) | (1<<31))) */ \ 169 /* Select the best insn combination to perform the */ \ 170 /* actual __m * __n / (__p << 64) operation. */ \ 171 if (!__c) { \ 172 asm ( "umull %Q0, %R0, %1, %Q2\n\t" \ 173 "mov %Q0, #0" \ 174 : "=&r" (__res) \ 175 : "r" (__m), "r" (__n) \ 176 : "cc" ); \ 177 } else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ 178 __res = __m; \ 179 asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \ 180 "mov %Q0, #0" \ 181 : "+&r" (__res) \ 182 : "r" (__m), "r" (__n) \ 183 : "cc" ); \ 184 } else { \ 185 asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \ 186 "cmn %Q0, %Q1\n\t" \ 187 "adcs %R0, %R0, %R1\n\t" \ 188 "adc %Q0, %3, #0" \ 189 : "=&r" (__res) \ 190 : "r" (__m), "r" (__n), "r" (__z) \ 191 : "cc" ); \ 192 } \ 193 if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ 194 asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \ 195 "umlal %R0, %Q0, %Q1, %R2\n\t" \ 196 "mov %R0, #0\n\t" \ 197 "umlal %Q0, %R0, %R1, %R2" \ 198 : "+&r" (__res) \ 199 : "r" (__m), "r" (__n) \ 200 : "cc" ); \ 201 } else { \ 202 asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \ 203 "umlal %R0, %1, %Q2, %R3\n\t" \ 204 "mov %R0, #0\n\t" \ 205 "adds %Q0, %1, %Q0\n\t" \ 206 "adc %R0, %R0, #0\n\t" \ 207 "umlal %Q0, %R0, %R2, %R3" \ 208 : "+&r" (__res), "+&r" (__z) \ 209 : "r" (__m), "r" (__n) \ 210 : "cc" ); \ 211 } \ 212 __res /= __p; \ 213 /* The reminder can be computed with 32-bit regs */ \ 214 /* only, and gcc is good at that. */ \ 215 { \ 216 unsigned int __res0 = __res; \ 217 unsigned int __b0 = __b; \ 218 __r -= __res0 * __b0; \ 219 } \ 220 /* BUG_ON(__r >= __b || __res * __b + __r != n); */ \ 221 n = __res; \ 222 } \ 223 __r; \ 224 }) 225 226 /* our own fls implementation to make sure constant propagation is fine */ 227 #define __div64_fls(bits) \ 228 ({ \ 229 unsigned int __left = (bits), __nr = 0; \ 230 if (__left & 0xffff0000) __nr += 16, __left >>= 16; \ 231 if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \ 232 if (__left & 0x000000f0) __nr += 4, __left >>= 4; \ 233 if (__left & 0x0000000c) __nr += 2, __left >>= 2; \ 234 if (__left & 0x00000002) __nr += 1; \ 235 __nr; \ 236 }) 237 238 #endif /* GCC version */ 239 240 #endif /* BITS_PER_LONG */ 241 242 #endif 243 /* 244 * Local variables: 245 * mode: C 246 * c-file-style: "BSD" 247 * c-basic-offset: 4 248 * indent-tabs-mode: nil 249 * End: 250 */ 251