diff options
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/math/div64.c | 124 |
1 files changed, 85 insertions, 39 deletions
diff --git a/lib/math/div64.c b/lib/math/div64.c index bb57a48ce36a..d1e92ea24fce 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -190,7 +190,6 @@ EXPORT_SYMBOL(iter_div_u64_rem); #define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c) #if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64) - static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) { /* native 64x64=128 bits multiplication */ @@ -199,9 +198,7 @@ static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) *p_lo = prod; return prod >> 64; } - #else - static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) { /* perform a 64x64=128 bits multiplication in 32bit chunks */ @@ -216,12 +213,37 @@ static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) *p_lo = (y << 32) + (u32)x; return add_u64_u32(z, y >> 32); } +#endif + +#ifndef BITS_PER_ITER +#define BITS_PER_ITER (__LONG_WIDTH__ >= 64 ? 32 : 16) +#endif + +#if BITS_PER_ITER == 32 +#define mul_u64_long_add_u64(p_lo, a, b, c) mul_u64_u64_add_u64(p_lo, a, b, c) +#define add_u64_long(a, b) ((a) + (b)) +#else +#undef BITS_PER_ITER +#define BITS_PER_ITER 16 +static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) +{ + u64 n_lo = mul_add(a, b, c); + u64 n_med = mul_add(a >> 32, b, c >> 32); + + n_med = add_u64_u32(n_med, n_lo >> 32); + *p_lo = n_med << 32 | (u32)n_lo; + return n_med >> 32; +} +#define add_u64_long(a, b) add_u64_u32(a, b) #endif u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) { - u64 n_lo, n_hi; + unsigned long d_msig, q_digit; + unsigned int reps, d_z_hi; + u64 quotient, n_lo, n_hi; + u32 overflow; n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c); @@ -240,46 +262,70 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) return ~0ULL; } - int shift = __builtin_ctzll(d); - - /* try reducing the fraction in case the dividend becomes <= 64 bits */ - if ((n_hi >> shift) == 0) { - u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo; - - return div64_u64(n, d >> shift); - /* - * The remainder value if needed would be: - * res = div64_u64_rem(n, d >> shift, &rem); - * rem = (rem << shift) + (n_lo - (n << shift)); - */ + /* Left align the divisor, shifting the dividend to match */ + d_z_hi = __builtin_clzll(d); + if (d_z_hi) { + d <<= d_z_hi; + n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); + n_lo <<= d_z_hi; } - /* Do the full 128 by 64 bits division */ - - shift = __builtin_clzll(d); - d <<= shift; - - int p = 64 + shift; - u64 res = 0; - bool carry; + reps = 64 / BITS_PER_ITER; + /* Optimise loop count for small dividends */ + if (!(u32)(n_hi >> 32)) { + reps -= 32 / BITS_PER_ITER; + n_hi = n_hi << 32 | n_lo >> 32; + n_lo <<= 32; + } +#if BITS_PER_ITER == 16 + if (!(u32)(n_hi >> 48)) { + reps--; + n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); + n_lo <<= 16; + } +#endif - do { - carry = n_hi >> 63; - shift = carry ? 1 : __builtin_clzll(n_hi); - if (p < shift) - break; - p -= shift; - n_hi <<= shift; - n_hi |= n_lo >> (64 - shift); - n_lo <<= shift; - if (carry || (n_hi >= d)) { - n_hi -= d; - res |= 1ULL << p; + /* Invert the dividend so we can use add instead of subtract. */ + n_lo = ~n_lo; + n_hi = ~n_hi; + + /* + * Get the most significant BITS_PER_ITER bits of the divisor. + * This is used to get a low 'guestimate' of the quotient digit. + */ + d_msig = (d >> (64 - BITS_PER_ITER)) + 1; + + /* + * Now do a 'long division' with BITS_PER_ITER bit 'digits'. + * The 'guess' quotient digit can be low and BITS_PER_ITER+1 bits. + * The worst case is dividing ~0 by 0x8000 which requires two subtracts. + */ + quotient = 0; + while (reps--) { + q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; + /* Shift 'n' left to align with the product q_digit * d */ + overflow = n_hi >> (64 - BITS_PER_ITER); + n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); + n_lo <<= BITS_PER_ITER; + /* Add product to negated divisor */ + overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); + /* Adjust for the q_digit 'guestimate' being low */ + while (overflow < 0xffffffff >> (32 - BITS_PER_ITER)) { + q_digit++; + n_hi += d; + overflow += n_hi < d; } - } while (n_hi); - /* The remainder value if needed would be n_hi << p */ + quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit); + } - return res; + /* + * The above only ensures the remainder doesn't overflow, + * it can still be possible to add (aka subtract) another copy + * of the divisor. + */ + if ((n_hi + d) > n_hi) + quotient++; + return quotient; } #if !defined(test_mul_u64_add_u64_div_u64) EXPORT_SYMBOL(mul_u64_add_u64_div_u64); |
