blob: 585ce89c0f336d0d5ab6895be6ccf9fe1b0a1c02 [file] [log] [blame] [edit]
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RECIPROCAL_DIV_H
#define _LINUX_RECIPROCAL_DIV_H
#include <linux/types.h>
/*
* This algorithm is based on the paper "Division by Invariant
* Integers Using Multiplication" by Torbjörn Granlund and Peter
* L. Montgomery.
*
* The assembler implementation from Agner Fog, which this code is
* based on, can be found here:
* http://www.agner.org/optimize/asmlib.zip
*
* This optimization for A/B is helpful if the divisor B is mostly
* runtime invariant. The reciprocal of B is calculated in the
* slow-path with reciprocal_value(). The fast-path can then just use
* a much faster multiplication operation with a variable dividend A
* to calculate the division A/B.
*/
struct reciprocal_value {
u32 m;
u8 sh1, sh2;
};
/* "reciprocal_value" and "reciprocal_divide" together implement the basic
* version of the algorithm described in Figure 4.1 of the paper.
*/
struct reciprocal_value reciprocal_value(u32 d);
static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
{
u32 t = (u32)(((u64)a * R.m) >> 32);
return (t + ((a - t) >> R.sh1)) >> R.sh2;
}
struct reciprocal_value_adv {
u32 m;
u8 sh, exp;
bool is_wide_m;
};
/* "reciprocal_value_adv" implements the advanced version of the algorithm
* described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
* ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
* exception case could be easily handled before calling "reciprocal_value_adv".
*
* The advanced version requires more complex calculation to get the reciprocal
* multiplier and other control variables, but then could reduce the required
* emulation operations.
*
* It makes no sense to use this advanced version for host divide emulation,
* those extra complexities for calculating multiplier etc could completely
* waive our saving on emulation operations.
*
* However, it makes sense to use it for JIT divide code generation for which
* we are willing to trade performance of JITed code with that of host. As shown
* by the following pseudo code, the required emulation operations could go down
* from 6 (the basic version) to 3 or 4.
*
* To use the result of "reciprocal_value_adv", suppose we want to calculate
* n/d, the pseudo C code will be:
*
* struct reciprocal_value_adv rvalue;
* u8 pre_shift, exp;
*
* // handle exception case.
* if (d >= (1U << 31)) {
* result = n >= d;
* return;
* }
*
* rvalue = reciprocal_value_adv(d, 32)
* exp = rvalue.exp;
* if (rvalue.is_wide_m && !(d & 1)) {
* // floor(log2(d & (2^32 -d)))
* pre_shift = fls(d & -d) - 1;
* rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
* } else {
* pre_shift = 0;
* }
*
* // code generation starts.
* if (imm == 1U << exp) {
* result = n >> exp;
* } else if (rvalue.is_wide_m) {
* // pre_shift must be zero when reached here.
* t = (n * rvalue.m) >> 32;
* result = n - t;
* result >>= 1;
* result += t;
* result >>= rvalue.sh - 1;
* } else {
* if (pre_shift)
* result = n >> pre_shift;
* result = ((u64)result * rvalue.m) >> 32;
* result >>= rvalue.sh;
* }
*/
struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
#endif /* _LINUX_RECIPROCAL_DIV_H */