From 8cb87c0407cb55277d8b9aa50f0e29201b90a88d Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 17:01:46 -0500
Subject: [PATCH 01/14] nouveau/nvkm/subdev/clk/gk20a.c: fix wrong do_div()
 usage

do_div() must only be used with a u64 dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c
index 254094ab7fb8..5da2aa8cc333 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c
@@ -141,9 +141,8 @@ gk20a_pllg_calc_rate(struct gk20a_clk *clk)
 
 	rate = clk->parent_rate * clk->n;
 	divider = clk->m * pl_to_div[clk->pl];
-	do_div(rate, divider);
 
-	return rate / 2;
+	return rate / divider / 2;
 }
 
 static int

From 6a0078c8520697ab3e32180ede2df35e7630d94b Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 19:46:23 -0500
Subject: [PATCH 02/14] imx/clk-pllv1: fix wrong do_div() usage

do_div() is meant to be used with an unsigned dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 drivers/clk/imx/clk-pllv1.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/clk/imx/clk-pllv1.c b/drivers/clk/imx/clk-pllv1.c
index 8564e4342c7d..82fe3662b5f6 100644
--- a/drivers/clk/imx/clk-pllv1.c
+++ b/drivers/clk/imx/clk-pllv1.c
@@ -52,7 +52,7 @@ static unsigned long clk_pllv1_recalc_rate(struct clk_hw *hw,
 		unsigned long parent_rate)
 {
 	struct clk_pllv1 *pll = to_clk_pllv1(hw);
-	long long ll;
+	unsigned long long ull;
 	int mfn_abs;
 	unsigned int mfi, mfn, mfd, pd;
 	u32 reg;
@@ -94,16 +94,16 @@ static unsigned long clk_pllv1_recalc_rate(struct clk_hw *hw,
 	rate = parent_rate * 2;
 	rate /= pd + 1;
 
-	ll = (unsigned long long)rate * mfn_abs;
+	ull = (unsigned long long)rate * mfn_abs;
 
-	do_div(ll, mfd + 1);
+	do_div(ull, mfd + 1);
 
 	if (mfn_is_negative(pll, mfn))
-		ll = -ll;
+		ull = (rate * mfi) - ull;
+	else
+		ull = (rate * mfi) + ull;
 
-	ll = (rate * mfi) + ll;
-
-	return ll;
+	return ull;
 }
 
 static struct clk_ops clk_pllv1_ops = {

From 4471f9a4db2d8dfdc79618502fafb1172ab40f05 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 20:01:40 -0500
Subject: [PATCH 03/14] imx/clk-pllv2: fix wrong do_div() usage

do_div() is meant to be used with an unsigned dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 drivers/clk/imx/clk-pllv2.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/clk/imx/clk-pllv2.c b/drivers/clk/imx/clk-pllv2.c
index b18f875eac6a..4aeda56ce372 100644
--- a/drivers/clk/imx/clk-pllv2.c
+++ b/drivers/clk/imx/clk-pllv2.c
@@ -79,7 +79,7 @@ static unsigned long __clk_pllv2_recalc_rate(unsigned long parent_rate,
 {
 	long mfi, mfn, mfd, pdf, ref_clk;
 	unsigned long dbl;
-	s64 temp;
+	u64 temp;
 
 	dbl = dp_ctl & MXC_PLL_DP_CTL_DPDCK0_2_EN;
 
@@ -98,8 +98,9 @@ static unsigned long __clk_pllv2_recalc_rate(unsigned long parent_rate,
 	temp = (u64) ref_clk * abs(mfn);
 	do_div(temp, mfd + 1);
 	if (mfn < 0)
-		temp = -temp;
-	temp = (ref_clk * mfi) + temp;
+		temp = (ref_clk * mfi) - temp;
+	else
+		temp = (ref_clk * mfi) + temp;
 
 	return temp;
 }
@@ -126,7 +127,7 @@ static int __clk_pllv2_set_rate(unsigned long rate, unsigned long parent_rate,
 {
 	u32 reg;
 	long mfi, pdf, mfn, mfd = 999999;
-	s64 temp64;
+	u64 temp64;
 	unsigned long quad_parent_rate;
 
 	quad_parent_rate = 4 * parent_rate;

From 3ed9c824375b07a2acd6919d0fe7aa71650c7f69 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 21:49:15 -0500
Subject: [PATCH 04/14] tegra/clk-divider: fix wrong do_div() usage

do_div() is meant to be used with an unsigned dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 drivers/clk/tegra/clk-divider.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/tegra/clk-divider.c b/drivers/clk/tegra/clk-divider.c
index 48c83efda4cf..16e0aee14773 100644
--- a/drivers/clk/tegra/clk-divider.c
+++ b/drivers/clk/tegra/clk-divider.c
@@ -32,7 +32,7 @@
 static int get_div(struct tegra_clk_frac_div *divider, unsigned long rate,
 		   unsigned long parent_rate)
 {
-	s64 divider_ux1 = parent_rate;
+	u64 divider_ux1 = parent_rate;
 	u8 flags = divider->flags;
 	int mul;
 
@@ -54,7 +54,7 @@ static int get_div(struct tegra_clk_frac_div *divider, unsigned long rate,
 
 	divider_ux1 -= mul;
 
-	if (divider_ux1 < 0)
+	if ((s64)divider_ux1 < 0)
 		return 0;
 
 	if (divider_ux1 > get_max_div(divider))

From e55791883f0a18734ac3d32eddf297b1610e248d Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 23:09:58 -0500
Subject: [PATCH 05/14] ti/clkt_dpll: fix wrong do_div() usage

do_div() is meant to be used with an unsigned dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 drivers/clk/ti/clkt_dpll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c
index 9023ca9caf84..b5cc6f66ae5d 100644
--- a/drivers/clk/ti/clkt_dpll.c
+++ b/drivers/clk/ti/clkt_dpll.c
@@ -240,7 +240,7 @@ u8 omap2_init_dpll_parent(struct clk_hw *hw)
  */
 unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk)
 {
-	long long dpll_clk;
+	u64 dpll_clk;
 	u32 dpll_mult, dpll_div, v;
 	struct dpll_data *dd;
 
@@ -262,7 +262,7 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk)
 	dpll_div = v & dd->div1_mask;
 	dpll_div >>= __ffs(dd->div1_mask);
 
-	dpll_clk = (long long)clk_get_rate(dd->clk_ref) * dpll_mult;
+	dpll_clk = (u64)clk_get_rate(dd->clk_ref) * dpll_mult;
 	do_div(dpll_clk, dpll_div + 1);
 
 	return dpll_clk;

From 3db14288bcc1ef8b53c1ceef35d998530b630657 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 23:17:11 -0500
Subject: [PATCH 06/14] ti/fapll: fix wrong do_div() usage

do_div() is meant to be used with an unsigned dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 drivers/clk/ti/fapll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/ti/fapll.c b/drivers/clk/ti/fapll.c
index f4b2e9888bdf..66a0d0ed8b55 100644
--- a/drivers/clk/ti/fapll.c
+++ b/drivers/clk/ti/fapll.c
@@ -168,7 +168,7 @@ static unsigned long ti_fapll_recalc_rate(struct clk_hw *hw,
 {
 	struct fapll_data *fd = to_fapll(hw);
 	u32 fapll_n, fapll_p, v;
-	long long rate;
+	u64 rate;
 
 	if (ti_fapll_clock_is_bypass(fd))
 		return parent_rate;
@@ -314,7 +314,7 @@ static unsigned long ti_fapll_synth_recalc_rate(struct clk_hw *hw,
 {
 	struct fapll_synth *synth = to_synth(hw);
 	u32 synth_div_m;
-	long long rate;
+	u64 rate;
 
 	/* The audio_pll_clk1 is hardwired to produce 32.768KiHz clock */
 	if (!synth->div)

From 8d43b49e7e0070f96ac46d30659a336c0224fa0b Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 17:01:46 -0500
Subject: [PATCH 07/14] hid-sensor-hub.c: fix wrong do_div() usage

do_div() must only be used with a u64 dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 drivers/hid/hid-sensor-hub.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c
index 92870cdb52d9..8efaa88329aa 100644
--- a/drivers/hid/hid-sensor-hub.c
+++ b/drivers/hid/hid-sensor-hub.c
@@ -218,7 +218,8 @@ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
 		goto done_proc;
 	}
 
-	remaining_bytes = do_div(buffer_size, sizeof(__s32));
+	remaining_bytes = buffer_size % sizeof(__s32);
+	buffer_size = buffer_size / sizeof(__s32);
 	if (buffer_size) {
 		for (i = 0; i < buffer_size; ++i) {
 			hid_set_field(report->field[field_index], i,

From c24ca5be763a402c7fce0bd073c17de901ad5e38 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 23:09:58 -0500
Subject: [PATCH 08/14] drm/mgag200/mgag200_mode.c: fix wrong do_div() usage

do_div() is meant to be used with an unsigned dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 drivers/gpu/drm/mgag200/mgag200_mode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c b/drivers/gpu/drm/mgag200/mgag200_mode.c
index c99d3fe12881..1d4c480f5c9c 100644
--- a/drivers/gpu/drm/mgag200/mgag200_mode.c
+++ b/drivers/gpu/drm/mgag200/mgag200_mode.c
@@ -1564,7 +1564,7 @@ static uint32_t mga_vga_calculate_mode_bandwidth(struct drm_display_mode *mode,
 							int bits_per_pixel)
 {
 	uint32_t total_area, divisor;
-	int64_t active_area, pixels_per_second, bandwidth;
+	uint64_t active_area, pixels_per_second, bandwidth;
 	uint64_t bytes_per_pixel = (bits_per_pixel + 7) / 8;
 
 	divisor = 1024;

From 1c07db46511f0d2335d3b32008f644164071d13e Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 23:09:58 -0500
Subject: [PATCH 09/14] mtd/sm_ftl.c: fix wrong do_div() usage

do_div() is meant to be used with an unsigned dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 drivers/mtd/sm_ftl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index c23184a47fc4..b096f8bb05ba 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -206,9 +206,10 @@ static loff_t sm_mkoffset(struct sm_ftl *ftl, int zone, int block, int boffset)
 }
 
 /* Breaks offset into parts */
-static void sm_break_offset(struct sm_ftl *ftl, loff_t offset,
+static void sm_break_offset(struct sm_ftl *ftl, loff_t loffset,
 			    int *zone, int *block, int *boffset)
 {
+	u64 offset = loffset;
 	*boffset = do_div(offset, ftl->block_size);
 	*block = do_div(offset, ftl->max_lba);
 	*zone = offset >= ftl->zone_count ? -1 : offset;

From 911918aa7ef6f868c135505b0015e42072c54682 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Mon, 2 Nov 2015 14:55:05 -0500
Subject: [PATCH 10/14] div64.h: optimize do_div() for power-of-two constant
 divisors

Let's perform the obvious mask and shift operation in this case.

On 32-bit targets, gcc is able to do the same thing with a constant
divisor that happens to be a power of two i.e. it turns the division
into an inline shift, but it doesn't hurt to be explicit.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 include/asm-generic/div64.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 8f4e3193342e..5d974683193a 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -32,6 +32,8 @@
 
 #elif BITS_PER_LONG == 32
 
+#include <linux/log2.h>
+
 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
 
 /* The unnecessary pointer compare is there
@@ -41,7 +43,11 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
 	uint32_t __base = (base);			\
 	uint32_t __rem;					\
 	(void)(((typeof((n)) *)0) == ((uint64_t *)0));	\
-	if (likely(((n) >> 32) == 0)) {			\
+	if (__builtin_constant_p(__base) &&		\
+	    is_power_of_2(__base)) {			\
+		__rem = (n) & (__base - 1);		\
+		(n) >>= ilog2(__base);			\
+	} else if (likely(((n) >> 32) == 0)) {		\
 		__rem = (uint32_t)(n) % __base;		\
 		(n) = (uint32_t)(n) / __base;		\
 	} else 						\

From 461a5e51060c93f5844113f4be9dba513cc92830 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 30 Oct 2015 15:36:39 -0400
Subject: [PATCH 11/14] do_div(): generic optimization for constant divisor on
 32-bit machines

64-by-32-bit divisions are prominent in the kernel, even on 32-bit
machines.  Luckily, many of them use a constant divisor that allows
for a much faster multiplication by the divisor's reciprocal.

The compiler already performs this optimization when compiling a 32-by-32
division with a constant divisor. Unfortunately, on 32-bit machines, gcc
does not optimize 64-by-32 divisions in that case, except for constant
divisors that happen to be a power of 2.

Let's avoid the slow path whenever the divisor is constant by manually
computing the reciprocal ourselves and performing the multiplication
inline.  In most cases, this improves performance of 64-by-32 divisions
by about two orders of magnitude compared to the __div64_32() fallback,
especially on architectures lacking a native div instruction.

The algorithm used here comes from the existing ARM code.

The __div64_const32_is_OK macro can be predefined by architectures to
disable this optimization in some cases. For example, some ancient gcc
version on ARM would crash with an ICE when fed this code.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Alexey Brodkin <abrodkin@synopsys.com>
---
 include/asm-generic/div64.h | 147 ++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 5d974683193a..5a1bf1aff502 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -4,6 +4,9 @@
  * Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>
  * Based on former asm-ppc/div64.h and asm-m68knommu/div64.h
  *
+ * Optimization for constant divisors on 32-bit machines:
+ * Copyright (C) 2006-2015 Nicolas Pitre
+ *
  * The semantics of do_div() are:
  *
  * uint32_t do_div(uint64_t *n, uint32_t base)
@@ -34,6 +37,142 @@
 
 #include <linux/log2.h>
 
+/*
+ * If the divisor happens to be constant, we determine the appropriate
+ * inverse at compile time to turn the division into a few inline
+ * multiplications which ought to be much faster. And yet only if compiling
+ * with a sufficiently recent gcc version to perform proper 64-bit constant
+ * propagation.
+ *
+ * (It is unfortunate that gcc doesn't perform all this internally.)
+ */
+
+#ifndef __div64_const32_is_OK
+#define __div64_const32_is_OK (__GNUC__ >= 4)
+#endif
+
+#define __div64_const32(n, ___b)					\
+({									\
+	/*								\
+	 * Multiplication by reciprocal of b: n / b = n * (p / b) / p	\
+	 *								\
+	 * We rely on the fact that most of this code gets optimized	\
+	 * away at compile time due to constant propagation and only	\
+	 * a few multiplication instructions should remain.		\
+	 * Hence this monstrous macro (static inline doesn't always	\
+	 * do the trick here).						\
+	 */								\
+	uint64_t ___res, ___x, ___t, ___m, ___n = (n);			\
+	uint32_t ___p, ___bias, ___m_lo, ___m_hi, ___n_lo, ___n_hi;	\
+									\
+	/* determine MSB of b */					\
+	___p = 1 << ilog2(___b);					\
+									\
+	/* compute m = ((p << 64) + b - 1) / b */			\
+	___m = (~0ULL / ___b) * ___p;					\
+	___m += (((~0ULL % ___b + 1) * ___p) + ___b - 1) / ___b;	\
+									\
+	/* one less than the dividend with highest result */		\
+	___x = ~0ULL / ___b * ___b - 1;					\
+									\
+	/* test our ___m with res = m * x / (p << 64) */		\
+	___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32;	\
+	___t = ___res += (___m & 0xffffffff) * (___x >> 32);		\
+	___res += (___x & 0xffffffff) * (___m >> 32);			\
+	___t = (___res < ___t) ? (1ULL << 32) : 0;			\
+	___res = (___res >> 32) + ___t;					\
+	___res += (___m >> 32) * (___x >> 32);				\
+	___res /= ___p;							\
+									\
+	/* Now sanitize and optimize what we've got. */			\
+	if (~0ULL % (___b / (___b & -___b)) == 0) {			\
+		/* special case, can be simplified to ... */		\
+		___n /= (___b & -___b);					\
+		___m = ~0ULL / (___b / (___b & -___b));			\
+		___p = 1;						\
+		___bias = 1;						\
+	} else if (___res != ___x / ___b) {				\
+		/*							\
+		 * We can't get away without a bias to compensate	\
+		 * for bit truncation errors.  To avoid it we'd need an	\
+		 * additional bit to represent m which would overflow	\
+		 * a 64-bit variable.					\
+		 *							\
+		 * Instead we do m = p / b and n / b = (n * m + m) / p.	\
+		 */							\
+		___bias = 1;						\
+		/* Compute m = (p << 64) / b */				\
+		___m = (~0ULL / ___b) * ___p;				\
+		___m += ((~0ULL % ___b + 1) * ___p) / ___b;		\
+	} else {							\
+		/*							\
+		 * Reduce m / p, and try to clear bit 31 of m when	\
+		 * possible, otherwise that'll need extra overflow	\
+		 * handling later.					\
+		 */							\
+		uint32_t ___bits = -(___m & -___m);			\
+		___bits |= ___m >> 32;					\
+		___bits = (~___bits) << 1;				\
+		/*							\
+		 * If ___bits == 0 then setting bit 31 is  unavoidable.	\
+		 * Simply apply the maximum possible reduction in that	\
+		 * case. Otherwise the MSB of ___bits indicates the	\
+		 * best reduction we should apply.			\
+		 */							\
+		if (!___bits) {						\
+			___p /= (___m & -___m);				\
+			___m /= (___m & -___m);				\
+		} else {						\
+			___p >>= ilog2(___bits);			\
+			___m >>= ilog2(___bits);			\
+		}							\
+		/* No bias needed. */					\
+		___bias = 0;						\
+	}								\
+									\
+	/*								\
+	 * Now we have a combination of 2 conditions:			\
+	 *								\
+	 * 1) whether or not we need to apply a bias, and		\
+	 *								\
+	 * 2) whether or not there might be an overflow in the cross	\
+	 *    product determined by (___m & ((1 << 63) | (1 << 31))).	\
+	 *								\
+	 * Select the best way to do (m_bias + m * n) / (p << 64).	\
+	 * From now on there will be actual runtime code generated.	\
+	 */								\
+									\
+	___m_lo = ___m;							\
+	___m_hi = ___m >> 32;						\
+	___n_lo = ___n;							\
+	___n_hi = ___n >> 32;						\
+									\
+	if (!___bias) {							\
+		___res = ((uint64_t)___m_lo * ___n_lo) >> 32;		\
+	} else if (!(___m & ((1ULL << 63) | (1ULL << 31)))) {		\
+		___res = (___m + (uint64_t)___m_lo * ___n_lo) >> 32;	\
+	} else {							\
+		___res = ___m + (uint64_t)___m_lo * ___n_lo;		\
+		___t = (___res < ___m) ? (1ULL << 32) : 0;		\
+		___res = (___res >> 32) + ___t;				\
+	}								\
+									\
+	if (!(___m & ((1ULL << 63) | (1ULL << 31)))) {			\
+		___res += (uint64_t)___m_lo * ___n_hi;			\
+		___res += (uint64_t)___m_hi * ___n_lo;			\
+		___res >>= 32;						\
+	} else {							\
+		___t = ___res += (uint64_t)___m_lo * ___n_hi;		\
+		___res += (uint64_t)___m_hi * ___n_lo;			\
+		___t = (___res < ___t) ? (1ULL << 32) : 0;		\
+		___res = (___res >> 32) + ___t;				\
+	}								\
+									\
+	___res += (uint64_t)___m_hi * ___n_hi;				\
+									\
+	___res /= ___p;							\
+})
+
 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
 
 /* The unnecessary pointer compare is there
@@ -47,6 +186,14 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
 	    is_power_of_2(__base)) {			\
 		__rem = (n) & (__base - 1);		\
 		(n) >>= ilog2(__base);			\
+	} else if (__div64_const32_is_OK &&		\
+		   __builtin_constant_p(__base) &&	\
+		   __base != 0) {			\
+		uint32_t __res_lo, __n_lo = (n);	\
+		(n) = __div64_const32(n, __base);	\
+		/* the remainder can be computed with 32-bit regs */ \
+		__res_lo = (n);				\
+		__rem = __n_lo - __res_lo * __base;	\
 	} else if (likely(((n) >> 32) == 0)) {		\
 		__rem = (uint32_t)(n) % __base;		\
 		(n) = (uint32_t)(n) / __base;		\

From f682b27c57aec2f0ca8927f9bb7c267c6165ad5a Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 30 Oct 2015 17:54:56 -0400
Subject: [PATCH 12/14] __div64_const32(): abstract out the actual 128-bit
 cross product code

The default C implementation for the 128-bit cross product is abstracted
into the __arch_xprod_64() macro that can be overridden to let
architectures provide their own assembly optimized implementation.

There are many advantages to an assembly version for this operation.
Carry bit handling becomes trivial, and 32-bit shifts may be achieved
simply by inverting register pairs on some architectures.  This has the
potential to be quite faster and use much fewer instructions.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 include/asm-generic/div64.h | 81 +++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 30 deletions(-)

diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 5a1bf1aff502..408856a9aba1 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -63,7 +63,7 @@
 	 * do the trick here).						\
 	 */								\
 	uint64_t ___res, ___x, ___t, ___m, ___n = (n);			\
-	uint32_t ___p, ___bias, ___m_lo, ___m_hi, ___n_lo, ___n_hi;	\
+	uint32_t ___p, ___bias;						\
 									\
 	/* determine MSB of b */					\
 	___p = 1 << ilog2(___b);					\
@@ -138,41 +138,62 @@
 	 * 2) whether or not there might be an overflow in the cross	\
 	 *    product determined by (___m & ((1 << 63) | (1 << 31))).	\
 	 *								\
-	 * Select the best way to do (m_bias + m * n) / (p << 64).	\
+	 * Select the best way to do (m_bias + m * n) / (1 << 64).	\
 	 * From now on there will be actual runtime code generated.	\
 	 */								\
-									\
-	___m_lo = ___m;							\
-	___m_hi = ___m >> 32;						\
-	___n_lo = ___n;							\
-	___n_hi = ___n >> 32;						\
-									\
-	if (!___bias) {							\
-		___res = ((uint64_t)___m_lo * ___n_lo) >> 32;		\
-	} else if (!(___m & ((1ULL << 63) | (1ULL << 31)))) {		\
-		___res = (___m + (uint64_t)___m_lo * ___n_lo) >> 32;	\
-	} else {							\
-		___res = ___m + (uint64_t)___m_lo * ___n_lo;		\
-		___t = (___res < ___m) ? (1ULL << 32) : 0;		\
-		___res = (___res >> 32) + ___t;				\
-	}								\
-									\
-	if (!(___m & ((1ULL << 63) | (1ULL << 31)))) {			\
-		___res += (uint64_t)___m_lo * ___n_hi;			\
-		___res += (uint64_t)___m_hi * ___n_lo;			\
-		___res >>= 32;						\
-	} else {							\
-		___t = ___res += (uint64_t)___m_lo * ___n_hi;		\
-		___res += (uint64_t)___m_hi * ___n_lo;			\
-		___t = (___res < ___t) ? (1ULL << 32) : 0;		\
-		___res = (___res >> 32) + ___t;				\
-	}								\
-									\
-	___res += (uint64_t)___m_hi * ___n_hi;				\
+	___res = __arch_xprod_64(___m, ___n, ___bias);			\
 									\
 	___res /= ___p;							\
 })
 
+#ifndef __arch_xprod_64
+/*
+ * Default C implementation for __arch_xprod_64()
+ *
+ * Prototype: uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
+ * Semantic:  retval = ((bias ? m : 0) + m * n) >> 64
+ *
+ * The product is a 128-bit value, scaled down to 64 bits.
+ * Assuming constant propagation to optimize away unused conditional code.
+ * Architectures may provide their own optimized assembly implementation.
+ */
+static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
+{
+	uint32_t m_lo = m;
+	uint32_t m_hi = m >> 32;
+	uint32_t n_lo = n;
+	uint32_t n_hi = n >> 32;
+	uint64_t res, tmp;
+
+	if (!bias) {
+		res = ((uint64_t)m_lo * n_lo) >> 32;
+	} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+		/* there can't be any overflow here */
+		res = (m + (uint64_t)m_lo * n_lo) >> 32;
+	} else {
+		res = m + (uint64_t)m_lo * n_lo;
+		tmp = (res < m) ? (1ULL << 32) : 0;
+		res = (res >> 32) + tmp;
+	}
+
+	if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+		/* there can't be any overflow here */
+		res += (uint64_t)m_lo * n_hi;
+		res += (uint64_t)m_hi * n_lo;
+		res >>= 32;
+	} else {
+		tmp = res += (uint64_t)m_lo * n_hi;
+		res += (uint64_t)m_hi * n_lo;
+		tmp = (res < tmp) ? (1ULL << 32) : 0;
+		res = (res >> 32) + tmp;
+	}
+
+	res += (uint64_t)m_hi * n_hi;
+
+	return res;
+}
+#endif
+
 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
 
 /* The unnecessary pointer compare is there

From dce1eb93b19b2a1a441708f51c97c4a554054d00 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Mon, 2 Nov 2015 13:02:46 -0500
Subject: [PATCH 13/14] __div64_32(): make it overridable at compile time

Some architectures may want to override the default implementation
at compile time to do things inline.  For example, ARM uses a
non-standard calling convention for better efficiency in this case.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 include/asm-generic/div64.h | 2 ++
 lib/div64.c                 | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 408856a9aba1..163f77999ea4 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -194,7 +194,9 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
 }
 #endif
 
+#ifndef __div64_32
 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
+#endif
 
 /* The unnecessary pointer compare is there
  * to check for type safety (n must be 64bit)
diff --git a/lib/div64.c b/lib/div64.c
index 62a698a432bc..7f345259c32f 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -13,7 +13,8 @@
  *
  * Code generated for this function might be very inefficient
  * for some CPUs. __div64_32() can be overridden by linking arch-specific
- * assembly versions such as arch/ppc/lib/div64.S and arch/sh/lib/div64.S.
+ * assembly versions such as arch/ppc/lib/div64.S and arch/sh/lib/div64.S
+ * or by defining a preprocessor macro in arch/include/asm/div64.h.
  */
 
 #include <linux/export.h>
@@ -23,6 +24,7 @@
 /* Not needed on 64bit architectures */
 #if BITS_PER_LONG == 32
 
+#ifndef __div64_32
 uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base)
 {
 	uint64_t rem = *n;
@@ -55,8 +57,8 @@ uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base)
 	*n = res;
 	return rem;
 }
-
 EXPORT_SYMBOL(__div64_32);
+#endif
 
 #ifndef div_s64_rem
 s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)

From 040b323b5012b5503561ec7fe15cccd6a4bcaec2 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Mon, 2 Nov 2015 14:20:41 -0500
Subject: [PATCH 14/14] ARM: asm/div64.h: adjust to generic codde

Now that the constant divisor optimization is made generic, adapt the
ARM case to it.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 arch/arm/include/asm/div64.h | 279 ++++++++++++-----------------------
 1 file changed, 91 insertions(+), 188 deletions(-)

diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index 662c7bd06108..e1f07764b0d6 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -5,9 +5,9 @@
 #include <asm/compiler.h>
 
 /*
- * The semantics of do_div() are:
+ * The semantics of __div64_32() are:
  *
- * uint32_t do_div(uint64_t *n, uint32_t base)
+ * uint32_t __div64_32(uint64_t *n, uint32_t base)
  * {
  * 	uint32_t remainder = *n % base;
  * 	*n = *n / base;
@@ -16,8 +16,9 @@
  *
  * In other words, a 64-bit dividend with a 32-bit divisor producing
  * a 64-bit result and a 32-bit remainder.  To accomplish this optimally
- * we call a special __do_div64 helper with completely non standard
- * calling convention for arguments and results (beware).
+ * we override the generic version in lib/div64.c to call our __do_div64
+ * assembly implementation with completely non standard calling convention
+ * for arguments and results (beware).
  */
 
 #ifdef __ARMEB__
@@ -28,199 +29,101 @@
 #define __xh "r1"
 #endif
 
-#define __do_div_asm(n, base)					\
-({								\
-	register unsigned int __base      asm("r4") = base;	\
-	register unsigned long long __n   asm("r0") = n;	\
-	register unsigned long long __res asm("r2");		\
-	register unsigned int __rem       asm(__xh);		\
-	asm(	__asmeq("%0", __xh)				\
-		__asmeq("%1", "r2")				\
-		__asmeq("%2", "r0")				\
-		__asmeq("%3", "r4")				\
-		"bl	__do_div64"				\
-		: "=r" (__rem), "=r" (__res)			\
-		: "r" (__n), "r" (__base)			\
-		: "ip", "lr", "cc");				\
-	n = __res;						\
-	__rem;							\
-})
+static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
+{
+	register unsigned int __base      asm("r4") = base;
+	register unsigned long long __n   asm("r0") = *n;
+	register unsigned long long __res asm("r2");
+	register unsigned int __rem       asm(__xh);
+	asm(	__asmeq("%0", __xh)
+		__asmeq("%1", "r2")
+		__asmeq("%2", "r0")
+		__asmeq("%3", "r4")
+		"bl	__do_div64"
+		: "=r" (__rem), "=r" (__res)
+		: "r" (__n), "r" (__base)
+		: "ip", "lr", "cc");
+	*n = __res;
+	return __rem;
+}
+#define __div64_32 __div64_32
 
-#if __GNUC__ < 4 || !defined(CONFIG_AEABI)
+#if !defined(CONFIG_AEABI)
+
+/*
+ * In OABI configurations, some uses of the do_div function
+ * cause gcc to run out of registers. To work around that,
+ * we can force the use of the out-of-line version for
+ * configurations that build a OABI kernel.
+ */
+#define do_div(n, base) __div64_32(&(n), base)
+
+#else
 
 /*
  * gcc versions earlier than 4.0 are simply too problematic for the
- * optimized implementation below. First there is gcc PR 15089 that
- * tend to trig on more complex constructs, spurious .global __udivsi3
- * are inserted even if none of those symbols are referenced in the
- * generated code, and those gcc versions are not able to do constant
- * propagation on long long values anyway.
+ * __div64_const32() code in asm-generic/div64.h. First there is
+ * gcc PR 15089 that tend to trig on more complex constructs, spurious
+ * .global __udivsi3 are inserted even if none of those symbols are
+ * referenced in the generated code, and those gcc versions are not able
+ * to do constant propagation on long long values anyway.
  */
-#define do_div(n, base) __do_div_asm(n, base)
 
-#elif __GNUC__ >= 4
+#define __div64_const32_is_OK (__GNUC__ >= 4)
 
-#include <asm/bug.h>
+static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
+{
+	unsigned long long res;
+	unsigned int tmp = 0;
 
-/*
- * If the divisor happens to be constant, we determine the appropriate
- * inverse at compile time to turn the division into a few inline
- * multiplications instead which is much faster. And yet only if compiling
- * for ARMv4 or higher (we need umull/umlal) and if the gcc version is
- * sufficiently recent to perform proper long long constant propagation.
- * (It is unfortunate that gcc doesn't perform all this internally.)
- */
-#define do_div(n, base)							\
-({									\
-	unsigned int __r, __b = (base);					\
-	if (!__builtin_constant_p(__b) || __b == 0 ||			\
-	    (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) {	\
-		/* non-constant divisor (or zero): slow path */		\
-		__r = __do_div_asm(n, __b);				\
-	} else if ((__b & (__b - 1)) == 0) {				\
-		/* Trivial: __b is constant and a power of 2 */		\
-		/* gcc does the right thing with this code.  */		\
-		__r = n;						\
-		__r &= (__b - 1);					\
-		n /= __b;						\
-	} else {							\
-		/* Multiply by inverse of __b: n/b = n*(p/b)/p       */	\
-		/* We rely on the fact that most of this code gets   */	\
-		/* optimized away at compile time due to constant    */	\
-		/* propagation and only a couple inline assembly     */	\
-		/* instructions should remain. Better avoid any      */	\
-		/* code construct that might prevent that.           */	\
-		unsigned long long __res, __x, __t, __m, __n = n;	\
-		unsigned int __c, __p, __z = 0;				\
-		/* preserve low part of n for reminder computation */	\
-		__r = __n;						\
-		/* determine number of bits to represent __b */		\
-		__p = 1 << __div64_fls(__b);				\
-		/* compute __m = ((__p << 64) + __b - 1) / __b */	\
-		__m = (~0ULL / __b) * __p;				\
-		__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;	\
-		/* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */	\
-		__x = ~0ULL / __b * __b - 1;				\
-		__res = (__m & 0xffffffff) * (__x & 0xffffffff);	\
-		__res >>= 32;						\
-		__res += (__m & 0xffffffff) * (__x >> 32);		\
-		__t = __res;						\
-		__res += (__x & 0xffffffff) * (__m >> 32);		\
-		__t = (__res < __t) ? (1ULL << 32) : 0;			\
-		__res = (__res >> 32) + __t;				\
-		__res += (__m >> 32) * (__x >> 32);			\
-		__res /= __p;						\
-		/* Now sanitize and optimize what we've got. */		\
-		if (~0ULL % (__b / (__b & -__b)) == 0) {		\
-			/* those cases can be simplified with: */	\
-			__n /= (__b & -__b);				\
-			__m = ~0ULL / (__b / (__b & -__b));		\
-			__p = 1;					\
-			__c = 1;					\
-		} else if (__res != __x / __b) {			\
-			/* We can't get away without a correction    */	\
-			/* to compensate for bit truncation errors.  */	\
-			/* To avoid it we'd need an additional bit   */	\
-			/* to represent __m which would overflow it. */	\
-			/* Instead we do m=p/b and n/b=(n*m+m)/p.    */	\
-			__c = 1;					\
-			/* Compute __m = (__p << 64) / __b */		\
-			__m = (~0ULL / __b) * __p;			\
-			__m += ((~0ULL % __b + 1) * __p) / __b;		\
-		} else {						\
-			/* Reduce __m/__p, and try to clear bit 31   */	\
-			/* of __m when possible otherwise that'll    */	\
-			/* need extra overflow handling later.       */	\
-			unsigned int __bits = -(__m & -__m);		\
-			__bits |= __m >> 32;				\
-			__bits = (~__bits) << 1;			\
-			/* If __bits == 0 then setting bit 31 is     */	\
-			/* unavoidable.  Simply apply the maximum    */	\
-			/* possible reduction in that case.          */	\
-			/* Otherwise the MSB of __bits indicates the */	\
-			/* best reduction we should apply.           */	\
-			if (!__bits) {					\
-				__p /= (__m & -__m);			\
-				__m /= (__m & -__m);			\
-			} else {					\
-				__p >>= __div64_fls(__bits);		\
-				__m >>= __div64_fls(__bits);		\
-			}						\
-			/* No correction needed. */			\
-			__c = 0;					\
-		}							\
-		/* Now we have a combination of 2 conditions:        */	\
-		/* 1) whether or not we need a correction (__c), and */	\
-		/* 2) whether or not there might be an overflow in   */	\
-		/*    the cross product (__m & ((1<<63) | (1<<31)))  */	\
-		/* Select the best insn combination to perform the   */	\
-		/* actual __m * __n / (__p << 64) operation.         */	\
-		if (!__c) {						\
-			asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"		\
-				"mov	%Q0, #0"			\
-				: "=&r" (__res)				\
-				: "r" (__m), "r" (__n)			\
-				: "cc" );				\
-		} else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {	\
-			__res = __m;					\
-			asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"		\
-				"mov	%Q0, #0"			\
-				: "+&r" (__res)				\
-				: "r" (__m), "r" (__n)			\
-				: "cc" );				\
-		} else {						\
-			asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"		\
-				"cmn	%Q0, %Q1\n\t"			\
-				"adcs	%R0, %R0, %R1\n\t"		\
-				"adc	%Q0, %3, #0"			\
-				: "=&r" (__res)				\
-				: "r" (__m), "r" (__n), "r" (__z)	\
-				: "cc" );				\
-		}							\
-		if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {		\
-			asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"		\
-				"umlal	%R0, %Q0, %Q1, %R2\n\t"		\
-				"mov	%R0, #0\n\t"			\
-				"umlal	%Q0, %R0, %R1, %R2"		\
-				: "+&r" (__res)				\
-				: "r" (__m), "r" (__n)			\
-				: "cc" );				\
-		} else {						\
-			asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"		\
-				"umlal	%R0, %1, %Q2, %R3\n\t"		\
-				"mov	%R0, #0\n\t"			\
-				"adds	%Q0, %1, %Q0\n\t"		\
-				"adc	%R0, %R0, #0\n\t"		\
-				"umlal	%Q0, %R0, %R2, %R3"		\
-				: "+&r" (__res), "+&r" (__z)		\
-				: "r" (__m), "r" (__n)			\
-				: "cc" );				\
-		}							\
-		__res /= __p;						\
-		/* The reminder can be computed with 32-bit regs     */	\
-		/* only, and gcc is good at that.                    */	\
-		{							\
-			unsigned int __res0 = __res;			\
-			unsigned int __b0 = __b;			\
-			__r -= __res0 * __b0;				\
-		}							\
-		/* BUG_ON(__r >= __b || __res * __b + __r != n); */	\
-		n = __res;						\
-	}								\
-	__r;								\
-})
+	if (!bias) {
+		asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"
+			"mov	%Q0, #0"
+			: "=&r" (res)
+			: "r" (m), "r" (n)
+			: "cc");
+	} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+		res = m;
+		asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"
+			"mov	%Q0, #0"
+			: "+&r" (res)
+			: "r" (m), "r" (n)
+			: "cc");
+	} else {
+		asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"
+			"cmn	%Q0, %Q1\n\t"
+			"adcs	%R0, %R0, %R1\n\t"
+			"adc	%Q0, %3, #0"
+			: "=&r" (res)
+			: "r" (m), "r" (n), "r" (tmp)
+			: "cc");
+	}
 
-/* our own fls implementation to make sure constant propagation is fine */
-#define __div64_fls(bits)						\
-({									\
-	unsigned int __left = (bits), __nr = 0;				\
-	if (__left & 0xffff0000) __nr += 16, __left >>= 16;		\
-	if (__left & 0x0000ff00) __nr +=  8, __left >>=  8;		\
-	if (__left & 0x000000f0) __nr +=  4, __left >>=  4;		\
-	if (__left & 0x0000000c) __nr +=  2, __left >>=  2;		\
-	if (__left & 0x00000002) __nr +=  1;				\
-	__nr;								\
-})
+	if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+		asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"
+			"umlal	%R0, %Q0, %Q1, %R2\n\t"
+			"mov	%R0, #0\n\t"
+			"umlal	%Q0, %R0, %R1, %R2"
+			: "+&r" (res)
+			: "r" (m), "r" (n)
+			: "cc");
+	} else {
+		asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"
+			"umlal	%R0, %1, %Q2, %R3\n\t"
+			"mov	%R0, #0\n\t"
+			"adds	%Q0, %1, %Q0\n\t"
+			"adc	%R0, %R0, #0\n\t"
+			"umlal	%Q0, %R0, %R2, %R3"
+			: "+&r" (res), "+&r" (tmp)
+			: "r" (m), "r" (n)
+			: "cc");
+	}
+
+	return res;
+}
+#define __arch_xprod_64 __arch_xprod_64
+
+#include <asm-generic/div64.h>
 
 #endif