sparc32: Kill off software 32-bit multiply/divide routines.

For the explicit calls to .udiv/.umul in assembler, I made a mechanical (read as: safe) transformation. I didn't attempt to make any simplifications. In particular, __ndelay and __udelay can be simplified significantly. Some of the %y reads are unnecessary and these routines have no need any longer for allocating a register window, they can be leaf functions. Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-15 11:23:01 -07:00 · 2012-05-15 11:23:01 -07:00 · 1b35a57b1c
parent 2119ff6d2b
commit 1b35a57b1c
17 changed files with 24 additions and 2129 deletions
--- a/arch/sparc/kernel/entry.S
+++ b/arch/sparc/kernel/entry.S
@ -1161,11 +1161,13 @@ fpload:
 	.globl	__ndelay
 __ndelay:
 	save	%sp, -STACKFRAME_SZ, %sp
-	mov	%i0, %o0
-	call	.umul			! round multiplier up so large ns ok
-	 mov	0x1ae, %o1		! 2**32 / (1 000 000 000 / HZ)
-	call	.umul
-	 mov	%i1, %o1		! udelay_val
+	mov	%i0, %o0		! round multiplier up so large ns ok
+	mov	0x1ae, %o1		! 2**32 / (1 000 000 000 / HZ)
+	umul	%o0, %o1, %o0
+	rd	%y, %o1
+	mov	%i1, %o1		! udelay_val
+	umul	%o0, %o1, %o0
+	rd	%y, %o1
 	ba	delay_continue
 	 mov	%o1, %o0		! >>32 later for better resolution

@ -1174,18 +1176,21 @@ __udelay:
 	save	%sp, -STACKFRAME_SZ, %sp
 	mov	%i0, %o0
 	sethi	%hi(0x10c7), %o1	! round multiplier up so large us ok
-	call	.umul
-	 or	%o1, %lo(0x10c7), %o1	! 2**32 / 1 000 000
-	call	.umul
-	 mov	%i1, %o1		! udelay_val
+	or	%o1, %lo(0x10c7), %o1	! 2**32 / 1 000 000
+	umul	%o0, %o1, %o0
+	rd	%y, %o1
+	mov	%i1, %o1		! udelay_val
+	umul	%o0, %o1, %o0
+	rd	%y, %o1
 	sethi	%hi(0x028f4b62), %l0	! Add in rounding constant * 2**32,
 	or	%g0, %lo(0x028f4b62), %l0
 	addcc	%o0, %l0, %o0		! 2**32 * 0.009 999
 	bcs,a	3f
 	 add	%o1, 0x01, %o1
 3:
-	call	.umul
-	 mov	HZ, %o0			! >>32 earlier for wider range
+	mov	HZ, %o0			! >>32 earlier for wider range
+	umul	%o0, %o1, %o0
+	rd	%y, %o1

 delay_continue:
 	cmp	%o0, 0x0
--- a/arch/sparc/kernel/head_32.S
+++ b/arch/sparc/kernel/head_32.S
@ -746,51 +746,6 @@ sun4d_init:
 	/* Fall through to sun4m_init */

 sun4m_init:
-
-#define PATCH_IT(dst, src)	\
-	set	(dst), %g5;	\
-	set	(src), %g4;	\
-	ld	[%g4], %g3;	\
-	st	%g3, [%g5];	\
-	ld	[%g4+0x4], %g3;	\
-	st	%g3, [%g5+0x4];
-
-	/* Signed multiply. */
-	PATCH_IT(.mul, .mul_patch)
-	PATCH_IT(.mul+0x08, .mul_patch+0x08)
-
-	/* Signed remainder. */
-	PATCH_IT(.rem, .rem_patch)
-	PATCH_IT(.rem+0x08, .rem_patch+0x08)
-	PATCH_IT(.rem+0x10, .rem_patch+0x10)
-	PATCH_IT(.rem+0x18, .rem_patch+0x18)
-	PATCH_IT(.rem+0x20, .rem_patch+0x20)
-	PATCH_IT(.rem+0x28, .rem_patch+0x28)
-
-	/* Signed division. */
-	PATCH_IT(.div, .div_patch)
-	PATCH_IT(.div+0x08, .div_patch+0x08)
-	PATCH_IT(.div+0x10, .div_patch+0x10)
-	PATCH_IT(.div+0x18, .div_patch+0x18)
-	PATCH_IT(.div+0x20, .div_patch+0x20)
-
-	/* Unsigned multiply. */
-	PATCH_IT(.umul, .umul_patch)
-	PATCH_IT(.umul+0x08, .umul_patch+0x08)
-
-	/* Unsigned remainder. */
-	PATCH_IT(.urem, .urem_patch)
-	PATCH_IT(.urem+0x08, .urem_patch+0x08)
-	PATCH_IT(.urem+0x10, .urem_patch+0x10)
-	PATCH_IT(.urem+0x18, .urem_patch+0x18)
-
-	/* Unsigned division. */
-	PATCH_IT(.udiv, .udiv_patch)
-	PATCH_IT(.udiv+0x08, .udiv_patch+0x08)
-	PATCH_IT(.udiv+0x10, .udiv_patch+0x10)
-
-#undef PATCH_IT
-
 /* Ok, the PROM could have done funny things and apple cider could still
 * be sitting in the fault status/address registers.  Read them all to
 * clear them so we don't get magic faults later on.
--- a/arch/sparc/kernel/kernel.h
+++ b/arch/sparc/kernel/kernel.h
@ -32,9 +32,6 @@ extern void cpu_probe(void);
 /* traps_32.c */
 extern void handle_hw_divzero(struct pt_regs *regs, unsigned long pc,
                              unsigned long npc, unsigned long psr);
-/* muldiv.c */
-extern int do_user_muldiv (struct pt_regs *, unsigned long);
-
 /* irq_32.c */
 extern struct irqaction static_irqaction[];
 extern int static_irq_count;
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@ -32,26 +32,11 @@ static void *module_map(unsigned long size)
 				GFP_KERNEL, PAGE_KERNEL, -1,
 				__builtin_return_address(0));
 }
-
-static char *dot2underscore(char *name)
-{
-	return name;
-}
 #else
 static void *module_map(unsigned long size)
 {
 	return vmalloc(size);
 }
-
-/* Replace references to .func with _Func */
-static char *dot2underscore(char *name)
-{
-	if (name[0] == '.') {
-		name[0] = '_';
-                name[1] = toupper(name[1]);
-	}
-	return name;
-}
 #endif /* CONFIG_SPARC64 */

 void *module_alloc(unsigned long size)
@ -93,12 +78,8 @@ int module_frob_arch_sections(Elf_Ehdr *hdr,

 	for (i = 1; i < sechdrs[symidx].sh_size / sizeof(Elf_Sym); i++) {
 		if (sym[i].st_shndx == SHN_UNDEF) {
-			if (ELF_ST_TYPE(sym[i].st_info) == STT_REGISTER) {
+			if (ELF_ST_TYPE(sym[i].st_info) == STT_REGISTER)
 				sym[i].st_shndx = SHN_ABS;
-			} else {
-				char *name = strtab + sym[i].st_name;
-				dot2underscore(name);
-			}
 		}
 	}
 	return 0;
--- a/arch/sparc/kernel/muldiv.c
+++ b/arch/sparc/kernel/muldiv.c
@ -1,238 +0,0 @@
-/*
- * muldiv.c: Hardware multiply/division illegal instruction trap
- *		for sun4c/sun4 (which do not have those instructions)
- *
- * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
- *
- * 2004-12-25	Krzysztof Helt (krzysztof.h1@wp.pl) 
- *		- fixed registers constrains in inline assembly declarations
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <asm/ptrace.h>
-#include <asm/processor.h>
-#include <asm/uaccess.h>
-
-#include "kernel.h"
-
-/* #define DEBUG_MULDIV */
-
-static inline int has_imm13(int insn)
-{
-	return (insn & 0x2000);
-}
-
-static inline int is_foocc(int insn)
-{
-	return (insn & 0x800000);
-}
-
-static inline int sign_extend_imm13(int imm)
-{
-	return imm << 19 >> 19;
-}
-
-static inline void advance(struct pt_regs *regs)
-{
-	regs->pc   = regs->npc;
-	regs->npc += 4;
-}
-
-static inline void maybe_flush_windows(unsigned int rs1, unsigned int rs2,
-				       unsigned int rd)
-{
-	if(rs2 >= 16 || rs1 >= 16 || rd >= 16) {
-		/* Wheee... */
-		__asm__ __volatile__("save %sp, -0x40, %sp\n\t"
-				     "save %sp, -0x40, %sp\n\t"
-				     "save %sp, -0x40, %sp\n\t"
-				     "save %sp, -0x40, %sp\n\t"
-				     "save %sp, -0x40, %sp\n\t"
-				     "save %sp, -0x40, %sp\n\t"
-				     "save %sp, -0x40, %sp\n\t"
-				     "restore; restore; restore; restore;\n\t"
-				     "restore; restore; restore;\n\t");
-	}
-}
-
-#define fetch_reg(reg, regs) ({						\
-	struct reg_window32 __user *win;					\
-	register unsigned long ret;					\
-									\
-	if (!(reg)) ret = 0;						\
-	else if ((reg) < 16) {						\
-		ret = regs->u_regs[(reg)];				\
-	} else {							\
-		/* Ho hum, the slightly complicated case. */		\
-		win = (struct reg_window32 __user *)regs->u_regs[UREG_FP];\
-		if (get_user (ret, &win->locals[(reg) - 16])) return -1;\
-	}								\
-	ret;								\
-})
-
-static inline int
-store_reg(unsigned int result, unsigned int reg, struct pt_regs *regs)
-{
-	struct reg_window32 __user *win;
-
-	if (!reg)
-		return 0;
-	if (reg < 16) {
-		regs->u_regs[reg] = result;
-		return 0;
-	} else {
-		/* need to use put_user() in this case: */
-		win = (struct reg_window32 __user *) regs->u_regs[UREG_FP];
-		return (put_user(result, &win->locals[reg - 16]));
-	}
-}
-
-/* Should return 0 if mul/div emulation succeeded and SIGILL should
- * not be issued.
- */
-int do_user_muldiv(struct pt_regs *regs, unsigned long pc)
-{
-	unsigned int insn;
-	int inst;
-	unsigned int rs1, rs2, rdv;
-
-	if (!pc)
-		return -1; /* This happens to often, I think */
-	if (get_user (insn, (unsigned int __user *)pc))
-		return -1;
-	if ((insn & 0xc1400000) != 0x80400000)
-		return -1;
-	inst = ((insn >> 19) & 0xf);
-	if ((inst & 0xe) != 10 && (inst & 0xe) != 14)
-		return -1;
-
-	/* Now we know we have to do something with umul, smul, udiv or sdiv */
-	rs1 = (insn >> 14) & 0x1f;
-	rs2 = insn & 0x1f;
-	rdv = (insn >> 25) & 0x1f;
-	if (has_imm13(insn)) {
-		maybe_flush_windows(rs1, 0, rdv);
-		rs2 = sign_extend_imm13(insn);
-	} else {
-		maybe_flush_windows(rs1, rs2, rdv);
-		rs2 = fetch_reg(rs2, regs);
-	}
-	rs1 = fetch_reg(rs1, regs);
-	switch (inst) {
-	case 10: /* umul */
-#ifdef DEBUG_MULDIV	
-		printk ("unsigned muldiv: 0x%x * 0x%x = ", rs1, rs2);
-#endif		
-		__asm__ __volatile__ ("\n\t"
-			"mov	%0, %%o0\n\t"
-			"call	.umul\n\t"
-			" mov	%1, %%o1\n\t"
-			"mov	%%o0, %0\n\t"
-			"mov	%%o1, %1\n\t"
-			: "=r" (rs1), "=r" (rs2)
-		        : "0" (rs1), "1" (rs2)
-			: "o0", "o1", "o2", "o3", "o4", "o5", "o7", "cc");
-#ifdef DEBUG_MULDIV
-		printk ("0x%x%08x\n", rs2, rs1);
-#endif
-		if (store_reg(rs1, rdv, regs))
-			return -1;
-		regs->y = rs2;
-		break;
-	case 11: /* smul */
-#ifdef DEBUG_MULDIV
-		printk ("signed muldiv: 0x%x * 0x%x = ", rs1, rs2);
-#endif
-		__asm__ __volatile__ ("\n\t"
-			"mov	%0, %%o0\n\t"
-			"call	.mul\n\t"
-			" mov	%1, %%o1\n\t"
-			"mov	%%o0, %0\n\t"
-			"mov	%%o1, %1\n\t"
-			: "=r" (rs1), "=r" (rs2)
-		        : "0" (rs1), "1" (rs2)
-			: "o0", "o1", "o2", "o3", "o4", "o5", "o7", "cc");
-#ifdef DEBUG_MULDIV
-		printk ("0x%x%08x\n", rs2, rs1);
-#endif
-		if (store_reg(rs1, rdv, regs))
-			return -1;
-		regs->y = rs2;
-		break;
-	case 14: /* udiv */
-#ifdef DEBUG_MULDIV
-		printk ("unsigned muldiv: 0x%x%08x / 0x%x = ", regs->y, rs1, rs2);
-#endif
-		if (!rs2) {
-#ifdef DEBUG_MULDIV
-			printk ("DIVISION BY ZERO\n");
-#endif
-			handle_hw_divzero (regs, pc, regs->npc, regs->psr);
-			return 0;
-		}
-		__asm__ __volatile__ ("\n\t"
-			"mov	%2, %%o0\n\t"
-			"mov	%0, %%o1\n\t"
-			"mov	%%g0, %%o2\n\t"
-			"call	__udivdi3\n\t"
-			" mov	%1, %%o3\n\t"
-			"mov	%%o1, %0\n\t"
-			"mov	%%o0, %1\n\t"
-			: "=r" (rs1), "=r" (rs2)
-			: "r" (regs->y), "0" (rs1), "1" (rs2)
-			: "o0", "o1", "o2", "o3", "o4", "o5", "o7",
-			  "g1", "g2", "g3", "cc");
-#ifdef DEBUG_MULDIV
-		printk ("0x%x\n", rs1);
-#endif
-		if (store_reg(rs1, rdv, regs))
-			return -1;
-		break;
-	case 15: /* sdiv */
-#ifdef DEBUG_MULDIV
-		printk ("signed muldiv: 0x%x%08x / 0x%x = ", regs->y, rs1, rs2);
-#endif
-		if (!rs2) {
-#ifdef DEBUG_MULDIV
-			printk ("DIVISION BY ZERO\n");
-#endif
-			handle_hw_divzero (regs, pc, regs->npc, regs->psr);
-			return 0;
-		}
-		__asm__ __volatile__ ("\n\t"
-			"mov	%2, %%o0\n\t"
-			"mov	%0, %%o1\n\t"
-			"mov	%%g0, %%o2\n\t"
-			"call	__divdi3\n\t"
-			" mov	%1, %%o3\n\t"
-			"mov	%%o1, %0\n\t"
-			"mov	%%o0, %1\n\t"
-			: "=r" (rs1), "=r" (rs2)
-			: "r" (regs->y), "0" (rs1), "1" (rs2)
-			: "o0", "o1", "o2", "o3", "o4", "o5", "o7",
-			  "g1", "g2", "g3", "cc");
-#ifdef DEBUG_MULDIV
-		printk ("0x%x\n", rs1);
-#endif
-		if (store_reg(rs1, rdv, regs))
-			return -1;
-		break;
-	}
-	if (is_foocc (insn)) {
-		regs->psr &= ~PSR_ICC;
-		if ((inst & 0xe) == 14) {
-			/* ?div */
-			if (rs2) regs->psr |= PSR_V;
-		}
-		if (!rs1) regs->psr |= PSR_Z;
-		if (((int)rs1) < 0) regs->psr |= PSR_N;
-#ifdef DEBUG_MULDIV
-		printk ("psr muldiv: %08x\n", regs->psr);
-#endif
-	}
-	advance(regs);
-	return 0;
-}
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@ -120,8 +120,6 @@ void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned lon
 	printk("Ill instr. at pc=%08lx instruction is %08lx\n",
 	       regs->pc, *(unsigned long *)regs->pc);
 #endif
-	if (!do_user_muldiv (regs, pc))
-		return;

 	info.si_signo = SIGILL;
 	info.si_errno = 0;
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@ -4,7 +4,7 @@
 asflags-y := -ansi -DST_DIV0=0x02
 ccflags-y := -Werror

-lib-$(CONFIG_SPARC32) += mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o
+lib-$(CONFIG_SPARC32) += ashrdi3.o
 lib-$(CONFIG_SPARC32) += memcpy.o memset.o
 lib-y                 += strlen.o
 lib-y                 += checksum_$(BITS).o
--- a/arch/sparc/lib/divdi3.S
+++ b/arch/sparc/lib/divdi3.S
@ -19,7 +19,6 @@ Boston, MA 02111-1307, USA.  */

 	.text
 	.align 4
-	.global .udiv
 	.globl __divdi3
 __divdi3:
 	save %sp,-104,%sp
@ -83,8 +82,9 @@ __divdi3:
 	bne .LL85
 	mov %i0,%o2
 	mov 1,%o0
-	call .udiv,0
 	mov 0,%o1
+	wr %g0, 0, %y
+	udiv %o0, %o1, %o0
 	mov %o0,%o4
 	mov %i0,%o2
 .LL85:
--- a/arch/sparc/lib/ksyms.c
+++ b/arch/sparc/lib/ksyms.c
@ -61,16 +61,6 @@ extern void ___rw_read_try(void);
 extern void ___rw_read_exit(void);
 extern void ___rw_write_enter(void);

-/* Alias functions whose names begin with "." and export the aliases.
- * The module references will be fixed up by module_frob_arch_sections.
- */
-extern int _Div(int, int);
-extern int _Mul(int, int);
-extern int _Rem(int, int);
-extern unsigned _Udiv(unsigned, unsigned);
-extern unsigned _Umul(unsigned, unsigned);
-extern unsigned _Urem(unsigned, unsigned);
-
 /* Networking helper routines. */
 EXPORT_SYMBOL(__csum_partial_copy_sparc_generic);

@ -95,13 +85,6 @@ EXPORT_SYMBOL(__ashldi3);
 EXPORT_SYMBOL(__lshrdi3);
 EXPORT_SYMBOL(__muldi3);
 EXPORT_SYMBOL(__divdi3);
-
-EXPORT_SYMBOL(_Rem);
-EXPORT_SYMBOL(_Urem);
-EXPORT_SYMBOL(_Mul);
-EXPORT_SYMBOL(_Umul);
-EXPORT_SYMBOL(_Div);
-EXPORT_SYMBOL(_Udiv);
 #endif

 /*
--- a/arch/sparc/lib/mul.S
+++ b/arch/sparc/lib/mul.S
@ -1,137 +0,0 @@
-/*
- * mul.S:       This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-/*
- * Signed multiply, from Appendix E of the Sparc Version 8
- * Architecture Manual.
- */
-
-/*
- * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
- * the 64-bit product).
- *
- * This code optimizes short (less than 13-bit) multiplies.
- */
-
-	.globl .mul
-	.globl _Mul
-.mul:
-_Mul:	/* needed for export */
-	mov	%o0, %y		! multiplier -> Y
-	andncc	%o0, 0xfff, %g0	! test bits 12..31
-	be	Lmul_shortway	! if zero, can do it the short way
-	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
-
-	/*
-	 * Long multiply.  32 steps, followed by a final shift step.
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %o1, %o4	! 13
-	mulscc	%o4, %o1, %o4	! 14
-	mulscc	%o4, %o1, %o4	! 15
-	mulscc	%o4, %o1, %o4	! 16
-	mulscc	%o4, %o1, %o4	! 17
-	mulscc	%o4, %o1, %o4	! 18
-	mulscc	%o4, %o1, %o4	! 19
-	mulscc	%o4, %o1, %o4	! 20
-	mulscc	%o4, %o1, %o4	! 21
-	mulscc	%o4, %o1, %o4	! 22
-	mulscc	%o4, %o1, %o4	! 23
-	mulscc	%o4, %o1, %o4	! 24
-	mulscc	%o4, %o1, %o4	! 25
-	mulscc	%o4, %o1, %o4	! 26
-	mulscc	%o4, %o1, %o4	! 27
-	mulscc	%o4, %o1, %o4	! 28
-	mulscc	%o4, %o1, %o4	! 29
-	mulscc	%o4, %o1, %o4	! 30
-	mulscc	%o4, %o1, %o4	! 31
-	mulscc	%o4, %o1, %o4	! 32
-	mulscc	%o4, %g0, %o4	! final shift
-
-	! If %o0 was negative, the result is
-	!	(%o0 * %o1) + (%o1 << 32))
-	! We fix that here.
-
-#if 0
-	tst	%o0
-	bge	1f
-	 rd	%y, %o0
-
-	! %o0 was indeed negative; fix upper 32 bits of result by subtracting 
-	! %o1 (i.e., return %o4 - %o1 in %o1).
-	retl
-	 sub	%o4, %o1, %o1
-
-1:
-	retl
-	 mov	%o4, %o1
-#else
-	/* Faster code adapted from tege@sics.se's code for umul.S.  */
-	sra	%o0, 31, %o2	! make mask from sign bit
-	and	%o1, %o2, %o2	! %o2 = 0 or %o1, depending on sign of %o0
-	rd	%y, %o0		! get lower half of product
-	retl
-	 sub	%o4, %o2, %o1	! subtract compensation 
-				!  and put upper half in place
-#endif
-
-Lmul_shortway:
-	/*
-	 * Short multiply.  12 steps, followed by a final shift step.
-	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
-	 * but there is no problem with %o0 being negative (unlike above).
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %g0, %o4	! final shift
-
-	/*
-	 *  %o4 has 20 of the bits that should be in the low part of the
-	 * result; %y has the bottom 12 (as %y's top 12).  That is:
-	 *
-	 *	  %o4		    %y
-	 * +----------------+----------------+
-	 * | -12- |   -20-  | -12- |   -20-  |
-	 * +------(---------+------)---------+
-	 *  --hi-- ----low-part----
-	 *
-	 * The upper 12 bits of %o4 should be sign-extended to form the
-	 * high part of the product (i.e., highpart = %o4 >> 20).
-	 */
-
-	rd	%y, %o5
-	sll	%o4, 12, %o0	! shift middle bits left 12
-	srl	%o5, 20, %o5	! shift low bits right 20, zero fill at left
-	or	%o5, %o0, %o0	! construct low part of result
-	retl
-	 sra	%o4, 20, %o1	! ... and extract high part of result
-
-	.globl	.mul_patch
-.mul_patch:
-	smul	%o0, %o1, %o0
-	retl
-	 rd	%y, %o1
-	nop
--- a/arch/sparc/lib/muldi3.S
+++ b/arch/sparc/lib/muldi3.S
@ -63,12 +63,12 @@ __muldi3:
 	rd  %y, %o1
 	mov  %o1, %l3
 	mov  %i1, %o0
-	call  .umul
 	mov  %i2, %o1
+	umul %o0, %o1, %o0
 	mov  %o0, %l0
 	mov  %i0, %o0
-	call  .umul
 	mov  %i3, %o1
+	umul %o0, %o1, %o0
 	add  %l0, %o0, %l0
 	mov  %l2, %i0
 	add  %l2, %l0, %i0
--- a/arch/sparc/lib/rem.S
+++ b/arch/sparc/lib/rem.S
@ -1,384 +0,0 @@
-/*
- * rem.S:       This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-
-/* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .rem	name of function to generate
- *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
- *  true		true=true => signed; true=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-	.globl .rem
-	.globl _Rem
-.rem:
-_Rem:	/* needed for export */
-	! compute sign of result; if neither is negative, no problem
-	orcc	%o1, %o0, %g0	! either negative?
-	bge	2f			! no, go do the divide
-	 mov	%o0, %g2	! compute sign in any case
-
-	tst	%o1
-	bge	1f
-	 tst	%o0
-	! %o1 is definitely negative; %o0 might also be negative
-	bge	2f			! if %o0 not negative...
-	 sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
-1:	! %o0 is negative, %o1 is nonnegative
-	sub	%g0, %o0, %o0	! make %o0 nonnegative
-2:
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	 mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	ST_DIV0
-		retl
-		 clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	Lgot_result		! (and algorithm fails otherwise)
-	 clr	%o2
-
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-
-	cmp	%o3, %g1
-	blu	Lnot_really_big
-	 clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		 mov	1, %g7
-
-		sll	%o5, 4, %o5
-
-		b	1b
-		 add	%o4, 1, %o4
-
-	! Now compute %g7.
-	2:
-		addcc	%o5, %o5, %o5
-
-		bcc	Lnot_too_big
-		 add	%g7, 1, %g7
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-
-		b	Ldo_single_div
-		 sub	%g7, 1, %g7
-
-	Lnot_too_big:
-	3:
-		cmp	%o5, %o3
-		blu	2b
-		 nop
-
-		be	Ldo_single_div
-		 nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g7
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	Ldo_single_div:
-		subcc	%g7, 1, %g7
-		bl	Lend_regular_divide
-		 nop
-
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-
-		b	Lend_single_divloop
-		 nop
-	Lsingle_divloop:
-		sll	%o2, 1, %o2
-
-		bl	1f
-		 srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-
-		b	2f
-		 add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	Lend_single_divloop:
-		subcc	%g7, 1, %g7
-		bge	Lsingle_divloop
-		 tst	%o3
-
-		b,a	Lend_regular_divide
-
-Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-	cmp	%o5, %o3
-	bleu	1b
-	 addcc	%o4, 1, %o4
-	be	Lgot_result
-	 sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	L.1.16
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	L.2.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	L.3.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	L.4.23
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-
-	b	9f
-	 add	%o2, (7*2+1), %o2
-	
-L.4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2-1), %o2
-	
-L.3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	L.4.21
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2+1), %o2
-	
-L.4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2-1), %o2
-	
-L.2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	L.3.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	L.4.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2+1), %o2
-
-L.4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2-1), %o2
-
-L.3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	L.4.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2+1), %o2
-
-L.4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2-1), %o2
-
-L.1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	L.2.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	L.3.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	L.4.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2+1), %o2
-
-L.4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2-1), %o2
-
-L.3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	L.4.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2+1), %o2
-
-L.4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2-1), %o2
-
-L.2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	L.3.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	L.4.11
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2+1), %o2
-
-L.4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2-1), %o2
-
-
-L.3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	L.4.9
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2+1), %o2
-
-L.4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2-1), %o2
-
-	9:
-Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	Ldivloop
-	 tst	%o3
-
-	bl,a	Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	add	%o3, %o1, %o3
-
-Lgot_result:
-	! check to see if answer should be < 0
-	tst	%g2
-	bl,a	1f
-	 sub %g0, %o3, %o3
-1:
-	retl
-	 mov %o3, %o0
-
-	.globl	.rem_patch
-.rem_patch:
-	sra	%o0, 0x1f, %o4
-	wr	%o4, 0x0, %y
-	nop
-	nop
-	nop
-	sdivcc	%o0, %o1, %o2
-	bvs,a	1f
-	 xnor	%o2, %g0, %o2
-1:	smul	%o2, %o1, %o2
-	retl
-	 sub	%o0, %o2, %o0
-	nop
--- a/arch/sparc/lib/sdiv.S
+++ b/arch/sparc/lib/sdiv.S
@ -1,381 +0,0 @@
-/*
- * sdiv.S:      This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-
-/* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .div	name of function to generate
- *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
- *  true		true=true => signed; true=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-	.globl .div
-	.globl _Div
-.div:
-_Div:	/* needed for export */
-	! compute sign of result; if neither is negative, no problem
-	orcc	%o1, %o0, %g0	! either negative?
-	bge	2f			! no, go do the divide
-	 xor	%o1, %o0, %g2	! compute sign in any case
-
-	tst	%o1
-	bge	1f
-	 tst	%o0
-	! %o1 is definitely negative; %o0 might also be negative
-	bge	2f			! if %o0 not negative...
-	 sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
-1:	! %o0 is negative, %o1 is nonnegative
-	sub	%g0, %o0, %o0	! make %o0 nonnegative
-2:
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	 mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	ST_DIV0
-		retl
-		 clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	Lgot_result		! (and algorithm fails otherwise)
-	 clr	%o2
-
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-
-	cmp	%o3, %g1
-	blu	Lnot_really_big
-	 clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		 mov	1, %g7
-
-		sll	%o5, 4, %o5
-
-		b	1b
-		 add	%o4, 1, %o4
-
-	! Now compute %g7.
-	2:
-		addcc	%o5, %o5, %o5
-		bcc	Lnot_too_big
-		 add	%g7, 1, %g7
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-
-		b	Ldo_single_div
-		 sub	%g7, 1, %g7
-
-	Lnot_too_big:
-	3:
-		cmp	%o5, %o3
-		blu	2b
-		 nop
-
-		be	Ldo_single_div
-		 nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g7
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	Ldo_single_div:
-		subcc	%g7, 1, %g7
-		bl	Lend_regular_divide
-		 nop
-
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-
-		b	Lend_single_divloop
-		 nop
-	Lsingle_divloop:
-		sll	%o2, 1, %o2
-
-		bl	1f
-		 srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-
-		b	2f
-		 add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	Lend_single_divloop:
-		subcc	%g7, 1, %g7
-		bge	Lsingle_divloop
-		 tst	%o3
-
-		b,a	Lend_regular_divide
-
-Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-	cmp	%o5, %o3
-	bleu	1b
-	 addcc	%o4, 1, %o4
-
-	be	Lgot_result
-	 sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	L.1.16
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	L.2.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	L.3.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	L.4.23
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2+1), %o2
-
-L.4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2-1), %o2
-
-L.3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	L.4.21
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2+1), %o2
-
-L.4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2-1), %o2
-
-L.2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	L.3.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	L.4.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2+1), %o2
-
-L.4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2-1), %o2
-	
-	
-L.3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	L.4.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2+1), %o2
-
-L.4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2-1), %o2
-
-L.1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	L.2.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	L.3.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	L.4.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2+1), %o2
-
-L.4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2-1), %o2
-
-L.3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	L.4.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2+1), %o2
-
-L.4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2-1), %o2
-
-L.2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	L.3.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	L.4.11
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2+1), %o2
-
-L.4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2-1), %o2
-
-L.3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	L.4.9
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2+1), %o2
-
-L.4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2-1), %o2
-
-	9:
-Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	Ldivloop
-	 tst	%o3
-
-	bl,a	Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	sub	%o2, 1, %o2
-
-Lgot_result:
-	! check to see if answer should be < 0
-	tst	%g2
-	bl,a	1f
-	 sub %g0, %o2, %o2
-1:
-	retl
-	 mov %o2, %o0
-
-	.globl	.div_patch
-.div_patch:
-	sra	%o0, 0x1f, %o2
-	wr	%o2, 0x0, %y
-	nop
-	nop
-	nop
-	sdivcc	%o0, %o1, %o0
-	bvs,a	1f
-	 xnor	%o0, %g0, %o0
-1:	retl
-	 nop
--- a/arch/sparc/lib/udiv.S
+++ b/arch/sparc/lib/udiv.S
@ -1,357 +0,0 @@
-/*
- * udiv.S:      This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-
-/* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .udiv	name of function to generate
- *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
- *  false		false=true => signed; false=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-	.globl .udiv
-	.globl _Udiv
-.udiv:
-_Udiv:	/* needed for export */
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	 mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	ST_DIV0
-		retl
-		 clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	Lgot_result		! (and algorithm fails otherwise)
-	 clr	%o2
-
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-
-	cmp	%o3, %g1
-	blu	Lnot_really_big
-	 clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		 mov	1, %g7
-
-		sll	%o5, 4, %o5
-
-		b	1b
-		 add	%o4, 1, %o4
-
-	! Now compute %g7.
-	2:
-		addcc	%o5, %o5, %o5
-		bcc	Lnot_too_big
-		 add	%g7, 1, %g7
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-
-		b	Ldo_single_div
-		 sub	%g7, 1, %g7
-
-	Lnot_too_big:
-	3:
-		cmp	%o5, %o3
-		blu	2b
-		 nop
-
-		be	Ldo_single_div
-		 nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g7
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	Ldo_single_div:
-		subcc	%g7, 1, %g7
-		bl	Lend_regular_divide
-		 nop
-
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-
-		b	Lend_single_divloop
-		 nop
-	Lsingle_divloop:
-		sll	%o2, 1, %o2
-		bl	1f
-		 srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-		b	2f
-		 add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	Lend_single_divloop:
-		subcc	%g7, 1, %g7
-		bge	Lsingle_divloop
-		 tst	%o3
-
-		b,a	Lend_regular_divide
-
-Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-
-	cmp	%o5, %o3
-	bleu	1b
-	 addcc	%o4, 1, %o4
-
-	be	Lgot_result
-	 sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	L.1.16
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	L.2.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	L.3.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	L.4.23
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2+1), %o2
-
-L.4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2-1), %o2
-
-L.3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	L.4.21
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2+1), %o2
-
-L.4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2-1), %o2
-
-L.2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	L.3.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	L.4.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2+1), %o2
-
-L.4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2-1), %o2
-
-L.3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	L.4.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2+1), %o2
-
-L.4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2-1), %o2
-
-L.1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	L.2.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	L.3.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	L.4.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2+1), %o2
-
-L.4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2-1), %o2
-
-L.3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	L.4.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2+1), %o2
-
-L.4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2-1), %o2
-
-L.2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	L.3.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	L.4.11
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2+1), %o2
-
-L.4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2-1), %o2
-
-L.3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	L.4.9
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2+1), %o2
-
-L.4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2-1), %o2
-
-	9:
-Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	Ldivloop
-	 tst	%o3
-
-	bl,a	Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	sub	%o2, 1, %o2
-
-Lgot_result:
-
-	retl
-	 mov %o2, %o0
-
-	.globl	.udiv_patch
-.udiv_patch:
-	wr	%g0, 0x0, %y
-	nop
-	nop
-	retl
-	 udiv	%o0, %o1, %o0
-	nop
--- a/arch/sparc/lib/udivdi3.S
+++ b/arch/sparc/lib/udivdi3.S
@ -60,8 +60,9 @@ __udivdi3:
 	bne .LL77
 	mov %i0,%o2
 	mov 1,%o0
-	call .udiv,0
 	mov 0,%o1
+	wr %g0, 0, %y
+	udiv %o0, %o1, %o0
 	mov %o0,%o3
 	mov %i0,%o2
 .LL77:
--- a/arch/sparc/lib/umul.S
+++ b/arch/sparc/lib/umul.S
@ -1,171 +0,0 @@
-/*
- * umul.S:      This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-
-/*
- * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
- * upper 32 bits of the 64-bit product).
- *
- * This code optimizes short (less than 13-bit) multiplies.  Short
- * multiplies require 25 instruction cycles, and long ones require
- * 45 instruction cycles.
- *
- * On return, overflow has occurred (%o1 is not zero) if and only if
- * the Z condition code is clear, allowing, e.g., the following:
- *
- *	call	.umul
- *	nop
- *	bnz	overflow	(or tnz)
- */
-
-	.globl .umul
-	.globl _Umul
-.umul:
-_Umul:	/* needed for export */
-	or	%o0, %o1, %o4
-	mov	%o0, %y		! multiplier -> Y
-
-	andncc	%o4, 0xfff, %g0	! test bits 12..31 of *both* args
-	be	Lmul_shortway	! if zero, can do it the short way
-	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
-
-	/*
-	 * Long multiply.  32 steps, followed by a final shift step.
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %o1, %o4	! 13
-	mulscc	%o4, %o1, %o4	! 14
-	mulscc	%o4, %o1, %o4	! 15
-	mulscc	%o4, %o1, %o4	! 16
-	mulscc	%o4, %o1, %o4	! 17
-	mulscc	%o4, %o1, %o4	! 18
-	mulscc	%o4, %o1, %o4	! 19
-	mulscc	%o4, %o1, %o4	! 20
-	mulscc	%o4, %o1, %o4	! 21
-	mulscc	%o4, %o1, %o4	! 22
-	mulscc	%o4, %o1, %o4	! 23
-	mulscc	%o4, %o1, %o4	! 24
-	mulscc	%o4, %o1, %o4	! 25
-	mulscc	%o4, %o1, %o4	! 26
-	mulscc	%o4, %o1, %o4	! 27
-	mulscc	%o4, %o1, %o4	! 28
-	mulscc	%o4, %o1, %o4	! 29
-	mulscc	%o4, %o1, %o4	! 30
-	mulscc	%o4, %o1, %o4	! 31
-	mulscc	%o4, %o1, %o4	! 32
-	mulscc	%o4, %g0, %o4	! final shift
-
-
-	/*
-	 * Normally, with the shift-and-add approach, if both numbers are
-	 * positive you get the correct result.  With 32-bit two's-complement
-	 * numbers, -x is represented as
-	 *
-	 *		  x		    32
-	 *	( 2  -  ------ ) mod 2  *  2
-	 *		   32
-	 *		  2
-	 *
-	 * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
-	 * we can treat this as if the radix point were just to the left
-	 * of the sign bit (multiply by 2^32), and get
-	 *
-	 *	-x  =  (2 - x) mod 2
-	 *
-	 * Then, ignoring the `mod 2's for convenience:
-	 *
-	 *   x *  y	= xy
-	 *  -x *  y	= 2y - xy
-	 *   x * -y	= 2x - xy
-	 *  -x * -y	= 4 - 2x - 2y + xy
-	 *
-	 * For signed multiplies, we subtract (x << 32) from the partial
-	 * product to fix this problem for negative multipliers (see mul.s).
-	 * Because of the way the shift into the partial product is calculated
-	 * (N xor V), this term is automatically removed for the multiplicand,
-	 * so we don't have to adjust.
-	 *
-	 * But for unsigned multiplies, the high order bit wasn't a sign bit,
-	 * and the correction is wrong.  So for unsigned multiplies where the
-	 * high order bit is one, we end up with xy - (y << 32).  To fix it
-	 * we add y << 32.
-	 */
-#if 0
-	tst	%o1
-	bl,a	1f		! if %o1 < 0 (high order bit = 1),
-	 add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half)
-
-1:
-	rd	%y, %o0		! get lower half of product
-	retl
-	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0
-#else
-	/* Faster code from tege@sics.se.  */
-	sra	%o1, 31, %o2	! make mask from sign bit
-	and	%o0, %o2, %o2	! %o2 = 0 or %o0, depending on sign of %o1
-	rd	%y, %o0		! get lower half of product
-	retl
-	 addcc	%o4, %o2, %o1	! add compensation and put upper half in place
-#endif
-
-Lmul_shortway:
-	/*
-	 * Short multiply.  12 steps, followed by a final shift step.
-	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
-	 * but there is no problem with %o0 being negative (unlike above),
-	 * and overflow is impossible (the answer is at most 24 bits long).
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %g0, %o4	! final shift
-
-	/*
-	 * %o4 has 20 of the bits that should be in the result; %y has
-	 * the bottom 12 (as %y's top 12).  That is:
-	 *
-	 *	  %o4		    %y
-	 * +----------------+----------------+
-	 * | -12- |   -20-  | -12- |   -20-  |
-	 * +------(---------+------)---------+
-	 *	   -----result-----
-	 *
-	 * The 12 bits of %o4 left of the `result' area are all zero;
-	 * in fact, all top 20 bits of %o4 are zero.
-	 */
-
-	rd	%y, %o5
-	sll	%o4, 12, %o0	! shift middle bits left 12
-	srl	%o5, 20, %o5	! shift low bits right 20
-	or	%o5, %o0, %o0
-	retl
-	 addcc	%g0, %g0, %o1	! %o1 = zero, and set Z
-
-	.globl	.umul_patch
-.umul_patch:
-	umul	%o0, %o1, %o0
-	retl
-	 rd	%y, %o1
-	nop
--- a/arch/sparc/lib/urem.S
+++ b/arch/sparc/lib/urem.S
@ -1,357 +0,0 @@
-/*
- * urem.S:      This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-/* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .urem	name of function to generate
- *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
- *  false		false=true => signed; false=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-	.globl .urem
-	.globl _Urem
-.urem:
-_Urem:	/* needed for export */
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	 mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	ST_DIV0
-		retl
-		 clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	Lgot_result		! (and algorithm fails otherwise)
-	 clr	%o2
-
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-
-	cmp	%o3, %g1
-	blu	Lnot_really_big
-	 clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		 mov	1, %g7
-
-		sll	%o5, 4, %o5
-
-		b	1b
-		 add	%o4, 1, %o4
-
-	! Now compute %g7.
-	2:
-		addcc	%o5, %o5, %o5
-		bcc	Lnot_too_big
-		 add	%g7, 1, %g7
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-
-		b	Ldo_single_div
-		 sub	%g7, 1, %g7
-
-	Lnot_too_big:
-	3:
-		cmp	%o5, %o3
-		blu	2b
-		 nop
-
-		be	Ldo_single_div
-		 nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g7
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	Ldo_single_div:
-		subcc	%g7, 1, %g7
-		bl	Lend_regular_divide
-		 nop
-
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-
-		b	Lend_single_divloop
-		 nop
-	Lsingle_divloop:
-		sll	%o2, 1, %o2
-		bl	1f
-		 srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-		b	2f
-		 add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	Lend_single_divloop:
-		subcc	%g7, 1, %g7
-		bge	Lsingle_divloop
-		 tst	%o3
-
-		b,a	Lend_regular_divide
-
-Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-
-	cmp	%o5, %o3
-	bleu	1b
-	 addcc	%o4, 1, %o4
-
-	be	Lgot_result
-	 sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	L.1.16
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	L.2.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	L.3.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	L.4.23
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2+1), %o2
-
-L.4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2-1), %o2
-
-L.3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	L.4.21
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2+1), %o2
-
-L.4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2-1), %o2
-
-L.2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	L.3.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	L.4.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2+1), %o2
-
-L.4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2-1), %o2
-
-L.3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	L.4.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2+1), %o2
-	
-L.4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2-1), %o2
-
-L.1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	L.2.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	L.3.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	L.4.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2+1), %o2
-
-L.4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2-1), %o2
-
-L.3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	L.4.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2+1), %o2
-
-L.4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2-1), %o2
-
-L.2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	L.3.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	L.4.11
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2+1), %o2
-	
-L.4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2-1), %o2
-
-L.3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	L.4.9
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2+1), %o2
-
-L.4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2-1), %o2
-
-	9:
-Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	Ldivloop
-	 tst	%o3
-
-	bl,a	Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	add	%o3, %o1, %o3
-
-Lgot_result:
-
-	retl
-	 mov %o3, %o0
-
-	.globl	.urem_patch
-.urem_patch:
-	wr	%g0, 0x0, %y
-	nop
-	nop
-	nop
-	udiv	%o0, %o1, %o2
-	umul	%o2, %o1, %o2
-	retl
-	 sub	%o0, %o2, %o0