powerpc: Fix endian issues in VMX copy loops

Fix the permute loops for little endian.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
This commit is contained in:
Anton Blanchard 2013-09-23 12:04:35 +10:00 committed by Benjamin Herrenschmidt
parent 8b5ede69d2
commit 32ee1e188e
2 changed files with 63 additions and 46 deletions

View file

@ -19,6 +19,14 @@
*/ */
#include <asm/ppc_asm.h> #include <asm/ppc_asm.h>
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
#else
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
#endif
.macro err1 .macro err1
100: 100:
.section __ex_table,"a" .section __ex_table,"a"
@ -552,13 +560,13 @@ err3; stw r7,4(r3)
li r10,32 li r10,32
li r11,48 li r11,48
lvsl vr16,0,r4 /* Setup permute control vector */ LVS(vr16,0,r4) /* Setup permute control vector */
err3; lvx vr0,0,r4 err3; lvx vr0,0,r4
addi r4,r4,16 addi r4,r4,16
bf cr7*4+3,5f bf cr7*4+3,5f
err3; lvx vr1,r0,r4 err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16 addi r4,r4,16
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
addi r3,r3,16 addi r3,r3,16
@ -566,9 +574,9 @@ err3; stvx vr8,r0,r3
5: bf cr7*4+2,6f 5: bf cr7*4+2,6f
err3; lvx vr1,r0,r4 err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9 err3; lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16 VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32 addi r4,r4,32
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9 err3; stvx vr9,r3,r9
@ -576,13 +584,13 @@ err3; stvx vr9,r3,r9
6: bf cr7*4+1,7f 6: bf cr7*4+1,7f
err3; lvx vr3,r0,r4 err3; lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16 VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9 err3; lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16 VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10 err3; lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16 VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11 err3; lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16 VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64 addi r4,r4,64
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9 err3; stvx vr9,r3,r9
@ -611,21 +619,21 @@ err3; stvx vr11,r3,r11
.align 5 .align 5
8: 8:
err4; lvx vr7,r0,r4 err4; lvx vr7,r0,r4
vperm vr8,vr0,vr7,vr16 VPERM(vr8,vr0,vr7,vr16)
err4; lvx vr6,r4,r9 err4; lvx vr6,r4,r9
vperm vr9,vr7,vr6,vr16 VPERM(vr9,vr7,vr6,vr16)
err4; lvx vr5,r4,r10 err4; lvx vr5,r4,r10
vperm vr10,vr6,vr5,vr16 VPERM(vr10,vr6,vr5,vr16)
err4; lvx vr4,r4,r11 err4; lvx vr4,r4,r11
vperm vr11,vr5,vr4,vr16 VPERM(vr11,vr5,vr4,vr16)
err4; lvx vr3,r4,r12 err4; lvx vr3,r4,r12
vperm vr12,vr4,vr3,vr16 VPERM(vr12,vr4,vr3,vr16)
err4; lvx vr2,r4,r14 err4; lvx vr2,r4,r14
vperm vr13,vr3,vr2,vr16 VPERM(vr13,vr3,vr2,vr16)
err4; lvx vr1,r4,r15 err4; lvx vr1,r4,r15
vperm vr14,vr2,vr1,vr16 VPERM(vr14,vr2,vr1,vr16)
err4; lvx vr0,r4,r16 err4; lvx vr0,r4,r16
vperm vr15,vr1,vr0,vr16 VPERM(vr15,vr1,vr0,vr16)
addi r4,r4,128 addi r4,r4,128
err4; stvx vr8,r0,r3 err4; stvx vr8,r0,r3
err4; stvx vr9,r3,r9 err4; stvx vr9,r3,r9
@ -649,13 +657,13 @@ err4; stvx vr15,r3,r16
bf cr7*4+1,9f bf cr7*4+1,9f
err3; lvx vr3,r0,r4 err3; lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16 VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9 err3; lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16 VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10 err3; lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16 VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11 err3; lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16 VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64 addi r4,r4,64
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9 err3; stvx vr9,r3,r9
@ -665,9 +673,9 @@ err3; stvx vr11,r3,r11
9: bf cr7*4+2,10f 9: bf cr7*4+2,10f
err3; lvx vr1,r0,r4 err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9 err3; lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16 VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32 addi r4,r4,32
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9 err3; stvx vr9,r3,r9
@ -675,7 +683,7 @@ err3; stvx vr9,r3,r9
10: bf cr7*4+3,11f 10: bf cr7*4+3,11f
err3; lvx vr1,r0,r4 err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16 addi r4,r4,16
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
addi r3,r3,16 addi r3,r3,16

View file

@ -20,6 +20,15 @@
#include <asm/ppc_asm.h> #include <asm/ppc_asm.h>
_GLOBAL(memcpy_power7) _GLOBAL(memcpy_power7)
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
#else
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
#endif
#ifdef CONFIG_ALTIVEC #ifdef CONFIG_ALTIVEC
cmpldi r5,16 cmpldi r5,16
cmpldi cr1,r5,4096 cmpldi cr1,r5,4096
@ -485,13 +494,13 @@ _GLOBAL(memcpy_power7)
li r10,32 li r10,32
li r11,48 li r11,48
lvsl vr16,0,r4 /* Setup permute control vector */ LVS(vr16,0,r4) /* Setup permute control vector */
lvx vr0,0,r4 lvx vr0,0,r4
addi r4,r4,16 addi r4,r4,16
bf cr7*4+3,5f bf cr7*4+3,5f
lvx vr1,r0,r4 lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16 addi r4,r4,16
stvx vr8,r0,r3 stvx vr8,r0,r3
addi r3,r3,16 addi r3,r3,16
@ -499,9 +508,9 @@ _GLOBAL(memcpy_power7)
5: bf cr7*4+2,6f 5: bf cr7*4+2,6f
lvx vr1,r0,r4 lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
lvx vr0,r4,r9 lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16 VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32 addi r4,r4,32
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
@ -509,13 +518,13 @@ _GLOBAL(memcpy_power7)
6: bf cr7*4+1,7f 6: bf cr7*4+1,7f
lvx vr3,r0,r4 lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16 VPERM(vr8,vr0,vr3,vr16)
lvx vr2,r4,r9 lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16 VPERM(vr9,vr3,vr2,vr16)
lvx vr1,r4,r10 lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16 VPERM(vr10,vr2,vr1,vr16)
lvx vr0,r4,r11 lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16 VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64 addi r4,r4,64
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
@ -544,21 +553,21 @@ _GLOBAL(memcpy_power7)
.align 5 .align 5
8: 8:
lvx vr7,r0,r4 lvx vr7,r0,r4
vperm vr8,vr0,vr7,vr16 VPERM(vr8,vr0,vr7,vr16)
lvx vr6,r4,r9 lvx vr6,r4,r9
vperm vr9,vr7,vr6,vr16 VPERM(vr9,vr7,vr6,vr16)
lvx vr5,r4,r10 lvx vr5,r4,r10
vperm vr10,vr6,vr5,vr16 VPERM(vr10,vr6,vr5,vr16)
lvx vr4,r4,r11 lvx vr4,r4,r11
vperm vr11,vr5,vr4,vr16 VPERM(vr11,vr5,vr4,vr16)
lvx vr3,r4,r12 lvx vr3,r4,r12
vperm vr12,vr4,vr3,vr16 VPERM(vr12,vr4,vr3,vr16)
lvx vr2,r4,r14 lvx vr2,r4,r14
vperm vr13,vr3,vr2,vr16 VPERM(vr13,vr3,vr2,vr16)
lvx vr1,r4,r15 lvx vr1,r4,r15
vperm vr14,vr2,vr1,vr16 VPERM(vr14,vr2,vr1,vr16)
lvx vr0,r4,r16 lvx vr0,r4,r16
vperm vr15,vr1,vr0,vr16 VPERM(vr15,vr1,vr0,vr16)
addi r4,r4,128 addi r4,r4,128
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
@ -582,13 +591,13 @@ _GLOBAL(memcpy_power7)
bf cr7*4+1,9f bf cr7*4+1,9f
lvx vr3,r0,r4 lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16 VPERM(vr8,vr0,vr3,vr16)
lvx vr2,r4,r9 lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16 VPERM(vr9,vr3,vr2,vr16)
lvx vr1,r4,r10 lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16 VPERM(vr10,vr2,vr1,vr16)
lvx vr0,r4,r11 lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16 VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64 addi r4,r4,64
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
@ -598,9 +607,9 @@ _GLOBAL(memcpy_power7)
9: bf cr7*4+2,10f 9: bf cr7*4+2,10f
lvx vr1,r0,r4 lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
lvx vr0,r4,r9 lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16 VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32 addi r4,r4,32
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
@ -608,7 +617,7 @@ _GLOBAL(memcpy_power7)
10: bf cr7*4+3,11f 10: bf cr7*4+3,11f
lvx vr1,r0,r4 lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16 addi r4,r4,16
stvx vr8,r0,r3 stvx vr8,r0,r3
addi r3,r3,16 addi r3,r3,16