Commit 0d008882 by Oleg Endo

re PR target/6526 ([SH4] sdivsi3_i4 can clobber xd0/xd2)

	PR target/6526
	* config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits
	other than FPSCR.PR and FPSCR.SZ.  Add SH4A implementation.

	PR target/6526
	* gcc.target/sh/pr6526.c: New.

From-SVN: r199873
parent 3e56ed50
2013-06-09 Oleg Endo <olegendo@gcc.gnu.org>
PR target/6526
* gcc.target/sh/pr6526.c: New.
2013-06-09 Jakub Jelinek <jakub@redhat.com> 2013-06-09 Jakub Jelinek <jakub@redhat.com>
PR target/57568 PR target/57568
......
/* Check that the XF registers are not clobbered by an integer division
that is done using double precision FPU division. */
/* { dg-do run { target "sh*-*-*" } } */
/* { dg-options "-O1 -mdiv=call-fp" } */
/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m4*-single" "-m4*-single-only" } } */
#include <assert.h>
#include <stdlib.h>
extern void __set_fpscr (int);
void
write_xf0 (float* f)
{
__asm__ __volatile__ ("frchg; fmov.s @%0,fr0; frchg" : : "r" (f) : "memory");
}
void
read_xf0 (float* f)
{
__asm__ __volatile__ ("frchg; fmov.s fr0,@%0; frchg" : : "r" (f) : "memory");
}
int __attribute__ ((noinline))
test_00 (int a, int b)
{
return a / b;
}
unsigned int __attribute__ ((noinline))
test_01 (unsigned a, unsigned b)
{
return a / b;
}
int __attribute__ ((noinline))
test_02 (int x)
{
return x & 0;
}
int
main (void)
{
float test_value;
int r = 0;
/* Set FPSCR.FR to 1. */
__set_fpscr (0x200000);
test_value = 123;
write_xf0 (&test_value);
r += test_00 (40, 4);
read_xf0 (&test_value);
assert (test_value == 123);
test_value = 321;
write_xf0 (&test_value);
r += test_01 (50, 5);
read_xf0 (&test_value);
assert (test_value == 321);
return test_02 (r);
}
2013-06-09 Oleg Endo <olegendo@gcc.gnu.org>
PR target/6526
* config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits
other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation.
2013-06-08 Walter Lee <walt@tilera.com> 2013-06-08 Walter Lee <walt@tilera.com>
* config/tilepro/atomic.h: Don't include stdint.h or features.h. * config/tilepro/atomic.h: Don't include stdint.h or features.h.
......
...@@ -1003,11 +1003,17 @@ hiset: sts macl,r0 ! r0 = bb*dd ...@@ -1003,11 +1003,17 @@ hiset: sts macl,r0 ! r0 = bb*dd
ENDFUNC(GLOBAL(mulsi3)) ENDFUNC(GLOBAL(mulsi3))
#endif #endif
#endif /* ! __SH5__ */ #endif /* ! __SH5__ */
/*------------------------------------------------------------------------------
32 bit signed integer division that uses FPU double precision division. */
#ifdef L_sdivsi3_i4 #ifdef L_sdivsi3_i4
.title "SH DIVIDE" .title "SH DIVIDE"
!! 4 byte integer Divide code for the Renesas SH
#if defined (__SH4__) || defined (__SH2A__) #if defined (__SH4__) || defined (__SH2A__)
!! args in r4 and r5, result in fpul, clobber dr0, dr2 /* This variant is used when FPSCR.PR = 1 (double precision) is the default
setting.
Args in r4 and r5, result in fpul, clobber dr0, dr2. */
.global GLOBAL(sdivsi3_i4) .global GLOBAL(sdivsi3_i4)
HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
...@@ -1021,8 +1027,13 @@ GLOBAL(sdivsi3_i4): ...@@ -1021,8 +1027,13 @@ GLOBAL(sdivsi3_i4):
ftrc dr0,fpul ftrc dr0,fpul
ENDFUNC(GLOBAL(sdivsi3_i4)) ENDFUNC(GLOBAL(sdivsi3_i4))
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__) #elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2 /* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
setting.
Args in r4 and r5, result in fpul, clobber r2, dr0, dr2.
For this to work, we must temporarily switch the FPU do double precision,
but we better do not touch FPSCR.FR. See PR 6526. */
#if ! __SH5__ || __SH5__ == 32 #if ! __SH5__ || __SH5__ == 32
#if __SH5__ #if __SH5__
...@@ -1031,10 +1042,26 @@ GLOBAL(sdivsi3_i4): ...@@ -1031,10 +1042,26 @@ GLOBAL(sdivsi3_i4):
.global GLOBAL(sdivsi3_i4) .global GLOBAL(sdivsi3_i4)
HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4): GLOBAL(sdivsi3_i4):
sts.l fpscr,@-r15
mov #8,r2 #ifndef __SH4A__
swap.w r2,r2 mov.l r3,@-r15
sts fpscr,r2
mov #8,r3
swap.w r3,r3 // r3 = 1 << 19 (FPSCR.PR bit)
or r2,r3
lds r3,fpscr // Set FPSCR.PR = 1.
lds r4,fpul
float fpul,dr0
lds r5,fpul
float fpul,dr2
fdiv dr2,dr0
ftrc dr0,fpul
lds r2,fpscr lds r2,fpscr
rts
mov.l @r15+,r3
#else
/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. */
fpchg
lds r4,fpul lds r4,fpul
float fpul,dr0 float fpul,dr0
lds r5,fpul lds r5,fpul
...@@ -1042,13 +1069,16 @@ GLOBAL(sdivsi3_i4): ...@@ -1042,13 +1069,16 @@ GLOBAL(sdivsi3_i4):
fdiv dr2,dr0 fdiv dr2,dr0
ftrc dr0,fpul ftrc dr0,fpul
rts rts
lds.l @r15+,fpscr fpchg
#endif /* __SH4A__ */
ENDFUNC(GLOBAL(sdivsi3_i4)) ENDFUNC(GLOBAL(sdivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */ #endif /* ! __SH5__ || __SH5__ == 32 */
#endif /* ! __SH4__ || __SH2A__ */ #endif /* ! __SH4__ || __SH2A__ */
#endif #endif /* L_sdivsi3_i4 */
//------------------------------------------------------------------------------
#ifdef L_sdivsi3 #ifdef L_sdivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
sh2e/sh3e code. */ sh2e/sh3e code. */
...@@ -1368,21 +1398,26 @@ div0: rts ...@@ -1368,21 +1398,26 @@ div0: rts
ENDFUNC(GLOBAL(sdivsi3)) ENDFUNC(GLOBAL(sdivsi3))
#endif /* ! __SHMEDIA__ */ #endif /* ! __SHMEDIA__ */
#endif #endif /* L_sdivsi3 */
#ifdef L_udivsi3_i4
/*------------------------------------------------------------------------------
32 bit unsigned integer division that uses FPU double precision division. */
#ifdef L_udivsi3_i4
.title "SH DIVIDE" .title "SH DIVIDE"
!! 4 byte integer Divide code for the Renesas SH
#if defined (__SH4__) || defined (__SH2A__) #if defined (__SH4__) || defined (__SH2A__)
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4, /* This variant is used when FPSCR.PR = 1 (double precision) is the default
!! and t bit setting.
Args in r4 and r5, result in fpul,
clobber r0, r1, r4, r5, dr0, dr2, dr4, and t bit */
.global GLOBAL(udivsi3_i4) .global GLOBAL(udivsi3_i4)
HIDDEN_FUNC(GLOBAL(udivsi3_i4)) HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4): GLOBAL(udivsi3_i4):
mov #1,r1 mov #1,r1
cmp/hi r1,r5 cmp/hi r1,r5
bf trivial bf/s trivial
rotr r1 rotr r1
xor r1,r4 xor r1,r4
lds r4,fpul lds r4,fpul
...@@ -1409,12 +1444,13 @@ trivial: ...@@ -1409,12 +1444,13 @@ trivial:
.align 2 .align 2
#ifdef FMOVD_WORKS #ifdef FMOVD_WORKS
.align 3 ! make double below 8 byte aligned. .align 3 // Make the double below 8 byte aligned.
#endif #endif
L1: L1:
.double 2147483648 .double 2147483648
ENDFUNC(GLOBAL(udivsi3_i4)) ENDFUNC(GLOBAL(udivsi3_i4))
#elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__) #elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__)
#if ! __SH5__ || __SH5__ == 32 #if ! __SH5__ || __SH5__ == 32
!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33 !! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
...@@ -1436,21 +1472,33 @@ GLOBAL(udivsi3_i4): ...@@ -1436,21 +1472,33 @@ GLOBAL(udivsi3_i4):
ENDFUNC(GLOBAL(udivsi3_i4)) ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */ #endif /* ! __SH5__ || __SH5__ == 32 */
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) #elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4 /* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
setting.
Args in r4 and r5, result in fpul,
clobber r0, r1, r4, r5, dr0, dr2, dr4.
For this to work, we must temporarily switch the FPU do double precision,
but we better do not touch FPSCR.FR. See PR 6526. */
.global GLOBAL(udivsi3_i4) .global GLOBAL(udivsi3_i4)
HIDDEN_FUNC(GLOBAL(udivsi3_i4)) HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4): GLOBAL(udivsi3_i4):
#ifndef __SH4A__
mov #1,r1 mov #1,r1
cmp/hi r1,r5 cmp/hi r1,r5
bf trivial bf/s trivial
rotr r1 // r1 = 1 << 31
sts.l fpscr,@-r15 sts.l fpscr,@-r15
mova L1,r0
lds.l @r0+,fpscr
rotr r1
xor r1,r4 xor r1,r4
mov.l @(0,r15),r0
xor r1,r5
mov.l L2,r1
lds r4,fpul lds r4,fpul
or r0,r1
mova L1,r0
lds r1,fpscr
#ifdef FMOVD_WORKS #ifdef FMOVD_WORKS
fmov.d @r0+,dr4 fmov.d @r0+,dr4
#else #else
...@@ -1458,7 +1506,6 @@ GLOBAL(udivsi3_i4): ...@@ -1458,7 +1506,6 @@ GLOBAL(udivsi3_i4):
fmov.s @r0,DR41 fmov.s @r0,DR41
#endif #endif
float fpul,dr0 float fpul,dr0
xor r1,r5
lds r5,fpul lds r5,fpul
float fpul,dr2 float fpul,dr2
fadd dr4,dr0 fadd dr4,dr0
...@@ -1469,24 +1516,62 @@ GLOBAL(udivsi3_i4): ...@@ -1469,24 +1516,62 @@ GLOBAL(udivsi3_i4):
lds.l @r15+,fpscr lds.l @r15+,fpscr
#ifdef FMOVD_WORKS #ifdef FMOVD_WORKS
.align 3 ! make double below 8 byte aligned. .align 3 // Make the double below 8 byte aligned.
#endif #endif
trivial: trivial:
rts rts
lds r4,fpul lds r4,fpul
.align 2 .align 2
L1: L2:
#ifndef FMOVD_WORKS #ifdef FMOVD_WORKS
.long 0x80000 .long 0x180000 // FPSCR.PR = 1, FPSCR.SZ = 1
#else #else
.long 0x180000 .long 0x80000 // FPSCR.PR = 1
#endif #endif
L1:
.double 2147483648 .double 2147483648
#else
/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit.
Although on SH4A fmovd usually works, it would require either additional
two fschg instructions or an FPSCR push + pop. It's not worth the effort
for loading only one double constant. */
mov #1,r1
cmp/hi r1,r5
bf/s trivial
rotr r1 // r1 = 1 << 31
fpchg
mova L1,r0
xor r1,r4
fmov.s @r0+,DR40
lds r4,fpul
fmov.s @r0,DR41
xor r1,r5
float fpul,dr0
lds r5,fpul
float fpul,dr2
fadd dr4,dr0
fadd dr4,dr2
fdiv dr2,dr0
ftrc dr0,fpul
rts
fpchg
trivial:
rts
lds r4,fpul
.align 2
L1:
.double 2147483648
#endif /* __SH4A__ */
ENDFUNC(GLOBAL(udivsi3_i4)) ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH4__ */ #endif /* ! __SH4__ */
#endif #endif /* L_udivsi3_i4 */
#ifdef L_udivsi3 #ifdef L_udivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment