Commit 43c371e8 by Tom de Vries Committed by Tom de Vries

Add extra initialization of broadcasted condition variables

2017-07-11  Tom de Vries  <tom@codesourcery.com>

	* config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro.
	(bb_first_real_insn): New function.
	(nvptx_single): Add extra initialization of broadcasted condition
	variables.

From-SVN: r250129
parent 3d36348a
2017-07-11 Tom de Vries <tom@codesourcery.com>
* config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro.
(bb_first_real_insn): New function.
(nvptx_single): Add extra initialization of broadcasted condition
variables.
2017-07-11 Nathan Sidwell <nathan@acm.org> 2017-07-11 Nathan Sidwell <nathan@acm.org>
* dwarf2out.c (gen_member_die): Remove useless check for anon ctors. * dwarf2out.c (gen_member_die): Remove useless check for anon ctors.
......
...@@ -74,6 +74,8 @@ ...@@ -74,6 +74,8 @@
/* This file should be included last. */ /* This file should be included last. */
#include "target-def.h" #include "target-def.h"
#define WORKAROUND_PTXJIT_BUG 1
/* The various PTX memory areas an object might reside in. */ /* The various PTX memory areas an object might reside in. */
enum nvptx_data_area enum nvptx_data_area
{ {
...@@ -3844,6 +3846,24 @@ nvptx_wsync (bool after) ...@@ -3844,6 +3846,24 @@ nvptx_wsync (bool after)
return gen_nvptx_barsync (GEN_INT (after)); return gen_nvptx_barsync (GEN_INT (after));
} }
#if WORKAROUND_PTXJIT_BUG
/* Return first real insn in BB, or return NULL_RTX if BB does not contain
real insns. */
static rtx_insn *
bb_first_real_insn (basic_block bb)
{
rtx_insn *insn;
/* Find first insn of from block. */
FOR_BB_INSNS (bb, insn)
if (INSN_P (insn))
return insn;
return 0;
}
#endif
/* Single neutering according to MASK. FROM is the incoming block and /* Single neutering according to MASK. FROM is the incoming block and
TO is the outgoing block. These may be the same block. Insert at TO is the outgoing block. These may be the same block. Insert at
start of FROM: start of FROM:
...@@ -3958,6 +3978,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) ...@@ -3958,6 +3978,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
{ {
/* Vector mode only, do a shuffle. */ /* Vector mode only, do a shuffle. */
#if WORKAROUND_PTXJIT_BUG
/* The branch condition %rcond is propagated like this:
{
.reg .u32 %x;
mov.u32 %x,%tid.x;
setp.ne.u32 %rnotvzero,%x,0;
}
@%rnotvzero bra Lskip;
setp.<op>.<type> %rcond,op1,op2;
Lskip:
selp.u32 %rcondu32,1,0,%rcond;
shfl.idx.b32 %rcondu32,%rcondu32,0,31;
setp.ne.u32 %rcond,%rcondu32,0;
There seems to be a bug in the ptx JIT compiler (observed at driver
version 381.22, at -O1 and higher for sm_61), that drops the shfl
unless %rcond is initialized to something before 'bra Lskip'. The
bug is not observed with ptxas from cuda 8.0.61.
It is true that the code is non-trivial: at Lskip, %rcond is
uninitialized in threads 1-31, and after the selp the same holds
for %rcondu32. But shfl propagates the defined value in thread 0
to threads 1-31, so after the shfl %rcondu32 is defined in threads
0-31, and after the setp.ne %rcond is defined in threads 0-31.
There is nothing in the PTX spec to suggest that this is wrong, or
to explain why the extra initialization is needed. So, we classify
it as a JIT bug, and the extra initialization as workaround. */
emit_insn_before (gen_movbi (pvar, const0_rtx),
bb_first_real_insn (from));
#endif
emit_insn_before (nvptx_gen_vcast (pvar), tail); emit_insn_before (nvptx_gen_vcast (pvar), tail);
} }
else else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment