Commit 43c371e8 by Tom de Vries Committed by Tom de Vries

Add extra initialization of broadcasted condition variables

2017-07-11  Tom de Vries  <tom@codesourcery.com>

	* config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro.
	(bb_first_real_insn): New function.
	(nvptx_single): Add extra initialization of broadcasted condition
	variables.

From-SVN: r250129
parent 3d36348a
2017-07-11 Tom de Vries <tom@codesourcery.com>
* config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro.
(bb_first_real_insn): New function.
(nvptx_single): Add extra initialization of broadcasted condition
variables.
2017-07-11 Nathan Sidwell <nathan@acm.org>
* dwarf2out.c (gen_member_die): Remove useless check for anon ctors.
......
......@@ -74,6 +74,8 @@
/* This file should be included last. */
#include "target-def.h"
#define WORKAROUND_PTXJIT_BUG 1
/* The various PTX memory areas an object might reside in. */
enum nvptx_data_area
{
......@@ -3844,6 +3846,24 @@ nvptx_wsync (bool after)
return gen_nvptx_barsync (GEN_INT (after));
}
#if WORKAROUND_PTXJIT_BUG
/* Return first real insn in BB, or return NULL_RTX if BB does not contain
real insns. */
static rtx_insn *
bb_first_real_insn (basic_block bb)
{
rtx_insn *insn;
/* Find first insn of from block. */
FOR_BB_INSNS (bb, insn)
if (INSN_P (insn))
return insn;
return 0;
}
#endif
/* Single neutering according to MASK. FROM is the incoming block and
TO is the outgoing block. These may be the same block. Insert at
start of FROM:
......@@ -3958,6 +3978,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
{
/* Vector mode only, do a shuffle. */
#if WORKAROUND_PTXJIT_BUG
/* The branch condition %rcond is propagated like this:
{
.reg .u32 %x;
mov.u32 %x,%tid.x;
setp.ne.u32 %rnotvzero,%x,0;
}
@%rnotvzero bra Lskip;
setp.<op>.<type> %rcond,op1,op2;
Lskip:
selp.u32 %rcondu32,1,0,%rcond;
shfl.idx.b32 %rcondu32,%rcondu32,0,31;
setp.ne.u32 %rcond,%rcondu32,0;
There seems to be a bug in the ptx JIT compiler (observed at driver
version 381.22, at -O1 and higher for sm_61), that drops the shfl
unless %rcond is initialized to something before 'bra Lskip'. The
bug is not observed with ptxas from cuda 8.0.61.
It is true that the code is non-trivial: at Lskip, %rcond is
uninitialized in threads 1-31, and after the selp the same holds
for %rcondu32. But shfl propagates the defined value in thread 0
to threads 1-31, so after the shfl %rcondu32 is defined in threads
0-31, and after the setp.ne %rcond is defined in threads 0-31.
There is nothing in the PTX spec to suggest that this is wrong, or
to explain why the extra initialization is needed. So, we classify
it as a JIT bug, and the extra initialization as workaround. */
emit_insn_before (gen_movbi (pvar, const0_rtx),
bb_first_real_insn (from));
#endif
emit_insn_before (nvptx_gen_vcast (pvar), tail);
}
else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment