tree-vectorizer.c 82 KB
Newer Older
1
/* Loop Vectorization
2
   Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 4 5 6 7 8
   Contributed by Dorit Naishlos <dorit@il.ibm.com>

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
9
Software Foundation; either version 3, or (at your option) any later
10 11 12 13 14 15 16 17
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
18 19
along with GCC; see the file COPYING3.  If not see
<http://www.gnu.org/licenses/>.  */
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

/* Loop Vectorization Pass.

   This pass tries to vectorize loops. This first implementation focuses on
   simple inner-most loops, with no conditional control flow, and a set of
   simple operations which vector form can be expressed using existing
   tree codes (PLUS, MULT etc).

   For example, the vectorizer transforms the following simple loop:

	short a[N]; short b[N]; short c[N]; int i;

	for (i=0; i<N; i++){
	  a[i] = b[i] + c[i];
	}

   as if it was manually vectorized by rewriting the source code into:

	typedef int __attribute__((mode(V8HI))) v8hi;
	short a[N];  short b[N]; short c[N];   int i;
	v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
	v8hi va, vb, vc;

	for (i=0; i<N/8; i++){
	  vb = pb[i];
	  vc = pc[i];
	  va = vb + vc;
	  pa[i] = va;
	}

	The main entry to this pass is vectorize_loops(), in which
   the vectorizer applies a set of analyses on a given set of loops,
   followed by the actual vectorization transformation for the loops that
   had successfully passed the analysis phase.

	Throughout this pass we make a distinction between two types of
   data: scalars (which are represented by SSA_NAMES), and memory references
   ("data-refs"). These two types of data require different handling both 
   during analysis and transformation. The types of data-refs that the 
59 60 61
   vectorizer currently supports are ARRAY_REFS which base is an array DECL 
   (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
   accesses are required to have a  simple (consecutive) access pattern.
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92

   Analysis phase:
   ===============
	The driver for the analysis phase is vect_analyze_loop_nest().
   It applies a set of analyses, some of which rely on the scalar evolution 
   analyzer (scev) developed by Sebastian Pop.

	During the analysis phase the vectorizer records some information
   per stmt in a "stmt_vec_info" struct which is attached to each stmt in the 
   loop, as well as general information about the loop as a whole, which is
   recorded in a "loop_vec_info" struct attached to each loop.

   Transformation phase:
   =====================
	The loop transformation phase scans all the stmts in the loop, and
   creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
   the loop that needs to be vectorized. It insert the vector code sequence
   just before the scalar stmt S, and records a pointer to the vector code
   in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct 
   attached to S). This pointer will be used for the vectorization of following
   stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
   otherwise, we rely on dead code elimination for removing it.

	For example, say stmt S1 was vectorized into stmt VS1:

   VS1: vb = px[i];
   S1:	b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
   S2:  a = b;

   To vectorize stmt S2, the vectorizer first finds the stmt that defines
   the operand 'b' (S1), and gets the relevant vector def 'vb' from the
93
   vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
   resulting sequence would be:

   VS1: vb = px[i];
   S1:	b = x[i];	STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
   VS2: va = vb;
   S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2

	Operands that are not SSA_NAMEs, are data-refs that appear in 
   load/store operations (like 'x[i]' in S1), and are handled differently.

   Target modeling:
   =================
	Currently the only target specific information that is used is the
   size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can 
   support different sizes of vectors, for now will need to specify one value 
   for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.

	Since we only vectorize operations which vector form can be
   expressed using existing tree codes, to verify that an operation is
   supported, the vectorizer checks the relevant optab at the relevant
114
   machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
   the value found is CODE_FOR_nothing, then there's no target support, and
   we can't vectorize the stmt.

   For additional information on this project see:
   http://gcc.gnu.org/projects/tree-ssa/vectorization.html
*/

#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "tm.h"
#include "ggc.h"
#include "tree.h"
#include "target.h"
#include "rtl.h"
#include "basic-block.h"
#include "diagnostic.h"
#include "tree-flow.h"
#include "tree-dump.h"
#include "timevar.h"
#include "cfgloop.h"
#include "cfglayout.h"
#include "expr.h"
Dorit Nuzman committed
138
#include "recog.h"
139
#include "optabs.h"
140
#include "params.h"
141
#include "toplev.h"
142 143 144
#include "tree-chrec.h"
#include "tree-data-ref.h"
#include "tree-scalar-evolution.h"
145
#include "input.h"
146 147
#include "tree-vectorizer.h"
#include "tree-pass.h"
148 149 150 151 152 153

/*************************************************************************
  Simple Loop Peeling Utilities
 *************************************************************************/
static void slpeel_update_phis_for_duplicate_loop 
  (struct loop *, struct loop *, bool after);
154 155 156 157
static void slpeel_update_phi_nodes_for_guard1 
  (edge, struct loop *, bool, basic_block *, bitmap *); 
static void slpeel_update_phi_nodes_for_guard2 
  (edge, struct loop *, bool, basic_block *);
158
static edge slpeel_add_loop_guard (basic_block, tree, basic_block, basic_block);
159

160 161 162
static void rename_use_op (use_operand_p);
static void rename_variables_in_bb (basic_block);
static void rename_variables_in_loop (struct loop *);
163

164
/*************************************************************************
165
  General Vectorization Utilities
166
 *************************************************************************/
167
static void vect_set_dump_settings (void);
168 169 170 171

/* vect_dump will be set to stderr or dump_file if exist.  */
FILE *vect_dump;

172 173 174
/* vect_verbosity_level set to an invalid value 
   to mark that it's uninitialized.  */
enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
175

176 177
/* Loop location.  */
static LOC vect_loop_location;
178 179

/* Bitmap of virtual variables to be renamed.  */
Diego Novillo committed
180
bitmap vect_memsyms_to_rename;
181

182 183 184 185 186
/*************************************************************************
  Simple Loop Peeling Utilities

  Utilities to support loop peeling for vectorization purposes.
 *************************************************************************/
187 188 189 190 191 192 193


/* Renames the use *OP_P.  */

static void
rename_use_op (use_operand_p op_p)
{
Diego Novillo committed
194
  tree new_name;
195 196 197 198

  if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
    return;

Diego Novillo committed
199
  new_name = get_current_def (USE_FROM_PTR (op_p));
200 201

  /* Something defined outside of the loop.  */
Diego Novillo committed
202
  if (!new_name)
203 204 205 206
    return;

  /* An ordinary ssa name defined in the loop.  */

Diego Novillo committed
207
  SET_USE (op_p, new_name);
208 209 210 211 212 213 214 215 216 217 218
}


/* Renames the variables in basic block BB.  */

static void
rename_variables_in_bb (basic_block bb)
{
  tree phi;
  block_stmt_iterator bsi;
  tree stmt;
219 220
  use_operand_p use_p;
  ssa_op_iter iter;
221 222
  edge e;
  edge_iterator ei;
223
  struct loop *loop = bb->loop_father;
224 225 226 227

  for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
    {
      stmt = bsi_stmt (bsi);
Diego Novillo committed
228
      FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
229
	rename_use_op (use_p);
230 231 232
    }

  FOR_EACH_EDGE (e, ei, bb->succs)
233 234 235 236 237 238
    {
      if (!flow_bb_inside_loop_p (loop, e->dest))
	continue;
      for (phi = phi_nodes (e->dest); phi; phi = PHI_CHAIN (phi))
        rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (phi, e));
    }
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
}


/* Renames variables in new generated LOOP.  */

static void
rename_variables_in_loop (struct loop *loop)
{
  unsigned i;
  basic_block *bbs;

  bbs = get_loop_body (loop);

  for (i = 0; i < loop->num_nodes; i++)
    rename_variables_in_bb (bbs[i]);

  free (bbs);
}


259
/* Update the PHI nodes of NEW_LOOP.
260

261 262 263 264
   NEW_LOOP is a duplicate of ORIG_LOOP.
   AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
   AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
   executes before it.  */
265 266

static void
267
slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
268
				       struct loop *new_loop, bool after)
269
{
Diego Novillo committed
270
  tree new_ssa_name;
271 272 273 274
  tree phi_new, phi_orig;
  tree def;
  edge orig_loop_latch = loop_latch_edge (orig_loop);
  edge orig_entry_e = loop_preheader_edge (orig_loop);
275
  edge new_loop_exit_e = single_exit (new_loop);
276 277 278 279 280 281 282 283 284 285 286
  edge new_loop_entry_e = loop_preheader_edge (new_loop);
  edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);

  /*
     step 1. For each loop-header-phi:
             Add the first phi argument for the phi in NEW_LOOP
            (the one associated with the entry of NEW_LOOP)

     step 2. For each loop-header-phi:
             Add the second phi argument for the phi in NEW_LOOP
            (the one associated with the latch of NEW_LOOP)
287

288
     step 3. Update the phis in the successor block of NEW_LOOP.
289

290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
        case 1: NEW_LOOP was placed before ORIG_LOOP:
                The successor block of NEW_LOOP is the header of ORIG_LOOP.
                Updating the phis in the successor block can therefore be done
                along with the scanning of the loop header phis, because the
                header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
                phi nodes, organized in the same order.

        case 2: NEW_LOOP was placed after ORIG_LOOP:
                The successor block of NEW_LOOP is the original exit block of 
                ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
                We postpone updating these phis to a later stage (when
                loop guards are added).
   */


  /* Scan the phis in the headers of the old and new loops
     (they are organized in exactly the same order).  */
307 308

  for (phi_new = phi_nodes (new_loop->header),
309 310 311
       phi_orig = phi_nodes (orig_loop->header);
       phi_new && phi_orig;
       phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
312
    {
313 314
      /* step 1.  */
      def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
315
      add_phi_arg (phi_new, def, new_loop_entry_e);
316

317 318
      /* step 2.  */
      def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
319
      if (TREE_CODE (def) != SSA_NAME)
320
        continue;
321

Diego Novillo committed
322 323
      new_ssa_name = get_current_def (def);
      if (!new_ssa_name)
324 325 326 327 328
	{
	  /* This only happens if there are no definitions
	     inside the loop. use the phi_result in this case.  */
	  new_ssa_name = PHI_RESULT (phi_new);
	}
329 330

      /* An ordinary ssa name defined in the loop.  */
331
      add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
332

333 334 335 336 337
      /* step 3 (case 1).  */
      if (!after)
        {
          gcc_assert (new_loop_exit_e == orig_entry_e);
          SET_PHI_ARG_DEF (phi_orig,
338
                           new_loop_exit_e->dest_idx,
339 340
                           new_ssa_name);
        }
341 342 343 344 345 346
    }
}


/* Update PHI nodes for a guard of the LOOP.

347 348 349 350 351
   Input:
   - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
        controls whether LOOP is to be executed.  GUARD_EDGE is the edge that
        originates from the guard-bb, skips LOOP and reaches the (unique) exit
        bb of LOOP.  This loop-exit-bb is an empty bb with one successor.
352 353
        We denote this bb NEW_MERGE_BB because before the guard code was added
        it had a single predecessor (the LOOP header), and now it became a merge
354 355
        point of two paths - the path that ends with the LOOP exit-edge, and
        the path that ends with GUARD_EDGE.
356 357
   - NEW_EXIT_BB: New basic block that is added by this function between LOOP
        and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
358

359 360 361 362 363 364
   ===> The CFG before the guard-code was added:
        LOOP_header_bb:
          loop_body
          if (exit_loop) goto update_bb
          else           goto LOOP_header_bb
        update_bb:
365

366 367 368 369
   ==> The CFG after the guard-code was added:
        guard_bb:
          if (LOOP_guard_condition) goto new_merge_bb
          else                      goto LOOP_header_bb
370
        LOOP_header_bb:
371 372 373 374 375
          loop_body
          if (exit_loop_condition) goto new_merge_bb
          else                     goto LOOP_header_bb
        new_merge_bb:
          goto update_bb
376 377
        update_bb:

378 379 380 381
   ==> The CFG after this function:
        guard_bb:
          if (LOOP_guard_condition) goto new_merge_bb
          else                      goto LOOP_header_bb
382
        LOOP_header_bb:
383 384 385 386
          loop_body
          if (exit_loop_condition) goto new_exit_bb
          else                     goto LOOP_header_bb
        new_exit_bb:
387 388 389 390
        new_merge_bb:
          goto update_bb
        update_bb:

391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
   This function:
   1. creates and updates the relevant phi nodes to account for the new
      incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
      1.1. Create phi nodes at NEW_MERGE_BB.
      1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
           UPDATE_BB).  UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
   2. preserves loop-closed-ssa-form by creating the required phi nodes
      at the exit of LOOP (i.e, in NEW_EXIT_BB).

   There are two flavors to this function:

   slpeel_update_phi_nodes_for_guard1:
     Here the guard controls whether we enter or skip LOOP, where LOOP is a
     prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
     for variables that have phis in the loop header.

   slpeel_update_phi_nodes_for_guard2:
     Here the guard controls whether we enter or skip LOOP, where LOOP is an
     epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
     for variables that have phis in the loop exit.

   I.E., the overall structure is:

        loop1_preheader_bb:
                guard1 (goto loop1/merg1_bb)
        loop1
        loop1_exit_bb:
                guard2 (goto merge1_bb/merge2_bb)
        merge1_bb
        loop2
        loop2_exit_bb
        merge2_bb
        next_bb

   slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
   loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
   that have phis in loop1->header).

   slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
   loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
   that have phis in next_bb). It also adds some of these phis to
   loop1_exit_bb.

   slpeel_update_phi_nodes_for_guard1 is always called before
   slpeel_update_phi_nodes_for_guard2. They are both needed in order
   to create correct data-flow and loop-closed-ssa-form.

   Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
   that change between iterations of a loop (and therefore have a phi-node
   at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
   phis for variables that are used out of the loop (and therefore have 
   loop-closed exit phis). Some variables may be both updated between 
   iterations and used after the loop. This is why in loop1_exit_bb we
   may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
   and exit phis (created by slpeel_update_phi_nodes_for_guard2).

   - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
     an original loop. i.e., we have:

           orig_loop
           guard_bb (goto LOOP/new_merge)
           new_loop <-- LOOP
           new_exit
           new_merge
           next_bb

     If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
     have:

           new_loop
           guard_bb (goto LOOP/new_merge)
           orig_loop <-- LOOP
           new_exit
           new_merge
           next_bb

Diego Novillo committed
467 468 469
     The SSA names defined in the original loop have a current
     reaching definition that that records the corresponding new
     ssa-name used in the new duplicated loop copy.
470
  */
471

472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
/* Function slpeel_update_phi_nodes_for_guard1
   
   Input:
   - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
   - DEFS - a bitmap of ssa names to mark new names for which we recorded
            information. 
   
   In the context of the overall structure, we have:

        loop1_preheader_bb: 
                guard1 (goto loop1/merg1_bb)
LOOP->  loop1
        loop1_exit_bb:
                guard2 (goto merge1_bb/merge2_bb)
        merge1_bb
        loop2
        loop2_exit_bb
        merge2_bb
        next_bb

   For each name updated between loop iterations (i.e - for each name that has
   an entry (loop-header) phi in LOOP) we create a new phi in:
   1. merge1_bb (to account for the edge from guard1)
   2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
*/

498
static void
499 500 501
slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
                                    bool is_new_loop, basic_block *new_exit_bb,
                                    bitmap *defs)
502
{
503 504
  tree orig_phi, new_phi;
  tree update_phi, update_phi2;
505 506
  tree guard_arg, loop_arg;
  basic_block new_merge_bb = guard_edge->dest;
507
  edge e = EDGE_SUCC (new_merge_bb, 0);
508
  basic_block update_bb = e->dest;
509 510 511
  basic_block orig_bb = loop->header;
  edge new_exit_e;
  tree current_new_name;
512
  tree name;
513 514

  /* Create new bb between loop and new_merge_bb.  */
515
  *new_exit_bb = split_edge (single_exit (loop));
516 517

  new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
518 519 520 521 522

  for (orig_phi = phi_nodes (orig_bb), update_phi = phi_nodes (update_bb);
       orig_phi && update_phi;
       orig_phi = PHI_CHAIN (orig_phi), update_phi = PHI_CHAIN (update_phi))
    {
523 524 525 526 527 528 529
      /* Virtual phi; Mark it for renaming. We actually want to call
	 mar_sym_for_renaming, but since all ssa renaming datastructures
	 are going to be freed before we get to call ssa_upate, we just
	 record this name for now in a bitmap, and will mark it for
	 renaming later.  */
      name = PHI_RESULT (orig_phi);
      if (!is_gimple_reg (SSA_NAME_VAR (name)))
Diego Novillo committed
530
        bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
531

532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
      /** 1. Handle new-merge-point phis  **/

      /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
      new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
                                 new_merge_bb);

      /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
            of LOOP. Set the two phi args in NEW_PHI for these edges:  */
      loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
      guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));

      add_phi_arg (new_phi, loop_arg, new_exit_e);
      add_phi_arg (new_phi, guard_arg, guard_edge);

      /* 1.3. Update phi in successor block.  */
      gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
                  || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
      SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
      update_phi2 = new_phi;


      /** 2. Handle loop-closed-ssa-form phis  **/

Diego Novillo committed
555 556 557
      if (!is_gimple_reg (PHI_RESULT (orig_phi)))
	continue;

558 559 560 561 562
      /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
      new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
                                 *new_exit_bb);

      /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
563
      add_phi_arg (new_phi, loop_arg, single_exit (loop));
564 565 566 567 568

      /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
      gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
      SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));

Diego Novillo committed
569
      /* 2.4. Record the newly created name with set_current_def.
570
         We want to find a name such that
Diego Novillo committed
571 572 573
                name = get_current_def (orig_loop_name)
         and to set its current definition as follows:
                set_current_def (name, new_phi_name)
574 575 576 577

         If LOOP is a new loop then loop_arg is already the name we're
         looking for. If LOOP is the original loop, then loop_arg is
         the orig_loop_name and the relevant name is recorded in its
Diego Novillo committed
578
         current reaching definition.  */
579 580 581 582
      if (is_new_loop)
        current_new_name = loop_arg;
      else
        {
Diego Novillo committed
583
          current_new_name = get_current_def (loop_arg);
584 585 586 587 588 589
	  /* current_def is not available only if the variable does not
	     change inside the loop, in which case we also don't care
	     about recording a current_def for it because we won't be
	     trying to create loop-exit-phis for it.  */
	  if (!current_new_name)
	    continue;
590
        }
Diego Novillo committed
591
      gcc_assert (get_current_def (current_new_name) == NULL_TREE);
592

Diego Novillo committed
593
      set_current_def (current_new_name, PHI_RESULT (new_phi));
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
      bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
    }

  set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
}


/* Function slpeel_update_phi_nodes_for_guard2

   Input:
   - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.

   In the context of the overall structure, we have:

        loop1_preheader_bb: 
                guard1 (goto loop1/merg1_bb)
        loop1
        loop1_exit_bb: 
                guard2 (goto merge1_bb/merge2_bb)
        merge1_bb
LOOP->  loop2
        loop2_exit_bb
        merge2_bb
        next_bb

   For each name used out side the loop (i.e - for each name that has an exit
   phi in next_bb) we create a new phi in:
   1. merge2_bb (to account for the edge from guard_bb) 
   2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
   3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
      if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
*/

static void
slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
                                    bool is_new_loop, basic_block *new_exit_bb)
{
  tree orig_phi, new_phi;
  tree update_phi, update_phi2;
  tree guard_arg, loop_arg;
  basic_block new_merge_bb = guard_edge->dest;
  edge e = EDGE_SUCC (new_merge_bb, 0);
  basic_block update_bb = e->dest;
  edge new_exit_e;
Diego Novillo committed
638
  tree orig_def, orig_def_new_name;
639 640 641 642
  tree new_name, new_name2;
  tree arg;

  /* Create new bb between loop and new_merge_bb.  */
643
  *new_exit_bb = split_edge (single_exit (loop));
644 645 646 647 648 649 650 651

  new_exit_e = EDGE_SUCC (*new_exit_bb, 0);

  for (update_phi = phi_nodes (update_bb); update_phi; 
       update_phi = PHI_CHAIN (update_phi))
    {
      orig_phi = update_phi;
      orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
652 653 654 655
      /* This loop-closed-phi actually doesn't represent a use
         out of the loop - the phi arg is a constant.  */ 
      if (TREE_CODE (orig_def) != SSA_NAME)
        continue;
Diego Novillo committed
656
      orig_def_new_name = get_current_def (orig_def);
657 658 659 660 661
      arg = NULL_TREE;

      /** 1. Handle new-merge-point phis  **/

      /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
662 663
      new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
                                 new_merge_bb);
664

665
      /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
Diego Novillo committed
666
            of LOOP. Set the two PHI args in NEW_PHI for these edges:  */
667 668
      new_name = orig_def;
      new_name2 = NULL_TREE;
Diego Novillo committed
669
      if (orig_def_new_name)
670
        {
Diego Novillo committed
671 672 673 674 675 676
          new_name = orig_def_new_name;
	  /* Some variables have both loop-entry-phis and loop-exit-phis.
	     Such variables were given yet newer names by phis placed in
	     guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
	     new_name2 = get_current_def (get_current_def (orig_name)).  */
          new_name2 = get_current_def (new_name);
677
        }
678 679
  
      if (is_new_loop)
680
        {
681 682
          guard_arg = orig_def;
          loop_arg = new_name;
683
        }
684 685 686 687 688 689 690 691 692
      else
        {
          guard_arg = new_name;
          loop_arg = orig_def;
        }
      if (new_name2)
        guard_arg = new_name2;
  
      add_phi_arg (new_phi, loop_arg, new_exit_e);
693
      add_phi_arg (new_phi, guard_arg, guard_edge);
694

695 696
      /* 1.3. Update phi in successor block.  */
      gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
697
      SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
698 699 700 701 702 703 704 705 706 707
      update_phi2 = new_phi;


      /** 2. Handle loop-closed-ssa-form phis  **/

      /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
      new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
                                 *new_exit_bb);

      /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
708
      add_phi_arg (new_phi, loop_arg, single_exit (loop));
709 710 711 712 713 714 715 716

      /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
      gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
      SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));


      /** 3. Handle loop-closed-ssa-form phis for first loop  **/

Diego Novillo committed
717 718 719 720
      /* 3.1. Find the relevant names that need an exit-phi in
	 GUARD_BB, i.e. names for which
	 slpeel_update_phi_nodes_for_guard1 had not already created a
	 phi node. This is the case for names that are used outside
721
	 the loop (and therefore need an exit phi) but are not updated
Diego Novillo committed
722 723 724 725 726 727 728 729 730 731 732
	 across loop iterations (and therefore don't have a
	 loop-header-phi).

	 slpeel_update_phi_nodes_for_guard1 is responsible for
	 creating loop-exit phis in GUARD_BB for names that have a
	 loop-header-phi.  When such a phi is created we also record
	 the new name in its current definition.  If this new name
	 exists, then guard_arg was set to this new name (see 1.2
	 above).  Therefore, if guard_arg is not this new name, this
	 is an indication that an exit-phi in GUARD_BB was not yet
	 created, so we take care of it here.  */
733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748
      if (guard_arg == new_name2)
	continue;
      arg = guard_arg;

      /* 3.2. Generate new phi node in GUARD_BB:  */
      new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
                                 guard_edge->src);

      /* 3.3. GUARD_BB has one incoming edge:  */
      gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
      add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));

      /* 3.4. Update phi in successor of GUARD_BB:  */
      gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
                                                                == guard_arg);
      SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
749
    }
750

751
  set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
752 753 754 755
}


/* Make the LOOP iterate NITERS times. This is done by adding a new IV
756 757 758
   that starts at zero, increases by one and its limit is NITERS.

   Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
759

760
void
761
slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
762 763 764
{
  tree indx_before_incr, indx_after_incr, cond_stmt, cond;
  tree orig_cond;
765
  edge exit_edge = single_exit (loop);
766 767 768
  block_stmt_iterator loop_cond_bsi;
  block_stmt_iterator incr_bsi;
  bool insert_after;
769 770
  tree init = build_int_cst (TREE_TYPE (niters), 0);
  tree step = build_int_cst (TREE_TYPE (niters), 1);
771
  LOC loop_loc;
772 773 774

  orig_cond = get_loop_exit_condition (loop);
  gcc_assert (orig_cond);
775 776 777
  loop_cond_bsi = bsi_for_stmt (orig_cond);

  standard_iv_increment_position (loop, &incr_bsi, &insert_after);
778
  create_iv (init, step, NULL_TREE, loop,
779
             &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr);
780 781

  if (exit_edge->flags & EDGE_TRUE_VALUE) /* 'then' edge exits the loop.  */
782
    cond = build2 (GE_EXPR, boolean_type_node, indx_after_incr, niters);
783
  else /* 'then' edge loops back.  */
784
    cond = build2 (LT_EXPR, boolean_type_node, indx_after_incr, niters);
785

786
  cond_stmt = build3 (COND_EXPR, TREE_TYPE (orig_cond), cond,
787
		      NULL_TREE, NULL_TREE);
788
  bsi_insert_before (&loop_cond_bsi, cond_stmt, BSI_SAME_STMT);
789 790

  /* Remove old loop exit test:  */
791
  bsi_remove (&loop_cond_bsi, true);
792

793
  loop_loc = find_loop_location (loop);
794 795 796 797 798 799 800
  if (dump_file && (dump_flags & TDF_DETAILS))
    {
      if (loop_loc != UNKNOWN_LOC)
        fprintf (dump_file, "\nloop at %s:%d: ",
                 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
      print_generic_expr (dump_file, cond_stmt, TDF_SLIM);
    }
801 802

  loop->nb_iterations = niters;
803 804 805 806 807 808 809
}


/* Given LOOP this function generates a new copy of it and puts it 
   on E which is either the entry or exit of LOOP.  */

static struct loop *
810
slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
811 812 813 814 815 816 817
{
  struct loop *new_loop;
  basic_block *new_bbs, *bbs;
  bool at_exit;
  bool was_imm_dom;
  basic_block exit_dest; 
  tree phi, phi_arg;
818
  edge exit, new_exit;
819

820
  at_exit = (e == single_exit (loop)); 
821
  if (!at_exit && e != loop_preheader_edge (loop))
822
    return NULL;
823 824 825 826 827 828 829 830 831 832 833

  bbs = get_loop_body (loop);

  /* Check whether duplication is possible.  */
  if (!can_copy_bbs_p (bbs, loop->num_nodes))
    {
      free (bbs);
      return NULL;
    }

  /* Generate new loop structure.  */
834
  new_loop = duplicate_loop (loop, loop_outer (loop));
835 836 837 838 839 840
  if (!new_loop)
    {
      free (bbs);
      return NULL;
    }

841
  exit_dest = single_exit (loop)->dest;
842 843 844 845
  was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS, 
					  exit_dest) == loop->header ? 
		 true : false);

846
  new_bbs = XNEWVEC (basic_block, loop->num_nodes);
847

848
  exit = single_exit (loop);
849
  copy_bbs (bbs, loop->num_nodes, new_bbs,
850
	    &exit, 1, &new_exit, NULL,
851
	    e->src);
852 853 854

  /* Duplicating phi args at exit bbs as coming 
     also from exit of duplicated loop.  */
855
  for (phi = phi_nodes (exit_dest); phi; phi = PHI_CHAIN (phi))
856
    {
857
      phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
858 859 860 861 862 863 864 865 866
      if (phi_arg)
	{
	  edge new_loop_exit_edge;

	  if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
	    new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
	  else
	    new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
  
867
	  add_phi_arg (phi, phi_arg, new_loop_exit_edge);	
868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895
	}
    }    
   
  if (at_exit) /* Add the loop copy at exit.  */
    {
      redirect_edge_and_branch_force (e, new_loop->header);
      set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
      if (was_imm_dom)
	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
    }
  else /* Add the copy at entry.  */
    {
      edge new_exit_e;
      edge entry_e = loop_preheader_edge (loop);
      basic_block preheader = entry_e->src;
           
      if (!flow_bb_inside_loop_p (new_loop, 
				  EDGE_SUCC (new_loop->header, 0)->dest))
        new_exit_e = EDGE_SUCC (new_loop->header, 0);
      else
	new_exit_e = EDGE_SUCC (new_loop->header, 1); 

      redirect_edge_and_branch_force (new_exit_e, loop->header);
      set_immediate_dominator (CDI_DOMINATORS, loop->header,
			       new_exit_e->src);

      /* We have to add phi args to the loop->header here as coming 
	 from new_exit_e edge.  */
896
      for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
897 898 899
	{
	  phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
	  if (phi_arg)
900
	    add_phi_arg (phi, phi_arg, new_exit_e);	
901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
	}    

      redirect_edge_and_branch_force (entry_e, new_loop->header);
      set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
    }

  free (new_bbs);
  free (bbs);

  return new_loop;
}


/* Given the condition statement COND, put it as the last statement
   of GUARD_BB; EXIT_BB is the basic block to skip the loop;
   Assumes that this is the single exit of the guarded loop.  
   Returns the skip edge.  */

static edge
920 921
slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
		        basic_block dom_bb)
922 923 924
{
  block_stmt_iterator bsi;
  edge new_e, enter_e;
925
  tree cond_stmt;
926

927
  enter_e = EDGE_SUCC (guard_bb, 0);
928 929 930 931
  enter_e->flags &= ~EDGE_FALLTHRU;
  enter_e->flags |= EDGE_FALSE_VALUE;
  bsi = bsi_last (guard_bb);

932
  cond_stmt = build3 (COND_EXPR, void_type_node, cond,
933
		      NULL_TREE, NULL_TREE);
934
  bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
935
  /* Add new edge to connect guard block to the merge/loop-exit block.  */
936
  new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
937
  set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
938 939 940 941
  return new_e;
}


942 943 944 945 946 947 948
/* This function verifies that the following restrictions apply to LOOP:
   (1) it is innermost
   (2) it consists of exactly 2 basic blocks - header, and an empty latch.
   (3) it is single entry, single exit
   (4) its exit condition is the last stmt in the header
   (5) E is the entry/exit edge of LOOP.
 */
949

950
bool
951
slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
952
{
953
  edge exit_e = single_exit (loop);
954
  edge entry_e = loop_preheader_edge (loop);
955 956
  tree orig_cond = get_loop_exit_condition (loop);
  block_stmt_iterator loop_exit_bsi = bsi_last (exit_e->src);
957

Diego Novillo committed
958
  if (need_ssa_update_p ())
959
    return false;
960

961 962 963
  if (loop->inner
      /* All loops have an outer scope; the only case loop->outer is NULL is for
         the function itself.  */
964
      || !loop_outer (loop)
965 966
      || loop->num_nodes != 2
      || !empty_block_p (loop->latch)
967
      || !single_exit (loop)
968 969 970 971
      /* Verify that new loop exit condition can be trivially modified.  */
      || (!orig_cond || orig_cond != bsi_stmt (loop_exit_bsi))
      || (e != exit_e && e != entry_e))
    return false;
972 973 974 975

  return true;
}

976
#ifdef ENABLE_CHECKING
977
void
978 979 980
slpeel_verify_cfg_after_peeling (struct loop *first_loop,
                                 struct loop *second_loop)
{
981
  basic_block loop1_exit_bb = single_exit (first_loop)->dest;
982
  basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
983 984 985 986 987 988 989 990
  basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;

  /* A guard that controls whether the second_loop is to be executed or skipped
     is placed in first_loop->exit.  first_loopt->exit therefore has two
     successors - one is the preheader of second_loop, and the other is a bb
     after second_loop.
   */
  gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
991
   
992 993
  /* 1. Verify that one of the successors of first_loopt->exit is the preheader
        of second_loop.  */
994
   
995
  /* The preheader of new_loop is expected to have two predecessors:
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
     first_loop->exit and the block that precedes first_loop.  */

  gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2 
              && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
                   && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
               || (EDGE_PRED (loop2_entry_bb, 1)->src ==  loop1_exit_bb
                   && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
  
  /* Verify that the other successor of first_loopt->exit is after the
     second_loop.  */
  /* TODO */
}
1008
#endif
1009

1010
/* Function slpeel_tree_peel_loop_to_edge.
1011

1012 1013 1014 1015
   Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
   that is placed on the entry (exit) edge E of LOOP. After this transformation
   we have two loops one after the other - first-loop iterates FIRST_NITERS
   times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
1016

1017 1018 1019 1020 1021 1022 1023 1024 1025
   Input:
   - LOOP: the loop to be peeled.
   - E: the exit or entry edge of LOOP.
        If it is the entry edge, we peel the first iterations of LOOP. In this
        case first-loop is LOOP, and second-loop is the newly created loop.
        If it is the exit edge, we peel the last iterations of LOOP. In this
        case, first-loop is the newly created loop, and second-loop is LOOP.
   - NITERS: the number of iterations that LOOP iterates.
   - FIRST_NITERS: the number of iterations that the first-loop should iterate.
1026
   - UPDATE_FIRST_LOOP_COUNT:  specified whether this function is responsible
1027 1028
        for updating the loop bound of the first-loop to FIRST_NITERS.  If it
        is false, the caller of this function may want to take care of this
1029
        (this can be useful if we don't want new stmts added to first-loop).
1030

1031 1032
   Output:
   The function returns a pointer to the new loop-copy, or NULL if it failed
1033
   to perform the transformation.
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045

   The function generates two if-then-else guards: one before the first loop,
   and the other before the second loop:
   The first guard is:
     if (FIRST_NITERS == 0) then skip the first loop,
     and go directly to the second loop.
   The second guard is:
     if (FIRST_NITERS == NITERS) then skip the second loop.

   FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
   FORNOW the resulting code will not be in loop-closed-ssa form.
*/
1046 1047

struct loop*
1048
slpeel_tree_peel_loop_to_edge (struct loop *loop, 
1049
			       edge e, tree first_niters, 
1050 1051
			       tree niters, bool update_first_loop_count,
			       unsigned int th)
1052 1053 1054 1055 1056
{
  struct loop *new_loop = NULL, *first_loop, *second_loop;
  edge skip_e;
  tree pre_condition;
  bitmap definitions;
1057 1058 1059
  basic_block bb_before_second_loop, bb_after_second_loop;
  basic_block bb_before_first_loop;
  basic_block bb_between_loops;
1060
  basic_block new_exit_bb;
1061
  edge exit_e = single_exit (loop);
1062
  LOC loop_loc;
1063
  
1064 1065
  if (!slpeel_can_duplicate_loop_p (loop, e))
    return NULL;
1066 1067
  
  /* We have to initialize cfg_hooks. Then, when calling
1068
   cfg_hooks->split_edge, the function tree_split_edge 
1069
   is actually called and, when calling cfg_hooks->duplicate_block,
1070 1071 1072
   the function tree_duplicate_bb is called.  */
  tree_register_cfg_hooks ();

1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087

  /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
        Resulting CFG would be:

        first_loop:
        do {
        } while ...

        second_loop:
        do {
        } while ...

        orig_exit_bb:
   */
  
1088
  if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
1089
    {
1090
      loop_loc = find_loop_location (loop);
1091 1092 1093 1094 1095 1096 1097
      if (dump_file && (dump_flags & TDF_DETAILS))
        {
          if (loop_loc != UNKNOWN_LOC)
            fprintf (dump_file, "\n%s:%d: note: ",
                     LOC_FILE (loop_loc), LOC_LINE (loop_loc));
          fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
        }
1098 1099
      return NULL;
    }
1100
  
1101 1102
  if (e == exit_e)
    {
1103
      /* NEW_LOOP was placed after LOOP.  */
1104 1105 1106
      first_loop = loop;
      second_loop = new_loop;
    }
1107
  else
1108
    {
1109
      /* NEW_LOOP was placed before LOOP.  */
1110 1111 1112 1113
      first_loop = new_loop;
      second_loop = loop;
    }

Diego Novillo committed
1114
  definitions = ssa_names_to_replace ();
1115 1116
  slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
  rename_variables_in_loop (new_loop);
1117 1118


1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130
  /* 2. Add the guard that controls whether the first loop is executed.
        Resulting CFG would be:

        bb_before_first_loop:
        if (FIRST_NITERS == 0) GOTO bb_before_second_loop
                               GOTO first-loop

        first_loop:
        do {
        } while ...

        bb_before_second_loop:
1131

1132 1133 1134
        second_loop:
        do {
        } while ...
1135

1136 1137
        orig_exit_bb:
   */
1138

1139
  bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
1140
  bb_before_second_loop = split_edge (single_exit (first_loop));
1141 1142

  pre_condition =
1143
    fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
1144 1145
	build_int_cst (TREE_TYPE (first_niters), th));

1146 1147
  skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
                                  bb_before_second_loop, bb_before_first_loop);
1148 1149 1150
  slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
				      first_loop == new_loop,
				      &new_exit_bb, &definitions);
1151 1152 1153 1154


  /* 3. Add the guard that controls whether the second loop is executed.
        Resulting CFG would be:
1155

1156 1157 1158
        bb_before_first_loop:
        if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
                               GOTO first-loop
1159

1160 1161 1162
        first_loop:
        do {
        } while ...
1163

1164 1165 1166
        bb_between_loops:
        if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
                                    GOTO bb_before_second_loop
1167

1168
        bb_before_second_loop:
1169

1170 1171 1172
        second_loop:
        do {
        } while ...
1173

1174
        bb_after_second_loop:
1175

1176 1177 1178
        orig_exit_bb:
   */

1179
  bb_between_loops = new_exit_bb;
1180
  bb_after_second_loop = split_edge (single_exit (second_loop));
1181

1182
  pre_condition = 
1183
	fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
1184 1185
  skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
                                  bb_after_second_loop, bb_before_first_loop);
1186 1187
  slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
                                     second_loop == new_loop, &new_exit_bb);
1188

1189 1190 1191 1192
  /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
   */
  if (update_first_loop_count)
    slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
1193

1194
  BITMAP_FREE (definitions);
Diego Novillo committed
1195
  delete_update_ssa ();
1196

1197 1198 1199
  return new_loop;
}

1200 1201 1202 1203 1204 1205 1206
/* Function vect_get_loop_location.

   Extract the location of the loop in the source code.
   If the loop is not well formed for vectorization, an estimated
   location is calculated.
   Return the loop location if succeed and NULL if not.  */

1207
LOC
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
find_loop_location (struct loop *loop)
{
  tree node = NULL_TREE;
  basic_block bb;
  block_stmt_iterator si;

  if (!loop)
    return UNKNOWN_LOC;

  node = get_loop_exit_condition (loop);

1219
  if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node)
1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233
      && EXPR_FILENAME (node) && EXPR_LINENO (node))
    return EXPR_LOC (node);

  /* If we got here the loop is probably not "well formed",
     try to estimate the loop location */

  if (!loop->header)
    return UNKNOWN_LOC;

  bb = loop->header;

  for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
    {
      node = bsi_stmt (si);
1234
      if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node))
1235 1236 1237 1238 1239 1240 1241
        return EXPR_LOC (node);
    }

  return UNKNOWN_LOC;
}


1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287
/*************************************************************************
  Vectorization Debug Information.
 *************************************************************************/

/* Function vect_set_verbosity_level.

   Called from toplev.c upon detection of the
   -ftree-vectorizer-verbose=N option.  */

void
vect_set_verbosity_level (const char *val)
{
   unsigned int vl;

   vl = atoi (val);
   if (vl < MAX_VERBOSITY_LEVEL)
     vect_verbosity_level = vl;
   else
     vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
}


/* Function vect_set_dump_settings.

   Fix the verbosity level of the vectorizer if the
   requested level was not set explicitly using the flag
   -ftree-vectorizer-verbose=N.
   Decide where to print the debugging information (dump_file/stderr).
   If the user defined the verbosity level, but there is no dump file,
   print to stderr, otherwise print to the dump file.  */

static void
vect_set_dump_settings (void)
{
  vect_dump = dump_file;

  /* Check if the verbosity level was defined by the user:  */
  if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
    {
      /* If there is no dump file, print to stderr.  */
      if (!dump_file)
        vect_dump = stderr;
      return;
    }

  /* User didn't specify verbosity level:  */
1288
  if (dump_file && (dump_flags & TDF_DETAILS))
1289
    vect_verbosity_level = REPORT_DETAILS;
1290
  else if (dump_file && (dump_flags & TDF_STATS))
1291 1292 1293
    vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
  else
    vect_verbosity_level = REPORT_NONE;
1294 1295

  gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
1296 1297 1298 1299 1300 1301 1302
}


/* Function debug_loop_details.

   For vectorization debug dumps.  */

1303
bool
1304
vect_print_dump_info (enum verbosity_levels vl)
1305 1306 1307 1308
{
  if (vl > vect_verbosity_level)
    return false;

1309 1310 1311
  if (!current_function_decl || !vect_dump)
    return false;

1312
  if (vect_loop_location == UNKNOWN_LOC)
1313
    fprintf (vect_dump, "\n%s:%d: note: ",
Mike Stump committed
1314 1315
	     DECL_SOURCE_FILE (current_function_decl),
	     DECL_SOURCE_LINE (current_function_decl));
1316
  else
1317 1318
    fprintf (vect_dump, "\n%s:%d: note: ", 
	     LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
1319 1320 1321 1322 1323

  return true;
}


1324 1325 1326 1327
/*************************************************************************
  Vectorization Utilities.
 *************************************************************************/

1328 1329 1330 1331 1332
/* Function new_stmt_vec_info.

   Create and initialize a new stmt_vec_info struct for STMT.  */

stmt_vec_info
1333
new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
1334 1335 1336 1337 1338 1339
{
  stmt_vec_info res;
  res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));

  STMT_VINFO_TYPE (res) = undef_vec_info_type;
  STMT_VINFO_STMT (res) = stmt;
1340
  STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
Dorit Nuzman committed
1341 1342
  STMT_VINFO_RELEVANT (res) = 0;
  STMT_VINFO_LIVE_P (res) = false;
1343 1344
  STMT_VINFO_VECTYPE (res) = NULL;
  STMT_VINFO_VEC_STMT (res) = NULL;
1345 1346
  STMT_VINFO_IN_PATTERN_P (res) = false;
  STMT_VINFO_RELATED_STMT (res) = NULL;
1347
  STMT_VINFO_DATA_REF (res) = NULL;
1348 1349 1350 1351 1352 1353 1354

  STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
  STMT_VINFO_DR_OFFSET (res) = NULL;
  STMT_VINFO_DR_INIT (res) = NULL;
  STMT_VINFO_DR_STEP (res) = NULL;
  STMT_VINFO_DR_ALIGNED_TO (res) = NULL;

1355
  if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
1356 1357 1358
    STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
  else
    STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
1359
  STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
1360 1361
  STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
  STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
1362
  STMT_SLP_TYPE (res) = 0;
1363 1364 1365 1366 1367 1368
  DR_GROUP_FIRST_DR (res) = NULL_TREE;
  DR_GROUP_NEXT_DR (res) = NULL_TREE;
  DR_GROUP_SIZE (res) = 0;
  DR_GROUP_STORE_COUNT (res) = 0;
  DR_GROUP_GAP (res) = 0;
  DR_GROUP_SAME_DR_STMT (res) = NULL_TREE;
1369
  DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
1370 1371 1372 1373 1374

  return res;
}


1375 1376 1377 1378 1379 1380 1381
/* Function bb_in_loop_p

   Used as predicate for dfs order traversal of the loop bbs.  */

static bool
bb_in_loop_p (const_basic_block bb, const void *data)
{
1382
  const struct loop *const loop = (const struct loop *)data;
1383 1384 1385 1386 1387 1388
  if (flow_bb_inside_loop_p (loop, bb))
    return true;
  return false;
}


1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399
/* Function new_loop_vec_info.

   Create and initialize a new loop_vec_info struct for LOOP, as well as
   stmt_vec_info structs for all the stmts in LOOP.  */

loop_vec_info
new_loop_vec_info (struct loop *loop)
{
  loop_vec_info res;
  basic_block *bbs;
  block_stmt_iterator si;
1400
  unsigned int i, nbbs;
1401 1402

  res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1403
  LOOP_VINFO_LOOP (res) = loop;
1404 1405 1406

  bbs = get_loop_body (loop);

1407
  /* Create/Update stmt_info for all stmts in the loop.  */
1408 1409 1410
  for (i = 0; i < loop->num_nodes; i++)
    {
      basic_block bb = bbs[i];
1411 1412
      tree phi;

1413 1414 1415 1416 1417 1418 1419 1420
      /* BBs in a nested inner-loop will have been already processed (because 
	 we will have called vect_analyze_loop_form for any nested inner-loop).
	 Therefore, for stmts in an inner-loop we just want to update the 
	 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new 
	 loop_info of the outer-loop we are currently considering to vectorize 
	 (instead of the loop_info of the inner-loop).
	 For stmts in other BBs we need to create a stmt_info from scratch.  */
      if (bb->loop_father != loop)
1421
	{
1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447
	  /* Inner-loop bb.  */
	  gcc_assert (loop->inner && bb->loop_father == loop->inner);
	  for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
	    {
	      stmt_vec_info stmt_info = vinfo_for_stmt (phi);
	      loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
	      gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
	      STMT_VINFO_LOOP_VINFO (stmt_info) = res;
	    }
	  for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
	   {
	      tree stmt = bsi_stmt (si);
	      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
	      loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
	      gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
	      STMT_VINFO_LOOP_VINFO (stmt_info) = res;
	   }
	}
      else
	{
	  /* bb in current nest.  */
	  for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
	    {
	      stmt_ann_t ann = get_stmt_ann (phi);
	      set_stmt_info (ann, new_stmt_vec_info (phi, res));
	    }
1448

1449 1450 1451 1452 1453 1454
	  for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
	    {
	      tree stmt = bsi_stmt (si);
	      stmt_ann_t ann = stmt_ann (stmt);
	      set_stmt_info (ann, new_stmt_vec_info (stmt, res));
	    }
1455 1456 1457
	}
    }

1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468
  /* CHECKME: We want to visit all BBs before their successors (except for 
     latch blocks, for which this assertion wouldn't hold).  In the simple 
     case of the loop forms we allow, a dfs order of the BBs would the same 
     as reversed postorder traversal, so we are safe.  */

   free (bbs);
   bbs = XCNEWVEC (basic_block, loop->num_nodes);
   nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, 
			      bbs, loop->num_nodes, loop);
   gcc_assert (nbbs == loop->num_nodes);

1469
  LOOP_VINFO_BBS (res) = bbs;
1470
  LOOP_VINFO_NITERS (res) = NULL;
1471
  LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1472
  LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1473
  LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
1474
  LOOP_VINFO_VECT_FACTOR (res) = 0;
1475 1476
  LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
  LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
1477
  LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1478 1479 1480 1481
  LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
    VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
  LOOP_VINFO_MAY_ALIAS_DDRS (res) =
    VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1482 1483 1484
  LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
  LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
  LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1485

1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
  return res;
}


/* Function destroy_loop_vec_info.
 
   Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the 
   stmts in the loop.  */

void
1496
destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1497 1498 1499 1500 1501 1502
{
  struct loop *loop;
  basic_block *bbs;
  int nbbs;
  block_stmt_iterator si;
  int j;
1503 1504
  VEC (slp_instance, heap) *slp_instances;
  slp_instance instance;
1505 1506 1507 1508 1509 1510 1511 1512 1513

  if (!loop_vinfo)
    return;

  loop = LOOP_VINFO_LOOP (loop_vinfo);

  bbs = LOOP_VINFO_BBS (loop_vinfo);
  nbbs = loop->num_nodes;

1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
  if (!clean_stmts)
    {
      free (LOOP_VINFO_BBS (loop_vinfo));
      free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
      free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
      VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));

      free (loop_vinfo);
      loop->aux = NULL;
      return;
    }

1526 1527 1528
  for (j = 0; j < nbbs; j++)
    {
      basic_block bb = bbs[j];
1529 1530 1531 1532 1533
      tree phi;
      stmt_vec_info stmt_info;

      for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
        {
1534
          stmt_ann_t ann = stmt_ann (phi);
1535 1536 1537 1538 1539 1540

          stmt_info = vinfo_for_stmt (phi);
          free (stmt_info);
          set_stmt_info (ann, NULL);
        }

1541
      for (si = bsi_start (bb); !bsi_end_p (si); )
1542 1543 1544 1545
	{
	  tree stmt = bsi_stmt (si);
	  stmt_ann_t ann = stmt_ann (stmt);
	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1546 1547 1548

	  if (stmt_info)
	    {
1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
	      /* Check if this is a "pattern stmt" (introduced by the 
		 vectorizer during the pattern recognition pass).  */
	      bool remove_stmt_p = false;
	      tree orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
	      if (orig_stmt)
		{
		  stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
		  if (orig_stmt_info
		      && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
		    remove_stmt_p = true; 
		}
			
	      /* Free stmt_vec_info.  */
1562 1563
	      VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
	      free (stmt_info);
1564
	      set_stmt_info (ann, NULL);
1565 1566 1567 1568

	      /* Remove dead "pattern stmts".  */
	      if (remove_stmt_p)
	        bsi_remove (&si, true);
1569
	    }
1570
	  bsi_next (&si);
1571 1572 1573 1574
	}
    }

  free (LOOP_VINFO_BBS (loop_vinfo));
1575 1576
  free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
  free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1577
  VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1578
  VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1579 1580 1581 1582
  slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
  for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
    vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
  VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1583 1584

  free (loop_vinfo);
1585
  loop->aux = NULL;
1586 1587 1588 1589 1590 1591 1592 1593
}


/* Function vect_force_dr_alignment_p.

   Returns whether the alignment of a DECL can be forced to be aligned
   on ALIGNMENT bit boundary.  */

1594
bool 
1595
vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
1596 1597 1598 1599 1600 1601 1602
{
  if (TREE_CODE (decl) != VAR_DECL)
    return false;

  if (DECL_EXTERNAL (decl))
    return false;

1603 1604 1605
  if (TREE_ASM_WRITTEN (decl))
    return false;

1606 1607 1608
  if (TREE_STATIC (decl))
    return (alignment <= MAX_OFILE_ALIGNMENT);
  else
1609 1610 1611 1612 1613 1614
    /* This is not 100% correct.  The absolute correct stack alignment
       is STACK_BOUNDARY.  We're supposed to hope, but not assume, that
       PREFERRED_STACK_BOUNDARY is honored by all translation units.
       However, until someone implements forced stack alignment, SSE
       isn't really usable without this.  */  
    return (alignment <= PREFERRED_STACK_BOUNDARY); 
1615 1616 1617 1618 1619 1620 1621 1622
}


/* Function get_vectype_for_scalar_type.

   Returns the vector type corresponding to SCALAR_TYPE as supported
   by the target.  */

1623
tree
1624 1625 1626 1627 1628
get_vectype_for_scalar_type (tree scalar_type)
{
  enum machine_mode inner_mode = TYPE_MODE (scalar_type);
  int nbytes = GET_MODE_SIZE (inner_mode);
  int nunits;
1629
  tree vectype;
1630

1631
  if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD)
1632 1633 1634 1635 1636 1637
    return NULL_TREE;

  /* FORNOW: Only a single vector size per target (UNITS_PER_SIMD_WORD)
     is expected.  */
  nunits = UNITS_PER_SIMD_WORD / nbytes;

1638
  vectype = build_vector_type (scalar_type, nunits);
1639
  if (vect_print_dump_info (REPORT_DETAILS))
1640
    {
1641 1642
      fprintf (vect_dump, "get vectype with %d units of type ", nunits);
      print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
1643 1644 1645
    }

  if (!vectype)
1646
    return NULL_TREE;
1647

1648
  if (vect_print_dump_info (REPORT_DETAILS))
1649
    {
1650 1651
      fprintf (vect_dump, "vectype: ");
      print_generic_expr (vect_dump, vectype, TDF_SLIM);
1652 1653
    }

1654 1655
  if (!VECTOR_MODE_P (TYPE_MODE (vectype))
      && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
1656
    {
1657
      if (vect_print_dump_info (REPORT_DETAILS))
1658
        fprintf (vect_dump, "mode not supported by target.");
1659 1660 1661
      return NULL_TREE;
    }

1662
  return vectype;
1663 1664 1665
}


1666
/* Function vect_supportable_dr_alignment
1667

1668 1669
   Return whether the data reference DR is supported with respect to its
   alignment.  */
1670

1671 1672
enum dr_alignment_support
vect_supportable_dr_alignment (struct data_reference *dr)
1673
{
1674 1675 1676
  tree stmt = DR_STMT (dr);
  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1677
  enum machine_mode mode = (int) TYPE_MODE (vectype);
1678 1679 1680
  struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
  bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
  bool invariant_in_outerloop = false;
1681

1682 1683
  if (aligned_access_p (dr))
    return dr_aligned;
1684

1685 1686 1687 1688 1689 1690 1691
  if (nested_in_vect_loop)
    {
      tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
      invariant_in_outerloop =
	(tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
    }

1692
  /* Possibly unaligned access.  */
1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712

  /* We can choose between using the implicit realignment scheme (generating
     a misaligned_move stmt) and the explicit realignment scheme (generating
     aligned loads with a REALIGN_LOAD). There are two variants to the explicit
     realignment scheme: optimized, and unoptimized.
     We can optimize the realignment only if the step between consecutive
     vector loads is equal to the vector size.  Since the vector memory
     accesses advance in steps of VS (Vector Size) in the vectorized loop, it
     is guaranteed that the misalignment amount remains the same throughout the
     execution of the vectorized loop.  Therefore, we can create the
     "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
     at the loop preheader.

     However, in the case of outer-loop vectorization, when vectorizing a
     memory access in the inner-loop nested within the LOOP that is now being
     vectorized, while it is guaranteed that the misalignment of the
     vectorized memory access will remain the same in different outer-loop
     iterations, it is *not* guaranteed that is will remain the same throughout
     the execution of the inner-loop.  This is because the inner-loop advances
     with the original scalar step (and not in steps of VS).  If the inner-loop
1713
     step happens to be a multiple of VS, then the misalignment remains fixed
1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754
     and we can use the optimized realignment scheme.  For example:

      for (i=0; i<N; i++)
        for (j=0; j<M; j++)
          s += a[i+j];

     When vectorizing the i-loop in the above example, the step between
     consecutive vector loads is 1, and so the misalignment does not remain
     fixed across the execution of the inner-loop, and the realignment cannot
     be optimized (as illustrated in the following pseudo vectorized loop):

      for (i=0; i<N; i+=4)
        for (j=0; j<M; j++){
          vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
                         // when j is {0,1,2,3,4,5,6,7,...} respectively.
                         // (assuming that we start from an aligned address).
          }

     We therefore have to use the unoptimized realignment scheme:

      for (i=0; i<N; i+=4)
          for (j=k; j<M; j+=4)
          vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
                           // that the misalignment of the initial address is
                           // 0).

     The loop can then be vectorized as follows:

      for (k=0; k<4; k++){
        rt = get_realignment_token (&vp[k]);
        for (i=0; i<N; i+=4){
          v1 = vp[i+k];
          for (j=k; j<M; j+=4){
            v2 = vp[i+j+VS-1];
            va = REALIGN_LOAD <v1,v2,rt>;
            vs += va;
            v1 = v2;
          }
        }
    } */

1755 1756
  if (DR_IS_READ (dr))
    {
1757 1758
      if (optab_handler (vec_realign_load_optab, mode)->insn_code != 
						   	     CODE_FOR_nothing
1759 1760
	  && (!targetm.vectorize.builtin_mask_for_load
	      || targetm.vectorize.builtin_mask_for_load ()))
1761 1762 1763 1764 1765 1766 1767
	{
	    if (nested_in_vect_loop
		&& TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD)
	      return dr_explicit_realign;
	    else
	      return dr_explicit_realign_optimized;
	}
1768

1769 1770
      if (optab_handler (movmisalign_optab, mode)->insn_code != 
							     CODE_FOR_nothing)
1771 1772 1773
	/* Can't software pipeline the loads, but can at least do them.  */
	return dr_unaligned_supported;
    }
1774

1775 1776 1777
  /* Unsupported.  */
  return dr_unaligned_unsupported;
}
1778 1779


1780
/* Function vect_is_simple_use.
1781

1782 1783 1784 1785
   Input:
   LOOP - the loop that is being vectorized.
   OPERAND - operand of a stmt in LOOP.
   DEF - the defining stmt in case OPERAND is an SSA_NAME.
1786

1787 1788 1789 1790 1791
   Returns whether a stmt with OPERAND can be vectorized.
   Supportable operands are constants, loop invariants, and operands that are
   defined by the current iteration of the loop. Unsupportable operands are 
   those that are defined by a previous iteration of the loop (as is the case
   in reduction/induction computations).  */
1792

1793
bool
1794 1795
vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, tree *def_stmt,
		    tree *def, enum vect_def_type *dt)
1796 1797
{ 
  basic_block bb;
1798
  stmt_vec_info stmt_vinfo;
1799
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1800

1801 1802 1803
  *def_stmt = NULL_TREE;
  *def = NULL_TREE;
  
1804
  if (vect_print_dump_info (REPORT_DETAILS))
1805 1806 1807 1808 1809
    {
      fprintf (vect_dump, "vect_is_simple_use: operand ");
      print_generic_expr (vect_dump, operand, TDF_SLIM);
    }
    
1810
  if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
1811 1812 1813 1814
    {
      *dt = vect_constant_def;
      return true;
    }
1815 1816 1817 1818 1819 1820
  if (is_gimple_min_invariant (operand))
   {
      *def = operand;
      *dt = vect_invariant_def;
      return true;
   }
1821
    
1822
  if (TREE_CODE (operand) != SSA_NAME)
1823
    {
1824
      if (vect_print_dump_info (REPORT_DETAILS))
1825 1826 1827 1828 1829 1830
        fprintf (vect_dump, "not ssa-name.");
      return false;
    }
    
  *def_stmt = SSA_NAME_DEF_STMT (operand);
  if (*def_stmt == NULL_TREE )
1831
    {
1832
      if (vect_print_dump_info (REPORT_DETAILS))
1833 1834
        fprintf (vect_dump, "no def_stmt.");
      return false;
1835 1836
    }

1837
  if (vect_print_dump_info (REPORT_DETAILS))
1838 1839 1840 1841 1842
    {
      fprintf (vect_dump, "def_stmt: ");
      print_generic_expr (vect_dump, *def_stmt, TDF_SLIM);
    }

1843
  /* empty stmt is expected only in case of a function argument.
1844
     (Otherwise - we expect a phi_node or a GIMPLE_MODIFY_STMT).  */
1845
  if (IS_EMPTY_STMT (*def_stmt))
1846
    {
1847
      tree arg = TREE_OPERAND (*def_stmt, 0);
1848
      if (is_gimple_min_invariant (arg))
1849 1850 1851 1852 1853 1854
        {
          *def = operand;
          *dt = vect_invariant_def;
          return true;
        }

1855
      if (vect_print_dump_info (REPORT_DETAILS))
1856 1857
        fprintf (vect_dump, "Unexpected empty stmt.");
      return false;
1858
    }
1859

1860 1861 1862 1863 1864 1865 1866 1867 1868 1869
  bb = bb_for_stmt (*def_stmt);
  if (!flow_bb_inside_loop_p (loop, bb))
    *dt = vect_invariant_def;
  else
    {
      stmt_vinfo = vinfo_for_stmt (*def_stmt);
      *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
    }

  if (*dt == vect_unknown_def_type)
1870
    {
1871
      if (vect_print_dump_info (REPORT_DETAILS))
1872 1873
        fprintf (vect_dump, "Unsupported pattern.");
      return false;
1874
    }
1875

1876
  if (vect_print_dump_info (REPORT_DETAILS))
1877 1878 1879 1880 1881 1882 1883 1884
    fprintf (vect_dump, "type of def: %d.",*dt);

  switch (TREE_CODE (*def_stmt))
    {
    case PHI_NODE:
      *def = PHI_RESULT (*def_stmt);
      break;

1885 1886
    case GIMPLE_MODIFY_STMT:
      *def = GIMPLE_STMT_OPERAND (*def_stmt, 0);
1887 1888 1889
      break;

    default:
1890
      if (vect_print_dump_info (REPORT_DETAILS))
1891 1892
        fprintf (vect_dump, "unsupported defining stmt: ");
      return false;
1893
    }
1894

1895 1896 1897 1898
  return true;
}


Dorit Nuzman committed
1899 1900 1901 1902 1903 1904
/* Function supportable_widening_operation

   Check whether an operation represented by the code CODE is a 
   widening operation that is supported by the target platform in 
   vector form (i.e., when operating on arguments of type VECTYPE).
    
1905 1906 1907 1908
   Widening operations we currently support are NOP (CONVERT), FLOAT
   and WIDEN_MULT.  This function checks if these operations are supported
   by the target platform either directly (via vector tree-codes), or via
   target builtins.
Dorit Nuzman committed
1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922

   Output:
   - CODE1 and CODE2 are codes of vector operations to be used when 
   vectorizing the operation, if available. 
   - DECL1 and DECL2 are decls of target builtin functions to be used
   when vectorizing the operation, if available. In this case,
   CODE1 and CODE2 are CALL_EXPR.  */

bool
supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
                                tree *decl1, tree *decl2,
                                enum tree_code *code1, enum tree_code *code2)
{
  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1923 1924
  loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
Dorit Nuzman committed
1925 1926 1927 1928
  bool ordered_p;
  enum machine_mode vec_mode;
  enum insn_code icode1, icode2;
  optab optab1, optab2;
1929
  tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
Dorit Nuzman committed
1930 1931 1932 1933
  tree type = TREE_TYPE (expr);
  tree wide_vectype = get_vectype_for_scalar_type (type);
  enum tree_code c1, c2;

1934
  /* The result of a vectorized widening operation usually requires two vectors
Dorit Nuzman committed
1935 1936 1937 1938 1939 1940 1941
     (because the widened results do not fit int one vector). The generated 
     vector results would normally be expected to be generated in the same 
     order as in the original scalar computation. i.e. if 8 results are 
     generated in each vector iteration, they are to be organized as follows:
        vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8]. 

     However, in the special case that the result of the widening operation is 
1942
     used in a reduction computation only, the order doesn't matter (because
Dorit Nuzman committed
1943
     when vectorizing a reduction we change the order of the computation). 
1944
     Some targets can take advantage of this and generate more efficient code.
Dorit Nuzman committed
1945 1946
     For example, targets like Altivec, that support widen_mult using a sequence
     of {mult_even,mult_odd} generate the following vectors:
1947 1948 1949 1950 1951 1952
        vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].

     When vectorizaing outer-loops, we execute the inner-loop sequentially
     (each vectorized inner-loop iteration contributes to VF outer-loop 
     iterations in parallel). We therefore don't allow to change the order 
     of the computation in the inner-loop during outer-loop vectorization.  */
Dorit Nuzman committed
1953

1954 1955
   if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
       && !nested_in_vect_loop_p (vect_loop, stmt))
Dorit Nuzman committed
1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991
     ordered_p = false;
   else
     ordered_p = true;

  if (!ordered_p
      && code == WIDEN_MULT_EXPR
      && targetm.vectorize.builtin_mul_widen_even
      && targetm.vectorize.builtin_mul_widen_even (vectype)
      && targetm.vectorize.builtin_mul_widen_odd
      && targetm.vectorize.builtin_mul_widen_odd (vectype))
    {
      if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "Unordered widening operation detected.");

      *code1 = *code2 = CALL_EXPR;
      *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
      *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
      return true;
    }

  switch (code)
    {
    case WIDEN_MULT_EXPR:
      if (BYTES_BIG_ENDIAN)
        {
          c1 = VEC_WIDEN_MULT_HI_EXPR;
          c2 = VEC_WIDEN_MULT_LO_EXPR;
        }
      else
        {
          c2 = VEC_WIDEN_MULT_HI_EXPR;
          c1 = VEC_WIDEN_MULT_LO_EXPR;
        }
      break;

    case NOP_EXPR:
1992
    case CONVERT_EXPR:
Dorit Nuzman committed
1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004
      if (BYTES_BIG_ENDIAN)
        {
          c1 = VEC_UNPACK_HI_EXPR;
          c2 = VEC_UNPACK_LO_EXPR;
        }
      else
        {
          c2 = VEC_UNPACK_HI_EXPR;
          c1 = VEC_UNPACK_LO_EXPR;
        }
      break;

2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
    case FLOAT_EXPR:
      if (BYTES_BIG_ENDIAN)
        {
          c1 = VEC_UNPACK_FLOAT_HI_EXPR;
          c2 = VEC_UNPACK_FLOAT_LO_EXPR;
        }
      else
        {
          c2 = VEC_UNPACK_FLOAT_HI_EXPR;
          c1 = VEC_UNPACK_FLOAT_LO_EXPR;
        }
      break;

2018 2019 2020 2021 2022 2023
    case FIX_TRUNC_EXPR:
      /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
	 VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
	 computing the operation.  */
      return false;

Dorit Nuzman committed
2024 2025 2026 2027
    default:
      gcc_unreachable ();
    }

2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038
  if (code == FIX_TRUNC_EXPR)
    {
      /* The signedness is determined from output operand.  */
      optab1 = optab_for_tree_code (c1, type);
      optab2 = optab_for_tree_code (c2, type);
    }
  else
    {
      optab1 = optab_for_tree_code (c1, vectype);
      optab2 = optab_for_tree_code (c2, vectype);
    }
Dorit Nuzman committed
2039 2040 2041 2042 2043

  if (!optab1 || !optab2)
    return false;

  vec_mode = TYPE_MODE (vectype);
2044
  if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
Dorit Nuzman committed
2045
      || insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
2046
      || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
Dorit Nuzman committed
2047 2048 2049 2050
                                                        == CODE_FOR_nothing
      || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
    return false;

2051 2052
  *code1 = c1;
  *code2 = c2;
Dorit Nuzman committed
2053 2054 2055 2056
  return true;
}


2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072
/* Function supportable_narrowing_operation

   Check whether an operation represented by the code CODE is a 
   narrowing operation that is supported by the target platform in 
   vector form (i.e., when operating on arguments of type VECTYPE).
    
   Narrowing operations we currently support are NOP (CONVERT) and
   FIX_TRUNC. This function checks if these operations are supported by
   the target platform directly via vector tree-codes.

   Output:
   - CODE1 is the code of a vector operation to be used when 
   vectorizing the operation, if available.  */

bool
supportable_narrowing_operation (enum tree_code code,
2073
				 const_tree stmt, const_tree vectype,
2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094
				 enum tree_code *code1)
{
  enum machine_mode vec_mode;
  enum insn_code icode1;
  optab optab1;
  tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
  tree type = TREE_TYPE (expr);
  tree narrow_vectype = get_vectype_for_scalar_type (type);
  enum tree_code c1;

  switch (code)
    {
    case NOP_EXPR:
    case CONVERT_EXPR:
      c1 = VEC_PACK_TRUNC_EXPR;
      break;

    case FIX_TRUNC_EXPR:
      c1 = VEC_PACK_FIX_TRUNC_EXPR;
      break;

2095 2096 2097 2098 2099
    case FLOAT_EXPR:
      /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
	 tree code and optabs used for computing the operation.  */
      return false;

2100 2101 2102 2103
    default:
      gcc_unreachable ();
    }

2104 2105 2106 2107 2108
  if (code == FIX_TRUNC_EXPR)
    /* The signedness is determined from output operand.  */
    optab1 = optab_for_tree_code (c1, type);
  else
    optab1 = optab_for_tree_code (c1, vectype);
2109 2110 2111 2112 2113

  if (!optab1)
    return false;

  vec_mode = TYPE_MODE (vectype);
2114
  if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2115 2116 2117
      || insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
    return false;

2118
  *code1 = c1;
2119 2120 2121 2122
  return true;
}


2123 2124 2125 2126 2127 2128
/* Function reduction_code_for_scalar_code

   Input:
   CODE - tree_code of a reduction operations.

   Output:
2129
   REDUC_CODE - the corresponding tree-code to be used to reduce the
2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158
      vector of partial results into a single scalar result (which
      will also reside in a vector).

   Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise.  */

bool
reduction_code_for_scalar_code (enum tree_code code,
                                enum tree_code *reduc_code)
{
  switch (code)
  {
  case MAX_EXPR:
    *reduc_code = REDUC_MAX_EXPR;
    return true;

  case MIN_EXPR:
    *reduc_code = REDUC_MIN_EXPR;
    return true;

  case PLUS_EXPR:
    *reduc_code = REDUC_PLUS_EXPR;
    return true;

  default:
    return false;
  }
}


2159 2160 2161
/* Function vect_is_simple_reduction

   Detect a cross-iteration def-use cucle that represents a simple
2162
   reduction computation. We look for the following pattern:
2163 2164 2165 2166 2167 2168 2169

   loop_header:
     a1 = phi < a0, a2 >
     a3 = ...
     a2 = operation (a3, a1)
  
   such that:
2170
   1. operation is commutative and associative and it is safe to 
2171
      change the order of the computation.
2172 2173 2174 2175 2176
   2. no uses for a2 in the loop (a2 is used out of the loop)
   3. no uses of a1 in the loop besides the reduction operation.

   Condition 1 is tested here.
   Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.  */
2177 2178

tree
2179
vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
2180
{
2181 2182
  struct loop *loop = (bb_for_stmt (phi))->loop_father;
  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2183 2184 2185 2186 2187 2188 2189
  edge latch_e = loop_latch_edge (loop);
  tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
  tree def_stmt, def1, def2;
  enum tree_code code;
  int op_type;
  tree operation, op1, op2;
  tree type;
2190 2191 2192 2193
  int nloop_uses;
  tree name;
  imm_use_iterator imm_iter;
  use_operand_p use_p;
2194

2195 2196
  gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));

2197 2198 2199
  name = PHI_RESULT (phi);
  nloop_uses = 0;
  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2200
    {
2201 2202 2203 2204 2205 2206
      tree use_stmt = USE_STMT (use_p);
      if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
	  && vinfo_for_stmt (use_stmt)
	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
        nloop_uses++;
      if (nloop_uses > 1)
2207
        {
2208 2209 2210
          if (vect_print_dump_info (REPORT_DETAILS))
            fprintf (vect_dump, "reduction used in loop.");
          return NULL_TREE;
2211
        }
2212 2213 2214 2215 2216 2217 2218 2219 2220
    }

  if (TREE_CODE (loop_arg) != SSA_NAME)
    {
      if (vect_print_dump_info (REPORT_DETAILS))
	{
	  fprintf (vect_dump, "reduction: not ssa_name: ");
	  print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
	}
2221 2222
      return NULL_TREE;
    }
2223

2224 2225 2226
  def_stmt = SSA_NAME_DEF_STMT (loop_arg);
  if (!def_stmt)
    {
2227
      if (vect_print_dump_info (REPORT_DETAILS))
2228
	fprintf (vect_dump, "reduction: no def_stmt.");
2229 2230 2231
      return NULL_TREE;
    }

2232
  if (TREE_CODE (def_stmt) != GIMPLE_MODIFY_STMT)
2233
    {
2234
      if (vect_print_dump_info (REPORT_DETAILS))
2235
        print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
2236 2237 2238
      return NULL_TREE;
    }

2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255
  name = GIMPLE_STMT_OPERAND (def_stmt, 0);
  nloop_uses = 0;
  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
    {
      tree use_stmt = USE_STMT (use_p);
      if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
	  && vinfo_for_stmt (use_stmt)
	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
	nloop_uses++;
      if (nloop_uses > 1)
	{
	  if (vect_print_dump_info (REPORT_DETAILS))
	    fprintf (vect_dump, "reduction used in loop.");
	  return NULL_TREE;
	}
    }

2256
  operation = GIMPLE_STMT_OPERAND (def_stmt, 1);
2257 2258 2259
  code = TREE_CODE (operation);
  if (!commutative_tree_code (code) || !associative_tree_code (code))
    {
2260
      if (vect_print_dump_info (REPORT_DETAILS))
2261 2262 2263 2264 2265 2266 2267
        {
          fprintf (vect_dump, "reduction: not commutative/associative: ");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
        }
      return NULL_TREE;
    }

2268
  op_type = TREE_OPERAND_LENGTH (operation);
2269 2270
  if (op_type != binary_op)
    {
2271
      if (vect_print_dump_info (REPORT_DETAILS))
2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282
        {
          fprintf (vect_dump, "reduction: not binary operation: ");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
        }
      return NULL_TREE;
    }

  op1 = TREE_OPERAND (operation, 0);
  op2 = TREE_OPERAND (operation, 1);
  if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
    {
2283
      if (vect_print_dump_info (REPORT_DETAILS))
2284 2285 2286 2287 2288 2289 2290
        {
          fprintf (vect_dump, "reduction: uses not ssa_names: ");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
        }
      return NULL_TREE;
    }

2291
  /* Check that it's ok to change the order of the computation.  */
2292
  type = TREE_TYPE (operation);
2293 2294
  if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
      || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
2295
    {
2296
      if (vect_print_dump_info (REPORT_DETAILS))
2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307
        {
          fprintf (vect_dump, "reduction: multiple types: operation type: ");
          print_generic_expr (vect_dump, type, TDF_SLIM);
          fprintf (vect_dump, ", operands types: ");
          print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
          fprintf (vect_dump, ",");
          print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
        }
      return NULL_TREE;
    }

2308 2309 2310 2311 2312 2313 2314
  /* Generally, when vectorizing a reduction we change the order of the
     computation.  This may change the behavior of the program in some
     cases, so we need to check that this is ok.  One exception is when 
     vectorizing an outer-loop: the inner-loop is executed sequentially,
     and therefore vectorizing reductions in the inner-loop durint 
     outer-loop vectorization is safe.  */

2315
  /* CHECKME: check for !flag_finite_math_only too?  */
2316
  if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2317
      && !nested_in_vect_loop_p (vect_loop, def_stmt)) 
2318
    {
2319
      /* Changing the order of operations changes the semantics.  */
2320
      if (vect_print_dump_info (REPORT_DETAILS))
2321 2322 2323 2324 2325 2326
        {
          fprintf (vect_dump, "reduction: unsafe fp math optimization: ");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
        }
      return NULL_TREE;
    }
2327 2328
  else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
	   && !nested_in_vect_loop_p (vect_loop, def_stmt))
2329
    {
2330
      /* Changing the order of operations changes the semantics.  */
2331
      if (vect_print_dump_info (REPORT_DETAILS))
2332 2333 2334
        {
          fprintf (vect_dump, "reduction: unsafe int math optimization: ");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
2335 2336 2337 2338 2339 2340 2341 2342 2343 2344
        }
      return NULL_TREE;
    }
  else if (SAT_FIXED_POINT_TYPE_P (type))
    {
      /* Changing the order of operations changes the semantics.  */
      if (vect_print_dump_info (REPORT_DETAILS))
        {
          fprintf (vect_dump, "reduction: unsafe fixed-point math optimization: ");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
2345 2346 2347 2348 2349 2350 2351 2352 2353 2354
        }
      return NULL_TREE;
    }

  /* reduction is safe. we're dealing with one of the following:
     1) integer arithmetic and no trapv
     2) floating point arithmetic, and special flags permit this optimization.
   */
  def1 = SSA_NAME_DEF_STMT (op1);
  def2 = SSA_NAME_DEF_STMT (op2);
2355
  if (!def1 || !def2 || IS_EMPTY_STMT (def1) || IS_EMPTY_STMT (def2))
2356
    {
2357
      if (vect_print_dump_info (REPORT_DETAILS))
2358 2359 2360 2361 2362 2363 2364
        {
          fprintf (vect_dump, "reduction: no defs for operands: ");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
        }
      return NULL_TREE;
    }

2365 2366

  /* Check that one def is the reduction def, defined by PHI,
2367 2368
     the other def is either defined in the loop ("vect_loop_def"),
     or it's an induction (defined by a loop-header phi-node).  */
2369 2370

  if (def2 == phi
2371
      && flow_bb_inside_loop_p (loop, bb_for_stmt (def1))
2372
      && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT 
2373 2374 2375 2376
	  || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
	  || (TREE_CODE (def1) == PHI_NODE 
	      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
	      && !is_loop_header_bb_p (bb_for_stmt (def1)))))
2377
    {
2378
      if (vect_print_dump_info (REPORT_DETAILS))
2379 2380 2381 2382 2383 2384
        {
          fprintf (vect_dump, "detected reduction:");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
        }
      return def_stmt;
    }
2385 2386 2387
  else if (def1 == phi
	   && flow_bb_inside_loop_p (loop, bb_for_stmt (def2))
	   && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT 
2388 2389 2390 2391
	       || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
	       || (TREE_CODE (def2) == PHI_NODE
		   && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
		   && !is_loop_header_bb_p (bb_for_stmt (def2)))))
2392 2393 2394 2395
    {
      /* Swap operands (just for simplicity - so that the rest of the code
	 can assume that the reduction variable is always the last (second)
	 argument).  */
2396
      if (vect_print_dump_info (REPORT_DETAILS))
2397 2398 2399 2400
        {
          fprintf (vect_dump, "detected reduction: need to swap operands:");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
        }
2401 2402
      swap_tree_operands (def_stmt, &TREE_OPERAND (operation, 0), 
				    &TREE_OPERAND (operation, 1));
2403 2404 2405 2406
      return def_stmt;
    }
  else
    {
2407
      if (vect_print_dump_info (REPORT_DETAILS))
2408 2409 2410 2411 2412 2413
        {
          fprintf (vect_dump, "reduction: unknown pattern.");
          print_generic_expr (vect_dump, operation, TDF_SLIM);
        }
      return NULL_TREE;
    }
2414 2415 2416
}


2417
/* Function vect_is_simple_iv_evolution.
2418

2419 2420
   FORNOW: A simple evolution of an induction variables in the loop is
   considered a polynomial evolution with constant step.  */
2421

2422 2423 2424
bool
vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, 
			     tree * step)
2425
{
2426 2427 2428
  tree init_expr;
  tree step_expr;
  tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
2429

2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440
  /* When there is no evolution in this loop, the evolution function
     is not "simple".  */  
  if (evolution_part == NULL_TREE)
    return false;
  
  /* When the evolution is a polynomial of degree >= 2
     the evolution function is not "simple".  */
  if (tree_is_chrec (evolution_part))
    return false;
  
  step_expr = evolution_part;
2441
  init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
2442

2443
  if (vect_print_dump_info (REPORT_DETAILS))
2444
    {
2445 2446 2447 2448
      fprintf (vect_dump, "step: ");
      print_generic_expr (vect_dump, step_expr, TDF_SLIM);
      fprintf (vect_dump, ",  init: ");
      print_generic_expr (vect_dump, init_expr, TDF_SLIM);
2449 2450
    }

2451 2452
  *init = init_expr;
  *step = step_expr;
2453

2454
  if (TREE_CODE (step_expr) != INTEGER_CST)
2455
    { 
2456
      if (vect_print_dump_info (REPORT_DETAILS))
2457
        fprintf (vect_dump, "step unknown.");
2458 2459 2460
      return false;
    }

2461 2462 2463 2464
  return true;
}


2465 2466 2467 2468
/* Function vectorize_loops.
   
   Entry Point to loop vectorization phase.  */

2469
unsigned
2470
vectorize_loops (void)
2471
{
2472
  unsigned int i;
2473
  unsigned int num_vectorized_loops = 0;
2474 2475 2476
  unsigned int vect_loops_num;
  loop_iterator li;
  struct loop *loop;
2477

2478 2479 2480 2481 2482 2483
  vect_loops_num = number_of_loops ();

  /* Bail out if there are no loops.  */
  if (vect_loops_num <= 1)
    return 0;

2484 2485 2486
  /* Fix the verbosity level if not defined explicitly by the user.  */
  vect_set_dump_settings ();

2487 2488
  /* Allocate the bitmap that records which virtual variables that 
     need to be renamed.  */
Diego Novillo committed
2489
  vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
2490

2491 2492 2493 2494 2495
  /*  ----------- Analyze loops. -----------  */

  /* If some loop was duplicated, it gets bigger number 
     than all previously defined loops. This fact allows us to run 
     only over initial loops skipping newly generated ones.  */
2496
  FOR_EACH_LOOP (li, loop, 0)
2497 2498 2499
    {
      loop_vec_info loop_vinfo;

2500
      vect_loop_location = find_loop_location (loop);
2501 2502 2503 2504 2505 2506
      loop_vinfo = vect_analyze_loop (loop);
      loop->aux = loop_vinfo;

      if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
	continue;

2507
      vect_transform_loop (loop_vinfo);
2508 2509
      num_vectorized_loops++;
    }
2510
  vect_loop_location = UNKNOWN_LOC;
2511

2512 2513 2514
  if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
      || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
	  && num_vectorized_loops > 0))
2515
    fprintf (vect_dump, "vectorized %u loops in function.\n",
2516 2517 2518 2519
	     num_vectorized_loops);

  /*  ----------- Finalize. -----------  */

Diego Novillo committed
2520
  BITMAP_FREE (vect_memsyms_to_rename);
2521

2522
  for (i = 1; i < vect_loops_num; i++)
2523
    {
2524 2525
      loop_vec_info loop_vinfo;

2526
      loop = get_loop (i);
2527
      if (!loop)
2528 2529
	continue;
      loop_vinfo = loop->aux;
2530
      destroy_loop_vec_info (loop_vinfo, true);
2531 2532
      loop->aux = NULL;
    }
2533 2534

  return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
2535
}
2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579

/* Increase alignment of global arrays to improve vectorization potential.
   TODO:
   - Consider also structs that have an array field.
   - Use ipa analysis to prune arrays that can't be vectorized?
     This should involve global alignment analysis and in the future also
     array padding.  */

static unsigned int
increase_alignment (void)
{
  struct varpool_node *vnode;

  /* Increase the alignment of all global arrays for vectorization.  */
  for (vnode = varpool_nodes_queue;
       vnode;
       vnode = vnode->next_needed)
    {
      tree vectype, decl = vnode->decl;
      unsigned int alignment;

      if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
	continue;
      vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
      if (!vectype)
	continue;
      alignment = TYPE_ALIGN (vectype);
      if (DECL_ALIGN (decl) >= alignment)
	continue;

      if (vect_can_force_dr_alignment_p (decl, alignment))
	{ 
	  DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
	  DECL_USER_ALIGN (decl) = 1;
	  if (dump_file)
	    { 
	      fprintf (dump_file, "Increasing alignment of decl: ");
	      print_generic_expr (dump_file, decl, TDF_SLIM);
	    }
	}
    }
  return 0;
}

2580
static bool
2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601
gate_increase_alignment (void)
{
  return flag_section_anchors && flag_tree_vectorize;
}

struct tree_opt_pass pass_ipa_increase_alignment = 
{
  "increase_alignment",			/* name */
  gate_increase_alignment,		/* gate */
  increase_alignment,			/* execute */
  NULL,					/* sub */
  NULL,					/* next */
  0,					/* static_pass_number */
  0,					/* tv_id */
  0,					/* properties_required */
  0,					/* properties_provided */
  0,					/* properties_destroyed */
  0,					/* todo_flags_start */
  0, 					/* todo_flags_finish */
  0					/* letter */
};