Commit a2ef9558 by Markus Trippelsdorf Committed by Markus Trippelsdorf

Correct imul (r64) latency for modern Intel CPUs

Since Sandybridge the 64bit multiplication latency is three cycles, not
four. So update the costs to reflect reality.

	* x86-tune-costs.h (skylake_cost, core_cost): Decrease r64 multiply
	latencies.

	* gcc.target/i386/wmul-3.c: New test.

From-SVN: r255760
parent d7f06bc3
2017-12-17 Markus Trippelsdorf <markus@trippelsdorf.de>
* x86-tune-costs.h (skylake_cost, core_cost): Decrease r64 multiply
latencies.
2017-12-16 Sandra Loosemore <sandra@codesourcery.com> 2017-12-16 Sandra Loosemore <sandra@codesourcery.com>
* doc/invoke.texi: Fix some typos. * doc/invoke.texi: Fix some typos.
...@@ -1538,8 +1538,8 @@ struct processor_costs skylake_cost = { ...@@ -1538,8 +1538,8 @@ struct processor_costs skylake_cost = {
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */ COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */ COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */ COSTS_N_INSNS (3), /* DI */
COSTS_N_INSNS (4)}, /* other */ COSTS_N_INSNS (3)}, /* other */
0, /* cost of multiply per each bit set */ 0, /* cost of multiply per each bit set */
/* Expanding div/mod currently doesn't consider parallelism. So the cost /* Expanding div/mod currently doesn't consider parallelism. So the cost
model is not realistic. We compensate by increasing the latencies a bit. */ model is not realistic. We compensate by increasing the latencies a bit. */
...@@ -2341,8 +2341,9 @@ struct processor_costs core_cost = { ...@@ -2341,8 +2341,9 @@ struct processor_costs core_cost = {
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */ COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */ COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */ /* Here we tune for Sandybridge or newer. */
COSTS_N_INSNS (4)}, /* other */ COSTS_N_INSNS (3), /* DI */
COSTS_N_INSNS (3)}, /* other */
0, /* cost of multiply per each bit set */ 0, /* cost of multiply per each bit set */
/* Expanding div/mod currently doesn't consider parallelism. So the cost /* Expanding div/mod currently doesn't consider parallelism. So the cost
model is not realistic. We compensate by increasing the latencies a bit. */ model is not realistic. We compensate by increasing the latencies a bit. */
......
2017-12-17 Markus Trippelsdorf <markus@trippelsdorf.de>
* gcc.target/i386/wmul-3.c: New test.
2017-12-16 Martin Sebor <msebor@redhat.com> 2017-12-16 Martin Sebor <msebor@redhat.com>
PR tree-optimization/78918 PR tree-optimization/78918
......
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-options "-O2 -march=sandybridge" } */
#include <stdint.h>
#include <string.h>
static const char b100_tab[200] = {
'0', '0', '0', '1', '0', '2', '0', '3', '0', '4',
'0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
'1', '0', '1', '1', '1', '2', '1', '3', '1', '4',
'1', '5', '1', '6', '1', '7', '1', '8', '1', '9',
'2', '0', '2', '1', '2', '2', '2', '3', '2', '4',
'2', '5', '2', '6', '2', '7', '2', '8', '2', '9',
'3', '0', '3', '1', '3', '2', '3', '3', '3', '4',
'3', '5', '3', '6', '3', '7', '3', '8', '3', '9',
'4', '0', '4', '1', '4', '2', '4', '3', '4', '4',
'4', '5', '4', '6', '4', '7', '4', '8', '4', '9',
'5', '0', '5', '1', '5', '2', '5', '3', '5', '4',
'5', '5', '5', '6', '5', '7', '5', '8', '5', '9',
'6', '0', '6', '1', '6', '2', '6', '3', '6', '4',
'6', '5', '6', '6', '6', '7', '6', '8', '6', '9',
'7', '0', '7', '1', '7', '2', '7', '3', '7', '4',
'7', '5', '7', '6', '7', '7', '7', '8', '7', '9',
'8', '0', '8', '1', '8', '2', '8', '3', '8', '4',
'8', '5', '8', '6', '8', '7', '8', '8', '8', '9',
'9', '0', '9', '1', '9', '2', '9', '3', '9', '4',
'9', '5', '9', '6', '9', '7', '9', '8', '9', '9',
};
void uint64_to_ascii_ta7_32_base100(uint64_t val, char *dst) {
const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000;
const uint64_t POW2_57_DIV_POW100_4 =
((int64_t)(1) << 57) / 100 / 100 / 100 / 100 + 1;
const uint64_t MASK32 = ((int64_t)(1) << 32) - 1;
int64_t hix = val / POW10_10;
int64_t lox = val % POW10_10;
int64_t lor = lox & (uint64_t)(-2);
uint64_t hi = hix * POW2_57_DIV_POW100_4;
uint64_t lo = lor * POW2_57_DIV_POW100_4;
memcpy(dst + 0 * 10 + 0, &b100_tab[(hi >> 57) * 2], 2);
memcpy(dst + 1 * 10 + 0, &b100_tab[(lo >> 57) * 2], 2);
hi = (hi >> 25) + 1;
lo = (lo >> 25) + 1;
hi = (hi & MASK32) * 100;
lo = (lo & MASK32) * 100;
memcpy(dst + 0 * 10 + 2, &b100_tab[(hi >> 32) * 2], 2);
hi = (hi & MASK32) * 100;
memcpy(dst + 1 * 10 + 2, &b100_tab[(lo >> 32) * 2], 2);
lo = (lo & MASK32) * 100;
memcpy(dst + 0 * 10 + 4, &b100_tab[(hi >> 32) * 2], 2);
hi = (hi & MASK32) * 100;
memcpy(dst + 1 * 10 + 4, &b100_tab[(lo >> 32) * 2], 2);
lo = (lo & MASK32) * 100;
memcpy(dst + 0 * 10 + 6, &b100_tab[(hi >> 32) * 2], 2);
hi = (hi & MASK32) * 100;
memcpy(dst + 1 * 10 + 6, &b100_tab[(lo >> 32) * 2], 2);
lo = (lo & MASK32) * 100;
hi >>= 32;
lo >>= 32;
lo = (lo & (-2)) | (lox & 1);
memcpy(dst + 0 * 10 + 8, &b100_tab[hi * 2], 2);
memcpy(dst + 1 * 10 + 8, &b100_tab[lo * 2], 2);
dst[2 * 10] = 0;
}
/* { dg-final { scan-assembler-times "imulq" 11 } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment