blob: feb0da4e4e453da6dbb729d6dcb62d36eaf3eaba [file] [log] [blame]
/* Subroutines used for code generation on IA-32.
Copyright (C) 1988-2014 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "tm.h"
#include "rtl.h"
#include "tree.h"
#include "stringpool.h"
#include "attribs.h"
#include "calls.h"
#include "stor-layout.h"
#include "varasm.h"
#include "tm_p.h"
#include "regs.h"
#include "hard-reg-set.h"
#include "insn-config.h"
#include "conditions.h"
#include "output.h"
#include "insn-codes.h"
#include "insn-attr.h"
#include "flags.h"
#include "except.h"
#include "function.h"
#include "recog.h"
#include "expr.h"
#include "optabs.h"
#include "diagnostic-core.h"
#include "toplev.h"
#include "basic-block.h"
#include "ggc.h"
#include "target.h"
#include "target-def.h"
#include "common/common-target.h"
#include "langhooks.h"
#include "reload.h"
#include "cgraph.h"
#include "pointer-set.h"
#include "hash-table.h"
#include "vec.h"
#include "basic-block.h"
#include "tree-ssa-alias.h"
#include "internal-fn.h"
#include "gimple-fold.h"
#include "tree-eh.h"
#include "gimple-expr.h"
#include "is-a.h"
#include "gimple.h"
#include "gimplify.h"
#include "cfgloop.h"
#include "dwarf2.h"
#include "df.h"
#include "tm-constrs.h"
#include "params.h"
#include "cselib.h"
#include "debug.h"
#include "sched-int.h"
#include "sbitmap.h"
#include "fibheap.h"
#include "opts.h"
#include "diagnostic.h"
#include "dumpfile.h"
#include "tree-pass.h"
#include "wide-int.h"
#include "context.h"
#include "pass_manager.h"
#include "target-globals.h"
#include "tree-vectorizer.h"
#include "shrink-wrap.h"
static rtx legitimize_dllimport_symbol (rtx, bool);
static rtx legitimize_pe_coff_extern_decl (rtx, bool);
static rtx legitimize_pe_coff_symbol (rtx, bool);
#ifndef CHECK_STACK_LIMIT
#define CHECK_STACK_LIMIT (-1)
#endif
/* Return index of given mode in mult and division cost tables. */
#define MODE_INDEX(mode) \
((mode) == QImode ? 0 \
: (mode) == HImode ? 1 \
: (mode) == SImode ? 2 \
: (mode) == DImode ? 3 \
: 4)
/* Processor costs (relative to an add) */
/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
#define COSTS_N_BYTES(N) ((N) * 2)
#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
static stringop_algs ix86_size_memcpy[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
static stringop_algs ix86_size_memset[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
const
struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of an add instruction */
COSTS_N_BYTES (3), /* cost of a lea instruction */
COSTS_N_BYTES (2), /* variable shift costs */
COSTS_N_BYTES (3), /* constant shift costs */
{COSTS_N_BYTES (3), /* cost of starting multiply for QI */
COSTS_N_BYTES (3), /* HI */
COSTS_N_BYTES (3), /* SI */
COSTS_N_BYTES (3), /* DI */
COSTS_N_BYTES (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
COSTS_N_BYTES (3), /* HI */
COSTS_N_BYTES (3), /* SI */
COSTS_N_BYTES (3), /* DI */
COSTS_N_BYTES (5)}, /* other */
COSTS_N_BYTES (3), /* cost of movsx */
COSTS_N_BYTES (3), /* cost of movzx */
0, /* "large" insn */
2, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */
{2, 2, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 2}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{2, 2, 2}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
3, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{3, 3}, /* cost of storing MMX registers
in SImode and DImode */
3, /* cost of moving SSE register */
{3, 3, 3}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{3, 3, 3}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
COSTS_N_BYTES (2), /* cost of FMUL instruction. */
COSTS_N_BYTES (2), /* cost of FDIV instruction. */
COSTS_N_BYTES (2), /* cost of FABS instruction. */
COSTS_N_BYTES (2), /* cost of FCHS instruction. */
COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
ix86_size_memcpy,
ix86_size_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
1, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
1, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* Processor costs (relative to an add) */
static stringop_algs i386_memcpy[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs i386_memset[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (3), /* variable shift costs */
COSTS_N_INSNS (2), /* constant shift costs */
{COSTS_N_INSNS (6), /* cost of starting multiply for QI */
COSTS_N_INSNS (6), /* HI */
COSTS_N_INSNS (6), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
COSTS_N_INSNS (1), /* cost of multiply per each bit set */
{COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
COSTS_N_INSNS (23), /* HI */
COSTS_N_INSNS (23), /* SI */
COSTS_N_INSNS (23), /* DI */
COSTS_N_INSNS (23)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
15, /* "large" insn */
3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{8, 8, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 8}, /* cost of loading MMX registers
in SImode and DImode */
{4, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 8, 16}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 8, 16}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (27), /* cost of FMUL instruction. */
COSTS_N_INSNS (88), /* cost of FDIV instruction. */
COSTS_N_INSNS (22), /* cost of FABS instruction. */
COSTS_N_INSNS (24), /* cost of FCHS instruction. */
COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
i386_memcpy,
i386_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs i486_memcpy[2] = {
{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs i486_memset[2] = {
{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (3), /* variable shift costs */
COSTS_N_INSNS (2), /* constant shift costs */
{COSTS_N_INSNS (12), /* cost of starting multiply for QI */
COSTS_N_INSNS (12), /* HI */
COSTS_N_INSNS (12), /* SI */
COSTS_N_INSNS (12), /* DI */
COSTS_N_INSNS (12)}, /* other */
1, /* cost of multiply per each bit set */
{COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
COSTS_N_INSNS (40), /* HI */
COSTS_N_INSNS (40), /* SI */
COSTS_N_INSNS (40), /* DI */
COSTS_N_INSNS (40)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
15, /* "large" insn */
3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{8, 8, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 8}, /* cost of loading MMX registers
in SImode and DImode */
{4, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 8, 16}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 8, 16}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
4, /* size of l1 cache. 486 has 8kB cache
shared for code and data, so 4kB is
not really precise. */
4, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (16), /* cost of FMUL instruction. */
COSTS_N_INSNS (73), /* cost of FDIV instruction. */
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
i486_memcpy,
i486_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs pentium_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentium_memset[2] = {
{libcall, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentium_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (4), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (11), /* cost of starting multiply for QI */
COSTS_N_INSNS (11), /* HI */
COSTS_N_INSNS (11), /* SI */
COSTS_N_INSNS (11), /* DI */
COSTS_N_INSNS (11)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
COSTS_N_INSNS (25), /* HI */
COSTS_N_INSNS (25), /* SI */
COSTS_N_INSNS (25), /* DI */
COSTS_N_INSNS (25)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
8, /* "large" insn */
6, /* MOVE_RATIO */
6, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
8, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 8, 16}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 8, 16}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (3), /* cost of FMUL instruction. */
COSTS_N_INSNS (39), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
pentium_memcpy,
pentium_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
(we ensure the alignment). For small blocks inline loop is still a
noticeable win, for bigger blocks either rep movsl or rep movsb is
way to go. Rep movsb has apparently more expensive startup time in CPU,
but after 4K the difference is down in the noise. */
static stringop_algs pentiumpro_memcpy[2] = {
{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentiumpro_memset[2] = {
{rep_prefix_4_byte, {{1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentiumpro_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (4)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
COSTS_N_INSNS (17), /* HI */
COSTS_N_INSNS (17), /* SI */
COSTS_N_INSNS (17), /* DI */
COSTS_N_INSNS (17)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
6, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{2, 2, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{2, 2, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
8, /* size of l1 cache. */
256, /* size of l2 cache */
32, /* size of prefetch block */
6, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (5), /* cost of FMUL instruction. */
COSTS_N_INSNS (56), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
pentiumpro_memcpy,
pentiumpro_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs geode_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs geode_memset[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs geode_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (2), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (7), /* SI */
COSTS_N_INSNS (7), /* DI */
COSTS_N_INSNS (7)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
COSTS_N_INSNS (23), /* HI */
COSTS_N_INSNS (39), /* SI */
COSTS_N_INSNS (39), /* DI */
COSTS_N_INSNS (39)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
4, /* MOVE_RATIO */
1, /* cost for loading QImode using movzbl */
{1, 1, 1}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{1, 1, 1}, /* cost of storing integer registers */
1, /* cost of reg,reg fld/fst */
{1, 1, 1}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 6, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
1, /* cost of moving MMX register */
{1, 1}, /* cost of loading MMX registers
in SImode and DImode */
{1, 1}, /* cost of storing MMX registers
in SImode and DImode */
1, /* cost of moving SSE register */
{1, 1, 1}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{1, 1, 1}, /* cost of storing SSE registers
in SImode, DImode and TImode */
1, /* MMX or SSE register to integer */
64, /* size of l1 cache. */
128, /* size of l2 cache. */
32, /* size of prefetch block */
1, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (11), /* cost of FMUL instruction. */
COSTS_N_INSNS (47), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
geode_memcpy,
geode_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs k6_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs k6_memset[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs k6_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (3), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (3), /* DI */
COSTS_N_INSNS (3)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (18), /* HI */
COSTS_N_INSNS (18), /* SI */
COSTS_N_INSNS (18), /* DI */
COSTS_N_INSNS (18)}, /* other */
COSTS_N_INSNS (2), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
8, /* "large" insn */
4, /* MOVE_RATIO */
3, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 3, 2}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{6, 6, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 4}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{2, 2, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{2, 2, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
6, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
32, /* size of l2 cache. Some models
have integrated l2 cache, but
optimizing for k6 is not important
enough to worry about that. */
32, /* size of prefetch block */
1, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (2), /* cost of FMUL instruction. */
COSTS_N_INSNS (56), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
k6_memcpy,
k6_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* For some reason, Athlon deals better with REP prefix (relative to loops)
compared to K8. Alignment becomes important after 8 bytes for memcpy and
128 bytes for memset. */
static stringop_algs athlon_memcpy[2] = {
{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs athlon_memset[2] = {
{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs athlon_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (5), /* cost of starting multiply for QI */
COSTS_N_INSNS (5), /* HI */
COSTS_N_INSNS (5), /* SI */
COSTS_N_INSNS (5), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 6}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
64, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
5, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (24), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
athlon_memcpy,
athlon_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* K8 has optimized REP instruction for medium sized blocks, but for very
small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs k8_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs k8_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs k8_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 3, 6}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
k8_memcpy,
k8_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
5, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
2, /* vec_align_load_cost. */
3, /* vec_unalign_load_cost. */
3, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
2, /* cond_not_taken_branch_cost. */
};
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs amdfam10_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs amdfam10_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs amdfam10_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 3}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
amdfam10_memcpy,
amdfam10_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
2, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
2, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* BDVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver1_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver1_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs bdver1_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{5, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{5, 5, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 4}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 4}, /* cost of storing SSE registers
in SImode, DImode and TImode */
2, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
bdver1_memcpy,
bdver1_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
4, /* vec_align_load_cost. */
4, /* vec_unalign_load_cost. */
4, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* BDVER2 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver2_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver2_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs bdver2_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{5, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{5, 5, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 4}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 4}, /* cost of storing SSE registers
in SImode, DImode and TImode */
2, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
bdver2_memcpy,
bdver2_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
4, /* vec_align_load_cost. */
4, /* vec_unalign_load_cost. */
4, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* BDVER3 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver3_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver3_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs bdver3_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{5, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{5, 5, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 4}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 4}, /* cost of storing SSE registers
in SImode, DImode and TImode */
2, /* MMX or SSE register to integer */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
bdver3_memcpy,
bdver3_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
4, /* vec_align_load_cost. */
4, /* vec_unalign_load_cost. */
4, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* BDVER4 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver4_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver4_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs bdver4_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{5, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{5, 5, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 4}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 4}, /* cost of storing SSE registers
in SImode, DImode and TImode */
2, /* MMX or SSE register to integer */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
bdver4_memcpy,
bdver4_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
4, /* vec_align_load_cost. */
4, /* vec_unalign_load_cost. */
4, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs btver1_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs btver1_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs btver1_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 3}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
btver1_memcpy,
btver1_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
2, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
2, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs btver2_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs btver2_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs btver2_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 3}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
32, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
btver2_memcpy,
btver2_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
2, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
2, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs pentium4_memcpy[2] = {
{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentium4_memset[2] = {
{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentium4_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (3), /* cost of a lea instruction */
COSTS_N_INSNS (4), /* variable shift costs */
COSTS_N_INSNS (4), /* constant shift costs */
{COSTS_N_INSNS (15), /* cost of starting multiply for QI */
COSTS_N_INSNS (15), /* HI */
COSTS_N_INSNS (15), /* SI */
COSTS_N_INSNS (15), /* DI */
COSTS_N_INSNS (15)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
COSTS_N_INSNS (56), /* HI */
COSTS_N_INSNS (56), /* SI */
COSTS_N_INSNS (56), /* DI */
COSTS_N_INSNS (56)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
16, /* "large" insn */
6, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 3, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
12, /* cost of moving SSE register */
{12, 12, 12}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{2, 2, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
10, /* MMX or SSE register to integer */
8, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (7), /* cost of FMUL instruction. */
COSTS_N_INSNS (43), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
pentium4_memcpy,
pentium4_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs nocona_memcpy[2] = {
{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
{100000, unrolled_loop, false}, {-1, libcall, false}}}};
static stringop_algs nocona_memset[2] = {
{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {64, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs nocona_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (10), /* cost of starting multiply for QI */
COSTS_N_INSNS (10), /* HI */
COSTS_N_INSNS (10), /* SI */
COSTS_N_INSNS (10), /* DI */
COSTS_N_INSNS (10)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
COSTS_N_INSNS (66), /* HI */
COSTS_N_INSNS (66), /* SI */
COSTS_N_INSNS (66), /* DI */
COSTS_N_INSNS (66)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
16, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
3, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 4}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
6, /* cost of moving MMX register */
{12, 12}, /* cost of loading MMX registers
in SImode and DImode */
{12, 12}, /* cost of storing MMX registers
in SImode and DImode */
6, /* cost of moving SSE register */
{12, 12, 12}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{12, 12, 12}, /* cost of storing SSE registers
in SImode, DImode and TImode */
8, /* MMX or SSE register to integer */
8, /* size of l1 cache. */
1024, /* size of l2 cache. */
64, /* size of prefetch block */
8, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (40), /* cost of FDIV instruction. */
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
nocona_memcpy,
nocona_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs atom_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static stringop_algs atom_memset[2] = {
{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs atom_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
atom_memcpy,
atom_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs slm_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static stringop_algs slm_memset[2] = {
{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs slm_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (3), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
slm_memcpy,
slm_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
4, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs intel_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static stringop_algs intel_memset[2] = {
{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs intel_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (3), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
intel_memcpy,
intel_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
4, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* Generic should produce code tuned for Core-i7 (and newer chips)
and btver1 (and newer chips). */
static stringop_algs generic_memcpy[2] = {
{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs generic_memset[2] = {
{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static const
struct processor_costs generic_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
/* On all chips taken into consideration lea is 2 cycles and more. With
this cost however our current implementation of synth_mult results in
use of unnecessary temporary registers causing regression on several
SPECfp benchmarks. */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
/* Benchmarks shows large regressions on K8 sixtrack benchmark when this
value is increased to perhaps more appropriate value of 5. */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
generic_memcpy,
generic_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* core_cost should produce code tuned for Core familly of CPUs. */
static stringop_algs core_memcpy[2] = {
{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
{libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
{-1, libcall, false}}}};
static stringop_algs core_memset[2] = {
{libcall, {{6, loop_1_byte, true},
{24, loop, true},
{8192, rep_prefix_4_byte, true},
{-1, libcall, false}}},
{libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
{-1, libcall, false}}}};
static const
struct processor_costs core_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
/* On all chips taken into consideration lea is 2 cycles and more. With
this cost however our current implementation of synth_mult results in
use of unnecessary temporary registers causing regression on several
SPECfp benchmarks. */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
/* FIXME perhaps more appropriate value is 5. */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
core_memcpy,
core_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* Set by -mtune. */
const struct processor_costs *ix86_tune_cost = &pentium_cost;
/* Set by -mtune or -Os. */
const struct processor_costs *ix86_cost = &pentium_cost;
/* Processor feature/optimization bitmasks. */
#define m_386 (1<<PROCESSOR_I386)
#define m_486 (1<<PROCESSOR_I486)
#define m_PENT (1<<PROCESSOR_PENTIUM)
#define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
#define m_PENT4 (1<<PROCESSOR_PENTIUM4)
#define m_NOCONA (1<<PROCESSOR_NOCONA)
#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
#define m_CORE2 (1<<PROCESSOR_CORE2)
#define m_NEHALEM (1<<PROCESSOR_NEHALEM)
#define m_SANDYBRIDGE (1<<PROCESSOR_SANDYBRIDGE)
#define m_HASWELL (1<<PROCESSOR_HASWELL)
#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
#define m_BONNELL (1<<PROCESSOR_BONNELL)
#define m_SILVERMONT (1<<PROCESSOR_SILVERMONT)
#define m_INTEL (1<<PROCESSOR_INTEL)
#define m_GEODE (1<<PROCESSOR_GEODE)
#define m_K6 (1<<PROCESSOR_K6)
#define m_K6_GEODE (m_K6 | m_GEODE)
#define m_K8 (1<<PROCESSOR_K8)
#define m_ATHLON (1<<PROCESSOR_ATHLON)
#define m_ATHLON_K8 (m_K8 | m_ATHLON)
#define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
#define m_BDVER1 (1<<PROCESSOR_BDVER1)
#define m_BDVER2 (1<<PROCESSOR_BDVER2)
#define m_BDVER3 (1<<PROCESSOR_BDVER3)
#define m_BDVER4 (1<<PROCESSOR_BDVER4)
#define m_BTVER1 (1<<PROCESSOR_BTVER1)
#define m_BTVER2 (1<<PROCESSOR_BTVER2)
#define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
#define m_BTVER (m_BTVER1 | m_BTVER2)
#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
#define m_GENERIC (1<<PROCESSOR_GENERIC)
const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
#undef DEF_TUNE
#define DEF_TUNE(tune, name, selector) name,
#include "x86-tune.def"
#undef DEF_TUNE
};
/* Feature tests against the various tunings. */
unsigned char ix86_tune_features[X86_TUNE_LAST];
/* Feature tests against the various tunings used to create ix86_tune_features
based on the processor mask. */
static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
#undef DEF_TUNE
#define DEF_TUNE(tune, name, selector) selector,
#include "x86-tune.def"
#undef DEF_TUNE
};
/* Feature tests against the various architecture variations. */
unsigned char ix86_arch_features[X86_ARCH_LAST];
/* Feature tests against the various architecture variations, used to create
ix86_arch_features based on the processor mask. */
static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
/* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
~(m_386 | m_486 | m_PENT | m_K6),
/* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
~m_386,
/* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
~(m_386 | m_486),
/* X86_ARCH_XADD: Exchange and add was added for 80486. */
~m_386,
/* X86_ARCH_BSWAP: Byteswap was added for 80486. */
~m_386,
};
/* In case the average insn count for single function invocation is
lower than this constant, emit fast (but longer) prologue and
epilogue code. */
#define FAST_PROLOGUE_INSN_COUNT 20
/* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
/* Array of the smallest class containing reg number REGNO, indexed by
REGNO. Used by REGNO_REG_CLASS in i386.h. */
enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
{
/* ax, dx, cx, bx */
AREG, DREG, CREG, BREG,
/* si, di, bp, sp */
SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
/* FP registers */
FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
/* arg pointer */
NON_Q_REGS,
/* flags, fpsr, fpcr, frame */
NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
/* SSE registers */
SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
SSE_REGS, SSE_REGS,
/* MMX registers */
MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
MMX_REGS, MMX_REGS,
/* REX registers */
NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
/* SSE REX registers */
SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
SSE_REGS, SSE_REGS,
/* AVX-512 SSE registers */
EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
/* Mask registers. */
MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
};
/* The "default" register map used in 32bit mode. */
int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
{
0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
-1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
-1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
-1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
-1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
-1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
};
/* The "default" register map used in 64bit mode. */
int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
{
0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
-1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
8,9,10,11,12,13,14,15, /* extended integer registers */
25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
};
/* Define the register numbers to be used in Dwarf debugging information.
The SVR4 reference port C compiler uses the following register numbers
in its Dwarf output code:
0 for %eax (gcc regno = 0)
1 for %ecx (gcc regno = 2)
2 for %edx (gcc regno = 1)
3 for %ebx (gcc regno = 3)
4 for %esp (gcc regno = 7)
5 for %ebp (gcc regno = 6)
6 for %esi (gcc regno = 4)
7 for %edi (gcc regno = 5)
The following three DWARF register numbers are never generated by
the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
believes these numbers have these meanings.
8 for %eip (no gcc equivalent)
9 for %eflags (gcc regno = 17)
10 for %trapno (no gcc equivalent)
It is not at all clear how we should number the FP stack registers
for the x86 architecture. If the version of SDB on x86/svr4 were
a bit less brain dead with respect to floating-point then we would
have a precedent to follow with respect to DWARF register numbers
for x86 FP registers, but the SDB on x86/svr4 is so completely
broken with respect to FP registers that it is hardly worth thinking
of it as something to strive for compatibility with.
The version of x86/svr4 SDB I have at the moment does (partially)
seem to believe that DWARF register number 11 is associated with
the x86 register %st(0), but that's about all. Higher DWARF
register numbers don't seem to be associated with anything in
particular, and even for DWARF regno 11, SDB only seems to under-
stand that it should say that a variable lives in %st(0) (when
asked via an `=' command) if we said it was in DWARF regno 11,
but SDB still prints garbage when asked for the value of the
variable in question (via a `/' command).
(Also note that the labels SDB prints for various FP stack regs
when doing an `x' command are all wrong.)
Note that these problems generally don't affect the native SVR4
C compiler because it doesn't allow the use of -O with -g and
because when it is *not* optimizing, it allocates a memory
location for each floating-point variable, and the memory
location is what gets described in the DWARF AT_location
attribute for the variable in question.
Regardless of the severe mental illness of the x86/svr4 SDB, we
do something sensible here and we use the following DWARF
register numbers. Note that these are all stack-top-relative
numbers.
11 for %st(0) (gcc regno = 8)
12 for %st(1) (gcc regno = 9)
13 for %st(2) (gcc regno = 10)
14 for %st(3) (gcc regno = 11)
15 for %st(4) (gcc regno = 12)
16 for %st(5) (gcc regno = 13)
17 for %st(6) (gcc regno = 14)
18 for %st(7) (gcc regno = 15)
*/
int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
{
0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
-1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
-1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
-1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
-1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
-1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
};
/* Define parameter passing and return registers. */
static int const x86_64_int_parameter_registers[6] =
{
DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
};
static int const x86_64_ms_abi_int_parameter_registers[4] =
{
CX_REG, DX_REG, R8_REG, R9_REG
};
static int const x86_64_int_return_registers[4] =
{
AX_REG, DX_REG, DI_REG, SI_REG
};
/* Additional registers that are clobbered by SYSV calls. */
int const x86_64_ms_sysv_extra_clobbered_registers[12] =
{
SI_REG, DI_REG,
XMM6_REG, XMM7_REG,
XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
};
/* Define the structure for the machine field in struct function. */
struct GTY(()) stack_local_entry {
unsigned short mode;
unsigned short n;
rtx rtl;
struct stack_local_entry *next;
};
/* Structure describing stack frame layout.
Stack grows downward:
[arguments]
<- ARG_POINTER
saved pc
saved static chain if ix86_static_chain_on_stack
saved frame pointer if frame_pointer_needed
<- HARD_FRAME_POINTER
[saved regs]
<- regs_save_offset
[padding0]
[saved SSE regs]
<- sse_regs_save_offset
[padding1] |
| <- FRAME_POINTER
[va_arg registers] |
|
[frame] |
|
[padding2] | = to_allocate
<- STACK_POINTER
*/
struct ix86_frame
{
int nsseregs;
int nregs;
int va_arg_size;
int red_zone_size;
int outgoing_arguments_size;
/* The offsets relative to ARG_POINTER. */
HOST_WIDE_INT frame_pointer_offset;
HOST_WIDE_INT hard_frame_pointer_offset;
HOST_WIDE_INT stack_pointer_offset;
HOST_WIDE_INT hfp_save_offset;
HOST_WIDE_INT reg_save_offset;
HOST_WIDE_INT sse_reg_save_offset;
/* When save_regs_using_mov is set, emit prologue using
move instead of push instructions. */
bool save_regs_using_mov;
};
/* Which cpu are we scheduling for. */
enum attr_cpu ix86_schedule;
/* Which cpu are we optimizing for. */
enum processor_type ix86_tune;
/* Which instruction set architecture to use. */
enum processor_type ix86_arch;
/* True if processor has SSE prefetch instruction. */
unsigned char x86_prefetch_sse;
/* -mstackrealign option */
static const char ix86_force_align_arg_pointer_string[]
= "force_align_arg_pointer";
static rtx (*ix86_gen_leave) (void);
static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
/* Preferred alignment for stack boundary in bits. */
unsigned int ix86_preferred_stack_boundary;
/* Alignment for incoming stack boundary in bits specified at
command line. */
static unsigned int ix86_user_incoming_stack_boundary;
/* Default alignment for incoming stack boundary in bits. */
static unsigned int ix86_default_incoming_stack_boundary;
/* Alignment for incoming stack boundary in bits. */
unsigned int ix86_incoming_stack_boundary;
/* Calling abi specific va_list type nodes. */
static GTY(()) tree sysv_va_list_type_node;
static GTY(()) tree ms_va_list_type_node;
/* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
char internal_label_prefix[16];
int internal_label_prefix_len;
/* Fence to use after loop using movnt. */
tree x86_mfence;
/* Register class used for passing given 64bit part of the argument.
These represent classes as documented by the PS ABI, with the exception
of SSESF, SSEDF classes, that are basically SSE class, just gcc will
use SF or DFmode move instead of DImode to avoid reformatting penalties.
Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
whenever possible (upper half does contain padding). */
enum x86_64_reg_class
{
X86_64_NO_CLASS,
X86_64_INTEGER_CLASS,
X86_64_INTEGERSI_CLASS,
X86_64_SSE_CLASS,
X86_64_SSESF_CLASS,
X86_64_SSEDF_CLASS,
X86_64_SSEUP_CLASS,
X86_64_X87_CLASS,
X86_64_X87UP_CLASS,
X86_64_COMPLEX_X87_CLASS,
X86_64_MEMORY_CLASS
};
#define MAX_CLASSES 8
/* Table of constants used by fldpi, fldln2, etc.... */
static REAL_VALUE_TYPE ext_80387_constants_table [5];
static bool ext_80387_constants_init = 0;
static struct machine_function * ix86_init_machine_status (void);
static rtx ix86_function_value (const_tree, const_tree, bool);
static bool ix86_function_value_regno_p (const unsigned int);
static unsigned int ix86_function_arg_boundary (enum machine_mode,
const_tree);
static rtx ix86_static_chain (const_tree, bool);
static int ix86_function_regparm (const_tree, const_tree);
static void ix86_compute_frame_layout (struct ix86_frame *);
static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
rtx, rtx, int);
static void ix86_add_new_builtins (HOST_WIDE_INT);
static tree ix86_canonical_va_list_type (tree);
static void predict_jump (int);
static unsigned int split_stack_prologue_scratch_regno (void);
static bool i386_asm_output_addr_const_extra (FILE *, rtx);
enum ix86_function_specific_strings
{
IX86_FUNCTION_SPECIFIC_ARCH,
IX86_FUNCTION_SPECIFIC_TUNE,
IX86_FUNCTION_SPECIFIC_MAX
};
static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
const char *, enum fpmath_unit, bool);
static void ix86_function_specific_save (struct cl_target_option *,
struct gcc_options *opts);
static void ix86_function_specific_restore (struct gcc_options *opts,
struct cl_target_option *);
static void ix86_function_specific_print (FILE *, int,
struct cl_target_option *);
static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
static bool ix86_valid_target_attribute_inner_p (tree, char *[],
struct gcc_options *,
struct gcc_options *,
struct gcc_options *);
static bool ix86_can_inline_p (tree, tree);
static void ix86_set_current_function (tree);
static unsigned int ix86_minimum_incoming_stack_boundary (bool);
static enum calling_abi ix86_function_abi (const_tree);
#ifndef SUBTARGET32_DEFAULT_CPU
#define SUBTARGET32_DEFAULT_CPU "i386"
#endif
/* Whether -mtune= or -march= were specified */
static int ix86_tune_defaulted;
static int ix86_arch_specified;
/* Vectorization library interface and handlers. */
static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
/* Processor target table, indexed by processor number */
struct ptt
{
const char *const name; /* processor name */
const struct processor_costs *cost; /* Processor costs */
const int align_loop; /* Default alignments. */
const int align_loop_max_skip;
const int align_jump;
const int align_jump_max_skip;
const int align_func;
};
/* This table must be in sync with enum processor_type in i386.h. */
static const struct ptt processor_target_table[PROCESSOR_max] =
{
{"generic", &generic_cost, 16, 10, 16, 10, 16},
{"i386", &i386_cost, 4, 3, 4, 3, 4},
{"i486", &i486_cost, 16, 15, 16, 15, 16},
{"pentium", &pentium_cost, 16, 7, 16, 7, 16},
{"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
{"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
{"nocona", &nocona_cost, 0, 0, 0, 0, 0},
{"core2", &core_cost, 16, 10, 16, 10, 16},
{"nehalem", &core_cost, 16, 10, 16, 10, 16},
{"sandybridge", &core_cost, 16, 10, 16, 10, 16},
{"haswell", &core_cost, 16, 10, 16, 10, 16},
{"bonnell", &atom_cost, 16, 15, 16, 7, 16},
{"silvermont", &slm_cost, 16, 15, 16, 7, 16},
{"intel", &intel_cost, 16, 15, 16, 7, 16},
{"geode", &geode_cost, 0, 0, 0, 0, 0},
{"k6", &k6_cost, 32, 7, 32, 7, 32},
{"athlon", &athlon_cost, 16, 7, 16, 7, 16},
{"k8", &k8_cost, 16, 7, 16, 7, 16},
{"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
{"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
{"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
{"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
{"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
{"btver1", &btver1_cost, 16, 10, 16, 7, 11},
{"btver2", &btver2_cost, 16, 10, 16, 7, 11}
};
static unsigned int
rest_of_handle_insert_vzeroupper (void)
{
int i;
/* vzeroupper instructions are inserted immediately after reload to
account for possible spills from 256bit registers. The pass
reuses mode switching infrastructure by re-running mode insertion
pass, so disable entities that have already been processed. */
for (i = 0; i < MAX_386_ENTITIES; i++)
ix86_optimize_mode_switching[i] = 0;
ix86_optimize_mode_switching[AVX_U128] = 1;
/* Call optimize_mode_switching. */
g->get_passes ()->execute_pass_mode_switching ();
return 0;
}
namespace {
const pass_data pass_data_insert_vzeroupper =
{
RTL_PASS, /* type */
"vzeroupper", /* name */
OPTGROUP_NONE, /* optinfo_flags */
true, /* has_execute */
TV_NONE, /* tv_id */
0, /* properties_required */
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
TODO_df_finish, /* todo_flags_finish */
};
class pass_insert_vzeroupper : public rtl_opt_pass
{
public:
pass_insert_vzeroupper(gcc::context *ctxt)
: rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
{}
/* opt_pass methods: */
virtual bool gate (function *)
{
return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
}
virtual unsigned int execute (function *)
{
return rest_of_handle_insert_vzeroupper ();
}
}; // class pass_insert_vzeroupper
} // anon namespace
rtl_opt_pass *
make_pass_insert_vzeroupper (gcc::context *ctxt)
{
return new pass_insert_vzeroupper (ctxt);
}
/* Return true if a red-zone is in use. */
static inline bool
ix86_using_red_zone (void)
{
return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
}
/* Return a string that documents the current -m options. The caller is
responsible for freeing the string. */
static char *
ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
const char *tune, enum fpmath_unit fpmath,
bool add_nl_p)
{
struct ix86_target_opts
{
const char *option; /* option string */
HOST_WIDE_INT mask; /* isa mask options */
};
/* This table is ordered so that options like -msse4.2 that imply
preceding options while match those first. */
static struct ix86_target_opts isa_opts[] =
{
{ "-mfma4", OPTION_MASK_ISA_FMA4 },
{ "-mfma", OPTION_MASK_ISA_FMA },
{ "-mxop", OPTION_MASK_ISA_XOP },
{ "-mlwp", OPTION_MASK_ISA_LWP },
{ "-mavx512f", OPTION_MASK_ISA_AVX512F },
{ "-mavx512er", OPTION_MASK_ISA_AVX512ER },
{ "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
{ "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
{ "-msse4a", OPTION_MASK_ISA_SSE4A },
{ "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
{ "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
{ "-mssse3", OPTION_MASK_ISA_SSSE3 },
{ "-msse3", OPTION_MASK_ISA_SSE3 },
{ "-msse2", OPTION_MASK_ISA_SSE2 },
{ "-msse", OPTION_MASK_ISA_SSE },
{ "-m3dnow", OPTION_MASK_ISA_3DNOW },
{ "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
{ "-mmmx", OPTION_MASK_ISA_MMX },
{ "-mabm", OPTION_MASK_ISA_ABM },
{ "-mbmi", OPTION_MASK_ISA_BMI },
{ "-mbmi2", OPTION_MASK_ISA_BMI2 },
{ "-mlzcnt", OPTION_MASK_ISA_LZCNT },
{ "-mhle", OPTION_MASK_ISA_HLE },
{ "-mfxsr", OPTION_MASK_ISA_FXSR },
{ "-mrdseed", OPTION_MASK_ISA_RDSEED },
{ "-mprfchw", OPTION_MASK_ISA_PRFCHW },
{ "-madx", OPTION_MASK_ISA_ADX },
{ "-mtbm", OPTION_MASK_ISA_TBM },
{ "-mpopcnt", OPTION_MASK_ISA_POPCNT },
{ "-mmovbe", OPTION_MASK_ISA_MOVBE },
{ "-mcrc32", OPTION_MASK_ISA_CRC32 },
{ "-maes", OPTION_MASK_ISA_AES },
{ "-msha", OPTION_MASK_ISA_SHA },
{ "-mpclmul", OPTION_MASK_ISA_PCLMUL },
{ "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
{ "-mrdrnd", OPTION_MASK_ISA_RDRND },
{ "-mf16c", OPTION_MASK_ISA_F16C },
{ "-mrtm", OPTION_MASK_ISA_RTM },
{ "-mxsave", OPTION_MASK_ISA_XSAVE },
{ "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
{ "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
{ "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
{ "-mxsavec", OPTION_MASK_ISA_XSAVEC },
{ "-mxsaves", OPTION_MASK_ISA_XSAVES },
};
/* Flag options. */
static struct ix86_target_opts flag_opts[] =
{
{ "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
{ "-mlong-double-128", MASK_LONG_DOUBLE_128 },
{ "-mlong-double-64", MASK_LONG_DOUBLE_64 },
{ "-m80387", MASK_80387 },
{ "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
{ "-malign-double", MASK_ALIGN_DOUBLE },
{ "-mcld", MASK_CLD },
{ "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
{ "-mieee-fp", MASK_IEEE_FP },
{ "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
{ "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
{ "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
{ "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
{ "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
{ "-mno-push-args", MASK_NO_PUSH_ARGS },
{ "-mno-red-zone", MASK_NO_RED_ZONE },
{ "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
{ "-mrecip", MASK_RECIP },
{ "-mrtd", MASK_RTD },
{ "-msseregparm", MASK_SSEREGPARM },
{ "-mstack-arg-probe", MASK_STACK_PROBE },
{ "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
{ "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
{ "-m8bit-idiv", MASK_USE_8BIT_IDIV },
{ "-mvzeroupper", MASK_VZEROUPPER },
{ "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
{ "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
{ "-mprefer-avx128", MASK_PREFER_AVX128},
};
const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
char isa_other[40];
char target_other[40];
unsigned num = 0;
unsigned i, j;
char *ret;
char *ptr;
size_t len;
size_t line_len;
size_t sep_len;
const char *abi;
memset (opts, '\0', sizeof (opts));
/* Add -march= option. */
if (arch)
{
opts[num][0] = "-march=";
opts[num++][1] = arch;
}
/* Add -mtune= option. */
if (tune)
{
opts[num][0] = "-mtune=";
opts[num++][1] = tune;
}
/* Add -m32/-m64/-mx32. */
if ((isa & OPTION_MASK_ISA_64BIT) != 0)
{
if ((isa & OPTION_MASK_ABI_64) != 0)
abi = "-m64";
else
abi = "-mx32";
isa &= ~ (OPTION_MASK_ISA_64BIT
| OPTION_MASK_ABI_64
| OPTION_MASK_ABI_X32);
}
else
abi = "-m32";
opts[num++][0] = abi;
/* Pick out the options in isa options. */
for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
{
if ((isa & isa_opts[i].mask) != 0)
{
opts[num++][0] = isa_opts[i].option;
isa &= ~ isa_opts[i].mask;
}
}
if (isa && add_nl_p)
{
opts[num++][0] = isa_other;
sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
isa);
}
/* Add flag options. */
for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
{
if ((flags & flag_opts[i].mask) != 0)
{
opts[num++][0] = flag_opts[i].option;
flags &= ~ flag_opts[i].mask;
}
}
if (flags && add_nl_p)
{
opts[num++][0] = target_other;
sprintf (target_other, "(other flags: %#x)", flags);
}
/* Add -fpmath= option. */
if (fpmath)
{
opts[num][0] = "-mfpmath=";
switch ((int) fpmath)
{
case FPMATH_387:
opts[num++][1] = "387";
break;
case FPMATH_SSE:
opts[num++][1] = "sse";
break;
case FPMATH_387 | FPMATH_SSE:
opts[num++][1] = "sse+387";
break;
default:
gcc_unreachable ();
}
}
/* Any options? */
if (num == 0)
return NULL;
gcc_assert (num < ARRAY_SIZE (opts));
/* Size the string. */
len = 0;
sep_len = (add_nl_p) ? 3 : 1;
for (i = 0; i < num; i++)
{
len += sep_len;
for (j = 0; j < 2; j++)
if (opts[i][j])
len += strlen (opts[i][j]);
}
/* Build the string. */
ret = ptr = (char *) xmalloc (len);
line_len = 0;
for (i = 0; i < num; i++)
{
size_t len2[2];
for (j = 0; j < 2; j++)
len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
if (i != 0)
{
*ptr++ = ' ';
line_len++;
if (add_nl_p && line_len + len2[0] + len2[1] > 70)
{
*ptr++ = '\\';
*ptr++ = '\n';
line_len = 0;
}
}
for (j = 0; j < 2; j++)
if (opts[i][j])