gcc/config/sparc/niagara7.md


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

;; Scheduling description for Niagara-7
;;   Copyright (C) 2016-2019 Free Software Foundation, Inc.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; <http://www.gnu.org/licenses/>.

(define_automaton "niagara7_0")

;; The S4 core has a dual-issue queue.  This queue is divided into two
;; slots.  One instruction can be issued each cycle to each slot, and
;; up to 2 instructions are committed each cycle.  Each slot serves
;; several execution units, as depicted below:
;;
;;
;;                 m7_slot0 - Integer unit.
;;                          - Load/Store unit.
;; === QUEUE ==>
;;
;;                 m7_slot1 - Integer unit.
;;                          - Branch unit.
;;                          - Floating-point and graphics unit.
;;                          - 3-cycles crypto unit.

(define_cpu_unit "n7_slot0,n7_slot1" "niagara7_0")

;; Some instructions stall the pipeline and avoid any other
;; instruction to be issued in the same cycle.  We assume the same for
;; multi-instruction insns.

(define_reservation "n7_single_issue" "n7_slot0 + n7_slot1")

(define_insn_reservation "n7_single" 1
  (and (eq_attr "cpu" "niagara7")
    (eq_attr "type" "multi,savew,flushw,trap"))
  "n7_single_issue")

;; Most of the instructions executing in the integer unit have a
;; latency of 1.

(define_insn_reservation "n7_integer" 1
  (and (eq_attr "cpu" "niagara7")
    (eq_attr "type" "ialu,ialuX,shift,cmove,compare"))
  "(n7_slot0 | n7_slot1)")

;; Flushing the instruction memory takes 27 cycles.

(define_insn_reservation "n7_iflush" 27
  (and (eq_attr "cpu" "niagara7")
       (eq_attr "type" "iflush"))
  "(n7_slot0 | n7_slot1), nothing*26")

;; The integer multiplication instructions have a latency of 12 cycles
;; and execute in the integer unit.
;;
;; Likewise for array*, edge* and pdistn instructions.

(define_insn_reservation "n7_imul" 12
  (and (eq_attr "cpu" "niagara7")
    (eq_attr "type" "imul,array,edge,edgen,pdistn"))
  "(n7_slot0 | n7_slot1), nothing*11")

;; The integer division instructions have a latency of 35 cycles and
;; execute in the integer unit.

(define_insn_reservation "n7_idiv" 35
  (and (eq_attr "cpu" "niagara7")
    (eq_attr "type" "idiv"))
  "(n7_slot0 | n7_slot1), nothing*34")

;; Both integer and floating-point load instructions have a latency of
;; 5 cycles, and execute in the slot0.
;;
;; The prefetch instruction also executes in the load/store unit, but
;; its latency is only 1 cycle.

(define_insn_reservation "n7_load" 5
  (and (eq_attr "cpu" "niagara7")
       (ior (eq_attr "type" "fpload,sload")
            (and (eq_attr "type" "load")
                 (eq_attr "subtype" "regular"))))
  "n7_slot0, nothing*4")

(define_insn_reservation "n7_prefetch" 1
  (and (eq_attr "cpu" "niagara7")
       (eq_attr "type" "load")
       (eq_attr "subtype" "prefetch"))
  "n7_slot0")

;; Both integer and floating-point store instructions have a latency
;; of 1 cycle, and execute in the load/store unit in slot0.

(define_insn_reservation "n7_store" 1
  (and (eq_attr "cpu" "niagara7")
    (eq_attr "type" "store,fpstore"))
  "n7_slot0")

;; Control-transfer instructions execute in the Branch Unit in the
;; slot1.

(define_insn_reservation "n7_cti" 1
  (and (eq_attr "cpu" "niagara7")
    (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
  "n7_slot1")

;; Many instructions executing in the Floating-point and Graphics unit
;; in the slot1 feature a latency of 11 cycles.

(define_insn_reservation "n7_fp" 11
  (and (eq_attr "cpu" "niagara7")
       (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
            (and (eq_attr "type" "fga")
                 (eq_attr "subtype" "fpu,maxmin"))))
  "n7_slot1, nothing*10")

;; Floating-point division and floating-point square-root instructions
;; have high latencies.  They execute in the floating-point and
;; graphics unit in the slot1.


(define_insn_reservation "n7_fpdivs" 24
  (and (eq_attr "cpu" "niagara7")
       (eq_attr "type" "fpdivs,fpsqrts"))
  "n7_slot1, nothing*23")

(define_insn_reservation "n7_fpdivd" 37
  (and (eq_attr "cpu" "niagara7")
    (eq_attr "type" "fpdivd,fpsqrtd"))
  "n7_slot1, nothing*36")

;; SIMD VIS instructions executing in the Floating-point and graphics
;; unit (FPG) in slot1 usually have a latency of either 11 or 12
;; cycles.
;;
;; However, the latency for many instructions is only 3 cycles if the
;; consumer can also be executed in 3 cycles.  We model this with a
;; bypass.  In these cases the instructions are executed in the
;; 3-cycle crypto unit which also serves slot1.

(define_insn_reservation "n7_vis_11cycles" 11
  (and (eq_attr "cpu" "niagara7")
       (ior (and (eq_attr "type" "fga")
                 (eq_attr "subtype" "addsub64,other"))
            (and (eq_attr "type" "vismv")
                 (eq_attr "subtype" "double,single"))
            (and (eq_attr "type" "visl")
                 (eq_attr "subtype" "double,single"))))
  "n7_slot1, nothing*10")

(define_insn_reservation "n7_vis_12cycles" 12
  (and (eq_attr "cpu" "niagara7")
       (ior (eq_attr "type" "bmask,viscmp")
            (and (eq_attr "type" "fga")
                 (eq_attr "subtype" "cmask"))
            (and (eq_attr "type" "vismv")
                 (eq_attr "subtype" "movstouw"))))
  "n7_slot1, nothing*11")

(define_bypass 3 "n7_vis_*" "n7_vis_*")

;; Some other VIS instructions have a latency of 12 cycles, and won't
;; be executed in the 3-cycle crypto pipe.

(define_insn_reservation "n7_lzd" 12
  (and (eq_attr "cpu" "niagara7")
       (ior (eq_attr "type" "lzd,")
            (and (eq_attr "type" "gsr")
                 (eq_attr "subtype" "alignaddr"))))
  "n7_slot1, nothing*11")

;; A couple of VIS instructions feature very low latencies in the M7.

(define_insn_reservation "n7_single_vis" 1
  (and (eq_attr "cpu" "niagara7")
       (eq_attr "type" "vismv")
       (eq_attr "subtype" "movxtod"))
  "n7_slot1")

(define_insn_reservation "n7_double_vis" 2
  (and (eq_attr "cpu" "niagara7")
       (eq_attr "type" "vismv")
       (eq_attr "subtype" "movdtox"))
  "n7_slot1, nothing")

;; Reading and writing to the gsr register takes a high number of
;; cycles that is not documented in the PRM.  Let's use the same value
;; than the M8.

(define_insn_reservation "n7_gsr_reg" 70
  (and (eq_attr "cpu" "niagara7")
       (eq_attr "type" "gsr")
       (eq_attr "subtype" "reg"))
  "n7_slot1, nothing*70")