1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
;; Copyright (C) 2004-2014 Free Software Foundation, Inc.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3. If not see
;; <http://www.gnu.org/licenses/>. */
;;
;; Pipeline description for the VR4130 family.
;;
;; The processor issues each 8-byte aligned pair of instructions together,
;; stalling the second instruction if it depends on the first. Thus, if we
;; want two instructions to issue in parallel, we need to make sure that the
;; first one is 8-byte aligned.
;;
;; For the purposes of this pipeline description, we treat the processor
;; like a standard two-way superscalar architecture. If scheduling were
;; the last pass to run, we could use the scheduler hooks to vary the
;; issue rate depending on whether an instruction is at an aligned or
;; unaligned address. Unfortunately, delayed branch scheduling and
;; hazard avoidance are done after the final scheduling pass, and they
;; can change the addresses of many instructions.
;;
;; We get around this in two ways:
;;
;; (1) By running an extra pass at the end of compilation. This pass goes
;; through the function looking for pairs of instructions that could
;; execute in parallel. It makes sure that the first instruction in
;; each pair is suitably aligned, inserting nops if necessary. Doing
;; this gives the same kind of pipeline behavior we would see on a
;; normal superscalar target.
;;
;; This pass is generally a speed improvement, but the extra nops will
;; obviously make the program bigger. It is therefore unsuitable for
;; -Os (at the very least).
;;
;; (2) By modifying the scheduler hooks so that, where possible:
;;
;; (a) dependent instructions are separated by a non-dependent
;; instruction;
;;
;; (b) instructions that use the multiplication unit are separated
;; by non-multiplication instructions; and
;;
;; (c) memory access instructions are separated by non-memory
;; instructions.
;;
;; The idea is to keep conflicting instructions apart wherever possible
;; and thus make the schedule less dependent on alignment.
(define_automaton "vr4130_main, vr4130_muldiv, vr4130_mulpre")
(define_cpu_unit "vr4130_alu1, vr4130_alu2, vr4130_dcache" "vr4130_main")
(define_cpu_unit "vr4130_muldiv" "vr4130_muldiv")
;; This is a fake unit for pre-reload scheduling of multiplications.
;; It enforces the true post-reload repeat rate.
(define_cpu_unit "vr4130_mulpre" "vr4130_mulpre")
;; The scheduling hooks use this attribute for (b) above.
(define_attr "vr4130_class" "mul,mem,alu"
(cond [(eq_attr "type" "load,store")
(const_string "mem")
(eq_attr "type" "mfhi,mflo,mthi,mtlo,imul,imul3,imadd,idiv")
(const_string "mul")]
(const_string "alu")))
(define_insn_reservation "vr4130_multi" 1
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "multi,unknown,atomic,syncloop"))
"vr4130_alu1 + vr4130_alu2 + vr4130_dcache + vr4130_muldiv")
(define_insn_reservation "vr4130_int" 1
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "arith,const,logical,move,nop,shift,signext,slt"))
"vr4130_alu1 | vr4130_alu2")
(define_insn_reservation "vr4130_load" 3
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "load"))
"vr4130_dcache")
(define_insn_reservation "vr4130_store" 1
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "store"))
"vr4130_dcache")
(define_insn_reservation "vr4130_mfhilo" 3
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "mfhi,mflo"))
"vr4130_muldiv")
(define_insn_reservation "vr4130_mthilo" 1
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "mthi,mtlo"))
"vr4130_muldiv")
;; The product is available in LO & HI after one cycle. Moving the result
;; into an integer register will take an additional three cycles, see mflo
;; & mfhi above. Note that the same latencies and repeat rates apply if we
;; use "mtlo; macc" instead of "mult; mflo".
(define_insn_reservation "vr4130_mulsi" 4
(and (eq_attr "cpu" "r4130")
(and (eq_attr "type" "imul,imul3")
(eq_attr "mode" "SI")))
"vr4130_muldiv + (vr4130_mulpre * 2)")
;; As for vr4130_mulsi, but the product is available in LO and HI
;; after 3 cycles.
(define_insn_reservation "vr4130_muldi" 6
(and (eq_attr "cpu" "r4130")
(and (eq_attr "type" "imul,imul3")
(eq_attr "mode" "DI")))
"(vr4130_muldiv * 3) + (vr4130_mulpre * 4)")
;; maccs can execute in consecutive cycles without stalling, but it
;; is 3 cycles before the integer destination can be read.
(define_insn_reservation "vr4130_macc" 3
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "imadd"))
"vr4130_muldiv")
(define_bypass 1 "vr4130_mulsi,vr4130_macc" "vr4130_macc" "mips_linked_madd_p")
(define_bypass 1 "vr4130_mulsi,vr4130_macc" "vr4130_mfhilo")
(define_bypass 3 "vr4130_muldi" "vr4130_mfhilo")
(define_insn_reservation "vr4130_divsi" 36
(and (eq_attr "cpu" "r4130")
(and (eq_attr "type" "idiv")
(eq_attr "mode" "SI")))
"vr4130_muldiv * 36")
(define_insn_reservation "vr4130_divdi" 72
(and (eq_attr "cpu" "r4130")
(and (eq_attr "type" "idiv")
(eq_attr "mode" "DI")))
"vr4130_muldiv * 72")
(define_insn_reservation "vr4130_branch" 0
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "branch,jump,call"))
"vr4130_alu1 | vr4130_alu2")
|