1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
|
/*
* Header file for all AVC INTER prediction kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
#if !defined(__INTER_HEADER__) // Make sure this file is only included once
#define __INTER_HEADER__
// Module name: inter_header.inc
//
// Header file for all AVC INTER prediction kernels
//
#define INTER_KERNEL
//-------------------------------------------------------------------------------------------
// TODO: The followings will be merged with the above definitions later
//-------------------------------------------------------------------------------------------
//------------ Input parameters & bit masks
// SW WA for weighted prediction - 2007/09/06
//.declare guwR1 Base=r1 ElementSize=2 Type=uw
//.declare guwW128 Base=r63.13 ElementSize=2 Type=uw
#ifdef DEV_ILK
// #define SW_W_128 // Enable SW WA for special Weight=128 case. Can be commented to disable it
#else // Pre DEV_ILK
#define SW_W_128 // Enable SW WA for special Weight=128 case.
#endif // DEV_ILK
#ifdef SW_W_128
.declare gudW128 Base=r1.0 ElementSize=4 Type=ud
#else
#endif // SW_W_128
#define gORIX r3.4 // :ub, X origin
#define gORIY r3.5 // :ub, Y origin
#define gCBP r3.9 // :ub, CBP (0, 0, Y0, Y1, Y2, Y3, Cb, Cr)
#define nCBPY_MASK 0x3c
#define nCBPU_MASK 0x2
#define nCBPV_MASK 0x1
#define gFIELDFLAGS r3.1 // :uw - To compute message descriptor for write
#define gMBTYPE r3.1 // :ub, MB type
#define nMBTYPE_MASK 0x1f
#define gFIELDMBFLAG r3.1 // :ub, Field MB flag
#define nFIELDMB_MASK 0x40
#define gMBPARITY r3.3 // :ub, Bottom field flag
#define nMBPARITY_MASK 0x01
#define gWPREDFLAG r3.0 // :ub, Weighted pred flag
#define nWBIDIR_MASK 0xc0
#define gSUBMB_SHAPE r3.12 // :ub, Sub-MB shape
#define gSUBMB_MODE r3.13 // :ub, Sub-MB prediction mode
.declare guwSUBMB_SHAPE_MODE Base=r3.6 ElementSize=2 Type=uw
#define gYWDENOM r3.14 // :ub, Luma log2 weight denom
#define gCWDENOM r3.15 // :ub, Chroma log2 weight denom
#define gADDR r3.24 // :ub, Register addresses of error data / MV
.declare gubBIDX Base=r3.16 ElementSize=1 Type=ub
#define gWGT r8 // Weights/offsets
.declare gdWGT Base=r8 ElementSize=4 Type=d
.declare gwWGT Base=r8 ElementSize=2 Type=w
#define gMV r4 // MVs
.declare gwMV Base=r4 ElementSize=2 Type=w
.declare gdMV Base=r4 ElementSize=4 Type=d
.declare gwERRORY Base=r10 ElementSize=2 Type=w // 16 GRFs
.declare gubERRORY Base=r10 ElementSize=1 Type=ub
.declare gwERRORC Base=r26 ElementSize=2 Type=w // 8 GRFs
.declare gubERRORC Base=r26 ElementSize=2 Type=ub
//------------ Address registers
#define pMSGDSC a0.0 // ud: Must be the leading dword of the address register
#define pREF a0.0
#define pBIDX a0.2
#define pWGT a0.3
#define pERRORYC a0.2 // :ud
#define pERRORY a0.4
#define pERRORC a0.5
#define pMV a0.6
#define pWGT_BIDX a0.1 // :ud, WGT & BIDX
#define pRECON_MV a0.3 // :ud, RECON & MV
#define pREF0 a0.0 // :uw
#define pREF0D a0.0 // :ud
#define pREF1 a0.1
#define pREF2 a0.2
#define pREF2D a0.1 // :ud
#define pREF3 a0.3
#define pREF4 a0.4
#define pREF4D a0.2 // :ud
#define pREF5 a0.5
#define pREF6 a0.6
#define pREF6D a0.3 // :ud
#define pREF7 a0.7
#define pRES a0.6
#define pRESD a0.3 // :ud
#define pRESULT a0.7
#define p0 a0.0
#define p1 a0.1
//------------ Constants for static/inline/indirect
#define nOFFSET_BIDX 112 // = 32*3+4*4
#define nOFFSET_WGT 256 // = 32*8
#define nOFFSET_WGT_BIDX 0x01000070 // = (256<<16)+112
#define nOFFSET_ERROR 0x03400140 // = (320+128*4)<<16+320=0x03400140
#define nOFFSET_ERRORY 0x0140
#define nOFFSET_ERRORC 0x0340
#define nOFFSET_MV 128 // = 32*4
#define nOFFSET_RECON_MV 0x04400080 // = (1088<<16)+128 // TODO: OFFSET_RECON is obsolete
//------------ Constants for kernel internal variables
#define nOFFSET_INTPY0 0x0640 // = 32*50
#define nOFFSET_INTPY1 0x0780 // = 32*60
#define nOFFSET_INTPC0 0x06c0 // = 32*54
#define nOFFSET_INTPC1 0x0480 // = 32*36
#define nOFFSET_INTP0 0x06c00640
#define nOFFSET_INTP1 0x04800780
#define nOFFSET_INTERIM 0x0480 // = 32*36
#define nOFFSET_INTERIM2 0x04A00480 // = ((32*37)<<16)|(32*36)
#define nOFFSET_INTERIM3 0x04A00480 // = ((32*36+32)<<16)|(32*36)
#define nOFFSET_INTERIM4 0x04A00490 // = ((32*37)<<16)|(32*36+16)
#define nOFFSET_INTERIM4x4 0x04C0 // = 32*38
#define nOFFSET_INTERIM4x4_4 0x04E004D0 // = ((32*38+32)<<16)|(32*38+16)
#define nOFFSET_INTERIM4x4_5 0x04D004C0 // = ((32*38+16)<<16)|(32*38)
#define nOFFSET_INTERIM4x4_6 0x04E004C0 // = ((32*38+32)<<16)|(32*38)
#define nOFFSET_INTERIM4x4_7 0x04D004C8 // = ((32*38+16)<<16)|(32*38+8)
#define nOFFSET_INTERIM4x4_8 0x04E004D8 // = ((32*38+32)<<16)|(32*38+24)
#define nOFFSET_INTERIM4x4_9 0x04F004E8 // = ((32*38+48)<<16)|(32*38+40)
#define nOFFSET_RES 0x540 // = 32*42
#define nOFFSET_REF 0x560 // = 32*43
#define nOFFSET_REFC 0x700 // = 32*56
// Binding table index
#define nBDIX_DESTY 0
#define nBDIX_DESTC 1
#define nBI_LC_DIFF 0x10 // Binding table index diff between luma and chroma
#define nGRFWIB 32
#define nGRFHWIB 16
//------------ Regions
.declare gudREF Base=r43 ElementSize=4 SrcRegion=<16;16,1> Type=ud
.declare gubREF Base=r43 ElementSize=1 Type=ub
.declare gudREFC Base=r56 ElementSize=4 SrcRegion=<16;16,1> Type=ud
// 16x16 handling
.declare gudREF21x21 Base=r58 ElementSize=4 SrcRegion=<16;16,1> Type=ud
.declare gudREF18x10 Base=r66 ElementSize=4 SrcRegion=<16;16,1> Type=ud
.declare gubREF18x10 Base=r66 ElementSize=1 SrcRegion=<16;16,1> Type=ub
.declare gudREF16x16 Base=r38 ElementSize=4 Type=ud // 8 GRFs
.declare gubREF16x16 Base=r38 ElementSize=1 Type=ub
.declare gudREFC16x8 Base=r46 ElementSize=4 Type=ud // 4 GRFs
.declare gubREFC16x8 Base=r46 ElementSize=1 Type=ub
// TODO
.declare gubAVG Base=r56 ElementSize=1 Type=ub
.declare gubREFY_BWD Base=r64 ElementSize=1 Type=ub
.declare gubREFC_BWD Base=r72 ElementSize=1 Type=ub
.declare guwINTPY0 Base=r50 ElementSize=2 SrcRegion=<16;16,1> Type=uw
.declare gudINTPY0 Base=r50 ElementSize=4 Type=ud
.declare gubINTPY0 Base=r50 ElementSize=1 SrcRegion=<32;16,2> Type=ub
.declare guwINTPY1 Base=r60 ElementSize=2 SrcRegion=<16;16,1> Type=uw
.declare gudINTPY1 Base=r60 ElementSize=4 Type=ud
.declare gubINTPY1 Base=r60 ElementSize=1 SrcRegion=<32;16,2> Type=ub
.declare guwYPRED Base=r50 ElementSize=2 SrcRegion=<8;8,1> Type=uw
.declare gubYPRED Base=r50 ElementSize=1 SrcRegion=<32;16,2> Type=ub
.declare guwINTPC0 Base=r54 ElementSize=2 SrcRegion=<16;16,1> Type=uw
.declare gwINTPC0 Base=r54 ElementSize=2 SrcRegion=<16;16,1> Type=w
.declare gudINTPC0 Base=r54 ElementSize=4 Type=ud
.declare gubINTPC0 Base=r54 ElementSize=1 SrcRegion=<32;16,2> Type=ub
.declare guwINTPC1 Base=r36 ElementSize=2 SrcRegion=<16;16,1> Type=uw
.declare gudINTPC1 Base=r36 ElementSize=4 Type=ud
.declare gubINTPC1 Base=r36 ElementSize=1 SrcRegion=<32;16,2> Type=ub
.declare guwCPRED Base=r54 ElementSize=2 SrcRegion=<16;8,2> Type=uw
.declare gubCPRED Base=r54 ElementSize=1 SrcRegion=<32;8,4> Type=ub
#define gINTERIM r36
.declare gubINTERIM_BUF Base=r36 ElementSize=1 SrcRegion=<32;16,2> Type=ub
#define gINTERIM4x4 r38
.declare gubINTERIM4x4_BUF Base=r38 ElementSize=1 SrcRegion=<32;16,2> Type=ub
.declare gwINTERIM4x4_BUF Base=r38 ElementSize=2 Type=w
.declare gubINTERIM_BUF2 Base=r42 ElementSize=1 SrcRegion=<8;4,2> Type=ub
.declare gwINTERIM_BUF2 Base=r42 ElementSize=2 SrcRegion=<16;16,1> Type=w
.declare guwINTERIM_BUF2 Base=r42 ElementSize=2 Type=uw
.declare gwINTERIM_BUF3 Base=r38 ElementSize=2 SrcRegion=<16;16,1> Type=w // 2 GRFs
.declare gubINTERIM_BUF3 Base=r38 ElementSize=1 Type=ub
.declare gwTEMP Base=r42 ElementSize=2 SrcRegion=<16;16,1> Type=w
//------------ General registers
#define gX r3.2 // w
#define gY r3.3 // w
#define gMSGDSC_R r3.6 // ud
#define gMSGDSC_W r3.7 // ud
#ifdef SW_W_128
.declare gwMBTYPE Base=r8.6 ElementSize=2 Type=w // Shared with gLOOP_SUBMB
// TODO
#define gLOOP_SUBMB r8.6
#define gLOOP_SUBMBPT r8.7
#define gLOOP_DIR r9.6
#define gLOOPCNT r9.7 // Loop counter for submodules
#else
.declare gwMBTYPE Base=r1.0 ElementSize=2 Type=w // Shared with gLOOP_SUBMB
// TODO
#define gLOOP_SUBMB r1.0
#define gLOOP_SUBMBPT r1.1
#define gLOOP_DIR r8.7
#define gLOOPCNT r9.7 // Loop counter for submodules
#endif // SW_W_128
#define gW0 r34.6 // Temporary WORD
#define gW1 r34.7 // Temporary WORD
#define gW2 r34.8 // Temporary WORD
#define gW3 r34.9 // Temporary WORD
#define gD0 r34.3 // Temporary DWORD
#define gW4 r34.15
//
#define gMVX_INT r34.0 // :w
#define gMVY_INT r34.1 // :w
#define gMVX_FRAC r34.2 // :w
#define gMVY_FRAC r34.3 // :w
#define gMVX_FRACC r34.4 // :w
#define gMVY_FRACC r34.5 // :w
#define gpINTPY r34.10
#define gpINTPC r34.11
#define gpINTP r34.5 // DW
#define gPREDFLAG r34.12
#define gBIDX r34.13
#define gREFPARITY r34.14
#define gCHRMVADJ r1.14
#define gPARITY r1.15
#define gCBP_MASK r1.1
#define gMVSTEP r1.13
#define gpADDR r1.2 // :uw (8 words)
#define gSHAPETEMP r8.15 // :uw
#define gCOEFA r42.0
#define gCOEFB r42.1
#define gCOEFC r42.2
#define gCOEFD r42.3
// Weighted prediction
#define gPREDFLAG0 r46.0
#define gPREDFLAG1 r46.2
#define gWEIGHTFLAG r43.2
#define gBIPRED r43.3
#define gYADD r43.4
#define gCADD r43.5
#define gYSHIFT r43.6
#define gCSHIFT r43.7
#define gOFFSET r44.0
#define gUOFFSET r44.1
#define gVOFFSET r44.2
#define gWT0 r45.0
#define gO0 r45.1
#define gWT1 r45.2
#define gO1 r45.3
#define gUW0 r45.4
#define gUO0 r45.5
#define gUW1 r45.6
#define gUO1 r45.7
#define gVW0 r45.8
#define gVO0 r45.9
#define gVW1 r45.10
#define gVO1 r45.11
#define gWT0_D r45.0
#define gUW0_D r45.2
//------------ Message-related Registers & constants
#define gMSGSRC r2 // Message Source
#define mMSGHDR m1
#define mMSGHDRY m1
#define mMSGHDRC m2
#define mMSGHDR1 m1
#define mMSGHDR2 m2
#define mMSGHDR3 m3
#define mMSGHDR4 m4
#define mMSGHDRYW m1
#define mMSGHDRCW m10
#ifdef DEV_ILK
// 0000 0100(read) 0001(msg len) xxxx(resp len) 1010 (sampler cache) xxxx (field/frame) xxxx xxxx (bidx)
#define nDWBRMSGDSC_SC 0x0208A002 // DWORD Block Read Message Descriptor through Data Port, Sampler Cache
#define nDWBRMSGDSC_SC_TF 0x0208E602 // DWORD Block Read Message Descriptor through Data Port, Sampler Cache
#define nDWBRMSGDSC_SC_BF 0x0208E702 // DWORD Block Read Message Descriptor through Data Port, Sampler Cache
// 0000 0101(write) 0001(msg len) xxxx(resp len) 0010 (render cache) xxxx (field/frame) xxxx xxxx (bidx)
#define nDWBWMSGDSC 0x02082000 // DWORD Block Write Message Descriptor through Data Port, Render Cache
#define nDWBWMSGDSC_TF 0x02082600 // DWORD Block Write Message Descriptor through Data Port, Render Cache
#define nDWBWMSGDSC_BF 0x02082700 // DWORD Block Write Message Descriptor through Data Port, Render Cache
#else // Pre DEV_ILK
// 0000 0100(read) 0001(msg len) xxxx(resp len) 1010 (sampler cache) xxxx (field/frame) xxxx xxxx (bidx)
#define nDWBRMSGDSC_SC 0x0410A002 // DWORD Block Read Message Descriptor through Data Port, Sampler Cache
#define nDWBRMSGDSC_SC_TF 0x0410A602 // DWORD Block Read Message Descriptor through Data Port, Sampler Cache
#define nDWBRMSGDSC_SC_BF 0x0410A702 // DWORD Block Read Message Descriptor through Data Port, Sampler Cache
// 0000 0101(write) 0001(msg len) xxxx(resp len) 0010 (render cache) xxxx (field/frame) xxxx xxxx (bidx)
#define nDWBWMSGDSC 0x05102000 // DWORD Block Write Message Descriptor through Data Port, Render Cache
#define nDWBWMSGDSC_TF 0x05102600 // DWORD Block Write Message Descriptor through Data Port, Render Cache
#define nDWBWMSGDSC_BF 0x05102700 // DWORD Block Write Message Descriptor through Data Port, Render Cache
#endif // DEV_ILK
#define nDWB_FIELD_MASK 0x0600
// message data payload
.declare mbMSGPAYLOADY Base=m2 ElementSize=1 SrcRegion=REGION(16,1) Type=b
.declare mbMSGPAYLOADC Base=m11 ElementSize=1 SrcRegion=REGION(16,1) Type=b
// Destination registers for write commit
#define gREG_WRITE_COMMIT_Y r10.0
#define gREG_WRITE_COMMIT_UV r11.0
#define RETURN_REG_INTER r1.5 // Return pointer for all sub-routine calls (type DWORD)
#define CALL_INTER(subFunc, skipInst) add (1) RETURN_REG_INTER<1>:ud ip:ud 1+skipInst*INST_SIZE \n\
jmpi (1) subFunc
#define RETURN_INTER mov (1) ip:ud RETURN_REG_INTER<0;1,0>:ud // Return to calling module
// End of inter_header.inc
#endif // !defined(__INTER_HEADER__)
|