vp8/encoder/ppc/fdct_altivec.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


    .globl vp8_short_fdct4x4_ppc
    .globl vp8_short_fdct8x4_ppc

.macro load_c V, LABEL, OFF, R0, R1
    lis     \R0, \LABEL@ha
    la      \R1, \LABEL@l(\R0)
    lvx     \V, \OFF, \R1
.endm

;# Forward and inverse DCTs are nearly identical; only differences are
;#   in normalization (fwd is twice unitary, inv is half unitary)
;#   and that they are of course transposes of each other.
;#
;#   The following three accomplish most of implementation and
;#   are used only by ppc_idct.c and ppc_fdct.c.
.macro prologue
    mfspr   r11, 256            ;# get old VRSAVE
    oris    r12, r11, 0xfffc
    mtspr   256, r12            ;# set VRSAVE

    stwu    r1,-32(r1)          ;# create space on the stack

    li      r6, 16

    load_c v0, dct_tab, 0, r9, r10
    lvx     v1,   r6, r10
    addi    r10, r10, 32
    lvx     v2,    0, r10
    lvx     v3,   r6, r10

    load_c v4, ppc_dctperm_tab,  0, r9, r10
    load_c v5, ppc_dctperm_tab, r6, r9, r10

    load_c v6, round_tab, 0, r10, r9
.endm

.macro epilogue
    addi    r1, r1, 32          ;# recover stack

    mtspr   256, r11            ;# reset old VRSAVE
.endm

;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
;#   For fwd transform, indices are horizontal positions, then frequencies.
;#   For inverse transform, frequencies then positions.
;#   The two resulting  A0..A3  B0..B3  are later combined
;#   and vertically transformed.

.macro two_rows_horiz Dst
    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1

    vmsumshm v10, v0, v8, v6
    vmsumshm v10, v1, v9, v10
    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1

    vmsumshm v11, v2, v8, v6
    vmsumshm v11, v3, v9, v11
    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3

    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
.endm

;# Vertical xf on two rows. DCT values in comments are for inverse transform;
;#   forward transform uses transpose.

.macro two_rows_vert Ceven, Codd
    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
    vmsumshm v8, v8, v12, v6
    vmsumshm v8, v9, v13, v8
    vsraw   v10, v8, v7

    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
    vmsumshm v8, v8, v12, v6
    vmsumshm v8, v9, v13, v8
    vsraw   v8, v8, v7

    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
.endm

.macro two_rows_h Dest
    stw     r0,  0(r8)
    lwz     r0,  4(r3)
    stw     r0,  4(r8)
    lwzux   r0, r3,r5
    stw     r0,  8(r8)
    lwz     r0,  4(r3)
    stw     r0, 12(r8)
    lvx     v8,  0,r8
    two_rows_horiz \Dest
.endm

    .align 2
;# r3 short *input
;# r4 short *output
;# r5 int pitch
vp8_short_fdct4x4_ppc:

    prologue

    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
    addi    r8, r1, 0


    lwz     r0, 0(r3)
    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

    lwzux   r0, r3, r5
    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

    lvx     v6, r6, r9          ;# v6 = Vround
    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

    two_rows_vert v0, v1
    stvx    v8, 0, r4
    two_rows_vert v2, v3
    stvx    v8, r6, r4

    epilogue

    blr

    .align 2
;# r3 short *input
;# r4 short *output
;# r5 int pitch
vp8_short_fdct8x4_ppc:
    prologue

    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
    addi    r8,  r1, 0
    addi    r10, r3, 0

    lwz     r0, 0(r3)
    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

    lwzux   r0, r3, r5
    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

    lvx     v6, r6, r9          ;# v6 = Vround
    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

    two_rows_vert v0, v1
    stvx    v8, 0, r4
    two_rows_vert v2, v3
    stvx    v8, r6, r4

    ;# Next block
    addi    r3, r10, 8
    addi    r4, r4, 32
    lvx     v6, 0, r9           ;# v6 = Hround

    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
    addi    r8, r1, 0

    lwz     r0, 0(r3)
    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

    lwzux   r0, r3, r5
    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

    lvx     v6, r6, r9          ;# v6 = Vround
    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

    two_rows_vert v0, v1
    stvx    v8, 0, r4
    two_rows_vert v2, v3
    stvx    v8, r6, r4

    epilogue

    blr

    .data
    .align 4
ppc_dctperm_tab:
    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15

    .align 4
dct_tab:
    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540

    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274

    .align 4
round_tab:
    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))