summaryrefslogtreecommitdiff
path: root/src/intel/vulkan/grl/gpu/instance.h
blob: e463a01dc9029295703ee9c849517942a1beacc1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
//
// Copyright (C) 2009-2021 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//

#pragma once

#include "shared.h"
#include "affinespace.h"
#include "api_interface.h"
#include "qbvh6.h"
#include "libs/lsc_intrinsics.h"

GRL_INLINE uint32_t HwInstanceLeafPart1_getInstanceIndex(struct HwInstanceLeaf *I)
{
    return I->part1.instanceIndex;
}

GRL_INLINE void encodeDW0_HwInstanceLeafPart0(
    uint32_t shaderIndex,
    uint32_t geomMask,
    uint4 *dst)
{
    (*dst).x = (shaderIndex & ((1 << 24) - 1)) |
             (geomMask << 24);
}

GRL_INLINE void encodeDW1_HwInstanceLeafPart0(
    uint32_t instanceContributionToHitGroupIndex,
    uint32_t notProcedural,
    uint32_t geomFlags,
    uint4* dst)
{
    (*dst).y = (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
        ((notProcedural & 1) << (24 + 5)) |
        ((geomFlags & 3) << (24 + 5 + 1));
}

GRL_INLINE void encodeDW2DW3_HwInstanceLeafPart0(
    uint64_t rootNodePtr,
    uint32_t instFlags,
    uint4* dst)
{
    uint64_t flags = instFlags;
    uint DW2 = (uint)rootNodePtr;
    uint DW3 = ((uint)(rootNodePtr >> 32ul) & 0xffff);
    DW3 |= flags << 16ull;
    (*dst).z = DW2;
    (*dst).w = DW3;
}

GRL_INLINE void HwInstanceLeafPart0_setDW0(struct HwInstanceLeaf *I,
                                       uint32_t shaderIndex,
                                       uint32_t geomMask)
{
    I->part0.DW0 =
        (shaderIndex & ((1 << 24) - 1)) |
        (geomMask << 24);
}

GRL_INLINE void HwInstanceLeafPart0_setDW1(struct HwInstanceLeaf *I,
                                       uint32_t instanceContributionToHitGroupIndex,
                                       uint32_t notProcedural,
                                       uint32_t geomFlags)
{
    I->part0.DW1 =
        (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
        ((notProcedural & 1) << (24 + 5)) |
        ((geomFlags & 3) << (24 + 5 + 1));
}

GRL_INLINE void HwInstanceLeafPart1_setDW0DW1(struct HwInstanceLeaf *I,
                                          global char *pBvhPtr)
{
    I->part1.DW0_DW1 = ((uint64_t)pBvhPtr) & (((uint64_t)1 << 48) - 1);
}

GRL_INLINE void HwInstanceLeafPart0_setDW2DW3(struct HwInstanceLeaf *I,
                                          uint64_t rootNodePtr,
                                          uint32_t instFlags)
{
    uint64_t flags = instFlags;
    flags = flags << 48ull;
    uint64_t ptr = rootNodePtr & 0x0000ffffffffffff;
    I->part0.DW2_DW3 = ptr + flags;
}

GRL_INLINE void HwInstanceLeaf_Constructor(global struct HwInstanceLeaf* leaf,
    global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
    uint instanceIndex,
    uint rootNodeByteOffset,
    uint instanceMask)
{
    global uint4* InstanceLeaf_4DWparts = (global uint4*) (leaf);

    struct AffineSpace3f obj2world = AffineSpace3f_load_row_major(instDesc->Transform);

    qword accStructPtr = (qword)instDesc->AccelerationStructure;
    uint4 p1_DW0_3 = (uint4)(
        (uint)accStructPtr,
        (uint)(accStructPtr >> (uint64_t)32),
        GRL_get_instanceID(instDesc),
        instanceIndex);

    struct AffineSpace3f world2obj = AffineSpace3f_invert(obj2world);

    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 0 /*part1 + 0DW*/, p1_DW0_3);

    uint4 p1_DW4_7 = (uint4)(
        as_uint(obj2world.l.vx.x),
        as_uint(obj2world.l.vx.y),
        as_uint(obj2world.l.vx.z),
        as_uint(obj2world.l.vy.x));

    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 1 /*part1 + 4DW*/, p1_DW4_7);

    uint4 p1_DW8_11 = (uint4)(
        as_uint(obj2world.l.vy.y),
        as_uint(obj2world.l.vy.z),
        as_uint(obj2world.l.vz.x),
        as_uint(obj2world.l.vz.y));

    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 2 /*part1 + 8DW*/, p1_DW8_11);


    uint4 p1_DW12_15 = (uint4)(
        as_uint(obj2world.l.vz.z),
        as_uint(world2obj.p.x),
        as_uint(world2obj.p.y),
        as_uint(world2obj.p.z));

    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 3 /*part1 + 12DW*/, p1_DW12_15);

    
    uint hit_group_index = GRL_get_InstanceContributionToHitGroupIndex(instDesc);
    global struct BVHBase* bvh = (global struct BVHBase*)instDesc->AccelerationStructure;

    uint4 p0_DW0_3;

    encodeDW0_HwInstanceLeafPart0(
        hit_group_index,
        instanceMask,
        &p0_DW0_3);

    encodeDW1_HwInstanceLeafPart0(
        hit_group_index, // for HW instance leaf, this field is used to offset the hit-group index
        1,  // disable opaque culling.. Necessary for SW instancing.. don't-care for HW instancing
        0,
        &p0_DW0_3);

    encodeDW2DW3_HwInstanceLeafPart0(
        rootNodeByteOffset == NO_NODE_OFFSET ? 0 : ((uint64_t)bvh) + rootNodeByteOffset, // offset NO_NODE_OFFSET is for degenerated instance, put null as root pointer
        GRL_get_InstanceFlags(instDesc),
        &p0_DW0_3);

    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 0 /*part0 + 0DW*/, p0_DW0_3);

    uint4 p0_DW4_7 = (uint4)(
        as_uint(world2obj.l.vx.x),
        as_uint(world2obj.l.vx.y),
        as_uint(world2obj.l.vx.z),
        as_uint(world2obj.l.vy.x));

    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 1 /*part0 + 4DW*/, p0_DW4_7);

    uint4 p0_DW8_11 = (uint4)(
        as_uint(world2obj.l.vy.y),
        as_uint(world2obj.l.vy.z),
        as_uint(world2obj.l.vz.x),
        as_uint(world2obj.l.vz.y));

    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 2 /*part0 + 8DW*/, p0_DW8_11);

    uint4 p0_DW12_15 = (uint4)(
        as_uint(world2obj.l.vz.z),
        as_uint(obj2world.p.x),
        as_uint(obj2world.p.y),
        as_uint(obj2world.p.z));

    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 3 /*part0 + 12DW*/, p0_DW12_15);
}