polly/lib/External/ppcg/gpu_hybrid.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

/*
 * Copyright 2013      Ecole Normale Superieure
 * Copyright 2015      Sven Verdoolaege
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 */

#include <string.h>

#include <isl/val.h>
#include <isl/space.h>
#include <isl/union_set.h>
#include <isl/schedule_node.h>

#include "hybrid.h"
#include "gpu_hybrid.h"
#include "gpu_tree.h"
#include "schedule.h"
#include "util.h"

/* Have all domain elements been filtered out before reaching
 * the "node" position in the schedule tree?
 */
static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node)
{
	isl_union_set *domain;
	isl_bool empty;

	domain = isl_schedule_node_get_domain(node);
	empty = isl_union_set_is_empty(domain);
	isl_union_set_free(domain);

	return empty;
}

/* Given a pointer to a phase in the result of hybrid tiling,
 * map the phase to the device, provided the phase is non-empty.
 * Empty phases can occur if the input schedule domain can be
 * covered by a small number of hexagons that all belong to the same phase.
 *
 * The input has the following form:
 *
 *	M - CT - P - C - ...
 *
 * with M the phase marker, CT the space tiling, P the original
 * parent band and C the original child band.
 * The (outer dimensions of the) C band need to be mapped to threads.
 * The (outer dimension of the) CT band needs to be mapped to blocks.
 * The mapping to shared memory needs to be computed between the CT and
 * the P band.
 *
 * The C band is first shifted to start at zero.
 * Then the appropriate markers are introduced and a kernel is
 * created for the tree rooted at CT.
 * If the "unroll_gpu_tile" option is set, then the AST generator
 * is instructed to unroll the P and C bands.
 */
static __isl_give isl_schedule_node *update_phase(
	__isl_take isl_schedule_node *node, void *user)
{
	struct gpu_gen *gen = user;
	int depth0, depth;
	isl_ctx *ctx;
	isl_id *id;
	isl_bool empty_domain;
	ppcg_ht_phase *phase;

	empty_domain = has_empty_domain(node);
	if (empty_domain < 0)
		return isl_schedule_node_free(node);
	if (empty_domain)
		return node;

	if (!node)
		return NULL;
	ctx = isl_schedule_node_get_ctx(node);

	phase = ppcg_ht_phase_extract_from_mark(node);

	depth0 = isl_schedule_node_get_tree_depth(node);

	node = isl_schedule_node_child(node, 0);

	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_child(node, 0);
	node = ppcg_ht_phase_shift_space_point(phase, node);
	if (gen->options->unroll_gpu_tile)
		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
	id = isl_id_alloc(ctx, "thread", NULL);
	node = isl_schedule_node_insert_mark(node, id);
	node = isl_schedule_node_parent(node);
	if (gen->options->unroll_gpu_tile)
		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
	id = isl_id_alloc(ctx, "shared", NULL);
	node = isl_schedule_node_insert_mark(node, id);
	node = isl_schedule_node_parent(node);

	node = gpu_create_kernel(gen, node, 0, NULL);

	depth = isl_schedule_node_get_tree_depth(node);
	node = isl_schedule_node_ancestor(node, depth - depth0);

	return node;
}

/* Apply hybrid tiling on "node" and its parent based on the (valid)
 * bounds on the relative dependence distances "bounds" and
 * the tile sizes in "tile_sizes".
 * The number of elements in "tile_sizes" is at least as large
 * as the sum of the dimensions of the parent and the child node.
 *
 * Convert the tile_sizes to an isl_multi_val in the right space,
 * insert the hybrid tiling and then create a kernel inside each phase.
 * Finally, remove the phase marks.
 */
__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
	int *tile_sizes)
{
	isl_multi_val *mv;
	isl_space *space, *space2;

	if (!node || !bounds)
		goto error;

	space2 = isl_schedule_node_band_get_space(node);
	node = isl_schedule_node_parent(node);
	space = isl_schedule_node_band_get_space(node);
	space = isl_space_product(space, space2);
	mv = ppcg_multi_val_from_int_list(space, tile_sizes);

	node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options);

	node = hybrid_tile_foreach_phase(node, &update_phase, gen);

	node = hybrid_tile_drop_phase_marks(node);

	return node;
error:
	isl_schedule_node_free(node);
	ppcg_ht_bounds_free(bounds);
	return NULL;
}