mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354

//===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the GPU dialect pattern rewriters that make GPU op
// within a region execute asynchronously.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/Transforms/Passes.h"

#include "mlir/Dialect/Async/IR/Async.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Utils.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/TypeSwitch.h"

namespace mlir {
#define GEN_PASS_DEF_GPUASYNCREGIONPASS
#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
} // namespace mlir

using namespace mlir;

namespace {
class GpuAsyncRegionPass
    : public impl::GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
  struct ThreadTokenCallback;
  struct DeferWaitCallback;
  struct SingleTokenUseCallback;
  void runOnOperation() override;
};
} // namespace

static bool isTerminator(Operation *op) {
  return op->mightHaveTrait<OpTrait::IsTerminator>();
}
static bool hasSideEffects(Operation *op) { return !isMemoryEffectFree(op); }

// Region walk callback which makes GPU ops implementing the AsyncOpInterface
// execute asynchronously.
struct GpuAsyncRegionPass::ThreadTokenCallback {
  ThreadTokenCallback(MLIRContext &context) : builder(&context) {}

  WalkResult operator()(Block *block) {
    for (Operation &op : make_early_inc_range(*block)) {
      if (failed(visit(&op)))
        return WalkResult::interrupt();
    }
    return WalkResult::advance();
  }

private:
  // If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
  // create a current token (unless it already exists), and 'thread' that token
  // through the `op` so that it executes asynchronously.
  //
  // If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to
  // host-synchronize execution. A `!gpu.async.token` will therefore only be
  // used inside of its block and GPU execution will always synchronize with
  // the host at block boundaries.
  LogicalResult visit(Operation *op) {
    if (isa<gpu::LaunchOp>(op))
      return op->emitOpError("replace with gpu.launch_func first");
    if (auto waitOp = llvm::dyn_cast<gpu::WaitOp>(op)) {
      if (currentToken)
        waitOp.addAsyncDependency(currentToken);
      currentToken = waitOp.getAsyncToken();
      return success();
    }
    builder.setInsertionPoint(op);
    if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))
      return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
    if (!currentToken)
      return success();
    // Insert host synchronization before terminator or op with side effects.
    if (isTerminator(op) || hasSideEffects(op))
      currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
    return success();
  }

  // Replaces asyncOp with a clone that returns a token.
  LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
    auto *op = asyncOp.getOperation();
    auto tokenType = builder.getType<gpu::AsyncTokenType>();

    // If there is no current token, insert a `gpu.wait async` without
    // dependencies to create one.
    if (!currentToken)
      currentToken = createWaitOp(op->getLoc(), tokenType, {});
    asyncOp.addAsyncDependency(currentToken);

    // Return early if op returns a token already.
    currentToken = asyncOp.getAsyncToken();
    if (currentToken)
      return success();

    // Clone the op to return a token in addition to the other results.
    SmallVector<Type, 1> resultTypes;
    resultTypes.reserve(1 + op->getNumResults());
    copy(op->getResultTypes(), std::back_inserter(resultTypes));
    resultTypes.push_back(tokenType);
    auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,
                                    op->getOperands(), op->getAttrDictionary(),
                                    op->getPropertiesStorage(),
                                    op->getSuccessors(), op->getNumRegions());

    // Clone regions into new op.
    IRMapping mapping;
    for (auto pair : llvm::zip_first(op->getRegions(), newOp->getRegions()))
      std::get<0>(pair).cloneInto(&std::get<1>(pair), mapping);

    // Replace the op with the async clone.
    auto results = newOp->getResults();
    currentToken = results.back();
    builder.insert(newOp);
    op->replaceAllUsesWith(results.drop_back());
    op->erase();

    return success();
  }

  Value createWaitOp(Location loc, Type resultType, ValueRange operands) {
    return builder.create<gpu::WaitOp>(loc, resultType, operands)
        .getAsyncToken();
  }

  OpBuilder builder;

  // The token that represents the current asynchronous dependency. It's valid
  // range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op.
  // In between, each gpu::AsyncOpInterface depends on the current token and
  // produces the new one.
  Value currentToken = {};
};

/// Erases `executeOp` and returns a clone with additional `results`.
async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp,
                                   ValueRange results) {
  // Add values to async.yield op.
  Operation *yieldOp = executeOp.getBody()->getTerminator();
  yieldOp->insertOperands(yieldOp->getNumOperands(), results);

  // Construct new result type list with additional types.
  SmallVector<Type, 2> resultTypes;
  resultTypes.reserve(executeOp.getNumResults() + results.size());
  transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
            [](Type type) {
              // Extract value type from !async.value.
              if (auto valueType = dyn_cast<async::ValueType>(type))
                return valueType.getValueType();
              assert(isa<async::TokenType>(type) && "expected token type");
              return type;
            });
  transform(results, std::back_inserter(resultTypes),
            [](Value value) { return value.getType(); });

  // Clone executeOp with the extra results.
  OpBuilder builder(executeOp);
  auto newOp = builder.create<async::ExecuteOp>(
      executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
      executeOp.getDependencies(), executeOp.getBodyOperands());
  IRMapping mapper;
  newOp.getRegion().getBlocks().clear();
  executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);

  // Replace executeOp with cloned one.
  executeOp.getOperation()->replaceAllUsesWith(
      newOp.getResults().drop_back(results.size()));
  executeOp.erase();

  return newOp;
}

// Callback for `async.execute` ops which tries to push the contained
// synchronous `gpu.wait` op to the dependencies of the `async.execute`.
struct GpuAsyncRegionPass::DeferWaitCallback {
  // If the `executeOp`s token is used only in `async.execute` or `async.await`
  // ops, add the region's last `gpu.wait` op to the worklist if it is
  // synchronous and is the last op with side effects.
  void operator()(async::ExecuteOp executeOp) {
    if (!areAllUsersExecuteOrAwait(executeOp.getToken()))
      return;
    // async.execute's region is currently restricted to one block.
    for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) {
      if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) {
        if (!waitOp.getAsyncToken())
          worklist.push_back(waitOp);
        return;
      }
      if (hasSideEffects(&op))
        return;
    }
  }

  // The destructor performs the actual rewrite work.
  ~DeferWaitCallback() {
    for (size_t i = 0; i < worklist.size(); ++i) {
      auto waitOp = worklist[i];
      auto executeOp = waitOp->getParentOfType<async::ExecuteOp>();

      // Erase `gpu.wait` and return async dependencies from execute op instead.
      SmallVector<Value, 4> dependencies = waitOp.getAsyncDependencies();
      waitOp.erase();
      executeOp = addExecuteResults(executeOp, dependencies);

      // Add the async dependency to each user of the `async.execute` token.
      auto asyncTokens = executeOp.getResults().take_back(dependencies.size());
      SmallVector<Operation *, 4> users(executeOp.getToken().user_begin(),
                                        executeOp.getToken().user_end());
      for (Operation *user : users)
        addAsyncDependencyAfter(asyncTokens, user);
    }
  }

private:
  // Returns whether all token users are either 'async.execute' or 'async.await'
  // ops. This is used as a requirement for pushing 'gpu.wait' ops from a
  // 'async.execute' body to it's users. Specifically, we do not allow
  // terminator users, because it could mean that the `async.execute` is inside
  // control flow code.
  static bool areAllUsersExecuteOrAwait(Value token) {
    return !token.use_empty() &&
           llvm::all_of(token.getUsers(), [](Operation *user) {
             return isa<async::ExecuteOp, async::AwaitOp>(user);
           });
  }

  // Add the `asyncToken` as dependency as needed after `op`.
  void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) {
    OpBuilder builder(op->getContext());
    auto loc = op->getLoc();

    Block::iterator it;
    SmallVector<Value, 1> tokens;
    tokens.reserve(asyncTokens.size());
    TypeSwitch<Operation *>(op)
        .Case<async::AwaitOp>([&](auto awaitOp) {
          // Add async.await ops to wait for the !gpu.async.tokens.
          builder.setInsertionPointAfter(op);
          for (auto asyncToken : asyncTokens)
            tokens.push_back(
                builder.create<async::AwaitOp>(loc, asyncToken).getResult());
          // Set `it` after the inserted async.await ops.
          it = builder.getInsertionPoint();
        })
        .Case<async::ExecuteOp>([&](auto executeOp) {
          // Set `it` to the beginning of the region and add asyncTokens to the
          // async.execute operands.
          it = executeOp.getBody()->begin();
          executeOp.getBodyOperandsMutable().append(asyncTokens);
          SmallVector<Type, 1> tokenTypes(
              asyncTokens.size(), builder.getType<gpu::AsyncTokenType>());
          SmallVector<Location, 1> tokenLocs(asyncTokens.size(),
                                             executeOp.getLoc());
          copy(executeOp.getBody()->addArguments(tokenTypes, tokenLocs),
               std::back_inserter(tokens));
        });

    // Advance `it` to terminator or op with side-effects.
    it = std::find_if(it, Block::iterator(), [](Operation &op) {
      return isTerminator(&op) || hasSideEffects(&op);
    });

    // If `op` implements the AsyncOpInterface, add `token` to the list of async
    // dependencies.
    if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) {
      for (auto token : tokens)
        asyncOp.addAsyncDependency(token);
      return;
    }

    // Otherwise, insert a gpu.wait before 'it'.
    builder.setInsertionPoint(it->getBlock(), it);
    auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens);

    // If the new waitOp is at the end of an async.execute region, add it to the
    // worklist. 'operator()(executeOp)' would do the same, but this is faster.
    auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp());
    if (executeOp && areAllUsersExecuteOrAwait(executeOp.getToken()) &&
        !it->getNextNode())
      worklist.push_back(waitOp);
  }

  SmallVector<gpu::WaitOp, 8> worklist;
};

// Callback for `async.execute` ops which repeats !gpu.async.token results
// so that each of them is only used once.
struct GpuAsyncRegionPass::SingleTokenUseCallback {
  void operator()(async::ExecuteOp executeOp) {
    // Extract !gpu.async.token results which have multiple uses.
    auto multiUseResults = llvm::make_filter_range(
        executeOp.getBodyResults(), [](OpResult result) {
          if (result.use_empty() || result.hasOneUse())
            return false;
          auto valueType = dyn_cast<async::ValueType>(result.getType());
          return valueType &&
                 isa<gpu::AsyncTokenType>(valueType.getValueType());
        });
    if (multiUseResults.empty())
      return;

    // Indices within !async.execute results (i.e. without the async.token).
    SmallVector<int, 4> indices;
    transform(multiUseResults, std::back_inserter(indices),
              [](OpResult result) {
                return result.getResultNumber() - 1; // Index without token.
              });

    for (auto index : indices) {
      assert(!executeOp.getBodyResults()[index].getUses().empty());
      // Repeat async.yield token result, one for each use after the first one.
      auto uses = llvm::drop_begin(executeOp.getBodyResults()[index].getUses());
      auto count = std::distance(uses.begin(), uses.end());
      auto yieldOp = cast<async::YieldOp>(executeOp.getBody()->getTerminator());
      SmallVector<Value, 4> operands(count, yieldOp.getOperand(index));
      executeOp = addExecuteResults(executeOp, operands);
      // Update 'uses' to refer to the new executeOp.
      uses = llvm::drop_begin(executeOp.getBodyResults()[index].getUses());
      auto results = executeOp.getBodyResults().take_back(count);
      for (auto pair : llvm::zip(uses, results))
        std::get<0>(pair).set(std::get<1>(pair));
    }
  }
};

// Replaces synchronous GPU ops in the op's region with asynchronous ones and
// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
// execution semantics and that no GPU ops are asynchronous yet.
void GpuAsyncRegionPass::runOnOperation() {
  if (getOperation()->walk(ThreadTokenCallback(getContext())).wasInterrupted())
    return signalPassFailure();

  // Collect gpu.wait ops that we can move out of async.execute regions.
  getOperation().getRegion().walk(DeferWaitCallback());
  // Makes each !gpu.async.token returned from async.execute op have single use.
  getOperation().getRegion().walk(SingleTokenUseCallback());
}

std::unique_ptr<OperationPass<func::FuncOp>> mlir::createGpuAsyncRegionPass() {
  return std::make_unique<GpuAsyncRegionPass>();
}