summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--mlir/include/mlir/Dialect/GPU/IR/GPUBase.td5
-rw-r--r--mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h8
-rw-r--r--mlir/include/mlir/Dialect/GPU/IR/GPUOps.td279
-rw-r--r--mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp359
-rw-r--r--mlir/lib/Dialect/GPU/IR/GPUDialect.cpp5
-rw-r--r--mlir/lib/ExecutionEngine/CMakeLists.txt4
-rw-r--r--mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp138
-rw-r--r--mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir37
-rw-r--r--mlir/test/Dialect/GPU/ops.mlir31
-rw-r--r--utils/bazel/llvm-project-overlay/mlir/BUILD.bazel1
10 files changed, 866 insertions, 1 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 50fcd197bb9e..e56af3e05048 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -109,6 +109,11 @@ class MMAMatrixOf<list<Type> allowedTypes> :
"::llvm::cast<::mlir::gpu::MMAMatrixType>($_self).getElementType()",
"gpu.mma_matrix", "::mlir::gpu::MMAMatrixType">;
+// Generic type for all sparse handles (could be refined).
+def GPU_SparseHandle : DialectType<
+ GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::SparseHandleType>()">, "sparse handle type">,
+ BuildableType<"mlir::gpu::SparseHandleType::get($_builder.getContext())">;
+
//===----------------------------------------------------------------------===//
// GPU Interfaces.
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index 4d0208b61dcc..64b8f8f6e8b3 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -163,6 +163,14 @@ public:
// Adds a `gpu.async.token` to the front of the argument list.
void addAsyncDependency(Operation *op, Value token);
+// Represents any sparse handle.
+class SparseHandleType
+ : public Type::TypeBase<SparseHandleType, Type, TypeStorage> {
+public:
+ // Used for generic hooks in TypeBase.
+ using Base::Base;
+};
+
} // namespace gpu
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index f682346d8108..982ec0c6d4c8 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1533,4 +1533,283 @@ def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise",
}];
}
+//
+// Operation on sparse matrices, called from the host
+// (currently lowers to cuSparse for CUDA only, no ROCM lowering).
+//
+
+def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]> {
+ let summary = "Create sparse environment operation";
+ let description = [{
+ The `gpu.create_sparse_env` operation initializes a sparse environment.
+ It must be executed prior to any other sparse operation. The operation
+ returns a handle to the new sparse environment.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %env, %token = gpu.create_sparse_env async [%dep]
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
+ let results = (outs Res<GPU_SparseHandle>:$env, Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
+ }];
+}
+
+def GPU_DestroySparseEnvOp : GPU_Op<"destroy_sparse_env", [GPU_AsyncOpInterface]> {
+ let summary = "Destroy sparse environment operation";
+ let description = [{
+ The `gpu.destroy_sparse_env` operation releases all resources of a sparse
+ environment represented by a handle that was previously created by a
+ `gpu.create_sparse_env` operation.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %token = gpu.destroy_sparse_env async [%dep] %env
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ Arg<GPU_SparseHandle>:$env);
+ let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $env attr-dict
+ }];
+}
+
+def GPU_CreateDnVecOp : GPU_Op<"create_dn_vec", [GPU_AsyncOpInterface]> {
+ let summary = "Create dense vector operation";
+ let description = [{
+ The `gpu.create_dn_vec` operation initializes a dense vector from
+ the given values buffer and size. The buffer must already be copied
+ from the host to the device prior to using this operation. The
+ operation returns a handle to the dense vector descriptor.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %dvec, %token = gpu.create_dn_vec async [%dep] %mem, %size : memref<?xf64>
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ AnyMemRef:$memref, Index:$size);
+ let results = (outs Res<GPU_SparseHandle>:$dvec, Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $memref `,` $size attr-dict `:` type($memref)
+ }];
+}
+
+def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> {
+ let summary = "Destroy dense vector operation";
+ let description = [{
+ The `gpu.destroy_sparse_env` operation releases all resources of a dense
+ vector represented by a handle that was previously created by a
+ `gpu.create_dn_vec` operation.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %token = gpu.destroy_dn_vec async [%dep] %dvec
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ Arg<GPU_SparseHandle>:$dvec);
+ let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $dvec attr-dict
+ }];
+}
+
+def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> {
+ let summary = "Create sparse matrix in COO format operation";
+ let description = [{
+ The `gpu.create_coo` operation initializes a sparse matrix in COO format
+ with the given sizes from the given index and values buffers. The buffers
+ must already be copied from the host to the device prior to using this
+ operation. The operation returns a handle to the sparse matrix descriptor.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %spmat, %token = gpu.create_coo async [%dep] %rows, %cols, %nnz, %rowIdx,
+ %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ Index:$rows,
+ Index:$cols,
+ Index:$nnz,
+ AnyMemRef:$rowIdxs,
+ AnyMemRef:$colIdxs,
+ AnyMemRef:$values);
+ let results = (outs Res<GPU_SparseHandle>:$spmat, Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $rows `,` $cols `,` $nnz `,` $rowIdxs `,` $colIdxs `,` $values attr-dict
+ `:` type($rowIdxs) `,` type($colIdxs) `,` type($values)
+ }];
+}
+
+def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
+ let summary = "Create sparse matrix in CSR format operation";
+ let description = [{
+ The `gpu.create_csr` operation initializes a sparse matrix in CSR format
+ with the given sizes from the given position, index, and values buffers.
+ The buffers must already be copied from the host to the device prior to
+ using this operation. The operation returns a handle to the sparse
+ matrix descriptor.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %spmat, %token = gpu.create_csr async [%dep] %rows, %cols, %nnz, %rowPos,
+ %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ Index:$rows,
+ Index:$cols,
+ Index:$nnz,
+ AnyMemRef:$rowPos,
+ AnyMemRef:$colIdxs,
+ AnyMemRef:$values);
+ let results = (outs Res<GPU_SparseHandle>:$spmat, Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $rows `,` $cols `,` $nnz `,` $rowPos `,` $colIdxs `,` $values attr-dict
+ `:` type($rowPos) `,` type($colIdxs) `,` type($values)
+ }];
+}
+
+def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> {
+ let summary = "Destroy sparse matrix operation";
+ let description = [{
+ The `gpu.destroy_sp_mat` operation releases all resources of a sparse
+ matrix represented by a handle that was previously created by a
+ one of the sparse matrix creation operations.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %token = gpu.destroy_sp_mat async [%dep] %spmat
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ Arg<GPU_SparseHandle>:$spmat);
+ let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $spmat attr-dict
+ }];
+}
+
+def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
+ let summary = "Precompute buffersize for SpMV operation";
+ let description = [{
+ The `gpu.spmv_buffer_size` operation returns the buffer size required
+ to perform the SpMV operation on the given sparse matrix and dense vectors.
+ The operation expects handles returned by previous sparse operations
+ to construct an environment and the operands for SpMV.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %buffersz, %token = gpu.spmv_buffersize async [%dep] %env, %spmatA, %dnX, %dnY
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ GPU_SparseHandle:$env,
+ GPU_SparseHandle:$spmatA,
+ GPU_SparseHandle:$dnX,
+ GPU_SparseHandle:$dnY);
+ let results = (outs Res<Index>:$bufferSz, Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $env `,` $spmatA `,` $dnX `,` $dnY attr-dict
+ }];
+}
+
+def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
+ let summary = "SpMV operation";
+ let description = [{
+ The `gpu.spmv` operation performs the SpMV operation on the given sparse matrix,
+ dense vectors, and buffer. The operation expects handles returned by previous
+ sparse operations to construct an environment and the operands for SpMV. The
+ buffer must have been allocated on the device.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %token = gpu.spmv async [%dep] %env, %spmatA, %dnX, %dnY : memref<?xf64>
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ GPU_SparseHandle:$env,
+ GPU_SparseHandle:$spmatA,
+ GPU_SparseHandle:$dnX,
+ GPU_SparseHandle:$dnY,
+ AnyMemRef:$buffer);
+ let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $env `,` $spmatA `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer)
+ }];
+}
+
#endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 43dff49e1cae..033d8c933539 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -192,6 +192,49 @@ protected:
"mgpuSetDefaultDevice",
llvmVoidType,
{llvmInt32Type /* uint32_t devIndex */}};
+ FunctionCallBuilder createSparseEnvCallBuilder = {
+ "mgpuCreateSparseEnv",
+ llvmPointerType,
+ {llvmPointerType /* void *stream */}};
+ FunctionCallBuilder destroySparseEnvCallBuilder = {
+ "mgpuDestroySparseEnv",
+ llvmVoidType,
+ {llvmPointerType, llvmPointerType /* void *stream */}};
+ FunctionCallBuilder createDnVecCallBuilder = {
+ "mgpuCreateDnVec",
+ llvmPointerType,
+ {llvmIntPtrType, llvmPointerType, llvmInt32Type,
+ llvmPointerType /* void *stream */}};
+ FunctionCallBuilder destroyDnVecCallBuilder = {
+ "mgpuDestroyDnVec",
+ llvmVoidType,
+ {llvmPointerType, llvmPointerType /* void *stream */}};
+ FunctionCallBuilder createCooCallBuilder = {
+ "mgpuCreateCoo",
+ llvmPointerType,
+ {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+ llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
+ llvmPointerType /* void *stream */}};
+ FunctionCallBuilder createCsrCallBuilder = {
+ "mgpuCreateCsr",
+ llvmPointerType,
+ {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+ llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
+ llvmInt32Type, llvmPointerType /* void *stream */}};
+ FunctionCallBuilder destroySpMatCallBuilder = {
+ "mgpuDestroySpMat",
+ llvmVoidType,
+ {llvmPointerType, llvmPointerType /* void *stream */}};
+ FunctionCallBuilder spMVBufferSizeCallBuilder = {
+ "mgpuSpMVBufferSize",
+ llvmIntPtrType,
+ {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
+ llvmPointerType /* void *stream */}};
+ FunctionCallBuilder spMVCallBuilder = {
+ "mgpuSpMV",
+ llvmVoidType,
+ {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
+ llvmPointerType, llvmPointerType /* void *stream */}};
};
/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
@@ -381,6 +424,121 @@ public:
matchAndRewrite(gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
};
+
+class ConvertCreateSparseEnvOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateSparseEnvOp> {
+public:
+ ConvertCreateSparseEnvOpToGpuRuntimeCallPattern(
+ LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::CreateSparseEnvOp>(
+ typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::CreateSparseEnvOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertDestroySparseEnvOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroySparseEnvOp> {
+public:
+ ConvertDestroySparseEnvOpToGpuRuntimeCallPattern(
+ LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::DestroySparseEnvOp>(
+ typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::DestroySparseEnvOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertCreateDnVecOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp> {
+public:
+ ConvertCreateDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::CreateDnVecOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertDestroyDnVecOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp> {
+public:
+ ConvertDestroyDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::DestroyDnVecOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertCreateCooOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateCooOp> {
+public:
+ ConvertCreateCooOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::CreateCooOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::CreateCooOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertCreateCsrOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateCsrOp> {
+public:
+ ConvertCreateCsrOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::CreateCsrOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::CreateCsrOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertDestroySpMatOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroySpMatOp> {
+public:
+ ConvertDestroySpMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::DestroySpMatOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::DestroySpMatOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::SpMVBufferSizeOp> {
+public:
+ ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern(
+ LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::SpMVBufferSizeOp>(typeConverter) {
+ }
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::SpMVBufferSizeOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertSpMVOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::SpMVOp> {
+public:
+ ConvertSpMVOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::SpMVOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::SpMVOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
} // namespace
void GpuToLLVMConversionPass::runOnOperation() {
@@ -959,6 +1117,191 @@ LogicalResult ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
+LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::CreateSparseEnvOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ auto handle =
+ createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult();
+ rewriter.replaceOp(op, {handle, stream});
+ return success();
+}
+
+LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::DestroySparseEnvOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ destroySparseEnvCallBuilder.create(loc, rewriter, {adaptor.getEnv(), stream});
+ rewriter.replaceOp(op, {stream});
+ return success();
+}
+
+LogicalResult ConvertCreateDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::CreateDnVecOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ Value pVec =
+ MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
+ if (!getTypeConverter()->useOpaquePointers())
+ pVec = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pVec);
+ Type dType = op.getMemref().getType().cast<MemRefType>().getElementType();
+ auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+ dType.getIntOrFloatBitWidth());
+ auto handle =
+ createDnVecCallBuilder
+ .create(loc, rewriter, {adaptor.getSize(), pVec, dw, stream})
+ .getResult();
+ rewriter.replaceOp(op, {handle, stream});
+ return success();
+}
+
+LogicalResult ConvertDestroyDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::DestroyDnVecOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ destroyDnVecCallBuilder.create(loc, rewriter, {adaptor.getDvec(), stream});
+ rewriter.replaceOp(op, {stream});
+ return success();
+}
+
+LogicalResult ConvertCreateCooOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::CreateCooOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ Value pRowIdxs =
+ MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc);
+ Value pColIdxs =
+ MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc);
+ Value pValues =
+ MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+ if (!getTypeConverter()->useOpaquePointers()) {
+ pRowIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowIdxs);
+ pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
+ pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
+ }
+ Type iType = op.getColIdxs().getType().cast<MemRefType>().getElementType();
+ Type dType = op.getValues().getType().cast<MemRefType>().getElementType();
+ auto iw = rewriter.create<LLVM::ConstantOp>(
+ loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth());
+ auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+ dType.getIntOrFloatBitWidth());
+ auto handle =
+ createCooCallBuilder
+ .create(loc, rewriter,
+ {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
+ pRowIdxs, pColIdxs, pValues, iw, dw, stream})
+ .getResult();
+ rewriter.replaceOp(op, {handle, stream});
+ return success();
+}
+
+LogicalResult ConvertCreateCsrOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::CreateCsrOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ Value pRowPos =
+ MemRefDescriptor(adaptor.getRowPos()).allocatedPtr(rewriter, loc);
+ Value pColIdxs =
+ MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc);
+ Value pValues =
+ MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+ if (!getTypeConverter()->useOpaquePointers()) {
+ pRowPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowPos);
+ pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
+ pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
+ }
+ Type pType = op.getRowPos().getType().cast<MemRefType>().getElementType();
+ Type iType = op.getColIdxs().getType().cast<MemRefType>().getElementType();
+ Type dType = op.getValues().getType().cast<MemRefType>().getElementType();
+ auto pw = rewriter.create<LLVM::ConstantOp>(
+ loc, llvmInt32Type, pType.isIndex() ? 64 : pType.getIntOrFloatBitWidth());
+ auto iw = rewriter.create<LLVM::ConstantOp>(
+ loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth());
+ auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+ dType.getIntOrFloatBitWidth());
+ auto handle =
+ createCsrCallBuilder
+ .create(loc, rewriter,
+ {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
+ pRowPos, pColIdxs, pValues, pw, iw, dw, stream})
+ .getResult();
+ rewriter.replaceOp(op, {handle, stream});
+ return success();
+}
+
+LogicalResult ConvertDestroySpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::DestroySpMatOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ destroySpMatCallBuilder.create(loc, rewriter, {adaptor.getSpmat(), stream});
+ rewriter.replaceOp(op, {stream});
+ return success();
+}
+
+LogicalResult ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::SpMVBufferSizeOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ auto bufferSize = spMVBufferSizeCallBuilder
+ .create(loc, rewriter,
+ {adaptor.getEnv(), adaptor.getSpmatA(),
+ adaptor.getDnX(), adaptor.getDnY(), stream})
+ .getResult();
+ rewriter.replaceOp(op, {bufferSize, stream});
+ return success();
+}
+
+LogicalResult ConvertSpMVOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::SpMVOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ Value pBuf =
+ MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
+ if (!getTypeConverter()->useOpaquePointers())
+ pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
+ spMVCallBuilder.create(loc, rewriter,
+ {adaptor.getEnv(), adaptor.getSpmatA(),
+ adaptor.getDnX(), adaptor.getDnY(), pBuf, stream});
+ rewriter.replaceOp(op, {stream});
+ return success();
+}
+
void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns,
StringRef gpuBinaryAnnotation,
@@ -967,6 +1310,11 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
return converter.getPointerType(
IntegerType::get(&converter.getContext(), 8));
});
+ converter.addConversion([&converter](gpu::SparseHandleType type) -> Type {
+ return converter.getPointerType(
+ IntegerType::get(&converter.getContext(), 8));
+ });
+
patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern,
ConvertHostRegisterOpToGpuRuntimeCallPattern,
@@ -976,7 +1324,16 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
ConvertWaitAsyncOpToGpuRuntimeCallPattern,
ConvertWaitOpToGpuRuntimeCallPattern,
- ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);
+ ConvertAsyncYieldToGpuRuntimeCallPattern,
+ ConvertCreateSparseEnvOpToGpuRuntimeCallPattern,
+ ConvertDestroySparseEnvOpToGpuRuntimeCallPattern,
+ ConvertCreateDnVecOpToGpuRuntimeCallPattern,
+ ConvertDestroyDnVecOpToGpuRuntimeCallPattern,
+ ConvertCreateCooOpToGpuRuntimeCallPattern,
+ ConvertCreateCsrOpToGpuRuntimeCallPattern,
+ ConvertDestroySpMatOpToGpuRuntimeCallPattern,
+ ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,
+ ConvertSpMVOpToGpuRuntimeCallPattern>(converter);
patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
converter, gpuBinaryAnnotation, kernelBarePtrCallConv);
patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index e5e0327ebf1c..ce502401b86a 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -146,6 +146,7 @@ struct GPUInlinerInterface : public DialectInlinerInterface {
void GPUDialect::initialize() {
addTypes<AsyncTokenType>();
addTypes<MMAMatrixType>();
+ addTypes<SparseHandleType>();
addOperations<
#define GET_OP_LIST
#include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc"
@@ -200,6 +201,9 @@ Type GPUDialect::parseType(DialectAsmParser &parser) const {
shape, elementType, operand);
}
+ if (keyword == "sparse.handle")
+ return SparseHandleType::get(context);
+
parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword);
return Type();
}
@@ -207,6 +211,7 @@ Type GPUDialect::parseType(DialectAsmParser &parser) const {
void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
TypeSwitch<Type>(type)
.Case<AsyncTokenType>([&](Type) { os << "async.token"; })
+ .Case<SparseHandleType>([&](Type) { os << "sparse.handle"; })
.Case<MMAMatrixType>([&](MMAMatrixType fragTy) {
os << "mma_matrix<";
auto shape = fragTy.getShape();
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 14d8ed2095e0..369fd1b8ca77 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -190,6 +190,9 @@ if(LLVM_ENABLE_PIC)
# We need the libcuda.so library.
find_library(CUDA_RUNTIME_LIBRARY cuda)
+ # We need the libcusparse.so library.
+ find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+
add_mlir_library(mlir_cuda_runtime
SHARED
CudaRuntimeWrappers.cpp
@@ -204,6 +207,7 @@ if(LLVM_ENABLE_PIC)
target_link_libraries(mlir_cuda_runtime
PRIVATE
${CUDA_RUNTIME_LIBRARY}
+ ${CUDA_CUSPARSE_LIBRARY}
)
endif()
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 4065c6531669..5040afb0915a 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -17,6 +17,7 @@
#include <stdio.h>
#include "cuda.h"
+#include "cusparse.h"
#ifdef _WIN32
#define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
@@ -35,6 +36,15 @@
fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
}(expr)
+#define CUSPARSE_REPORT_IF_ERROR(expr) \
+ { \
+ cusparseStatus_t status = (expr); \
+ if (status != CUSPARSE_STATUS_SUCCESS) { \
+ fprintf(stderr, "cuSPARSE '%s' failed with '%s'\n", #expr, \
+ cusparseGetErrorString(status)); \
+ } \
+ }
+
thread_local static int32_t defaultDevice = 0;
// Make the primary context of the current default device current for the
@@ -158,7 +168,9 @@ extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count,
value, count, stream));
}
+///
/// Helper functions for writing mlir example code
+///
// Allows to register byte array with the CUDA runtime. Helpful until we have
// transfer functions implemented.
@@ -211,3 +223,129 @@ mgpuMemHostUnregisterMemRef(int64_t rank,
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
defaultDevice = device;
}
+
+///
+/// Wrapper methods for the cuSparse library.
+///
+
+static inline cudaDataType_t dataTp(int32_t width) {
+ switch (width) {
+ case 32:
+ return CUDA_R_32F;
+ default:
+ return CUDA_R_64F;
+ }
+}
+
+static inline cusparseIndexType_t idxTp(int32_t width) {
+ switch (width) {
+ case 32:
+ return CUSPARSE_INDEX_32I;
+ default:
+ return CUSPARSE_INDEX_64I;
+ }
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateSparseEnv(CUstream /*stream*/) {
+ cusparseHandle_t handle = nullptr;
+ CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&handle))
+ return reinterpret_cast<void *>(handle);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroySparseEnv(void *h, CUstream /*stream*/) {
+ cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+ CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(handle))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateDnVec(intptr_t size, void *values, int32_t dw, CUstream /*stream*/) {
+ cusparseDnVecDescr_t vec = nullptr;
+ cudaDataType_t dtp = dataTp(dw);
+ CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnVec(&vec, size, values, dtp))
+ return reinterpret_cast<void *>(vec);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyDnVec(void *v, CUstream /*stream*/) {
+ cusparseDnVecDescr_t vec = reinterpret_cast<cusparseDnVecDescr_t>(v);
+ CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnVec(vec))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateDnMat(intptr_t rows, intptr_t cols, void *values, int32_t dw,
+ CUstream /*stream*/) {
+ cusparseDnMatDescr_t mat = nullptr;
+ cudaDataType_t dtp = dataTp(dw);
+ CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnMat(&mat, rows, cols, /*ld=*/cols,
+ values, dtp, CUSPARSE_ORDER_ROW))
+ return reinterpret_cast<void *>(mat);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyDnMat(void *m, CUstream /*stream*/) {
+ cusparseDnMatDescr_t mat = reinterpret_cast<cusparseDnMatDescr_t>(m);
+ CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnMat(mat))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateCoo(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowIdxs,
+ void *colIdxs, void *values, int32_t iw, int32_t dw,
+ CUstream /*stream*/) {
+ cusparseSpMatDescr_t mat = nullptr;
+ cusparseIndexType_t itp = idxTp(iw);
+ cudaDataType_t dtp = dataTp(dw);
+ CUSPARSE_REPORT_IF_ERROR(cusparseCreateCoo(&mat, rows, cols, nnz, rowIdxs,
+ colIdxs, values, itp,
+ CUSPARSE_INDEX_BASE_ZERO, dtp))
+ return reinterpret_cast<void *>(mat);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
+ void *colIdxs, void *values, int32_t pw, int32_t iw, int32_t dw,
+ CUstream /*stream*/) {
+ cusparseSpMatDescr_t mat = nullptr;
+ cusparseIndexType_t ptp = idxTp(pw);
+ cusparseIndexType_t itp = idxTp(iw);
+ cudaDataType_t dtp = dataTp(dw);
+ CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsr(&mat, rows, cols, nnz, rowPos,
+ colIdxs, values, ptp, itp,
+ CUSPARSE_INDEX_BASE_ZERO, dtp))
+ return reinterpret_cast<void *>(mat);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroySpMat(void *m, CUstream /*stream*/) {
+ cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m);
+ CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
+mgpuSpMVBufferSize(void *h, void *a, void *x, void *y, CUstream /*stream*/) {
+ cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+ cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+ cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
+ cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
+ double alpha = 1.0;
+ double beta = 1.0;
+ size_t bufferSize = 0;
+ CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize(
+ handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
+ CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize))
+ return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuSpMV(void *h, void *a, void *x, void *y, void *b, CUstream /*stream*/) {
+ cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+ cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+ cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
+ cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
+ double alpha = 1.0;
+ double beta = 1.0;
+ CUSPARSE_REPORT_IF_ERROR(
+ cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX,
+ &beta, vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, b))
+}
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
new file mode 100644
index 000000000000..6f163f926939
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+ // CHECK-LABEL: func @matvec
+ // CHECK: llvm.call @mgpuStreamCreate
+ // CHECK: llvm.call @mgpuMemAlloc
+ // CHECK: llvm.call @mgpuMemAlloc
+ // CHECK: llvm.call @mgpuCreateSparseEnv
+ // CHECK: llvm.call @mgpuCreateCoo
+ // CHECK: llvm.call @mgpuCreateDnVec
+ // CHECK: llvm.call @mgpuSpMVBufferSize
+ // CHECK: llvm.call @mgpuSpM
+ // CHECK: llvm.call @mgpuDestroySpMat
+ // CHECK: llvm.call @mgpuDestroyDnVec
+ // CHECK: llvm.call @mgpuDestroySparseEnv
+ // CHECK: llvm.call @mgpuStreamSynchronize
+ // CHECK: llvm.call @mgpuStreamDestroy
+ func.func @matvec(%arg0: index) {
+ %token0 = gpu.wait async
+ %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
+ %env, %token3 = gpu.create_sparse_env async [%token2]
+ %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref<?xf64>
+ %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec
+ %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>
+ %token8 = gpu.destroy_sp_mat async [%token7] %spmat
+ %token9 = gpu.destroy_dn_vec async [%token8] %dnvec
+ %token10 = gpu.destroy_sparse_env async [%token9] %env
+ gpu.wait [%token10]
+ return
+ }
+
+}
+
+
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 5bb5efb2a449..00e2421c0283 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -317,6 +317,37 @@ module attributes {gpu.container_module} {
gpu.set_default_device %arg0
return
}
+
+ // CHECK-LABEL: func @sparse_ops
+ func.func @sparse_ops(%arg0: index) {
+ // CHECK: gpu.wait async
+ %token0 = gpu.wait async
+ // CHECK: gpu.alloc async
+ %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+ // CHECK: gpu.alloc async
+ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
+ // CHECK: gpu.create_sparse_env async
+ %env, %token3 = gpu.create_sparse_env async [%token2]
+ // CHECK: gpu.create_coo async
+ %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ // CHECK: gpu.create_csr async
+ %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ // CHECK: gpu.create_dn_vec async
+ %dnvec, %token6 = gpu.create_dn_vec async [%token5] %mem2, %arg0 : memref<?xf64>
+ // CHECK: gpu.spmv_buffer_size async
+ %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec
+ // CHECK: gpu.spmv async
+ %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>
+ // CHECK: gpu.destroy_sp_mat async
+ %token9 = gpu.destroy_sp_mat async [%token8] %spmat
+ // CHECK: gpu.destroy_dn_vec async
+ %token10 = gpu.destroy_dn_vec async [%token9] %dnvec
+ // CHECK: gpu.destroy_sparse_env async
+ %token11 = gpu.destroy_sparse_env async [%token10] %env
+ // CHECK: gpu.wait
+ gpu.wait [%token11]
+ return
+ }
}
// Just check that this doesn't crash.
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 7036cd5f3013..c661dd4decab 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7758,6 +7758,7 @@ cc_library(
":LLVMSupportHeaders",
":mlir_c_runner_utils",
"@cuda//:cuda_headers",
+ "@cuda//:cusparse_static",
"@cuda//:libcuda",
],
)