summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormax <maksim.levental@gmail.com>2023-04-06 15:07:12 -0500
committermax <maksim.levental@gmail.com>2023-04-06 15:07:12 -0500
commit8f7c8a6ea765139225878e1dfe90bc1eb6f0067c (patch)
tree496b99f19b90d911ffe4b7b6d05f8b6630f4a15f
parente58a49300e757ff61142f6abd227bd1437c1cf87 (diff)
downloadllvm-8f7c8a6ea765139225878e1dfe90bc1eb6f0067c.tar.gz
Add gpu::HostUnregisterOp
Without explicitly unregistering you will get ``` 'cuMemHostRegister(ptr, sizeBytes, 0)' failed with 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED' ``` in CUDA (for example) after repeated runs (e.g., during benchmarking the same kernel). Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D147277
-rw-r--r--mlir/include/mlir/Dialect/GPU/IR/GPUOps.td13
-rw-r--r--mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp43
-rw-r--r--mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp16
-rw-r--r--mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp16
4 files changed, 88 insertions, 0 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 32ab246c74f0..860e20720afd 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -929,6 +929,19 @@ def GPU_HostRegisterOp : GPU_Op<"host_register">,
let assemblyFormat = "$value attr-dict `:` type($value)";
}
+def GPU_HostUnregisterOp : GPU_Op<"host_unregister">,
+ Arguments<(ins AnyUnrankedMemRef:$value)> {
+ let summary = "Unregisters a memref for access from device.";
+ let description = [{
+ This op unmaps the provided host buffer from the device address space.
+
+ This operation may not be supported in every environment, there is not yet a
+ way to check at runtime whether this feature is supported.
+ }];
+
+ let assemblyFormat = "$value attr-dict `:` type($value)";
+}
+
def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> {
let summary = "Wait for async gpu ops to complete.";
let description = [{
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 55a5e4683955..3687bd6718bf 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -161,6 +161,12 @@ protected:
{llvmIntPtrType /* intptr_t rank */,
llvmPointerType /* void *memrefDesc */,
llvmIntPtrType /* intptr_t elementSizeBytes */}};
+ FunctionCallBuilder hostUnregisterCallBuilder = {
+ "mgpuMemHostUnregisterMemRef",
+ llvmVoidType,
+ {llvmIntPtrType /* intptr_t rank */,
+ llvmPointerType /* void *memrefDesc */,
+ llvmIntPtrType /* intptr_t elementSizeBytes */}};
FunctionCallBuilder allocCallBuilder = {
"mgpuMemAlloc",
llvmPointerType /* void * */,
@@ -202,6 +208,20 @@ private:
ConversionPatternRewriter &rewriter) const override;
};
+class ConvertHostUnregisterOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp> {
+public:
+ ConvertHostUnregisterOpToGpuRuntimeCallPattern(
+ LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp>(typeConverter) {
+ }
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
/// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertAllocOpToGpuRuntimeCallPattern
@@ -446,6 +466,28 @@ LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
+LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ Operation *op = hostUnregisterOp.getOperation();
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
+ return failure();
+
+ Location loc = op->getLoc();
+
+ auto memRefType = hostUnregisterOp.getValue().getType();
+ auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
+ auto elementSize = getSizeInBytes(loc, elementType, rewriter);
+
+ auto arguments = getTypeConverter()->promoteOperands(
+ loc, op->getOperands(), adaptor.getOperands(), rewriter);
+ arguments.push_back(elementSize);
+ hostUnregisterCallBuilder.create(loc, rewriter, arguments);
+
+ rewriter.eraseOp(op);
+ return success();
+}
+
LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::AllocOp allocOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
@@ -928,6 +970,7 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern,
ConvertHostRegisterOpToGpuRuntimeCallPattern,
+ ConvertHostUnregisterOpToGpuRuntimeCallPattern,
ConvertMemcpyOpToGpuRuntimeCallPattern,
ConvertMemsetOpToGpuRuntimeCallPattern,
ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 44ed5b0cd205..4065c6531669 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -192,6 +192,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
mgpuMemHostRegister(ptr, sizeBytes);
}
+// Allows to unregister byte array with the CUDA runtime.
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) {
+ ScopedContext scopedContext;
+ CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr));
+}
+
+/// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a
+/// ranked memref descriptor struct of rank `rank`
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemHostUnregisterMemRef(int64_t rank,
+ StridedMemRefType<char, 1> *descriptor,
+ int64_t elementSizeBytes) {
+ auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+ mgpuMemHostUnregister(ptr);
+}
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
defaultDevice = device;
}
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index 43a7e3c62089..bd3868a8e196 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -152,6 +152,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
mgpuMemHostRegister(ptr, sizeBytes);
}
+// Allows to unregister byte array with the ROCM runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mgpuMemHostUnregister(void *ptr) {
+ HIP_REPORT_IF_ERROR(hipHostUnregister(ptr));
+}
+
+// Allows to unregister a MemRef with the ROCm runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void
+mgpuMemHostUnregisterMemRef(int64_t rank,
+ StridedMemRefType<char, 1> *descriptor,
+ int64_t elementSizeBytes) {
+ auto ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+ mgpuMemHostUnregister(ptr);
+}
+
template <typename T>
void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) {
HIP_REPORT_IF_ERROR(hipSetDevice(0));