Add gpu::HostUnregisterOp

Without explicitly unregistering you will get ``` 'cuMemHostRegister(ptr, sizeBytes, 0)' failed with 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED' ``` in CUDA (for example) after repeated runs (e.g., during benchmarking the same kernel). Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D147277
author: max <maksim.levental@gmail.com> 2023-04-06 15:07:12 -0500
committer: max <maksim.levental@gmail.com> 2023-04-06 15:07:12 -0500
commit: 8f7c8a6ea765139225878e1dfe90bc1eb6f0067c (patch)
tree: 496b99f19b90d911ffe4b7b6d05f8b6630f4a15f
parent: e58a49300e757ff61142f6abd227bd1437c1cf87 (diff)
download: llvm-8f7c8a6ea765139225878e1dfe90bc1eb6f0067c.tar.gz
4 files changed, 88 insertions, 0 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 32ab246c74f0..860e20720afd 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -929,6 +929,19 @@ def GPU_HostRegisterOp : GPU_Op<"host_register">,
   let assemblyFormat = "$value attr-dict `:` type($value)";
 }
 
+def GPU_HostUnregisterOp : GPU_Op<"host_unregister">,
+    Arguments<(ins AnyUnrankedMemRef:$value)> {
+  let summary = "Unregisters a memref for access from device.";
+  let description = [{
+      This op unmaps the provided host buffer from the device address space.
+
+      This operation may not be supported in every environment, there is not yet a
+          way to check at runtime whether this feature is supported.
+  }];
+
+  let assemblyFormat = "$value attr-dict `:` type($value)";
+}
+
 def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> {
   let summary = "Wait for async gpu ops to complete.";
   let description = [{
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 55a5e4683955..3687bd6718bf 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -161,6 +161,12 @@ protected:
       {llvmIntPtrType /* intptr_t rank */,
        llvmPointerType /* void *memrefDesc */,
        llvmIntPtrType /* intptr_t elementSizeBytes */}};
+  FunctionCallBuilder hostUnregisterCallBuilder = {
+      "mgpuMemHostUnregisterMemRef",
+      llvmVoidType,
+      {llvmIntPtrType /* intptr_t rank */,
+       llvmPointerType /* void *memrefDesc */,
+       llvmIntPtrType /* intptr_t elementSizeBytes */}};
   FunctionCallBuilder allocCallBuilder = {
       "mgpuMemAlloc",
       llvmPointerType /* void * */,
@@ -202,6 +208,20 @@ private:
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+class ConvertHostUnregisterOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp> {
+public:
+  ConvertHostUnregisterOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp>(typeConverter) {
+  }
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
 /// call. Currently it supports CUDA and ROCm (HIP).
 class ConvertAllocOpToGpuRuntimeCallPattern
@@ -446,6 +466,28 @@ LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  Operation *op = hostUnregisterOp.getOperation();
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
+    return failure();
+
+  Location loc = op->getLoc();
+
+  auto memRefType = hostUnregisterOp.getValue().getType();
+  auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
+  auto elementSize = getSizeInBytes(loc, elementType, rewriter);
+
+  auto arguments = getTypeConverter()->promoteOperands(
+      loc, op->getOperands(), adaptor.getOperands(), rewriter);
+  arguments.push_back(elementSize);
+  hostUnregisterCallBuilder.create(loc, rewriter, arguments);
+
+  rewriter.eraseOp(op);
+  return success();
+}
+
 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::AllocOp allocOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -928,6 +970,7 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
   patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
                ConvertDeallocOpToGpuRuntimeCallPattern,
                ConvertHostRegisterOpToGpuRuntimeCallPattern,
+               ConvertHostUnregisterOpToGpuRuntimeCallPattern,
                ConvertMemcpyOpToGpuRuntimeCallPattern,
                ConvertMemsetOpToGpuRuntimeCallPattern,
                ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 44ed5b0cd205..4065c6531669 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -192,6 +192,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
   mgpuMemHostRegister(ptr, sizeBytes);
 }
 
+// Allows to unregister byte array with the CUDA runtime.
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) {
+  ScopedContext scopedContext;
+  CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr));
+}
+
+/// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a
+/// ranked memref descriptor struct of rank `rank`
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemHostUnregisterMemRef(int64_t rank,
+                            StridedMemRefType<char, 1> *descriptor,
+                            int64_t elementSizeBytes) {
+  auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  mgpuMemHostUnregister(ptr);
+}
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
   defaultDevice = device;
 }
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index 43a7e3c62089..bd3868a8e196 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -152,6 +152,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
   mgpuMemHostRegister(ptr, sizeBytes);
 }
 
+// Allows to unregister byte array with the ROCM runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mgpuMemHostUnregister(void *ptr) {
+  HIP_REPORT_IF_ERROR(hipHostUnregister(ptr));
+}
+
+// Allows to unregister a MemRef with the ROCm runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void
+mgpuMemHostUnregisterMemRef(int64_t rank,
+                            StridedMemRefType<char, 1> *descriptor,
+                            int64_t elementSizeBytes) {
+  auto ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  mgpuMemHostUnregister(ptr);
+}
+
 template <typename T>
 void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) {
   HIP_REPORT_IF_ERROR(hipSetDevice(0));
author	max <maksim.levental@gmail.com>	2023-04-06 15:07:12 -0500
committer	max <maksim.levental@gmail.com>	2023-04-06 15:07:12 -0500
commit	8f7c8a6ea765139225878e1dfe90bc1eb6f0067c (patch)
tree	496b99f19b90d911ffe4b7b6d05f8b6630f4a15f
parent	e58a49300e757ff61142f6abd227bd1437c1cf87 (diff)
download	llvm-8f7c8a6ea765139225878e1dfe90bc1eb6f0067c.tar.gz