diff options
author | max <maksim.levental@gmail.com> | 2023-04-06 15:07:12 -0500 |
---|---|---|
committer | max <maksim.levental@gmail.com> | 2023-04-06 15:07:12 -0500 |
commit | 8f7c8a6ea765139225878e1dfe90bc1eb6f0067c (patch) | |
tree | 496b99f19b90d911ffe4b7b6d05f8b6630f4a15f | |
parent | e58a49300e757ff61142f6abd227bd1437c1cf87 (diff) | |
download | llvm-8f7c8a6ea765139225878e1dfe90bc1eb6f0067c.tar.gz |
Add gpu::HostUnregisterOp
Without explicitly unregistering you will get
```
'cuMemHostRegister(ptr, sizeBytes, 0)' failed with 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED'
```
in CUDA (for example) after repeated runs (e.g., during benchmarking the same kernel).
Reviewed By: ftynse
Differential Revision: https://reviews.llvm.org/D147277
-rw-r--r-- | mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 13 | ||||
-rw-r--r-- | mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 43 | ||||
-rw-r--r-- | mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp | 16 | ||||
-rw-r--r-- | mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp | 16 |
4 files changed, 88 insertions, 0 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 32ab246c74f0..860e20720afd 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -929,6 +929,19 @@ def GPU_HostRegisterOp : GPU_Op<"host_register">, let assemblyFormat = "$value attr-dict `:` type($value)"; } +def GPU_HostUnregisterOp : GPU_Op<"host_unregister">, + Arguments<(ins AnyUnrankedMemRef:$value)> { + let summary = "Unregisters a memref for access from device."; + let description = [{ + This op unmaps the provided host buffer from the device address space. + + This operation may not be supported in every environment, there is not yet a + way to check at runtime whether this feature is supported. + }]; + + let assemblyFormat = "$value attr-dict `:` type($value)"; +} + def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { let summary = "Wait for async gpu ops to complete."; let description = [{ diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 55a5e4683955..3687bd6718bf 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -161,6 +161,12 @@ protected: {llvmIntPtrType /* intptr_t rank */, llvmPointerType /* void *memrefDesc */, llvmIntPtrType /* intptr_t elementSizeBytes */}}; + FunctionCallBuilder hostUnregisterCallBuilder = { + "mgpuMemHostUnregisterMemRef", + llvmVoidType, + {llvmIntPtrType /* intptr_t rank */, + llvmPointerType /* void *memrefDesc */, + llvmIntPtrType /* intptr_t elementSizeBytes */}}; FunctionCallBuilder allocCallBuilder = { "mgpuMemAlloc", llvmPointerType /* void * */, @@ -202,6 +208,20 @@ private: ConversionPatternRewriter &rewriter) const override; }; +class ConvertHostUnregisterOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp> { +public: + ConvertHostUnregisterOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp>(typeConverter) { + } + +private: + LogicalResult + matchAndRewrite(gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertAllocOpToGpuRuntimeCallPattern @@ -446,6 +466,28 @@ LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite( return success(); } +LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + Operation *op = hostUnregisterOp.getOperation(); + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter))) + return failure(); + + Location loc = op->getLoc(); + + auto memRefType = hostUnregisterOp.getValue().getType(); + auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType(); + auto elementSize = getSizeInBytes(loc, elementType, rewriter); + + auto arguments = getTypeConverter()->promoteOperands( + loc, op->getOperands(), adaptor.getOperands(), rewriter); + arguments.push_back(elementSize); + hostUnregisterCallBuilder.create(loc, rewriter, arguments); + + rewriter.eraseOp(op); + return success(); +} + LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -928,6 +970,7 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, patterns.add<ConvertAllocOpToGpuRuntimeCallPattern, ConvertDeallocOpToGpuRuntimeCallPattern, ConvertHostRegisterOpToGpuRuntimeCallPattern, + ConvertHostUnregisterOpToGpuRuntimeCallPattern, ConvertMemcpyOpToGpuRuntimeCallPattern, ConvertMemsetOpToGpuRuntimeCallPattern, ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern, diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 44ed5b0cd205..4065c6531669 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -192,6 +192,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor, mgpuMemHostRegister(ptr, sizeBytes); } +// Allows to unregister byte array with the CUDA runtime. +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) { + ScopedContext scopedContext; + CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr)); +} + +/// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a +/// ranked memref descriptor struct of rank `rank` +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuMemHostUnregisterMemRef(int64_t rank, + StridedMemRefType<char, 1> *descriptor, + int64_t elementSizeBytes) { + auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes; + mgpuMemHostUnregister(ptr); +} + extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { defaultDevice = device; } diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index 43a7e3c62089..bd3868a8e196 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -152,6 +152,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor, mgpuMemHostRegister(ptr, sizeBytes); } +// Allows to unregister byte array with the ROCM runtime. Helpful until we have +// transfer functions implemented. +extern "C" void mgpuMemHostUnregister(void *ptr) { + HIP_REPORT_IF_ERROR(hipHostUnregister(ptr)); +} + +// Allows to unregister a MemRef with the ROCm runtime. Helpful until we have +// transfer functions implemented. +extern "C" void +mgpuMemHostUnregisterMemRef(int64_t rank, + StridedMemRefType<char, 1> *descriptor, + int64_t elementSizeBytes) { + auto ptr = descriptor->data + descriptor->offset * elementSizeBytes; + mgpuMemHostUnregister(ptr); +} + template <typename T> void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) { HIP_REPORT_IF_ERROR(hipSetDevice(0)); |