Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CIR][HIP|CUDA] Generate global storing CUDA|HIP stub function #1341

Merged
merged 1 commit into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 116 additions & 20 deletions clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,35 @@

#include "CIRGenCUDARuntime.h"
#include "CIRGenFunction.h"
#include "mlir/IR/Operation.h"
#include "clang/Basic/Cuda.h"
#include "clang/CIR/Dialect/IR/CIRTypes.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/raw_ostream.h"
#include <iostream>

using namespace clang;
using namespace clang::CIRGen;

CIRGenCUDARuntime::~CIRGenCUDARuntime() {}

CIRGenCUDARuntime::CIRGenCUDARuntime(CIRGenModule &cgm) : cgm(cgm) {
if (cgm.getLangOpts().OffloadViaLLVM)
llvm_unreachable("NYI");
else if (cgm.getLangOpts().HIP)
Prefix = "hip";
else
Prefix = "cuda";
}

std::string CIRGenCUDARuntime::addPrefixToName(StringRef FuncName) const {
return (Prefix + FuncName).str();
}
std::string
CIRGenCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
return ("__" + Prefix + FuncName).str();
}

void CIRGenCUDARuntime::emitDeviceStubBodyLegacy(CIRGenFunction &cgf,
cir::FuncOp fn,
FunctionArgList &args) {
Expand All @@ -31,16 +52,14 @@ void CIRGenCUDARuntime::emitDeviceStubBodyLegacy(CIRGenFunction &cgf,
void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
cir::FuncOp fn,
FunctionArgList &args) {
if (cgm.getLangOpts().HIP)
llvm_unreachable("NYI");

// This requires arguments to be sent to kernels in a different way.
if (cgm.getLangOpts().OffloadViaLLVM)
llvm_unreachable("NYI");

auto &builder = cgm.getBuilder();

// For cudaLaunchKernel, we must add another layer of indirection
// For [cuda|hip]LaunchKernel, we must add another layer of indirection
// to arguments. For example, for function `add(int a, float b)`,
// we need to pass it as `void *args[2] = { &a, &b }`.

Expand Down Expand Up @@ -71,7 +90,8 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
LangOptions::GPUDefaultStreamKind::PerThread)
llvm_unreachable("NYI");

std::string launchAPI = "cudaLaunchKernel";
std::string launchAPI = addPrefixToName("LaunchKernel");
std::cout << "LaunchAPI is " << launchAPI << "\n";
const IdentifierInfo &launchII = cgm.getASTContext().Idents.get(launchAPI);
FunctionDecl *launchFD = nullptr;
for (auto *result : dc->lookup(&launchII)) {
Expand All @@ -86,11 +106,11 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
}

// Use this function to retrieve arguments for cudaLaunchKernel:
// int __cudaPopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
// int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
// *sharedMem, cudaStream_t *stream)
//
// Here cudaStream_t, while also being the 6th argument of cudaLaunchKernel,
// is a pointer to some opaque struct.
// Here [cuda|hip]Stream_t, while also being the 6th argument of
// [cuda|hip]LaunchKernel, is a pointer to some opaque struct.

mlir::Type dim3Ty =
cgf.getTypes().convertType(launchFD->getParamDecl(1)->getType());
Expand All @@ -114,26 +134,45 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
cir::FuncType::get({gridDim.getType(), blockDim.getType(),
sharedMem.getType(), stream.getType()},
cgm.SInt32Ty),
"__cudaPopCallConfiguration");
addUnderscoredPrefixToName("PopCallConfiguration"));
cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});

// Now emit the call to cudaLaunchKernel
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
// [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
// dim3 blockDim,
// void **args, size_t sharedMem,
// cudaStream_t stream);
auto kernelTy =
cir::PointerType::get(&cgm.getMLIRContext(), fn.getFunctionType());
// [cuda|hip]Stream_t stream);

mlir::Value kernel =
builder.create<cir::GetGlobalOp>(loc, kernelTy, fn.getSymName());
mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy);
// We now either pick the function or the stub global for cuda, hip
// resepectively.
auto kernel = [&]() {
if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
KernelHandles[fn.getSymName()])) {
auto kernelTy =
cir::PointerType::get(&cgm.getMLIRContext(), globalOp.getSymType());
mlir::Value kernel = builder.create<cir::GetGlobalOp>(
loc, kernelTy, globalOp.getSymName());
return kernel;
}
if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
KernelHandles[fn.getSymName()])) {
auto kernelTy = cir::PointerType::get(&cgm.getMLIRContext(),
funcOp.getFunctionType());
mlir::Value kernel =
builder.create<cir::GetGlobalOp>(loc, kernelTy, funcOp.getSymName());
mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy);
return func;
}
assert(false && "Expected stub handle to be cir::GlobalOp or funcOp");
}();
// mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy);
CallArgList launchArgs;

mlir::Value kernelArgsDecayed =
builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
cir::PointerType::get(cgm.VoidPtrTy));

launchArgs.add(RValue::get(func), launchFD->getParamDecl(0)->getType());
launchArgs.add(RValue::get(kernel), launchFD->getParamDecl(0)->getType());
launchArgs.add(
RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
launchFD->getParamDecl(1)->getType());
Expand All @@ -157,13 +196,16 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,

void CIRGenCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
FunctionArgList &args) {
// Device stub and its handle might be different.
if (cgm.getLangOpts().HIP)
llvm_unreachable("NYI");

if (auto globalOp =
llvm::dyn_cast<cir::GlobalOp>(KernelHandles[fn.getSymName()])) {
auto symbol = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
// Set the initializer for the global
cgm.setInitializer(globalOp, symbol);
}
// CUDA 9.0 changed the way to launch kernels.
if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
(cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
cgm.getLangOpts().OffloadViaLLVM)
emitDeviceStubBodyNew(cgf, fn, args);
else
Expand All @@ -189,3 +231,57 @@ RValue CIRGenCUDARuntime::emitCUDAKernelCallExpr(CIRGenFunction &cgf,

return RValue::get(nullptr);
}

mlir::Operation *CIRGenCUDARuntime::getKernelHandle(cir::FuncOp fn,
GlobalDecl GD) {

// Check if we already have a kernel handle for this function
auto Loc = KernelHandles.find(fn.getSymName());
if (Loc != KernelHandles.end()) {
auto OldHandle = Loc->second;
// Here we know that the fn did not change. Return it
if (KernelStubs[OldHandle] == fn)
return OldHandle;

// We've found the function name, but F itself has changed, so we need to
// update the references.
if (cgm.getLangOpts().HIP) {
// For HIP compilation the handle itself does not change, so we only need
// to update the Stub value.
KernelStubs[OldHandle] = fn;
return OldHandle;
}
// For non-HIP compilation, erase the old Stub and fall-through to creating
// new entries.
KernelStubs.erase(OldHandle);
}

// If not targeting HIP, store the function itself
if (!cgm.getLangOpts().HIP) {
KernelHandles[fn.getSymName()] = fn;
KernelStubs[fn] = fn;
return fn;
}

// Create a new CIR global variable to represent the kernel handle
auto &builder = cgm.getBuilder();
auto globalName = cgm.getMangledName(
GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
auto globalOp = cgm.getOrInsertGlobal(
fn->getLoc(), globalName, fn.getFunctionType(), [&] {
return CIRGenModule::createGlobalOp(
cgm, fn->getLoc(), globalName,
builder.getPointerTo(fn.getFunctionType()), true, /* addrSpace=*/{},
/*insertPoint=*/nullptr, fn.getLinkage());
});

globalOp->setAttr("alignment", builder.getI64IntegerAttr(
cgm.getPointerAlign().getQuantity()));
globalOp->setAttr("visibility", fn->getAttr("sym_visibility"));

// Store references
KernelHandles[fn.getSymName()] = globalOp;
KernelStubs[globalOp] = fn;

return globalOp;
}
14 changes: 13 additions & 1 deletion clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,26 @@ class ReturnValueSlot;
class CIRGenCUDARuntime {
protected:
CIRGenModule &cgm;
StringRef Prefix;

// Map a device stub function to a symbol for identifying kernel in host code.
// For CUDA, the symbol for identifying the kernel is the same as the device
// stub function. For HIP, they are different.
llvm::DenseMap<StringRef, mlir::Operation *> KernelHandles;

// Map a kernel handle to the kernel stub.
llvm::DenseMap<mlir::Operation *, mlir::Operation *> KernelStubs;

private:
void emitDeviceStubBodyLegacy(CIRGenFunction &cgf, cir::FuncOp fn,
FunctionArgList &args);
void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
FunctionArgList &args);
std::string addPrefixToName(StringRef FuncName) const;
std::string addUnderscoredPrefixToName(StringRef FuncName) const;

public:
CIRGenCUDARuntime(CIRGenModule &cgm) : cgm(cgm) {}
CIRGenCUDARuntime(CIRGenModule &cgm);
virtual ~CIRGenCUDARuntime();

virtual void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
Expand All @@ -46,6 +57,7 @@ class CIRGenCUDARuntime {
virtual RValue emitCUDAKernelCallExpr(CIRGenFunction &cgf,
const CUDAKernelCallExpr *expr,
ReturnValueSlot retValue);
virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl GD);
};

} // namespace clang::CIRGen
Expand Down
28 changes: 18 additions & 10 deletions clang/lib/CIR/CodeGen/CIRGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,9 +651,10 @@ void CIRGenModule::emitGlobalFunctionDefinition(GlobalDecl GD,

// Get or create the prototype for the function.
auto Fn = dyn_cast_if_present<cir::FuncOp>(Op);
if (!Fn || Fn.getFunctionType() != Ty)
Fn = GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer=*/true,
ForDefinition);
if (!Fn || Fn.getFunctionType() != Ty) {
Fn = GetAddrOfFunction(GD, Ty, /*ForVTable=*/false,
/*DontDefer=*/true, ForDefinition);
}

// Already emitted.
if (!Fn.isDeclaration())
Expand Down Expand Up @@ -2356,10 +2357,17 @@ cir::FuncOp CIRGenModule::GetAddrOfFunction(clang::GlobalDecl GD, mlir::Type Ty,

// As __global__ functions (kernels) always reside on device,
// when we access them from host, we must refer to the kernel handle.
// For CUDA, it's just the device stub. For HIP, it's something different.
if (langOpts.CUDA && !langOpts.CUDAIsDevice && langOpts.HIP &&
// For HIP, we should never directly access the host device addr, but
// instead the Global Variable of that stub. For CUDA, it's just the device
// stub. For HIP, it's something different.
if ((langOpts.HIP || langOpts.CUDA) && !langOpts.CUDAIsDevice &&
cast<FunctionDecl>(GD.getDecl())->hasAttr<CUDAGlobalAttr>()) {
llvm_unreachable("NYI");
auto *stubHandle = getCUDARuntime().getKernelHandle(F, GD);
if (IsForDefinition)
return F;

if (langOpts.HIP)
llvm_unreachable("NYI");
}

return F;
Expand Down Expand Up @@ -3169,15 +3177,15 @@ CIRGenModule::GetAddrOfGlobal(GlobalDecl GD, ForDefinition_t IsForDefinition) {
auto FInfo =
&getTypes().arrangeCXXMethodDeclaration(cast<CXXMethodDecl>(D));
auto Ty = getTypes().GetFunctionType(*FInfo);
return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer=*/false,
IsForDefinition);
return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false,
/*DontDefer=*/false, IsForDefinition);
}

if (isa<FunctionDecl>(D)) {
const CIRGenFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
auto Ty = getTypes().GetFunctionType(FI);
return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer=*/false,
IsForDefinition);
return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false,
/*DontDefer=*/false, IsForDefinition);
}

return getAddrOfGlobalVar(cast<VarDecl>(D), /*Ty=*/nullptr, IsForDefinition)
Expand Down
14 changes: 0 additions & 14 deletions clang/test/CIR/CodeGen/HIP/simple-device.cpp

This file was deleted.

32 changes: 25 additions & 7 deletions clang/test/CIR/CodeGen/HIP/simple.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
#include "../Inputs/cuda.h"

// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
// RUN: -x hip -fhip-new-launch-api \
// RUN: -emit-cir %s -o %t.cir
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
// RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s

// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip \
// RUN: -fcuda-is-device -fhip-new-launch-api \
// RUN: -emit-cir %s -o %t.cir
// RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s

// Attribute for global_fn
// CIR-HOST: [[Kernel:#[a-zA-Z_0-9]+]] = {{.*}}#cir.cuda_kernel_name<_Z9global_fni>{{.*}}

// This should emit as a normal C++ function.
__host__ void host_fn(int *a, int *b, int *c) {}

// CIR: cir.func @_Z7host_fnPiS_S_
__host__ void host_fn(int *a, int *b, int *c) {}
// CIR-HOST: cir.func @_Z7host_fnPiS_S_
// CIR-DEVICE-NOT: cir.func @_Z7host_fnPiS_S_

// This shouldn't emit.
__device__ void device_fn(int* a, double b, float c) {}
// CIR-HOST-NOT: cir.func @_Z9device_fnPidf
// CIR-DEVICE: cir.func @_Z9device_fnPidf

__global__ void global_fn(int a) {}
// CIR-DEVICE: @_Z9global_fni

// CIR-HOST: cir.alloca {{.*}}"kernel_args"
// CIR-HOST: cir.call @__hipPopCallConfiguration

// CHECK-NOT: cir.func @_Z9device_fnPidf
// Host access the global stub instead of the functiond evice stub.
// The stub has the mangled name of the function
// CIR-HOST: cir.get_global @_Z9global_fni
// CIR-HOST: cir.call @hipLaunchKernel