在GPU上使用TE优化算子
- 在NVIDIA GPU上使用TE生成优化算子
- 生成nvidia的cuda代码
- 存储并加载gpu module
- 加载编译过的模块
- 把两个库打包
- 生成opencl代码
在NVIDIA GPU上使用TE生成优化算子
生成nvidia的cuda代码
实际上除了CPU,tvm可以在多种目标平台上生成代码,并编译优化。在CPU之外,用的更广泛的应该是GPU了,当然,开源社区里都是NVIDIA GPU,但是似乎也支持AMD GPU,并且支持生成opencl,其实大部分的gpu都可以在opencl语言下搞定,性能另说了。
run_cuda = True
if run_cuda:
# Change this target to the correct backend for you gpu. For example: cuda (NVIDIA GPUs),
# rocm (Radeon GPUS), OpenCL (opencl).
tgt_gpu = tvm.target.Target(target="cuda", host="llvm")
# Recreate the schedule
n = te.var("n")
A = te.placeholder((n,), name="A")
B = te.placeholder((n,), name="B")
C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
print(type(C))
s = te.create_schedule(C.op)
bx, tx = s[C].split(C.op.axis[0], factor=64)
################################################################################
# Finally we must bind the iteration axis bx and tx to threads in the GPU
# compute grid. The naive schedule is not valid for GPUs, and these are
# specific constructs that allow us to generate code that runs on a GPU.
s[C].bind(bx, te.thread_axis(""))
s[C].bind(tx, te.thread_axis(""))
######################################################################
# Compilation
# -----------
# After we have finished specifying the schedule, we can compile it
# into a TVM function. By default TVM compiles into a type-erased
# function that can be directly called from the python side.
#
# In the following line, we use to create a function.
# The build function takes the schedule, the desired signature of the
# function (including the inputs and outputs) as well as target language
# we want to compile to.
#
# The result of compilation fadd is a GPU device function (if GPU is
# involved) as well as a host wrapper that calls into the GPU
# function. fadd is the generated host wrapper function, it contains
# a reference to the generated device function internally.
fadd = tvm.build(s, [A, B, C], target=tgt_gpu, name="myadd")
################################################################################
# The compiled TVM function is exposes a concise C API that can be invoked from
# any language.
#
# We provide a minimal array API in python to aid quick testing and prototyping.
# The array API is based on the `DLPack </dmlc/dlpack>`_ standard.
#
# - We first create a GPU device.
# - Then copies the data to the GPU.
# - ``fadd`` runs the actual computation
# - ``numpy()`` copies the GPU array back to the CPU (so we can verify correctness).
#
# Note that copying the data to and from the memory on the GPU is a required step.
dev = tvm.device(tgt_gpu.kind.name, 0)
n = 1024
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
fadd(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
################################################################################
# Inspect the Generated GPU Code
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# You can inspect the generated code in TVM. The result of is a TVM
# Module. fadd is the host module that contains the host wrapper, it also
# contains a device module for the CUDA (GPU) function.
#
# The following code fetches the device module and prints the content code.
if (
tgt_gpu.kind.name == "cuda"
or tgt_gpu.kind.name == "rocm"
or tgt_gpu.kind.name.startswith("opencl")
):
dev_module = fadd.imported_modules[0]
print("-----GPU code-----")
print(dev_module.get_source())
else:
print(fadd.get_source())
我好像遇到报错了:
<class ''>
Traceback (most recent call last):
File "/home/shaowang/tvm/my_tvm/tutorials/get_started/tensor_expr_get_started.py", line 347, in <module>
fadd = (s, [A, B, C], target=tgt_gpu, name="myadd")
File "/home/shaowang/tvm/my_tvm/python/tvm/driver/build_module.py", line 357, in build
mod_host, mdev = _build_for_device(input_mod, tar, target_host)
File "/home/shaowang/tvm/my_tvm/python/tvm/driver/build_module.py", line 223, in _build_for_device
rt_mod_dev = codegen.build_module(mod_dev, target) if len(mod_dev.functions) != 0 else None
File "/home/shaowang/tvm/my_tvm/python/tvm/target/", line 39, in build_module
return _ffi_api.Build(mod, target)
File "/home/shaowang/tvm/my_tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 237, in __call__
raise get_last_ffi_error()
tvm._ffi.: Traceback (most recent call last):
File "/home/shaowang/tvm/my_tvm/src/target/opt/build_cuda_on.cc", line 116
TVMError:
---------------------------------------------------------------
An error occurred during the execution of TVM.
For more information, please see: /docs/
---------------------------------------------------------------
Check failed: compile_res == NVRTC_SUCCESS (5 vs. 0) : nvrtc: error: invalid value for --gpu-architecture (-arch)
还不知道啥原因。。。。
知道原因了,cuda版本不支持当前显卡,当前用的A100,但是docker是cuda10的,A100至少要cuda11.所以重新弄了一个docker。这篇记录了怎么搞docker cuda和cudnn的
然后cuda的能跑了,还给输出出来一个类似于cuda的代码:
-----GPU code-----
#ifdef _WIN32
using uint = unsigned int;
using uchar = unsigned char;
using ushort = unsigned short;
using int64_t = long long;
using uint64_t = unsigned long long;
#else
#define uint unsigned int
#define uchar unsigned char
#define ushort unsigned short
#define int64_t long long
#define uint64_t unsigned long long
#endif
extern "C" __global__ void __launch_bounds__(64) myadd_kernel0(float* __restrict__ C, float* __restrict__ A, float* __restrict__ B, int n, int stride, int stride1, int stride2) {
if (((int)blockIdx.x) < (n >> 6)) {
C[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride)] = (A[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride1)] + B[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride2)]);
} else {
if (((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) < n) {
C[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride)] = (A[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride1)] + B[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride2)]);
}
}
}
这时生成了一个人能看懂的cuda代码,随后,我们需要将这个cuda代码编译成可执行文件,并且能够运行出正确结果。
存储并加载gpu module
出了运行时编译这种方法外,我们还可以把编译后的库存起来,在需要的时候加载在GPU上运行。
下面的代码可以完成这件事:
- 把host模块保存下来
- 把cuda代码编译成ptx保存下来
- 用cc.create_shared去调用编译器编译出device端的动态链接库
code | device | module |
---|---|---|
cuda | nvidia GPU | .so |
rocm | AMD GPU | .hsaco |
opencl | all type of GPUs | .o |
from tvm.contrib import cc
from tvm.contrib import utils
temp = utils.tempdir()
fadd.save(temp.relpath(""))
if tgt_gpu.kind.name == "cuda":
fadd.imported_modules[0].save(temp.relpath(""))
if tgt_gpu.kind.name == "rocm":
fadd.imported_modules[0].save(temp.relpath(""))
if tgt_gpu.kind.name.startswith("opencl"):
fadd.imported_modules[0].save(temp.relpath(""))
cc.create_shared(temp.relpath(""), [temp.relpath("")])
print(temp.listdir())
这里CPU的模块被保存为.so
文件,但是我们可以根据硬件平台修改。设备端代码可以有多种保存形式,再这个例子中,我们用的nvidia的设备,会被保存成.ptx
文件和一个json文件,可是ptx也不是可执行文件,ptx的编译在哪个步骤进行呢?他们可以被链接在import步骤。
嗨害嗨,果然是不能编ptx文件出来了,要编成cubin,GitHub上的代码有失误。
加载编译过的模块
我们可以从文件系统加载和运行编译后的动态链接库,下面的代码可以实现分别加载host和device的库,并且把他们链接在一起。并且可以验证结果。GitHub上的代码无法运行,请看我的
fadd1 = tvm.runtime.load_module(temp.relpath(""))
print(tgt_gpu.kind.name)
if tgt_gpu.kind.name == "cuda":
fadd1_dev = tvm.runtime.load_module(temp.relpath(""))
fadd1.import_module(fadd1_dev)
if tgt_gpu.kind.name == "rocm":
fadd1_dev = tvm.runtime.load_module(temp.relpath(""))
fadd1.import_module(fadd1_dev)
if tgt_gpu.kind.name.startswith("opencl"):
fadd1_dev = tvm.runtime.load_module(temp.relpath(""))
fadd1.import_module(fadd1_dev)
fadd1(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
把两个库打包
不用区分两个库(一个.so
,一个cubin
)也可以,tvm有接口可以把host和device的库打包在一起。在这个模式下,我们可以把device的二进制生成,然后链接在host库里面。目前支持水果爹的metal,opencl和cuda,很遗憾,又不支持rocm。。。
fadd.export_library(temp.relpath("myadd_pack.so"))
fadd2 = tvm.runtime.load_module(temp.relpath("myadd_pack.so"))
fadd2(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
关于运行时接口:这些编译过的模块就不在依赖于tvm编译器了,它们仅仅依赖于一个最小的运行时api。在编译后的模块中,tvm包裹了设备驱动,线程安全和设备无关的调用。
这就是表示我们可以编译任意GPU代码,并且提供所需的运行时库。
生成opencl代码
tvm也可以生成ocl代码出来,因为前面生成了cuda代码出来,我就不再实验ocl的了,其实基本是一样的。
if tgt.kind.name.startswith("opencl"):
fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd")
print("------opencl code------")
print(fadd_cl.imported_modules[0].get_source())
dev = tvm.cl(0)
n = 1024
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
fadd_cl(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())