J'ai testé quelques fonctions CUDA de base en utilisant le package Numba
. Mon objectif principal est d'implémenter un algorithme Richardson-Lucy sur le GPU. Il est possible d'accélérer l'algorithme et l'une des principales étapes de le faire peut se résumer dans la fonction factice suivanteNumba CUDA «vectoriser» et «réduire» les décorateurs plus lentement que prévu
def dummy(arr1, arr2):
return (arr1 * arr2).sum()/((arr2**2).sum() + eps)
Cette fonction fonctionne raisonnablement rapide sur le CPU, mais je voudrais tout garder sur le GPU pour éviter l'hôte < ---> copies de l'appareil.
Pour comparer les vitesses des différents calculs que j'ai écrit une courte série de fonctions:
import numpy as np
from numba import njit, jit
import numba
import numba.cuda as cuda
import timeit
import time
# define our functions
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def add_gpu(a, b):
return a + b
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def mult_gpu(a, b):
return a * b
@cuda.reduce
def sum_gpu(a, b):
return a + b
@cuda.jit
def add_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] + b[x]
@cuda.jit
def mult_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] * b[x]
@cuda.jit
def mult_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] * b[x, y]
@cuda.jit
def add_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] + b[x, y]
et certaines fonctions de minuterie:
def avg_t(t, num):
return np.mean(t)/num
def format_t(t):
"""Turn t into nice formating"""
if t < 1e-3:
return "{:.1f} us".format(t * 1e6)
elif t < 1:
return "{:.1f} ms".format(t * 1e3)
else:
return "{:.1f} s".format(t)
def test_1d_times(data_len, dtype=np.float32):
num_times = 10
title = "Testing 1D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")
t = time.time()
arr1, arr2 = np.empty((2, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in " + format_t(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)
print()
print("Testing multiplication times")
print("----------------------------")
t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_1d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing sum times")
print("------------------")
t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_1d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing reduction times")
print("-----------------------")
t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()
def test_2d_times(data_len, dtype=np.float32):
num_times = 10
title = "Testing 2D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")
t = time.time()
arr1, arr2 = np.empty((2, data_len, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in {} seconds".format(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)
print()
print("Testing multiplication times")
print("----------------------------")
t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: mult_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_2d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing sum times")
print("------------------")
t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: add_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_2d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing reduction times")
print("-----------------------")
t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu.reduce(d_arr1.ravel())), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: sum_gpu(d_arr1.ravel())), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()
Exécution des fonctions de test
numba.cuda.detect()
test_1d_times(2**24)
test_2d_times(2**12)
test_1d_times(2**24, dtype=np.float64)
test_2d_times(2**12, dtype=np.float64)
donne la sortie suivante:
Found 1 CUDA devices
id 0 b'GeForce GTX TITAN X' [SUPPORTED]
compute capability: 5.2
pci device id: 0
pci bus id: 3
Summary:
1/1 devices are supported
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float32'>
============================================================================
Data generated in 88.2 ms
d_arr1 dtype = float32
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 122.8 ms
cuda_mult_1d time = 206.8 us
Testing sum times
------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 106.1 ms
cuda_add_1d time = 212.6 us
Testing reduction times
-----------------------
cpu/numpy time = 16.7 ms
cuda vectorize time = 11.1 ms
sum_gpu time = 127.3 ms
========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float32'>
========================================================================
Data generated in 0.0800013542175293 seconds
d_arr1 dtype = float32
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 35.4 ms
cuda vectorize time = 97.9 ms
cuda_mult_2d time = 208.9 us
Testing sum times
------------------
cpu/numpy time = 36.3 ms
cuda vectorize time = 94.5 ms
cuda_add_2d time = 250.8 us
Testing reduction times
-----------------------
cpu/numpy time = 16.4 ms
cuda vectorize time = 15.8 ms
sum_gpu time = 125.4 ms
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float64'>
============================================================================
Data generated in 171.0 ms
d_arr1 dtype = float64
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 73.2 ms
cuda vectorize time = 114.9 ms
cuda_mult_1d time = 201.9 us
Testing sum times
------------------
cpu/numpy time = 71.4 ms
cuda vectorize time = 71.0 ms
cuda_add_1d time = 217.2 us
Testing reduction times
-----------------------
cpu/numpy time = 29.0 ms
cuda vectorize time = 12.8 ms
sum_gpu time = 123.5 ms
========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float64'>
========================================================================
Data generated in 0.301849365234375 seconds
d_arr1 dtype = float64
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 73.7 ms
cuda vectorize time = 84.2 ms
cuda_mult_2d time = 226.2 us
Testing sum times
------------------
cpu/numpy time = 74.9 ms
cuda vectorize time = 84.3 ms
cuda_add_2d time = 208.7 us
Testing reduction times
-----------------------
cpu/numpy time = 29.9 ms
cuda vectorize time = 14.3 ms
sum_gpu time = 121.2 ms
Il semble que les fonctions décorées @cuda.vectorize
fonctionnent plus lentement que le CPU et les fonctions @cuda.jit
personnalisées. Alors que les fonctions @cuda.jit
donnent les ordres de grandeur attendus accélèrent et la performance de temps presque constante (résultats non montrés). D'autre part, la fonction @cuda.reduce
est beaucoup plus lente que la fonction @cuda.vectorize
ou la fonction CPU.
Y a-t-il une raison aux mauvaises performances des fonctions @cuda.vectorize
et @cuda.reduce
? Est-il possible d'écrire un noyau de réduction CUDA en utilisant simplement Numba?
EDIT:
On dirait que c'est un bug légitime Numba
: https://github.com/numba/numba/issues/2266, https://github.com/numba/numba/issues/2268