Je dois implémenter une fonction de transposition matricielle sur un GPU utilisant la mémoire partagée. Je l'ai fait d'une manière simple sans mémoire partagée qui fonctionne bien et aussi une tentative avec SM. Mais malheureusement, le calcul n'est pas correct et je ne peux pas comprendre pourquoi. Un exemple de travail complet peut être trouvé here et au bas de cette question.Transposition de la matrice CUDA avec la mémoire partagée
EDIT 1
je sais en outre que le premier index du résultat où j'ai une valeur incorrecte est index 32 (de la matrice méplat, de sorte que matr[0][32]
dans un mode à deux dimensions).
S'il y a plus d'informations, je les préviendrai volontiers.
Un court extrait du code entier qui ressemble à la fonction de travail est non énumérés ci-dessous:
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width,
const int height, const int nreps)
{
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
int blockIdx_y = blockIdx.x;
int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)* width;
xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)* height;
int r, i;
#pragma unroll
for (r = 0; r < nreps; r++)
{
#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];
__syncthreads();
#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if (index_in + i * width < width * height)
matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
}
}
La sortie ressemble à ceci:
Avg. CPU Transpose Time: 0.106048 ms, Bandwidth: 3.771873 GB/s
Avg. GPU Naive Trans Time: 0.009871 ms, bandwidth: 40.520836 GB/s
Correct: 50000, Wrong: 0
Avg. GPU Trans with SM Time: 0.007598 ms, bandwidth: 52.643482 GB/s
Correct: 12352, Wrong: 37648
Voici le plein exemple de travail. J'ai enlevé la plupart du code inutile de sorte qu'il est moins bourré:
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#define TILE_DIM 32
#define BLOCK_ROWS 8
#define BLOCK_COLS 32
cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
int main()
{
int i, width, height, nreps, size, wrong, correct;
double cpuTime, cpuBandwidth;
cudaError_t cudaStatus;
float *matrA, *matrATC, *matrATG, *matrAC;
srand(time(NULL));
nreps = 10000;
width = 500;
height = 100;
size = width * height;
matrA = (float*)malloc(size * sizeof(float)); // matrix A
matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU
for (i = 0; i < size; i++)
{
matrA[i] = (float)i;
}
auto start = std::chrono::high_resolution_clock::now();
//CPU Transpose
cpuMatrTrans(matrATC, matrA, width, height, nreps);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000)/nreps;
cpuBandwidth = (sizeof(float) * size * 2)/(cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
correct = 0;
wrong = 0;
//Naive transpose
matrMagicCuda(matrATG, matrA, width, height, nreps, 1);
//Check if calc was correct
for (i = 0; i < size; i++)
{
if (matrATC[i] != matrATG[i])
{
/*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
return;*/
wrong++;
}
else
{
correct++;
}
}
printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
correct = 0;
wrong = 0;
//Transpose with shared memory
matrMagicCuda(matrATG, matrA, width, height, nreps, 2);
//Check if calc was correct
for (i = 0; i < size; i++)
{
if (matrATC[i] != matrATG[i])
{
/*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
return;*/
wrong++;
}
else
{
correct++;
}
}
//printf("\tTranspose with SM on GPU was executed correctly.\n\n");
printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
correct = 0;
wrong = 0;
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!\n");
return 1;
}
return 0;
}
cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
{
float elapsed = 0;
float *dev_matrA = 0;
float *dev_matrB = 0;
cudaError_t cudaStatus;
dim3 dim_grid, dim_block;
double gpuBandwidth;
int size = width * height;
dim_block.x = TILE_DIM;
dim_block.y = BLOCK_ROWS;
dim_block.z = 1;
dim_grid.x = (width + TILE_DIM - 1)/TILE_DIM;
dim_grid.y = (height + TILE_DIM - 1)/TILE_DIM;
dim_grid.z = 1;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three matrix
cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input matrix from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
switch (operation)
{
case(1):
{
cudaEventRecord(start);
// Launch a kernel on the GPU with one thread for each element.
naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
elapsed /= nreps;
gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
break;
}
case(2):
{
cudaEventRecord(start);
// Launch a kernel on the GPU with one thread for each element.
notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
elapsed /= nreps;
gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
break;
}
default:
printf("No matching opcode was found.\n");
}
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
goto Error;
}
// Copy output matrix from GPU buffer to host memory.
cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_matrB);
cudaFree(dev_matrA);
return cudaStatus;
}
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
int i, j, r;
#pragma unroll
for (r = 0; r < nreps; r++)
#pragma unroll
for (i = 0; i < height; i++)
#pragma unroll
for (j = 0; j < width; j++)
matrB[j * height + i] = matrA[i * width + j];
}
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
int i, r;
int row = blockIdx.x * TILE_DIM + threadIdx.x;
int col = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = row + width * col;
int index_out = col + height * row;
#pragma unroll
for (r = 0; r < nreps; r++)
#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if (index_in + i * width < width * height)
matrB[index_out + i] = matrA[index_in + i * width];
}
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
int blockIdx_y = blockIdx.x;
int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)* width;
xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)* height;
int r, i;
#pragma unroll
for (r = 0; r < nreps; r++)
{
#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];
__syncthreads();
#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if (index_in + i * width < width * height)
matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
}
}
apparemment que vous voulez être capable de faire une transposition de matrice non carrée? Ou seulement la matrice carrée transposer? Vous devriez inclure un [mcve] * dans votre question *, pas dans un lien externe. –
Oui, je veux aussi faire la transposition sur des matrices non carrées. Par souci de lisibilité, je l'ai placé à l'extérieur, mais comme vous le souhaitez, je vais l'ajouter à la question. – JRsz
Votre code fait des accès hors-limites. Chaque fois que vous rencontrez des problèmes avec un code CUDA, vous devez utiliser [correct cuda error checking] (http://stackoverflow.com/questions/14038589) et exécuter votre code avec 'cuda-memcheck'. Même si vous ne comprenez pas la sortie d'erreur, il sera utile pour les autres qui essaient de vous aider. Notez également que même si je comprends qu'il s'agit probablement d'un exercice d'apprentissage, pour un travail sérieux, il est recommandé d'utiliser une routine de bibliothèque comme cublas [geam] (http://docs.nvidia.com/cuda/cublas/index.html#cublas- lt-t-gt-geam). –