J'essaye de faire un pice de code de travail en parallèle sur un périphérique cuda, mais j'ai un problème, mon pilote gfx continue de se bloquer pendant que le programme fonctionne, mais seulement pour ce programme, autre code cuda fonctionne bien. Et ça donne une mauvaise réponse, mais je pense que c'est à cause du crash!Pilote CUDA crashé du programme
Remarque: il est utilisé sur une carte GFX Quadro 2000M!
Voici ma version parallèle du code.
#include <stdio.h>
#include <time.h>
#include <math.h>
#define N 16
#define threads 512
#define MaxBlocks 6500
__global__ void cudaCalculateBestPath(int *M, int *bestToDiagPathSum, int *bestFromDiagPathSum,
unsigned int *bestToDiagPathCode, unsigned int *bestFromDiagPathCode)
{
int x = ((unsigned int)1 << (N-1));
unsigned int currentPathCode = blockIdx.x * threads + threadIdx.x;
// This while is for if we are over the max amount of blocks
while(currentPathCode < x)
{
int test = ((unsigned int)1 << (N-1));
if(currentPathCode >= test)
return;
unsigned int ui = currentPathCode;
int toDiagPathSum = M[0];
int toDiagRow = 0;
int toDiagCol = 0;
int fromDiagPathSum = M[(N-1)*N+N-1];
int fromDiagRow = N-1;
int fromDiagCol = N-1;
for (int i = 0; i < N-1; i++)
{
if (ui % 2 == 0)
{
toDiagCol++; // horizontal move
fromDiagCol--;
}
else
{
toDiagRow++; // vertical move
fromDiagRow--;
}
toDiagPathSum += M[toDiagRow*N+toDiagCol];
fromDiagPathSum += M[fromDiagRow*N+fromDiagCol];
ui = ui >> 1;
}
if (toDiagPathSum < bestToDiagPathSum[toDiagRow])
{
bestToDiagPathSum[toDiagRow] = toDiagPathSum;
bestToDiagPathCode[toDiagRow] = currentPathCode;
}
if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow])
{
bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum;
bestFromDiagPathCode[fromDiagRow] = currentPathCode;
}
// Next run
currentPathCode = blockDim.x + gridDim.x;
}
}
int main()
{
clock_t start = clock();
//--- create and initialize M matrix (including best path)
int M[N*N];
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++)
M[row*N+col] = 2;
#pragma region Test Path
M[ 0*N+0] = 1;
M[ 0*N+1] = 1;
M[ 0*N+2] = 1;
M[ 1*N+2] = 1;
M[ 1*N+3] = 1;
M[ 2*N+3] = 1;
M[ 3*N+3] = 1;
M[ 3*N+4] = 1;
M[ 3*N+5] = 1;
M[ 3*N+6] = 1;
M[ 4*N+6] = 1;
M[ 5*N+6] = 1;
M[ 6*N+6] = 1;
M[ 7*N+6] = 1;
M[ 8*N+6] = 1;
M[ 9*N+6] = 1;
M[10*N+6] = 1;
M[10*N+7] = 1;
M[10*N+8] = 1;
M[10*N+9] = 1;
M[11*N+9] = 1;
M[11*N+0] = 1;
M[12*N+0] = 1;
M[13*N+10] = 1;
M[13*N+11] = 1;
M[13*N+12] = 1;
M[14*N+12] = 1;
M[15*N+12] = 1;
M[15*N+13] = 1;
M[15*N+14] = 1;
M[15*N+15] = 1;
#pragma endregion Test Path
//--- create and initialize bestToDiag and bestFromDiag arrays
int bestToDiagPathSum[N];
int bestFromDiagPathSum[N];
unsigned int bestToDiagPathCode[N];
unsigned int bestFromDiagPathCode[N];
int biggerThanMaxPathSum = 256*N + 1;
for (int i = 0; i < N; i++)
{
bestToDiagPathSum[i] = biggerThanMaxPathSum;
bestFromDiagPathSum[i] = biggerThanMaxPathSum;
}
//--- iterate through path codes, updating bestToDiag and bestFromDiag arrays
int x = ((unsigned int)1 << (N-1));
// Trick for celin the total blocks
int TotalBlocks = (x+threads)/threads;
if(TotalBlocks > MaxBlocks)
TotalBlocks = MaxBlocks;
int *dev_M, *dev_bestToDiagPathSum, *dev_bestFromDiagPathSum;
unsigned int *dev_bestToDiagPathCode, *dev_bestFromDiagPathCode;
// allocate memory
cudaMalloc((void**)&dev_M, N*N*sizeof(int));
cudaMalloc((void**)&dev_bestToDiagPathSum, N*sizeof(int));
cudaMalloc((void**)&dev_bestFromDiagPathSum, N*sizeof(int));
cudaMalloc((void**)&dev_bestToDiagPathCode, N*sizeof(int));
cudaMalloc((void**)&dev_bestFromDiagPathCode, N*sizeof(int));
// Copy memory to device
cudaMemcpy(dev_M, M, N*N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_bestToDiagPathSum, bestToDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_bestFromDiagPathSum, bestFromDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_bestToDiagPathCode, bestToDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_bestFromDiagPathCode, bestFromDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice);
// Run code on device
printf("Blocks: %d\n", TotalBlocks);
printf("Threads: %d\n\n", threads);
cudaCalculateBestPath<<<TotalBlocks,threads>>>(dev_M, dev_bestToDiagPathSum, dev_bestFromDiagPathSum,
dev_bestToDiagPathCode, dev_bestFromDiagPathCode);
// Insert code here to run while the GPU is running.
// Copy the mem back
cudaMemcpy(M, dev_M, N*N*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(bestToDiagPathSum, dev_bestToDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(bestFromDiagPathSum, dev_bestFromDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(bestToDiagPathCode, dev_bestToDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(bestFromDiagPathCode, dev_bestFromDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost);
int bestPathSum = biggerThanMaxPathSum;
unsigned int bestPathCodePrefix;
unsigned int bestPathCodeSuffix;
int tempSum;
for (int i = 0; i < N; i++)
{
tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)];
if (tempSum < bestPathSum)
{
bestPathSum = tempSum;
bestPathCodePrefix = bestToDiagPathCode[i];
bestPathCodeSuffix = bestFromDiagPathCode[i];
}
}
//--- output best path sum and best path diagram
printf("Best Path Sum = %d\n\n",bestPathSum);
M[0] = -M[0];
int toDiagRow = 0;
int toDiagCol = 0;
unsigned int ui = bestPathCodePrefix;
for (int i = 0; i < N-1; i++)
{
if (ui % 2 == 0)
toDiagCol++; // horizontal move
else
toDiagRow++; // vertical move
M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol];
ui = ui >> 1;
}
M[(N-1)*N+N-1] = -M[(N-1)*N+N-1];
int fromDiagRow = N-1;
int fromDiagCol = N-1;
ui = bestPathCodeSuffix;
for (int i = 0; i < N-2; i++)
{
if (ui % 2 == 0)
fromDiagCol--; // horizontal move
else
fromDiagRow--; // vertical move
M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol];
ui = ui >> 1;
}
for (int row = N-1; row >= 0; row--)
{
for (int col = 0; col <= N-1; col++)
if (M[row*N+col] < 0)
{
printf("*");
M[row*N+col] = -M[row*N+col];
}
else
printf("%d",M[row*N+col]);
printf("\n");
}
printf("\nTime elapsed: %f", ((double)clock() - start)/CLOCKS_PER_SEC);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
return 0;
}
Code séquentiel (parfait état)
#include <stdio.h>
#include <time.h>
#include <math.h>
#define N 16
#define threads 512
#define MaxBlocks 6500
int main()
{
clock_t start = clock();
//--- create and initialize M matrix (including best path)
int M[N*N];
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++)
M[row*N+col] = 2;
#pragma region Test Path
M[ 0*N+0] = 1;
M[ 0*N+1] = 1;
M[ 0*N+2] = 1;
M[ 1*N+2] = 1;
M[ 1*N+3] = 1;
M[ 2*N+3] = 1;
M[ 3*N+3] = 1;
M[ 3*N+4] = 1;
M[ 3*N+5] = 1;
M[ 3*N+6] = 1;
M[ 4*N+6] = 1;
M[ 5*N+6] = 1;
M[ 6*N+6] = 1;
M[ 7*N+6] = 1;
M[ 8*N+6] = 1;
M[ 9*N+6] = 1;
M[10*N+6] = 1;
M[10*N+7] = 1;
M[10*N+8] = 1;
M[10*N+9] = 1;
M[11*N+9] = 1;
M[11*N+0] = 1;
M[12*N+0] = 1;
M[13*N+10] = 1;
M[13*N+11] = 1;
M[13*N+12] = 1;
M[14*N+12] = 1;
M[15*N+12] = 1;
M[15*N+13] = 1;
M[15*N+14] = 1;
M[15*N+15] = 1;
#pragma endregion Test Path
//--- create and initialize bestToDiag and bestFromDiag arrays
int bestToDiagPathSum[N];
int bestFromDiagPathSum[N];
unsigned int bestToDiagPathCode[N];
unsigned int bestFromDiagPathCode[N];
int biggerThanMaxPathSum = 256*N + 1;
for (int i = 0; i < N; i++)
{
bestToDiagPathSum[i] = biggerThanMaxPathSum;
bestFromDiagPathSum[i] = biggerThanMaxPathSum;
}
//--- iterate through path codes, updating bestToDiag and bestFromDiag arrays
int toDiagPathSum, toDiagRow, toDiagCol;
int fromDiagPathSum, fromDiagRow, fromDiagCol;
unsigned int ui;
int x = ((unsigned int)1 << (N-1));
for(unsigned int currentPathCode = 0; currentPathCode < x; currentPathCode++)
{
ui = currentPathCode;
toDiagPathSum = M[0];
toDiagRow = 0;
toDiagCol = 0;
fromDiagPathSum = M[(N-1)*N+N-1];
fromDiagRow = N-1;
fromDiagCol = N-1;
for (int i = 0; i < N-1; i++)
{
if (ui % 2 == 0)
{
toDiagCol++; // horizontal move
fromDiagCol--;
}
else
{
toDiagRow++; // vertical move
fromDiagRow--;
}
toDiagPathSum += M[toDiagRow*N+toDiagCol];
fromDiagPathSum += M[fromDiagRow*N+fromDiagCol];
ui = ui >> 1;
}
if (toDiagPathSum < bestToDiagPathSum[toDiagRow])
{
bestToDiagPathSum[toDiagRow] = toDiagPathSum;
bestToDiagPathCode[toDiagRow] = currentPathCode;
}
if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow])
{
bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum;
bestFromDiagPathCode[fromDiagRow] = currentPathCode;
}
}
int bestPathSum = biggerThanMaxPathSum;
unsigned int bestPathCodePrefix;
unsigned int bestPathCodeSuffix;
int tempSum;
for (int i = 0; i < N; i++)
{
tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)];
if (tempSum < bestPathSum)
{
bestPathSum = tempSum;
bestPathCodePrefix = bestToDiagPathCode[i];
bestPathCodeSuffix = bestFromDiagPathCode[i];
}
}
//--- output best path sum and best path diagram
printf("Best Path Sum = %d\n\n",bestPathSum);
M[0] = -M[0];
toDiagRow = 0;
toDiagCol = 0;
ui = bestPathCodePrefix;
for (int i = 0; i < N-1; i++)
{
if (ui % 2 == 0)
toDiagCol++; // horizontal move
else
toDiagRow++; // vertical move
M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol];
ui = ui >> 1;
}
M[(N-1)*N+N-1] = -M[(N-1)*N+N-1];
fromDiagRow = N-1;
fromDiagCol = N-1;
ui = bestPathCodeSuffix;
for (int i = 0; i < N-2; i++)
{
if (ui % 2 == 0)
fromDiagCol--; // horizontal move
else
fromDiagRow--; // vertical move
M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol];
ui = ui >> 1;
}
for (int row = N-1; row >= 0; row--)
{
for (int col = 0; col <= N-1; col++)
if (M[row*N+col] < 0)
{
printf("*");
M[row*N+col] = -M[row*N+col];
}
else
printf("%d",M[row*N+col]);
printf("\n");
}
printf("\nTime elapsed: %f", ((double)clock() - start)/CLOCKS_PER_SEC);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
return 0;
}
Juste comme une observation, votre code CUDA semble terriblement complexe. Est-il clair que chaque thread accède à des emplacements mémoire séparés? Le branchement à l'intérieur du code parallèle est déconseillé, mais vous modifiez également la variable d'index ('ui') dans le code ... qui semble très suspect. –
Si je baisse le numéro N, à un nombre faible comme 6, dose ça fonctionne bien, mais si je reçois une dose plus élevée, il pleut, je pense que ce pourrait être un problème de chien de garde, mais comment résoudre ce problème – Androme