2011-11-11 5 views
0

J'essaye de faire un pice de code de travail en parallèle sur un périphérique cuda, mais j'ai un problème, mon pilote gfx continue de se bloquer pendant que le programme fonctionne, mais seulement pour ce programme, autre code cuda fonctionne bien. Et ça donne une mauvaise réponse, mais je pense que c'est à cause du crash!Pilote CUDA crashé du programme

Remarque: il est utilisé sur une carte GFX Quadro 2000M!

Voici ma version parallèle du code.

#include <stdio.h> 
#include <time.h> 
#include <math.h> 

#define N 16 
#define threads 512 
#define MaxBlocks 6500 

__global__ void cudaCalculateBestPath(int *M, int *bestToDiagPathSum, int *bestFromDiagPathSum, 
     unsigned int *bestToDiagPathCode, unsigned int *bestFromDiagPathCode) 
{ 
    int x = ((unsigned int)1 << (N-1)); 
    unsigned int currentPathCode = blockIdx.x * threads + threadIdx.x; 
    // This while is for if we are over the max amount of blocks 
    while(currentPathCode < x) 
    { 
     int test = ((unsigned int)1 << (N-1)); 
     if(currentPathCode >= test) 
      return; 

     unsigned int ui = currentPathCode; 

     int toDiagPathSum = M[0]; 
     int toDiagRow = 0; 
     int toDiagCol = 0; 

     int fromDiagPathSum = M[(N-1)*N+N-1]; 
     int fromDiagRow = N-1; 
     int fromDiagCol = N-1; 

     for (int i = 0; i < N-1; i++) 
     { 
      if (ui % 2 == 0) 
      { 
       toDiagCol++;     // horizontal move 
       fromDiagCol--; 
      } 
      else        
      { 
       toDiagRow++;     // vertical move 
       fromDiagRow--; 
      } 
      toDiagPathSum += M[toDiagRow*N+toDiagCol]; 
      fromDiagPathSum += M[fromDiagRow*N+fromDiagCol]; 
      ui = ui >> 1; 
     } 

     if (toDiagPathSum < bestToDiagPathSum[toDiagRow]) 
     { 
      bestToDiagPathSum[toDiagRow] = toDiagPathSum; 
      bestToDiagPathCode[toDiagRow] = currentPathCode; 
     } 

     if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow]) 
     { 
      bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum; 
      bestFromDiagPathCode[fromDiagRow] = currentPathCode; 
     } 

     // Next run 
     currentPathCode = blockDim.x + gridDim.x; 
    } 
} 

int main() 
{ 
    clock_t start = clock(); 

    //--- create and initialize M matrix (including best path) 

    int M[N*N]; 
    for (int row = 0; row < N; row++) 
     for (int col = 0; col < N; col++) 
      M[row*N+col] = 2; 

#pragma region Test Path 
    M[ 0*N+0] = 1; 
    M[ 0*N+1] = 1; 
    M[ 0*N+2] = 1; 
    M[ 1*N+2] = 1; 
    M[ 1*N+3] = 1; 
    M[ 2*N+3] = 1; 
    M[ 3*N+3] = 1; 
    M[ 3*N+4] = 1; 
    M[ 3*N+5] = 1; 
    M[ 3*N+6] = 1; 
    M[ 4*N+6] = 1; 
    M[ 5*N+6] = 1; 
    M[ 6*N+6] = 1; 
    M[ 7*N+6] = 1; 
    M[ 8*N+6] = 1; 
    M[ 9*N+6] = 1; 
    M[10*N+6] = 1; 
    M[10*N+7] = 1; 
    M[10*N+8] = 1; 
    M[10*N+9] = 1; 
    M[11*N+9] = 1; 
    M[11*N+0] = 1; 
    M[12*N+0] = 1; 
    M[13*N+10] = 1; 
    M[13*N+11] = 1; 
    M[13*N+12] = 1; 
    M[14*N+12] = 1; 
    M[15*N+12] = 1; 
    M[15*N+13] = 1; 
    M[15*N+14] = 1; 
    M[15*N+15] = 1; 
#pragma endregion Test Path 

    //--- create and initialize bestToDiag and bestFromDiag arrays 

    int bestToDiagPathSum[N]; 
    int bestFromDiagPathSum[N]; 

    unsigned int bestToDiagPathCode[N]; 
    unsigned int bestFromDiagPathCode[N]; 

    int biggerThanMaxPathSum = 256*N + 1; 
    for (int i = 0; i < N; i++) 
    { 
     bestToDiagPathSum[i] = biggerThanMaxPathSum; 
     bestFromDiagPathSum[i] = biggerThanMaxPathSum; 
    } 

    //--- iterate through path codes, updating bestToDiag and bestFromDiag arrays 

    int x = ((unsigned int)1 << (N-1)); 

    // Trick for celin the total blocks 
    int TotalBlocks = (x+threads)/threads; 
    if(TotalBlocks > MaxBlocks) 
     TotalBlocks = MaxBlocks; 

    int *dev_M, *dev_bestToDiagPathSum, *dev_bestFromDiagPathSum; 
    unsigned int *dev_bestToDiagPathCode, *dev_bestFromDiagPathCode; 

    // allocate memory 
    cudaMalloc((void**)&dev_M, N*N*sizeof(int)); 
    cudaMalloc((void**)&dev_bestToDiagPathSum, N*sizeof(int)); 
    cudaMalloc((void**)&dev_bestFromDiagPathSum, N*sizeof(int)); 
    cudaMalloc((void**)&dev_bestToDiagPathCode, N*sizeof(int)); 
    cudaMalloc((void**)&dev_bestFromDiagPathCode, N*sizeof(int)); 

    // Copy memory to device 
    cudaMemcpy(dev_M, M, N*N*sizeof(int), cudaMemcpyHostToDevice); 
    cudaMemcpy(dev_bestToDiagPathSum, bestToDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice); 
    cudaMemcpy(dev_bestFromDiagPathSum, bestFromDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice); 
    cudaMemcpy(dev_bestToDiagPathCode, bestToDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice); 
    cudaMemcpy(dev_bestFromDiagPathCode, bestFromDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice); 

    // Run code on device 
    printf("Blocks: %d\n", TotalBlocks); 
    printf("Threads: %d\n\n", threads); 
    cudaCalculateBestPath<<<TotalBlocks,threads>>>(dev_M, dev_bestToDiagPathSum, dev_bestFromDiagPathSum, 
     dev_bestToDiagPathCode, dev_bestFromDiagPathCode); 

    // Insert code here to run while the GPU is running. 

    // Copy the mem back 
    cudaMemcpy(M, dev_M, N*N*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(bestToDiagPathSum, dev_bestToDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(bestFromDiagPathSum, dev_bestFromDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(bestToDiagPathCode, dev_bestToDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(bestFromDiagPathCode, dev_bestFromDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost); 

    int bestPathSum = biggerThanMaxPathSum; 
    unsigned int bestPathCodePrefix; 
    unsigned int bestPathCodeSuffix; 

    int tempSum; 

    for (int i = 0; i < N; i++) 
    { 
     tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)]; 
     if (tempSum < bestPathSum) 
     { 
      bestPathSum = tempSum; 
      bestPathCodePrefix = bestToDiagPathCode[i]; 
      bestPathCodeSuffix = bestFromDiagPathCode[i]; 
     } 
    } 

    //--- output best path sum and best path diagram 

    printf("Best Path Sum = %d\n\n",bestPathSum); 

    M[0] = -M[0]; 
    int toDiagRow = 0; 
    int toDiagCol = 0; 
    unsigned int ui = bestPathCodePrefix; 
    for (int i = 0; i < N-1; i++) 
    { 
     if (ui % 2 == 0) 
      toDiagCol++;     // horizontal move 
     else        
      toDiagRow++;     // vertical move 
     M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol]; 
     ui = ui >> 1; 
    } 

    M[(N-1)*N+N-1] = -M[(N-1)*N+N-1]; 
    int fromDiagRow = N-1; 
    int fromDiagCol = N-1; 
    ui = bestPathCodeSuffix; 
    for (int i = 0; i < N-2; i++) 
    { 
     if (ui % 2 == 0) 
      fromDiagCol--;     // horizontal move 
     else        
      fromDiagRow--;     // vertical move 
     M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol]; 
     ui = ui >> 1; 
    } 

    for (int row = N-1; row >= 0; row--) 
    { 
     for (int col = 0; col <= N-1; col++) 
      if (M[row*N+col] < 0) 
      { 
       printf("*"); 
       M[row*N+col] = -M[row*N+col]; 
      } 
      else 
       printf("%d",M[row*N+col]); 
     printf("\n"); 
    } 

    printf("\nTime elapsed: %f", ((double)clock() - start)/CLOCKS_PER_SEC); 

    int dummyReadForPause; 
    scanf_s("%d",&dummyReadForPause); 

    return 0; 
} 

Code séquentiel (parfait état)

#include <stdio.h> 
#include <time.h> 
#include <math.h> 

#define N 16 
#define threads 512 
#define MaxBlocks 6500 

int main() 
{ 
    clock_t start = clock(); 

    //--- create and initialize M matrix (including best path) 

    int M[N*N]; 
    for (int row = 0; row < N; row++) 
     for (int col = 0; col < N; col++) 
      M[row*N+col] = 2; 

#pragma region Test Path 
    M[ 0*N+0] = 1; 
    M[ 0*N+1] = 1; 
    M[ 0*N+2] = 1; 
    M[ 1*N+2] = 1; 
    M[ 1*N+3] = 1; 
    M[ 2*N+3] = 1; 
    M[ 3*N+3] = 1; 
    M[ 3*N+4] = 1; 
    M[ 3*N+5] = 1; 
    M[ 3*N+6] = 1; 
    M[ 4*N+6] = 1; 
    M[ 5*N+6] = 1; 
    M[ 6*N+6] = 1; 
    M[ 7*N+6] = 1; 
    M[ 8*N+6] = 1; 
    M[ 9*N+6] = 1; 
    M[10*N+6] = 1; 
    M[10*N+7] = 1; 
    M[10*N+8] = 1; 
    M[10*N+9] = 1; 
    M[11*N+9] = 1; 
    M[11*N+0] = 1; 
    M[12*N+0] = 1; 
    M[13*N+10] = 1; 
    M[13*N+11] = 1; 
    M[13*N+12] = 1; 
    M[14*N+12] = 1; 
    M[15*N+12] = 1; 
    M[15*N+13] = 1; 
    M[15*N+14] = 1; 
    M[15*N+15] = 1; 
#pragma endregion Test Path 

    //--- create and initialize bestToDiag and bestFromDiag arrays 

    int bestToDiagPathSum[N]; 
    int bestFromDiagPathSum[N]; 

    unsigned int bestToDiagPathCode[N]; 
    unsigned int bestFromDiagPathCode[N]; 

    int biggerThanMaxPathSum = 256*N + 1; 
    for (int i = 0; i < N; i++) 
    { 
     bestToDiagPathSum[i] = biggerThanMaxPathSum; 
     bestFromDiagPathSum[i] = biggerThanMaxPathSum; 
    } 

    //--- iterate through path codes, updating bestToDiag and bestFromDiag arrays 
    int toDiagPathSum, toDiagRow, toDiagCol; 
    int fromDiagPathSum, fromDiagRow, fromDiagCol; 
    unsigned int ui; 
    int x = ((unsigned int)1 << (N-1)); 

    for(unsigned int currentPathCode = 0; currentPathCode < x; currentPathCode++) 
    { 
     ui = currentPathCode; 

     toDiagPathSum = M[0]; 
     toDiagRow = 0; 
     toDiagCol = 0; 

     fromDiagPathSum = M[(N-1)*N+N-1]; 
     fromDiagRow = N-1; 
     fromDiagCol = N-1; 

     for (int i = 0; i < N-1; i++) 
     { 
      if (ui % 2 == 0) 
      { 
       toDiagCol++;     // horizontal move 
       fromDiagCol--; 
      } 
      else        
      { 
       toDiagRow++;     // vertical move 
       fromDiagRow--; 
      } 
      toDiagPathSum += M[toDiagRow*N+toDiagCol]; 
      fromDiagPathSum += M[fromDiagRow*N+fromDiagCol]; 
      ui = ui >> 1; 
     } 

     if (toDiagPathSum < bestToDiagPathSum[toDiagRow]) 
     { 
      bestToDiagPathSum[toDiagRow] = toDiagPathSum; 
      bestToDiagPathCode[toDiagRow] = currentPathCode; 
     } 

     if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow]) 
     { 
      bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum; 
      bestFromDiagPathCode[fromDiagRow] = currentPathCode; 
     } 
    } 

    int bestPathSum = biggerThanMaxPathSum; 
    unsigned int bestPathCodePrefix; 
    unsigned int bestPathCodeSuffix; 

    int tempSum; 

    for (int i = 0; i < N; i++) 
    { 
     tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)]; 
     if (tempSum < bestPathSum) 
     { 
      bestPathSum = tempSum; 
      bestPathCodePrefix = bestToDiagPathCode[i]; 
      bestPathCodeSuffix = bestFromDiagPathCode[i]; 
     } 
    } 

    //--- output best path sum and best path diagram 

    printf("Best Path Sum = %d\n\n",bestPathSum); 

    M[0] = -M[0]; 
    toDiagRow = 0; 
    toDiagCol = 0; 
    ui = bestPathCodePrefix; 
    for (int i = 0; i < N-1; i++) 
    { 
     if (ui % 2 == 0) 
      toDiagCol++;     // horizontal move 
     else        
      toDiagRow++;     // vertical move 
     M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol]; 
     ui = ui >> 1; 
    } 

    M[(N-1)*N+N-1] = -M[(N-1)*N+N-1]; 
    fromDiagRow = N-1; 
    fromDiagCol = N-1; 
    ui = bestPathCodeSuffix; 
    for (int i = 0; i < N-2; i++) 
    { 
     if (ui % 2 == 0) 
      fromDiagCol--;     // horizontal move 
     else        
      fromDiagRow--;     // vertical move 
     M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol]; 
     ui = ui >> 1; 
    } 

    for (int row = N-1; row >= 0; row--) 
    { 
     for (int col = 0; col <= N-1; col++) 
      if (M[row*N+col] < 0) 
      { 
       printf("*"); 
       M[row*N+col] = -M[row*N+col]; 
      } 
      else 
       printf("%d",M[row*N+col]); 
     printf("\n"); 
    } 

    printf("\nTime elapsed: %f", ((double)clock() - start)/CLOCKS_PER_SEC); 

    int dummyReadForPause; 
    scanf_s("%d",&dummyReadForPause); 

    return 0; 
} 
+1

Juste comme une observation, votre code CUDA semble terriblement complexe. Est-il clair que chaque thread accède à des emplacements mémoire séparés? Le branchement à l'intérieur du code parallèle est déconseillé, mais vous modifiez également la variable d'index ('ui') dans le code ... qui semble très suspect. –

+0

Si je baisse le numéro N, à un nombre faible comme 6, dose ça fonctionne bien, mais si je reçois une dose plus élevée, il pleut, je pense que ce pourrait être un problème de chien de garde, mais comment résoudre ce problème – Androme

Répondre