2012-12-15 6 views
6

Dans ce devoir, je dois compléter le code pour multiplier deux matrices rectangle en utilisant CUDA C. Après avoir complété le code, j'ai soumis et la solution était correcte pour l'ensemble de données lorsque les matrices étaient carrées, le résultat ne correspondait pas à la valeur attendue lorsque les matrices n'étaient pas carrées.Multiplier les matrices rectangulaires dans CUDA

Voici le code après avoir ajouté les pièces manquantes:

#include <wb.h> 

#define wbCheck(stmt) do {        \ 
    cudaError_t err = stmt;       \ 
    if (err != cudaSuccess) {       \ 
     wbLog(ERROR, "Failed to run stmt ", #stmt); \ 
     return -1;          \ 
    }             \ 
} while(0) 

// Compute C = A * B 
__global__ void matrixMultiply(float * A, float * B, float * C, 
       int numARows, int numAColumns, 
       int numBRows, int numBColumns, 
       int numCRows, int numCColumns) { 
    //@@ Insert code to implement matrix multiplication here 
    int Row = blockIdx.y * blockDim.y + threadIdx.y; 
    int Col = blockIdx.x * blockDim.x + threadIdx.x; 
    if (numAColumns != numBRows) return ; 
    if ((Row < numARows) && (Col < numBColumns)){ 
     float Cvalue = 0; 
     for (int k = 0 ; k < numAColumns ; ++k) 
     Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col]; 
     C[Row*numAColumns + Col] = Cvalue; 
    } 

    } 



int main(int argc, char ** argv) { 
    wbArg_t args; 
    float * hostA; // The A matrix 
    float * hostB; // The B matrix 
    float * hostC; // The output C matrix 
    float * deviceA; 
    float * deviceB; 
    float * deviceC; 
    int numARows; // number of rows in the matrix A 
    int numAColumns; // number of columns in the matrix A 
    int numBRows; // number of rows in the matrix B 
    int numBColumns; // number of columns in the matrix B 
    int numCRows; // number of rows in the matrix C (you have to set this) 
    int numCColumns; // number of columns in the matrix C (you have to set this) 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); 
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); 
    //@@ Set numCRows and numCColumns 
    numCRows = 0; 
    numCColumns = 0; 
    numCRows = numARows; 
    numCColumns = numBColumns; 
    //@@ Allocate the hostC matrix 
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 

    wbTime_start(GPU, "Allocating GPU memory."); 
    //@@ Allocate GPU memory here 
    cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns); 
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); 
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); 

    wbTime_stop(GPU, "Allocating GPU memory."); 

    wbTime_start(GPU, "Copying input memory to the GPU."); 
    //@@ Copy memory to the GPU here 

    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); 
    wbTime_stop(GPU, "Copying input memory to the GPU."); 

    //@@ Initialize the grid and block dimensions here 

    dim3 DimGrid(numARows/8 , numBColumns/8, 1); 
    dim3 DimBlock(8 , 8, 1); 

    wbTime_start(Compute, "Performing CUDA computation"); 

    //@@ Launch the GPU Kernel here 
    matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); 

    cudaThreadSynchronize(); 
    wbTime_stop(Compute, "Performing CUDA computation"); 

    wbTime_start(Copy, "Copying output memory to the CPU"); 
    //@@ Copy the GPU memory back to the CPU here 
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); 

    wbTime_stop(Copy, "Copying output memory to the CPU"); 

    wbTime_start(GPU, "Freeing GPU Memory"); 
    //@@ Free the GPU memory here 

    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 
    wbTime_stop(GPU, "Freeing GPU Memory"); 

    wbSolution(args, hostC, numCRows, numCColumns); 

    free(hostA); 
    free(hostB); 
    free(hostC); 

    return 0; 
} 

J'espère que vous pouvez me aider à trouver quelle partie est incorrecte.

Répondre

2

Remplacer:

Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col]; 

avec

Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col]; 
+0

salut Ahmad, merci d'essayer de me aider à trouver quelle partie est incorrecte, et après avoir suivi vos conseils, je trouve que le temps de traitement était beaucoup mieux, mais le résultat toujours pas le résultat attendu par exemple un L'ensemble de données montre que: La solution ne correspond pas aux résultats attendus à la colonne 124 et à la ligne 0. Prévu 457.153 mais a obtenu 422.296. Maintenant, je suis d'accord avec vous doit être une erreur dans la fonction matrixMultiply, peut-être que je dois changer quelque chose d'autre. –

3

Remplacer: for (int k = 0 ; k < numAColumns ; ++k) Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col]; C[Row*numAColumns + Col] = Cvalue; }

avec for (int k = 0 ; k < numAColumns ; ++k) Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col]; C[Row*numCColumns + Col] = Cvalue; }

+0

merci Ira, avec votre suggestion j'ai encore un ensemble de données correctes, mais je n'arrive toujours pas à obtenir le résultat entièrement attendu pour tous les ensembles de données, par exemple j'ai eu ce cas: La solution ne correspond pas aux résultats attendus à la colonne 200 et la ligne 0. Attendez-vous à 415,556 mais à 0,807. Je pense que vous avez raison, c'est quelque chose dans la fonction matrixMultiply. –

2

remplacer

Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col]; 

pour

Cvalue += A[Row*numAColumns +k]* B[k*numBColumns+Col]; 

et

C[Row*numAColumns + Col] = Cvalue; 

pour

C[Row*numCColumns+Col] = Cvalue; 
+0

merci ram, vos corrections sont vraies, ils sont similaires à celui Ahmad et Ira mentionné auparavant, même j'ai fait ces corrections, mais je ne peux toujours pas obtenir le bon résultat. –

4

Après l'aide d'Ira, Ahmad, bélier, et Oli Fly, j'ai eu la bonne réponse que suit:

#include <wb.h> 

#define wbCheck(stmt) do {         \ 
     cudaError_t err = stmt;       \ 
     if (err != cudaSuccess) {       \ 
      wbLog(ERROR, "Failed to run stmt ", #stmt); \ 
      return -1;          \ 
     }             \ 
    } while(0) 

// Compute C = A * B 
__global__ void matrixMultiply(float * A, float * B, float * C, 
        int numARows, int numAColumns, 
        int numBRows, int numBColumns, 
        int numCRows, int numCColumns) { 
    //@@ Insert code to implement matrix multiplication here 
    int Row = blockIdx.y * blockDim.y + threadIdx.y; 
    int Col = blockIdx.x * blockDim.x + threadIdx.x; 
    if (numAColumns != numBRows) return; 
    if ((Row < numARows) && (Col < numBColumns)){ 
    float Cvalue = 0; 
    for (int k = 0; k < numAColumns; ++k) 
    Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col]; 
    C[Row*numCColumns + Col] = Cvalue; 
    } 

} 

int main(int argc, char ** argv) { 
    wbArg_t args; 
    float * hostA; // The A matrix 
    float * hostB; // The B matrix 
    float * hostC; // The output C matrix 
    float * deviceA; 
    float * deviceB; 
    float * deviceC; 
    int numARows; // number of rows in the matrix A 
    int numAColumns; // number of columns in the matrix A 
    int numBRows; // number of rows in the matrix B 
    int numBColumns; // number of columns in the matrix B 
    int numCRows; // number of rows in the matrix C (you have to set this) 
    int numCColumns; // number of columns in the matrix C (you have to set this) 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); 
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); 
    //@@ Set numCRows and numCColumns 
    numCRows = 0; 
    numCColumns = 0; 
    numCRows = numARows; 
    numCColumns = numBColumns; 
    //@@ Allocate the hostC matrix 
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 

    wbTime_start(GPU, "Allocating GPU memory."); 
    //@@ Allocate GPU memory here 
    cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns); 
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); 
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); 

    wbTime_stop(GPU, "Allocating GPU memory."); 

    wbTime_start(GPU, "Copying input memory to the GPU."); 
    //@@ Copy memory to the GPU here 

    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); 
    wbTime_stop(GPU, "Copying input memory to the GPU."); 

    //@@ Initialize the grid and block dimensions here 

    dim3 DimGrid((numCColumns - 1)/8 + 1, (numCRows - 1)/8 + 1, 1); 
    dim3 DimBlock(8 , 8, 1); 

    wbTime_start(Compute, "Performing CUDA computation"); 

    //@@ Launch the GPU Kernel here 
    matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); 

    cudaThreadSynchronize(); 
    wbTime_stop(Compute, "Performing CUDA computation"); 

    wbTime_start(Copy, "Copying output memory to the CPU"); 
    //@@ Copy the GPU memory back to the CPU here 
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); 

    wbTime_stop(Copy, "Copying output memory to the CPU"); 

    wbTime_start(GPU, "Freeing GPU Memory"); 
    //@@ Free the GPU memory here 

    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 
    wbTime_stop(GPU, "Freeing GPU Memory"); 

    wbSolution(args, hostC, numCRows, numCColumns); 

    free(hostA); 
    free(hostB); 
    free(hostC); 

    return 0; 
} 
+0

Merci d'avoir posé cette question. Cela m'a vraiment aidé. Une chose que je voudrais demander, avez-vous déjà réussi à travailler pour des jeux de données où les dimensions de la matrice ne sont pas des multiples de 8? –

+0

@ Abraham, certains des ensembles de données avaient les dimensions suivantes: (les dimensions de la matrice A sont 200 * 100, les dimensions de la matrice B sont 100 * 256) et il y avait un autre ensemble de données (dimensions de A * 100 * 128, dimensions de B est 128 * 50) –

+1

Je crois que la réponse pour # 2 est fausse avec ce code parce que l'une des dimensions (100) n'est pas divisible par la taille de bloc (8).La configuration de la grille ne tient pas compte de cela. –

1

nous pouvons utiliser la multiplication matricielle matricielle et j'ai trouvé qu'il a un meilleur temps d'exécution.

#include <wb.h> 

#define wbCheck(stmt) do {         \ 
     cudaError_t err = stmt;       \ 
     if (err != cudaSuccess) {       \ 
      wbLog(ERROR, "Failed to run stmt ", #stmt); \ 
      return -1;          \ 
     }             \ 
    } while(0) 

// Compute C = A * B 
__global__ void matrixMultiplyShared(float * A, float * B, float * C, 
          int numARows, int numAColumns, 
          int numBRows, int numBColumns, 
          int numCRows, int numCColumns) { 
    //@@ Insert code to implement matrix multiplication here 
    //@@ You have to use shared memory for this MP 
    const int TILE_WIDTH = 32; 
    __shared__ float sharedA[TILE_WIDTH][TILE_WIDTH]; 
    __shared__ float sharedB[TILE_WIDTH][TILE_WIDTH]; 
    int bx = blockIdx.x; 
    int by = blockIdx.y; 
    int tx = threadIdx.x; 
    int ty = threadIdx.y; 
    int Row = by*TILE_WIDTH + ty; 
    int Col = bx*TILE_WIDTH + tx; 
    float Cvalue = 0.0; 
    if (numAColumns != numBRows) return ; 
    for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++) 
    { 

     if (i*TILE_WIDTH + tx < numAColumns && Row < numARows){ 
      sharedA[ty][tx] = A[Row*numAColumns + i*TILE_WIDTH + tx]; 
     }else{ 
      sharedA[ty][tx] = 0.0; 
     } 

     if (i*TILE_WIDTH + ty < numBRows && Col < numBColumns){ 
      sharedB[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + Col]; 
     }else{ 
      sharedB[ty][tx] = 0.0; 
     } 
     __syncthreads(); 
     if(Row < numARows && Col < numBColumns){ 

      for(int j = 0; j < TILE_WIDTH; j++) 
      Cvalue += sharedA[ty][j] * sharedB[j][tx]; 
     } 

     __syncthreads(); 
    } 

    if (Row < numCRows && Col < numCColumns) 
     C[Row*numCColumns + Col] = Cvalue; 
}  




int main(int argc, char ** argv) { 
    wbArg_t args; 
    float * hostA; // The A matrix 
    float * hostB; // The B matrix 
    float * hostC; // The output C matrix 
    float * deviceA; 
    float * deviceB; 
    float * deviceC; 
    int numARows; // number of rows in the matrix A 
    int numAColumns; // number of columns in the matrix A 
    int numBRows; // number of rows in the matrix B 
    int numBColumns; // number of columns in the matrix B 
    int numCRows; // number of rows in the matrix C (you have to set this) 
    int numCColumns; // number of columns in the matrix C (you have to set this) 
    int TILE_WIDTH = 32; 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); 
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); 
    //@@ Set numCRows and numCColumns 
    numCRows = 0; 
    numCColumns = 0; 
    numCRows = numARows; 
    numCColumns = numBColumns; 
    //@@ Allocate the hostC matrix 
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 

    wbTime_start(GPU, "Allocating GPU memory."); 
    //@@ Allocate GPU memory here 
    cudaMalloc((void**)&deviceA , sizeof(float)*numARows*numAColumns); 
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); 
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); 

    wbTime_stop(GPU, "Allocating GPU memory."); 

    wbTime_start(GPU, "Copying input memory to the GPU."); 
    //@@ Copy memory to the GPU here 
    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); 

    wbTime_stop(GPU, "Copying input memory to the GPU."); 

    //@@ Initialize the grid and block dimensions here 
    int dimX = (int)(ceil((float)numCColumns/TILE_WIDTH)); 
    int dimY = (int)(ceil((float)numCRows/TILE_WIDTH)); 
    dim3 DimGrid(dimX, dimY); 
    dim3 DimBlock(TILE_WIDTH, TILE_WIDTH); 



    wbTime_start(Compute, "Performing CUDA computation"); 
    //@@ Launch the GPU Kernel here 
    matrixMultiplyShared<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); 

    cudaThreadSynchronize(); 
    wbTime_stop(Compute, "Performing CUDA computation"); 

    wbTime_start(Copy, "Copying output memory to the CPU"); 
    //@@ Copy the GPU memory back to the CPU here 
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); 

    wbTime_stop(Copy, "Copying output memory to the CPU"); 

    wbTime_start(GPU, "Freeing GPU Memory"); 
    //@@ Free the GPU memory here 
    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 

    wbTime_stop(GPU, "Freeing GPU Memory"); 

    wbSolution(args, hostC, numCRows, numCColumns); 

    free(hostA); 
    free(hostB); 
    free(hostC); 

    return 0; 
}