J'ai utilisé le code suivant pour mesurer le temps de fonctionnement de mon code;J'ai utilisé cudaEvent pour mesurer le temps d'exécution de mon code. Mais j'ai trouvé quelque chose puzzles
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/find.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/distance.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/pair.h>
#include <thrust/remove.h>
#include <math.h>
#include <fstream>
#include <string>
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
using namespace std;
const int MINCOUNTS = 20;
const int h = 10;
const int dim = 2;
//const int h2 = pow(double(h),double(dim));
struct DataType
{
float d[dim];
};
void loadData(thrust::host_vector<DataType>& D_,string dir_, DataType& gt)
{
fstream in(dir_.c_str(),ios::in);
string tline;
string::size_type position;
getline(in,tline);
int flag = atoi(tline.c_str());
if(flag != 1)
{
cout<<"there is problem in file : "<<dir_<<endl;
exit(-1);
}
getline(in,tline);
int tot = atoi(tline.c_str());
getline(in,tline);
for(int i = 0; i < dim - 1; i++)
{
position = tline.find(" ");
gt.d[i] = atof(tline.substr(0,position).c_str());
tline = tline.substr(position+1, tline.size() - position);
}
gt.d[dim-1] = atof(tline.c_str());
DataType dt;
for(int i = 0; i < tot-1; i++)
{
getline(in,tline);
for(int i = 0; i < dim - 1; i++)
{
position = tline.find(" ");
dt.d[i] = atof(tline.substr(0,position).c_str());
tline = tline.substr(position+1, tline.size() - position);
}
dt.d[dim-1] = atof(tline.c_str());
D_.push_back(dt);
}
}
__global__ void initialSM(int *gpu_Mchanged1, int *gpu_Schanged1,int N)
{
int index = blockIdx.x;
if(index < N)
{
gpu_Mchanged1[index] = index;
gpu_Schanged1[index] = index;
}
}
//parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N, h);
__global__ void parallelCal(int* gpu_Schanged1, DataType *input, DataType *msPoint, int tot) // h is the band-width of the kernel function;
{
int index = blockIdx.x;
int dis = 0;
int ii = 0;
int i0 = 0;
int inlierNum = 0;
// double h2 = 10000;
if(index < tot)
{
dis = 0;
i0 = gpu_Schanged1[index];
for(unsigned int i = 0; i < dim; i++)
msPoint[index].d[i] = 0;
for(int i = 0 ;i < tot ;i++)
{
ii = gpu_Schanged1[i];
dis = 0;
for(unsigned int j = 0; j < dim; j++)
{
dis += (input[i0].d[j] - input[ii].d[j])*(input[i0].d[j] - input[ii].d[j]);
if(dis > pow(double(h),2.0))
break;
}
if (dis < pow(double(h),2.0))
{
inlierNum++;
for(unsigned int j = 0; j < dim; j++)
msPoint[index].d[j] += (input[ii].d[j] - input[i0].d[j]);
}
}
// msPoint[index].d[0] = inlierNum;
for(unsigned int j = 0; j < dim; j++)
{
msPoint[index].d[j] /= inlierNum;
msPoint[index].d[j] += input[i0].d[j];
}
}
}
//nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);
__global__ void nearestSearch(int *gpu_Schanged1,int *gpu_Mchanged1, DataType *msPoint, DataType *input, int tot, int *Sunchanged, int *Munchanged)
{
int index = blockIdx.x;
float dis = 0;
float disMin = 1000000;
int flag = -1;
int i1;
if(index < tot)
{
for(int i = 0; i < tot; i++)
{
i1 = gpu_Schanged1[i];
dis = 0;
for(int j = 0; j < dim; j++)
dis += (msPoint[index].d[j] - input[i1].d[j])*(msPoint[index].d[j] - input[i1].d[j]);
if(dis <= disMin)
{
disMin = dis;
flag = i1;
}
}
Sunchanged[gpu_Schanged1[index]] = index;
Munchanged[gpu_Schanged1[index]] = flag;
gpu_Mchanged1[index] = flag;
}
}
////routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);
__global__ void routineTransmission(bool loop1st, int *gpu_Schanged1,int *gpu_Mchanged1, int *gpu_Sunchanged,int *gpu_Munchanged, const int tot)
{
int index = blockIdx.x;
bool find2 = false;
if(index < tot)
{
int lastOne = -1;
int thisOne = -1;
int indexIter = index;
while(1)
{
if(loop1st)
{
lastOne = gpu_Mchanged1[indexIter];
thisOne = gpu_Mchanged1[lastOne];
if(lastOne == thisOne)
{
gpu_Munchanged[gpu_Schanged1[index]] = thisOne;
gpu_Mchanged1[index] = thisOne;
break;
}
indexIter = thisOne;
}
else
{
// gpu_Mchanged1[index] = gpu_Schanged1[index];
while(1)
{
lastOne = gpu_Mchanged1[indexIter];
for(int i = 0; i < tot; i++)
{
if(i == indexIter)
continue;
if(lastOne == gpu_Schanged1[i])
{
thisOne = i;
find2 = true;
break;
}
}
if(find2 == false)
break;
indexIter = thisOne;
find2 = false;
}
if(thisOne != index && thisOne != -1)
{
gpu_Munchanged[gpu_Schanged1[index]] = gpu_Schanged1[thisOne];
gpu_Mchanged1[index] = gpu_Schanged1[thisOne];
}
break;
}
}
}
}
//
__global__ void deleteCircle(int *gpu_Mchanged1, int *gpu_Munchanged, const int N, bool loop1st)
{
int index = blockIdx.x;
int router0, router1;
if(index < N)
{
if(loop1st)
{
router0 = gpu_Mchanged1[index];
router1 = gpu_Mchanged1[router0];
while(1)
{
if(index == router0 || index == router1)
{
gpu_Munchanged[index] = index;
break;
}
if(router0 == router1)
break;
router0 = gpu_Mchanged1[router1];
router1 = gpu_Mchanged1[router0];
}
}
}
}
__global__ void checkTheClusterSize(int *gpu_Mchanged1, int *gpu_Schanged1, int *gpu_Munchanged, int *gpu_clusterSize, int smallTot, int tot)
{
int index = blockIdx.x;
if(index < smallTot)
{
int count = 0;
for(unsigned int i = 0; i < tot; i++)
{
if(gpu_Munchanged[i] == gpu_Mchanged1[index])
count++;
}
gpu_clusterSize[index] = count;
if(count <= MINCOUNTS)
gpu_Schanged1[index] = -1;
}
}
__global__ void checkTheCenterNum(int *gpu_Munchanged,int *gpu_Sunchanged, int *gpu_Kcounts ,int tot)
{
int index = blockIdx.x;
if(index < tot)
{
if (gpu_Kcounts[gpu_Munchanged[index]] < MINCOUNTS)
{
gpu_Sunchanged[index] = -1;
}
}
}
struct increaseOne: public thrust::unary_function<int, int>
{
int operator()(int a_){return a_++;}
};
//
__global__ void mergeCentreSimple(int* gpu_Munchanged, int *gpu_clusterSize, DataType* gpu_input,int *gpu_Schanged1, int *gpu_Mchanged1, int tot)
{
int index = blockIdx.x;
float dis = 0;
float disMin = pow(double(h/2),2.0);
int disMinIndex = -1;
bool flag = false;
if(index < tot)
{
for(unsigned int i = 0; i < tot; i++)
{
if(index == i)
continue;
dis = 0;
for(unsigned int j = 0; j < dim; j++)
dis += (gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j])*(gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j]);
// dis = (gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)*(gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)+(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2)*(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2);
if(dis < disMin)
{
flag = true;
disMin = dis;
disMinIndex = i;
}
}
if(flag)
if(gpu_clusterSize[index] < gpu_clusterSize[disMinIndex])
{
gpu_Munchanged[gpu_Schanged1[index]] = gpu_Mchanged1[disMinIndex];
gpu_Mchanged1[index] = gpu_Mchanged1[disMinIndex];
}
}
}
struct is_minus_one
{
__host__ __device__
bool operator()(const int x)
{
return(x == -1);
}
};
typedef thrust::device_vector<int>::iterator dintiter;
int main(int argc, char** argv)
{
// int h = 100;
using namespace std;
thrust::host_vector<DataType> host_input;
// string dir = "/home/gaoy/cuda-workspace/DATA/input/dataMS/data_1.txt";
string dir = "/home/gaoy/workspace/DATA/dataInput/gaussianDistribution_2500.txt";
DataType gt;
loadData(host_input,dir, gt);
cudaEvent_t start,stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int loopTime = 100;
float timeSum = 0;
thrust::device_vector<DataType> device_input = host_input; // Host端vector
int N = device_input.size();
int rN = N;
int lastSize, thisSize;
DataType *gpu_input;
gpu_input = thrust::raw_pointer_cast(&device_input[0]);
thrust::device_vector<DataType> device_msPoint;
device_msPoint.resize(N);
DataType *gpu_msPoint;
thrust::device_vector<int> device_Sunchanged;
device_Sunchanged.resize(N);
int *gpu_Sunchanged;
gpu_Sunchanged = thrust::raw_pointer_cast(&device_Sunchanged[0]);
thrust::device_vector<int> device_Munchanged;
device_Munchanged.resize(N);
int *gpu_Munchanged;
gpu_Munchanged = thrust::raw_pointer_cast(&device_Munchanged[0]);
thrust::device_vector<int> device_Schanged1;
device_Schanged1.resize(N);
int *gpu_Schanged1;
gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);
thrust::device_vector<int> device_Mchanged1;
device_Mchanged1.resize(N);
int *gpu_Mchanged1;
gpu_Mchanged1 = thrust::raw_pointer_cast(&device_Mchanged1[0]);
thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> new_end;
thrust::device_vector<int> device_clusterSize;
initialSM<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,N);
bool loop1st = true;
dintiter Mend, Send, Cend;
int *gpu_clusterSize;
gpu_msPoint = thrust::raw_pointer_cast(&device_msPoint[0]);
for(int i = 0; i < loopTime; i++)
{
cudaFree(0);
cudaEventRecord(start,0);
while(1)
{
lastSize = device_Schanged1.size();
N = lastSize;
device_msPoint.resize(N);
parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N); //the size of the gpu_msPoint is as the same as the gpu_Mchanged1; but the gpu_input is the original data size
device_Mchanged1.resize(N);
nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);
routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);
thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
//
new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
N = new_end.first - device_Mchanged1.begin();
device_Mchanged1.resize(N);
device_Schanged1.resize(N);
device_clusterSize.clear();
device_clusterSize.resize(N);
gpu_clusterSize = thrust::raw_pointer_cast(&device_clusterSize[0]);
checkTheClusterSize<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,gpu_Munchanged, gpu_clusterSize,N,rN);
Mend = thrust::remove_if(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin(),is_minus_one());
Cend = thrust::remove_if(device_clusterSize.begin(), device_clusterSize.end(), device_Schanged1.begin(), is_minus_one());
Send = thrust::remove(device_Schanged1.begin(), device_Schanged1.end(), -1);
N = Send - device_Schanged1.begin();
device_Schanged1.resize(N);
device_Mchanged1.resize(N);
device_clusterSize.resize(N);
mergeCentreSimple<<<N,1>>>(gpu_Munchanged,gpu_clusterSize, gpu_input, gpu_Schanged1, gpu_Mchanged1, N);
thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
N = new_end.first - device_Mchanged1.begin();
device_Mchanged1.resize(N);
device_Schanged1.resize(N);
thisSize = N;
if(lastSize == thisSize)
break;
loop1st = false;
thrust::copy(device_Mchanged1.begin(),device_Mchanged1.end(),device_Schanged1.begin());
device_Mchanged1.clear();
gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
// for(unsigned int ii = 0; ii < device_Mchanged1.size(); ii++)
// cout<<ii<<" "<<host_input[device_Schanged1[ii]].d[0]<<" "<<host_input[device_Schanged1[ii]].d[1]<<endl;
timeSum += time;
cout<<i<<" "<<time<<endl;
}
cout<<"elapsed: "<<timeSum/loopTime<<" ms"<<endl;
return 0;
}
La sortie de la variable, le temps, dans chaque boucle est pas la même chose et cela est le résultat que je suis:
0 385.722
1 3.67507
2 3.64183
3 2.40269
Mais à chaque fois que le code que je viens de tester fais la même chose. Quel résultat devrais-je croire? Je suis vraiment perplexe à ce sujet. Merci.
Ce code n'est toujours pas correct. Vous appelez 'cudaEventRecord (stop, 0);' deux fois. Pourquoi ne pas copier et coller à partir de votre code actuel? Je ne demande pas le code que vous essayez de chronométrer, juste la séquence API cuda que vous utilisez. Je ne crois pas que ce code que vous avez posté reflète ce que vous êtes en train de faire. –
Il est possible que votre première mesure soit affectée par l'heure de démarrage du GPU (création du contexte, etc.). Vous pouvez mettre une instruction 'cudaFree (0);' avant votre boucle de synchronisation. –
Alors, comment puis-je supprimer l'influence causée par le démarrage du GPU. Ou il est seulement raisonnable pour moi de considérer le premier résultat, 0 385.722? – GaoYuan