Implémentation OpenMP plus lente que l'implémentation en série

J'essaie actuellement de me familiariser avec OpenMP. Pour la pratique, j'ai implémenté un algorithme "d'apprentissage" gourmand avec OpenMP. Alors je mesurais le temps avecImplémentation OpenMP plus lente que l'implémentation en série

time ./a.out

je comparais avec ma mise en œuvre de série et peu importe combien d'itérations mon programme est en train de faire le OpenMP une est alway significative plus lente.

Voici mon code, les commentaires devraient, espérons tout expliquer:

#include <omp.h> 
#include <iostream> 
#include <vector> 
#include <cstdlib> 
#include <cmath> 
#include <stdio.h> 
#include <ctime> 

#define THREADS 4 

using namespace std; 

struct TrainData { 
    double input; 
    double output; 
}; 

//Long Term Memory struct 
struct LTM { 
     double a; //paramter a of the polynom 
     double b; 
     double c; 
     double score; //score to be minimized! 

     LTM() 
     { 
      a=0; 
      b=0; 
      c=0; 
      score=0; 
     } 

     //random LTM with paramters from low to high (including low and high) 
     LTM(int low, int high) 
     { 
      score=0; 
      a= rand() % high + low; 
      b= rand() % high + low; 
      c= rand() % high + low; 

     } 

     LTM(double _a, double _b, double _c) 
     { 
      a=_a; 
      b=_b; 
      c=_c; 
     } 

     void print() 
     { 
      cout<<"Score: "<<score<<endl; 
      cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl; 
     } 
}; 

//the acutal polynom function evaluating with passed LTM 
inline double evaluate(LTM &ltm, const double &x) 
{ 
    double ret; 
    ret = ltm.a*x*x + ltm.b*x + ltm.c; 

    return ret; 
} 


//scoring function calculates the Root Mean Square error (RMS) 
inline double score_function(LTM &ltmnew, vector<TrainData> &td) 
{ 
    double score; 
    double val; 
    int tdsize=td.size(); 
    score=0; 

    for(int i=0; i< tdsize; i++) 
    { 
     val = (td.at(i)).output - evaluate(ltmnew, (td.at(i)).input); 
     val *= val; 
     score += val; 
    } 

    score /= (double)tdsize; 

    score = sqrt(score); 

    return score; 
} 

LTM iterate(int iterations, vector<TrainData> td, int low, int high) 
{ 
    LTM fav = LTM(low,high); 
    fav.score = score_function(fav, td); 
    fav.print(); 
    LTM favs[THREADS]; // array for collecting the favorites of each thread 

    #pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td) 
    { 
     #pragma omp master 
     printf("Threads: %d\n", omp_get_num_threads()); 

     LTM cand; 
     #pragma omp for private(cand) 
     for(int i=0; i<iterations; i++) 
     { 
      cand = LTM(low, high); 
      cand.score = score_function(cand, td); 

      if(cand.score < fav.score) 
       fav = cand; 
     } 

     //save the favorite before ending the parallel section 
     #pragma omp critical 
     favs[omp_get_thread_num()] = fav; 
    } 

    //search for the best one in the array 
    for(int i=0; i<THREADS; i++) 
    { 
     if(favs[i].score < fav.score) 
      fav=favs[i]; 
    } 

    return fav; 
} 

//generate training data from -50 up to 50 with the train LTM 
void generateTrainData(vector<TrainData> *td, LTM train) 
{ 
    #pragma omp parallel for schedule(dynamic, 25) 
    for(int i=-50; i< 50; i++) 
    { 
     struct TrainData d; 
     d.input = i; 
     d.output = evaluate(train, (double)i); 
     #pragma omp critical 
     td->push_back(d); 

     //cout<<"input: "<<d.input<<" -> "<<d.output<<endl; 
    } 

} 

int main(int argc, char *argv[]) 
{ 

    int its= 10000000; //number of iterations 
    int a=2; 
    int b=4; 
    int c=6; 

    srand(time(NULL)); 
    LTM pol = LTM(a,b,c); //original polynom parameters 
    vector<TrainData> td; 

    //first genarte some training data and save it to td 
    generateTrainData(&td, pol); 

    //try to find the best solution 
    LTM fav = iterate(its, td, 1, 6); 


    printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score); 

    return 0; 
}

A mon PC à la maison, il a fallu 12 ans avec cette mise en œuvre. Le serial un seulement 6s. Si j'augmente le nombre d'itérations par le facteur 10, il sera d'environ 2min/1min (omp/serial).

Quelqu'un peut-il m'aider?

Source

2017-05-19 Talto

'rand()' n'est pas utilisable dans les contextes concurrents car il utilise un état global. Utilisez 'drand48_r()' à la place. –

D'accord, grâce aux commentaires de ma première question, je pourrais résoudre les problèmes de performance.

Comme dans les commentaires dit le problème était la fonction rand() que j'utilisais. Je les ai remplacés par un thread approprié drand48_r().

Comme:

... 
LTM(double low, double high, struct drand48_data *buff) 
{ 
    score=0; 
    double x; 
    drand48_r(buff,&x); 
    a= low + x * (high - low); 
    drand48_r(buff,&x); 
    b= low + x * (high - low); 
    drand48_r(buff,&x); 
    c= low + x * (high - low); 

} 
...

maintenant j'ai fois moins d'une seconde! Merci! :)

Source

2017-05-19 12:08:45 Talto

Vous pouvez également utiliser les [générateurs aléatoires C++ 11] (http://en.cppreference.com/w/cpp/numeric/random), qui ont une API plus agréable, offrent plus de choix pour l'algorithme PRNG, et peuvent adapter la sortie dans différentes distributions statistiques. Déclarez simplement votre variable moteur aléatoire dans le bloc '#pragma omp parallel' afin qu'elle soit locale à chaque thread. – Wyzard

Implémentation OpenMP plus lente que l'implémentation en série

Répondre

Questions connexes