3. C++ et le monde extérieur

3.7. Thrust

Thrust is a parallel algorithms library which resembles the C++ Standard Template Library (STL).

Thrust's high-level interface greatly enhances developer productivity while enabling performance portability between GPUs and multicore CPUs. Interoperability with established technologies (such as CUDA, TBB and OpenMP) facilitates integration with existing software.

Thrust provides a rich collection of data parallel primitives such as scan, sort, and reduce, which can be composed together to implement complex algorithms with concise, readable source code.

By describing your computation in terms of these high-level abstractions you provide Thrust with the freedom to select the most efficient implementation automatically. As a result, Thrust can be utilized in rapid prototyping of CUDA applications, where programmer productivity matters most, as well as in production, where robustness and absolute performance are crucial.

Thrust nécessite CUDA et est installée par défaut avec les nouvelles versions de CUDA.

3.7.1. Les vecteurs host et device

Voici un exemple de tri d'un vecteur d'entiers réalisé sur le GPU.

CODE
thrust_vectors.cu

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h> 
#include <thrust/copy.h>
#include <cstdlib>
#include <iostream>
#include <vector>
#include <algorithm>
 
using namespace std;
 
int method = 1;
 
int main(int argc, char *argv[]) {
    srand(19702013);
 
    if (argc > 1) {
        method = atoi(argv[1]);
    }
    
    if (method == 1) {
        vector<int> v(32 << 20);
        
        generate(v.begin(), v.end(), rand);
        sort(v.begin(), v.end());
        vector<int>::iterator it;
        for (it = v.begin(); it != v.begin() + 50; ++it) {
            cout << *it << " ";
        }
        cout << endl;
        
        
    } else {
        // generate 32M random numbers on the host
        thrust::host_vector<int> h_vec(32 << 20);
        thrust::generate(h_vec.begin(), h_vec.end(), rand);
 
        // transfer data to the device
        thrust::device_vector<int> d_vec = h_vec;
 
        // sort data on the device (846M keys per second on GeForce GTX 480)
        thrust::sort(d_vec.begin(), d_vec.end());
 
        // transfer data back to host
        thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
 
        thrust::host_vector<int>::iterator it;
        for (it = h_vec.begin(); it != h_vec.begin() + 50; ++it) {
            cout << *it << " ";
        }
        cout << endl;
    }   
    return 0;
}
 
 
 

145 177 261 278 329 371 406 418 428 437 477 494 629 791 834 897 1012 1212 1258 1298 1343 1363 1381 1407 1442 1575 1588 1631 1861 1871 1911 2006 2190 2215 2470 2507 2635 2644 2672 2752 2843 2876 2900 2907 3177 3178 3197 3240 3377 3558 ...

Temps de calcul tri sur 32 * 2^20 entiers
Technologie utilisée	Matériel	Temps	Accélération
STL	Intel Core i5 4570 @ 3.20 Ghz	2.72s	1.0
Thrust	NVidia GTX 770 1536 Cores	0.36s	7.5

3.7.2. Les algorithmes

Thrust dispose des algorithmes de base pour le parallèlisme :

thrust::fill : remplir un vecteur avec une valeur
thrust::sequence : générer une séquence i, i+1, ....
thrust::transform : transformer un vecteur en appliquant une opération
thrust::reduce : réduction
thrust::sort : tri des élements d'un vecteur

3.7.2.a Exemple 1

Calcul de Y = X mod Z

CODE
thrust_algos_1.cu

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
using namespace std;
 
// =====================
// print vector
// =====================
void print(string s, 
    thrust::device_vector<int>::iterator it_b,
    thrust::device_vector<int>::iterator it_e) {
    
    cout << s << " = [ ";
    thrust::device_vector<int>::iterator it;
    for (it = it_b; it != it_e; ++it) {
        cout << *it << " ";
    }
    cout << "]" << endl;
}
 
// =====================
// main function
// =====================
int main(void) {
 
    // allocate three device_vectors with 10 elements
    thrust::device_vector<int> X(10);
    thrust::device_vector<int> Y(10);
    thrust::device_vector<int> Z(10);
 
    // initialize X to 0,1,2,3, ....
    thrust::sequence(X.begin(), X.end());
    print("X", X.begin(), X.end());
    
    // compute Y = -X
    thrust::transform(X.begin(), X.end(), Y.begin(), thrust::negate<int>());
    print("Y", Y.begin(), Y.end());
 
    // fill Z with twos
    thrust::fill(Z.begin(), Z.end(), 2);
    print("Z", Z.begin(), Z.end());
 
    // compute Y = X mod 2
    thrust::transform(X.begin(), X.end(), Z.begin(), Y.begin(), thrust::modulus<int>());
    cout << "After transform: " << endl;
    print("X", X.begin(), X.end());
    print("Y", Y.begin(), Y.end());
    print("Z", Z.begin(), Z.end());
    
    // replace all the ones in Y with tens
    thrust::replace(Y.begin(), Y.end(), 1, 10);
    cout << "After replace:" << endl;
    print("Y", Y.begin(), Y.end());
    
    return 0;    
}
 

X = [ 0 1 2 3 4 5 6 7 8 9 ]
Y = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 ]
Z = [ 2 2 2 2 2 2 2 2 2 2 ]
After transform: 
X = [ 0 1 2 3 4 5 6 7 8 9 ]
Y = [ 0 1 0 1 0 1 0 1 0 1 ]
Z = [ 2 2 2 2 2 2 2 2 2 2 ]
After replace:
Y = [ 0 10 0 10 0 10 0 10 0 10 ]

3.7.2.b Exemple 2

Calcul de SAXPY : Y = A * X + Y

CODE
thrust_algos_2.cu

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
using namespace std;
 
// =====================
// print vector
// =====================
void print(string s, 
    thrust::device_vector<int>::iterator it_b,
    thrust::device_vector<int>::iterator it_e) {
    
    cout << s << " = [ ";
    thrust::device_vector<int>::iterator it;
    for (it = it_b; it != it_e; ++it) {
        cout << *it << " ";
    }
    cout << "]" << endl;
}
 
// =====================
// functor to compute Y <- A * X + Y
// =====================
struct saxpy_functor {
    const float a;
 
    saxpy_functor(float _a) : a(_a) {}
 
    __host__ __device__
    float operator()(const float& x, const float& y) const { 
        return a * x + y;
    }
};
 
 
void saxpy_fast(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y) {
    // Y <- A * X + Y
    thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A));
}
 
void saxpy_slow(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y) {
    thrust::device_vector<float> temp(X.size());
   
    // temp <- A
    thrust::fill(temp.begin(), temp.end(), A);
    
    // temp <- A * X
    thrust::transform(X.begin(), X.end(), temp.begin(), temp.begin(), thrust::multiplies<float>());
 
    // Y <- A * X + Y
    thrust::transform(temp.begin(), temp.end(), Y.begin(), Y.begin(), thrust::plus<float>());
}
 
int method = 1;
int size = 10;
 
// =====================
// main function
// =====================
int main(int argc, char *argv[]) {
 
    if (argc > 1) {
        method = atoi(argv[1]);
    }   
    
    if (argc > 2) {
        size = atoi(argv[2]);
    }
    
    // allocate device_vectors with size elements
    thrust::device_vector<float> X(size);
    thrust::device_vector<float> Y(size);
 
    thrust::sequence(X.begin(), X.end());
    thrust::sequence(Y.begin(), Y.end());
 
    if (method == 1) {
        saxpy_fast(1.5, X, Y);
    } else {
        saxpy_slow(1.5, X, Y);
    }
 
    thrust::device_vector<float>::iterator it;
    cout << "Y = [ ";
    for (it = Y.begin(); it != Y.begin() + 10; ++it) {
        cout << *it << " ";
    }
    cout << "]" << endl;
    return 0;    
}
 

Y = [ 0 2.5 5 7.5 10 12.5 15 17.5 20 22.5 ]

3.7.2.c Exemple 3

Algorithme de réduction : somme des éléments du vecteur X

CODE
thrust_algos_3.cu

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
using namespace std;
 
 
int size = 10;
 
// =====================
// main function
// =====================
int main(int argc, char *argv[]) {
 
    if (argc > 1) {
        size = atoi(argv[1]);
    }
    
    // allocate device_vector with size elements
    thrust::device_vector<float> X(size);
 
    thrust::sequence(X.begin(), X.end());
 
    float sum = thrust::reduce(X.begin(), X.end(), (float) 0, thrust::plus<float>());
 
    cout << "sum = " << sum << endl;
    return 0;    
}
 

sum = 4.99995e+09

3.7.2.d Autres exemples

On trouve bien d'autres exemples ici.

Page de Jean-Michel Richer