3. C++ et le monde extérieur



sommaire chapitre 3

3.7. Thrust

Thrust is a parallel algorithms library which resembles the C++ Standard Template Library (STL).

Thrust's high-level interface greatly enhances developer productivity while enabling performance portability between GPUs and multicore CPUs. Interoperability with established technologies (such as CUDA, TBB and OpenMP) facilitates integration with existing software.

Thrust provides a rich collection of data parallel primitives such as scan, sort, and reduce, which can be composed together to implement complex algorithms with concise, readable source code.

By describing your computation in terms of these high-level abstractions you provide Thrust with the freedom to select the most efficient implementation automatically. As a result, Thrust can be utilized in rapid prototyping of CUDA applications, where programmer productivity matters most, as well as in production, where robustness and absolute performance are crucial.

Thrust nécessite CUDA et est installée par défaut avec les nouvelles versions de CUDA.

3.7.1. Les vecteurs host et device

Voici un exemple de tri d'un vecteur d'entiers réalisé sur le GPU.

  1. #include <thrust/host_vector.h>
  2. #include <thrust/device_vector.h>
  3. #include <thrust/generate.h>
  4. #include <thrust/sort.h>
  5. #include <thrust/copy.h>
  6. #include <cstdlib>
  7. #include <iostream>
  8. #include <vector>
  9. #include <algorithm>
  10.  
  11. using namespace std;
  12.  
  13. int method = 1;
  14.  
  15. int main(int argc, char *argv[]) {
  16.     srand(19702013);
  17.  
  18.     if (argc > 1) {
  19.         method = atoi(argv[1]);
  20.     }
  21.    
  22.     if (method == 1) {
  23.         vector<int> v(32 << 20);
  24.        
  25.         generate(v.begin(), v.end(), rand);
  26.         sort(v.begin(), v.end());
  27.         vector<int>::iterator it;
  28.         for (it = v.begin(); it != v.begin() + 50; ++it) {
  29.             cout << *it << " ";
  30.         }
  31.         cout << endl;
  32.        
  33.        
  34.     } else {
  35.         // generate 32M random numbers on the host
  36.         thrust::host_vector<int> h_vec(32 << 20);
  37.         thrust::generate(h_vec.begin(), h_vec.end(), rand);
  38.  
  39.         // transfer data to the device
  40.         thrust::device_vector<int> d_vec = h_vec;
  41.  
  42.         // sort data on the device (846M keys per second on GeForce GTX 480)
  43.         thrust::sort(d_vec.begin(), d_vec.end());
  44.  
  45.         // transfer data back to host
  46.         thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
  47.  
  48.         thrust::host_vector<int>::iterator it;
  49.         for (it = h_vec.begin(); it != h_vec.begin() + 50; ++it) {
  50.             cout << *it << " ";
  51.         }
  52.         cout << endl;
  53.     }  
  54.     return 0;
  55. }
  56.  
  57.  
  58.  
145 177 261 278 329 371 406 418 428 437 477 494 629 791 834 897 1012 1212 1258 1298 1343 1363 1381 1407 1442 1575 1588 1631 1861 1871 1911 2006 2190 2215 2470 2507 2635 2644 2672 2752 2843 2876 2900 2907 3177 3178 3197 3240 3377 3558 ...
 Technologie utilisée   Matériel   Temps   Accélération 
 STL   Intel Core i5 4570 @ 3.20 Ghz    2.72s   1.0 
 Thrust   NVidia GTX 770 1536 Cores   0.36s   7.5 
Temps de calcul tri sur 32 * 2^20 entiers

3.7.2. Les algorithmes

Thrust dispose des algorithmes de base pour le parallèlisme :

3.7.2.a  Exemple 1

Calcul de Y = X mod Z

  1. #include <thrust/device_vector.h>
  2. #include <thrust/transform.h>
  3. #include <thrust/sequence.h>
  4. #include <thrust/copy.h>
  5. #include <thrust/fill.h>
  6. #include <thrust/replace.h>
  7. #include <thrust/functional.h>
  8. #include <iostream>
  9. using namespace std;
  10.  
  11. // =====================
  12. // print vector
  13. // =====================
  14. void print(string s,
  15.     thrust::device_vector<int>::iterator it_b,
  16.     thrust::device_vector<int>::iterator it_e) {
  17.    
  18.     cout << s << " = [ ";
  19.     thrust::device_vector<int>::iterator it;
  20.     for (it = it_b; it != it_e; ++it) {
  21.         cout << *it << " ";
  22.     }
  23.     cout << "]" << endl;
  24. }
  25.  
  26. // =====================
  27. // main function
  28. // =====================
  29. int main(void) {
  30.  
  31.     // allocate three device_vectors with 10 elements
  32.     thrust::device_vector<int> X(10);
  33.     thrust::device_vector<int> Y(10);
  34.     thrust::device_vector<int> Z(10);
  35.  
  36.     // initialize X to 0,1,2,3, ....
  37.     thrust::sequence(X.begin(), X.end());
  38.     print("X", X.begin(), X.end());
  39.    
  40.     // compute Y = -X
  41.     thrust::transform(X.begin(), X.end(), Y.begin(), thrust::negate<int>());
  42.     print("Y", Y.begin(), Y.end());
  43.  
  44.     // fill Z with twos
  45.     thrust::fill(Z.begin(), Z.end(), 2);
  46.     print("Z", Z.begin(), Z.end());
  47.  
  48.     // compute Y = X mod 2
  49.     thrust::transform(X.begin(), X.end(), Z.begin(), Y.begin(), thrust::modulus<int>());
  50.     cout << "After transform: " << endl;
  51.     print("X", X.begin(), X.end());
  52.     print("Y", Y.begin(), Y.end());
  53.     print("Z", Z.begin(), Z.end());
  54.    
  55.     // replace all the ones in Y with tens
  56.     thrust::replace(Y.begin(), Y.end(), 1, 10);
  57.     cout << "After replace:" << endl;
  58.     print("Y", Y.begin(), Y.end());
  59.    
  60.     return 0;    
  61. }
  62.  
X = [ 0 1 2 3 4 5 6 7 8 9 ]
Y = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 ]
Z = [ 2 2 2 2 2 2 2 2 2 2 ]
After transform: 
X = [ 0 1 2 3 4 5 6 7 8 9 ]
Y = [ 0 1 0 1 0 1 0 1 0 1 ]
Z = [ 2 2 2 2 2 2 2 2 2 2 ]
After replace:
Y = [ 0 10 0 10 0 10 0 10 0 10 ]

3.7.2.b  Exemple 2

Calcul de SAXPY : Y = A * X + Y

  1. #include <thrust/device_vector.h>
  2. #include <thrust/transform.h>
  3. #include <thrust/sequence.h>
  4. #include <thrust/copy.h>
  5. #include <thrust/fill.h>
  6. #include <thrust/replace.h>
  7. #include <thrust/functional.h>
  8. #include <iostream>
  9. using namespace std;
  10.  
  11. // =====================
  12. // print vector
  13. // =====================
  14. void print(string s,
  15.     thrust::device_vector<int>::iterator it_b,
  16.     thrust::device_vector<int>::iterator it_e) {
  17.    
  18.     cout << s << " = [ ";
  19.     thrust::device_vector<int>::iterator it;
  20.     for (it = it_b; it != it_e; ++it) {
  21.         cout << *it << " ";
  22.     }
  23.     cout << "]" << endl;
  24. }
  25.  
  26. // =====================
  27. // functor to compute Y <- A * X + Y
  28. // =====================
  29. struct saxpy_functor {
  30.     const float a;
  31.  
  32.     saxpy_functor(float _a) : a(_a) {}
  33.  
  34.     __host__ __device__
  35.     float operator()(const float& x, const float& y) const {
  36.         return a * x + y;
  37.     }
  38. };
  39.  
  40.  
  41. void saxpy_fast(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y) {
  42.     // Y <- A * X + Y
  43.     thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A));
  44. }
  45.  
  46. void saxpy_slow(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y) {
  47.     thrust::device_vector<float> temp(X.size());
  48.    
  49.     // temp <- A
  50.     thrust::fill(temp.begin(), temp.end(), A);
  51.    
  52.     // temp <- A * X
  53.     thrust::transform(X.begin(), X.end(), temp.begin(), temp.begin(), thrust::multiplies<float>());
  54.  
  55.     // Y <- A * X + Y
  56.     thrust::transform(temp.begin(), temp.end(), Y.begin(), Y.begin(), thrust::plus<float>());
  57. }
  58.  
  59. int method = 1;
  60. int size = 10;
  61.  
  62. // =====================
  63. // main function
  64. // =====================
  65. int main(int argc, char *argv[]) {
  66.  
  67.     if (argc > 1) {
  68.         method = atoi(argv[1]);
  69.     }  
  70.    
  71.     if (argc > 2) {
  72.         size = atoi(argv[2]);
  73.     }
  74.    
  75.     // allocate device_vectors with size elements
  76.     thrust::device_vector<float> X(size);
  77.     thrust::device_vector<float> Y(size);
  78.  
  79.     thrust::sequence(X.begin(), X.end());
  80.     thrust::sequence(Y.begin(), Y.end());
  81.  
  82.     if (method == 1) {
  83.         saxpy_fast(1.5, X, Y);
  84.     } else {
  85.         saxpy_slow(1.5, X, Y);
  86.     }
  87.  
  88.     thrust::device_vector<float>::iterator it;
  89.     cout << "Y = [ ";
  90.     for (it = Y.begin(); it != Y.begin() + 10; ++it) {
  91.         cout << *it << " ";
  92.     }
  93.     cout << "]" << endl;
  94.     return 0;    
  95. }
  96.  
Y = [ 0 2.5 5 7.5 10 12.5 15 17.5 20 22.5 ]

3.7.2.c  Exemple 3

Algorithme de réduction : somme des éléments du vecteur X

  1. #include <thrust/device_vector.h>
  2. #include <thrust/transform.h>
  3. #include <thrust/sequence.h>
  4. #include <thrust/copy.h>
  5. #include <thrust/fill.h>
  6. #include <thrust/replace.h>
  7. #include <thrust/functional.h>
  8. #include <iostream>
  9. using namespace std;
  10.  
  11.  
  12. int size = 10;
  13.  
  14. // =====================
  15. // main function
  16. // =====================
  17. int main(int argc, char *argv[]) {
  18.  
  19.     if (argc > 1) {
  20.         size = atoi(argv[1]);
  21.     }
  22.    
  23.     // allocate device_vector with size elements
  24.     thrust::device_vector<float> X(size);
  25.  
  26.     thrust::sequence(X.begin(), X.end());
  27.  
  28.     float sum = thrust::reduce(X.begin(), X.end(), (float) 0, thrust::plus<float>());
  29.  
  30.     cout << "sum = " << sum << endl;
  31.     return 0;    
  32. }
  33.  
sum = 4.99995e+09

3.7.2.d  Autres exemples

On trouve bien d'autres exemples ici.