3. C++ et le monde extérieur
sommaire chapitre 3
3.7. Thrust
Thrust is a parallel algorithms library which resembles the C++ Standard Template Library (STL).
Thrust's high-level interface greatly enhances developer productivity while enabling performance
portability between GPUs and multicore CPUs. Interoperability with established technologies (such as CUDA, TBB
and OpenMP) facilitates integration with existing software.
Thrust provides a rich collection of data parallel primitives such as scan, sort, and reduce, which can be composed together to implement complex algorithms with concise, readable source code.
By describing your computation in terms of these high-level abstractions you provide Thrust with the freedom to select the most efficient implementation automatically. As a result, Thrust can be utilized in rapid prototyping of CUDA applications, where programmer productivity matters most, as well as in production, where robustness and absolute performance are crucial.
Thrust nécessite CUDA et est installée par défaut avec les nouvelles versions de CUDA.
3.7.1. Les vecteurs host et device
Voici un exemple de tri d'un vecteur d'entiers réalisé sur le GPU.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <cstdlib>
#include <iostream>
#include <vector>
#include <algorithm>
using namespace std;
int method = 1;
int main(int argc, char *argv[]) {
srand(19702013);
if (argc > 1) {
method = atoi(argv[1]);
}
if (method == 1) {
vector<int> v(32 << 20);
generate(v.begin(), v.end(), rand);
sort(v.begin(), v.end());
vector<int>::iterator it;
for (it = v.begin(); it != v.begin() + 50; ++it) {
cout << *it << " ";
}
cout << endl;
} else {
// generate 32M random numbers on the host
thrust::host_vector<int> h_vec(32 << 20);
thrust::generate(h_vec.begin(), h_vec.end(), rand);
// transfer data to the device
thrust::device_vector<int> d_vec = h_vec;
// sort data on the device (846M keys per second on GeForce GTX 480)
thrust::sort(d_vec.begin(), d_vec.end());
// transfer data back to host
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
thrust::host_vector<int>::iterator it;
for (it = h_vec.begin(); it != h_vec.begin() + 50; ++it) {
cout << *it << " ";
}
cout << endl;
}
return 0;
}
145 177 261 278 329 371 406 418 428 437 477 494 629 791 834 897 1012 1212 1258 1298 1343 1363 1381 1407 1442 1575 1588 1631 1861 1871 1911 2006 2190 2215 2470 2507 2635 2644 2672 2752 2843 2876 2900 2907 3177 3178 3197 3240 3377 3558 ...
STL |
Intel Core i5 4570 @ 3.20 Ghz |
2.72s |
1.0 |
Thrust |
NVidia GTX 770 1536 Cores |
0.36s |
7.5 |
Temps de calcul tri sur 32 * 2^20 entiers
3.7.2. Les algorithmes
Thrust dispose des algorithmes de base pour le parallèlisme :
- thrust::fill : remplir un vecteur avec une valeur
- thrust::sequence : générer une séquence i, i+1, ....
- thrust::transform : transformer un vecteur en appliquant une opération
- thrust::reduce : réduction
- thrust::sort : tri des élements d'un vecteur
3.7.2.a Exemple 1
Calcul de Y = X mod Z
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
using namespace std;
// =====================
// print vector
// =====================
void print(string s,
thrust::device_vector<int>::iterator it_b,
thrust::device_vector<int>::iterator it_e) {
cout << s << " = [ ";
thrust::device_vector<int>::iterator it;
for (it = it_b; it != it_e; ++it) {
cout << *it << " ";
}
cout << "]" << endl;
}
// =====================
// main function
// =====================
int main(void) {
// allocate three device_vectors with 10 elements
thrust::device_vector<int> X(10);
thrust::device_vector<int> Y(10);
thrust::device_vector<int> Z(10);
// initialize X to 0,1,2,3, ....
thrust::sequence(X.begin(), X.end());
print("X", X.begin(), X.end());
// compute Y = -X
thrust::transform(X.begin(), X.end(), Y.begin(), thrust::negate<int>());
print("Y", Y.begin(), Y.end());
// fill Z with twos
thrust::fill(Z.begin(), Z.end(), 2);
print("Z", Z.begin(), Z.end());
// compute Y = X mod 2
thrust::transform(X.begin(), X.end(), Z.begin(), Y.begin(), thrust::modulus<int>());
cout << "After transform: " << endl;
print("X", X.begin(), X.end());
print("Y", Y.begin(), Y.end());
print("Z", Z.begin(), Z.end());
// replace all the ones in Y with tens
thrust::replace(Y.begin(), Y.end(), 1, 10);
cout << "After replace:" << endl;
print("Y", Y.begin(), Y.end());
return 0;
}
X = [ 0 1 2 3 4 5 6 7 8 9 ]
Y = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 ]
Z = [ 2 2 2 2 2 2 2 2 2 2 ]
After transform:
X = [ 0 1 2 3 4 5 6 7 8 9 ]
Y = [ 0 1 0 1 0 1 0 1 0 1 ]
Z = [ 2 2 2 2 2 2 2 2 2 2 ]
After replace:
Y = [ 0 10 0 10 0 10 0 10 0 10 ]
3.7.2.b Exemple 2
Calcul de SAXPY : Y = A * X + Y
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
using namespace std;
// =====================
// print vector
// =====================
void print(string s,
thrust::device_vector<int>::iterator it_b,
thrust::device_vector<int>::iterator it_e) {
cout << s << " = [ ";
thrust::device_vector<int>::iterator it;
for (it = it_b; it != it_e; ++it) {
cout << *it << " ";
}
cout << "]" << endl;
}
// =====================
// functor to compute Y <- A * X + Y
// =====================
struct saxpy_functor {
const float a;
saxpy_functor(float _a) : a(_a) {}
__host__ __device__
float operator()(const float& x, const float& y) const {
return a * x + y;
}
};
void saxpy_fast(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y) {
// Y <- A * X + Y
thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A));
}
void saxpy_slow(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y) {
thrust::device_vector<float> temp(X.size());
// temp <- A
thrust::fill(temp.begin(), temp.end(), A);
// temp <- A * X
thrust::transform(X.begin(), X.end(), temp.begin(), temp.begin(), thrust::multiplies<float>());
// Y <- A * X + Y
thrust::transform(temp.begin(), temp.end(), Y.begin(), Y.begin(), thrust::plus<float>());
}
int method = 1;
int size = 10;
// =====================
// main function
// =====================
int main(int argc, char *argv[]) {
if (argc > 1) {
method = atoi(argv[1]);
}
if (argc > 2) {
size = atoi(argv[2]);
}
// allocate device_vectors with size elements
thrust::device_vector<float> X(size);
thrust::device_vector<float> Y(size);
thrust::sequence(X.begin(), X.end());
thrust::sequence(Y.begin(), Y.end());
if (method == 1) {
saxpy_fast(1.5, X, Y);
} else {
saxpy_slow(1.5, X, Y);
}
thrust::device_vector<float>::iterator it;
cout << "Y = [ ";
for (it = Y.begin(); it != Y.begin() + 10; ++it) {
cout << *it << " ";
}
cout << "]" << endl;
return 0;
}
Y = [ 0 2.5 5 7.5 10 12.5 15 17.5 20 22.5 ]
3.7.2.c Exemple 3
Algorithme de réduction : somme des éléments du vecteur X
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
using namespace std;
int size = 10;
// =====================
// main function
// =====================
int main(int argc, char *argv[]) {
if (argc > 1) {
size = atoi(argv[1]);
}
// allocate device_vector with size elements
thrust::device_vector<float> X(size);
thrust::sequence(X.begin(), X.end());
float sum = thrust::reduce(X.begin(), X.end(), (float) 0, thrust::plus<float>());
cout << "sum = " << sum << endl;
return 0;
}
sum = 4.99995e+09
3.7.2.d Autres exemples
On trouve bien d'autres exemples ici.