3. C++ et le monde extérieur
3.3. Librairies dédiées au calcul scientifique
3.3.1. Complex<T> de la STL
Pour les calculs avec des nombres complexes on peut utiliser la STL qui dispose d'un
type Complex<T> à instancier.
#include <iostream>
#include <complex>
#include <cstdlib>
#include <valarray>
using namespace std;
typedef complex<double> Complex;
int main() {
Complex c2(2, 0);
Complex ci(0, 1);
cout << ci << " + " << c2 << " = " << ci+c2 << '\n'
<< ci << " * " << ci << " = " << ci*ci << '\n'
<< ci << " + " << c2 << " / " << ci << " = " << ci+c2/ci << '\n'
<< 1 << " / " << ci << " = " << 1./ci << '\n';
cout << "norm of c2 = " << norm(c2) << endl;
cout << "norm of ci = " << norm(ci) << endl;
cout << "real(c2) = " << real(c2) << endl;
cout << "imag(ci) = " << imag(ci) << endl;
valarray<complex<double> > v(10);
for (size_t i = 0; i <v.size(); ++i) {
v[i].real( static_cast<double>(rand() % 1000) / 1000);
v[i].imag( static_cast<double>(rand() % 1000) / 1000);
}
for (auto x : v) {
cout << x << endl;
}
complex<double> sc = v.sum();
cout << "===========" << endl;
cout << "sum = " << sc << endl;
exit(EXIT_SUCCESS);
}
(0,1) + (2,0) = (2,1)
(0,1) * (0,1) = (-1,0)
(0,1) + (2,0) / (0,1) = (0,-1)
1 / (0,1) = (0,-1)
norm of c2 = 4
norm of ci = 1
real(c2) = 2
imag(ci) = 1
(0.383,0.886)
(0.777,0.915)
(0.793,0.335)
(0.386,0.492)
(0.649,0.421)
(0.362,0.027)
(0.69,0.059)
(0.763,0.926)
(0.54,0.426)
(0.172,0.736)
===========
sum = (5.515,5.223)
3.3.2. BLAS
BLAS
(Basic Linear Algebra Subprograms) est une librairie de sous-programmes
optimisés pour calcul matriciel :
- BLAS niveau 1 : scalar, vector et vector-vector
- BLAS niveau 2 : matrix-vector
- BLAS niveau 3 : matrix-matrix
BLAS Reference Card
Sous Ubuntu, installer les packages : libblas3 libblas-dev
Exemple produit de matrices avec double DGEMM (Double GEneral Matrix Matrix) :
$$ C ← α op(A) × op(B) + β C $$
où $α$ et $β$ sont des réels
et $op$ est soit l'identité, la transposée ou la transposée conjuguée.
#include <iostream>
#include <numeric>
using namespace std;
#include <cblas.h>
#include <xmmintrin.h>
/*
extern "C" void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
const int K, const double alpha, const double *A,
const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc);
*/
int DIM = 1000;
double *A;
double *B;
double *C;
int main(int argc, char *argv[]) {
char TRANS = 'N';
double alpha = 1.0;
double beta = 0.0;
srand(19702013);
if (argc > 1) {
DIM = atoi(argv[1]);
}
int size = DIM*DIM;
// allocate
A = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
B = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
C = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
// fill
for (int i=0; i<size; ++i) {
A[i] = (rand() % 131) * 0.01;
B[i] = (rand() % 131) * 0.01;
}
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, DIM, DIM, DIM,
alpha, A, DIM, B, DIM, beta, C, DIM);
// check result
double sum = accumulate(&C[0], &C[size], 0.0);
cout << "sum = " << sum << endl;
_mm_free(A);
_mm_free(B);
_mm_free(C);
return 0;
}
3.3.3. LAPACK
LAPACK (or Linear Algebra PACKage) provides routines :
- for solving systems of simultaneous linear equations
- least-squares solutions of linear systems of equations
- eigenvalue problems, and singular value problems
Sous Ubuntu installer les packages liblapack3 (dynamique) ou liblapack-dev (statique)
3.3.4. Intel MKL (Math Kernel Library)
Intel Math Kernel Library (Intel® MKL) includes a wealth of math processing routines to accelerate application performance and reduce development time. Intel® MKL includes highly vectorized and threaded Linear Algebra, Fast Fourier Transforms (FFT), Vector Math and Statistics functions. The easiest way to take advantage of all of that processing power is to use a carefully optimized computing math library. Even the best compiler can’t compete with the level of performance possible from a hand-optimized library. If your application already relies on the BLAS or LAPACK functionality, simply re-link with Intel® MKL to get better performance on Intel and compatible architectures.
Voir MKL Cookbook
et MKL Link Advisor
3.3.4.a Exemple produit de matrix avec double
On utilise encore DGEMM de MKL :
/*
source /opt/intel/bin/iccvars.sh intel64
icpc -o mkl_matrix_product.exe mkl_matrix_product.cpp -O3 -msse2 -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_intel_ilp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_sequential.a -Wl,--end-group -lpthread -lm
icpc -o mkl_matrix_product.exe mkl_matrix_product.cpp -O3 -msse2 -DMKL_ILP64 -openmp -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_intel_ilp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_intel_thread.a -Wl,--end-group -lpthread -lm
*/
#include <iostream>
#include <numeric>
using namespace std;
#include <mkl.h>
#include <xmmintrin.h>
/*
extern "C" void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
const int K, const double alpha, const double *A,
const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc);
*/
int DIM = 1000;
double *A;
double *B;
double *C;
// ----------------------------------------------
// main function
// ----------------------------------------------
int main(int argc, char *argv[]) {
char TRANS = 'N';
double alpha = 1.0;
double beta = 0.0;
srand(19702013);
if (argc > 1) {
DIM = atoi(argv[1]);
}
if (argc > 2) {
mkl_set_num_threads(atoi(argv[2]));
}
int size = DIM*DIM;
// allocate
A = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
B = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
C = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
// fill
for (int i=0; i<size; ++i) {
A[i] = (rand() % 131) * 0.01;
B[i] = (rand() % 131) * 0.01;
}
// ------------------------------------------
// perform product with MKL
// ------------------------------------------
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, DIM, DIM, DIM,
alpha, A, DIM, B, DIM, beta, C, DIM);
// check result
double sum = accumulate(&C[0], &C[size], 0.0);
cout << "sum = " << sum << endl;
_mm_free(A);
_mm_free(B);
_mm_free(C);
return 0;
}
3.3.4.b Exemple de valeurs propres
On utilise DGEEV
#include <iostream>
#include <numeric>
using namespace std;
#include <mkl.h>
#include <mkl_lapack.h>
#include <xmmintrin.h>
int DIM = 10;
double *A;
double *work;
int main(int argc, char *argv[]) {
srand(19702013);
if (argc > 1) {
DIM = atoi(argv[1]);
}
if (argc > 2) {
mkl_set_num_threads(atoi(argv[2]));
}
MKL_INT n, lda, lwork, ldvl, ldvr, info, idxi, idxj;
double *wr, *wi, *A, *work, *vl, *vr;
char jobvl, jobvr;
n = DIM;
lda = DIM;
ldvl = 1;
ldvr = DIM;
lwork = 4*DIM;
jobvl = 'N'; // or 'V'
jobvr = 'N'; // or 'V'
int size = DIM*DIM;
A = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
wr = static_cast<double *>(_mm_malloc(DIM * sizeof(double), 16));
wi = static_cast<double *>(_mm_malloc(DIM * sizeof(double), 16));
vl = static_cast<double *>(_mm_malloc(DIM * ldvl * sizeof(double), 16));
vr = static_cast<double *>(_mm_malloc(DIM * ldvr * sizeof(double), 16));
work = static_cast<double *>(_mm_malloc(lwork * sizeof(double), 16));
for (int i=0; i<DIM; ++i) {
for (int j=0; j<DIM; ++j) {
A[i*DIM +j] = 1.0/(i+j+2.0);
}
}
/*
cerr << "A = [" << endl;
for (int i=0; i<size; ++i) cerr << A[i] << " ";
cerr << endl << "]" << endl;
*/
//void dgeev( const char* jobvl, const char* jobvr, const MKL_INT* n, double* a,
// const MKL_INT* lda, double* wr, double* wi, double* vl,
// const MKL_INT* ldvl, double* vr, const MKL_INT* ldvr, double* work,
// const MKL_INT* lwork, MKL_INT* info );
dgeev(&jobvl, &jobvr, &n, A, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, &info);
cerr << "A = [" << endl;
for (int i=0; i<10; ++i) cerr << A[i] << " ";
cerr << endl << "]" << endl;
cerr << "wr = [" << endl;
for (int i=0; i<10; ++i) cerr << wr[i] << " " << endl;
cerr << endl << "]" << endl;
cerr << "wi = [" << endl;
for (int i=0; i<10; ++i) cerr << wi[i] << " " << endl;
cerr << endl << "]" << endl;
return 0;
}
Compiler avec :
icpc -o mkl_eigen.exe mkl_eigen.cpp -O3 -msse2 -DMKL_ILP64 -openmp -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_intel_ilp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_intel_thread.a -Wl,--end-group -lpthread -lm
3.3.5. GNU GSL
The GNU Scientific Library (GSL) is a numerical library for C and C++ programmers. It is free software under the GNU General Public License.
The library provides a wide range of mathematical routines such as random number generators, special functions and least-squares fitting. There are over 1000 functions in total with an extensive test suite.
Sous Ubuntu, installer les packages : libgsl0ldbl et libgsl0-dev
3.3.5.a Produit de matrices
#include <iostream>
#include <numeric>
using namespace std;
#include <gsl/gsl_blas.h>
#include <xmmintrin.h>
/*
extern "C" int gsl_blas_dgemm (CBLAS_TRANSPOSE_t TransA,
CBLAS_TRANSPOSE_t TransB,
double alpha,
const gsl_matrix * A,
const gsl_matrix * B,
double beta,
gsl_matrix * C);
*/
int DIM = 1000;
double *A;
double *B;
double *C;
int main(int argc, char *argv[]) {
char TRANS = 'N';
double alpha = 1.0;
double beta = 0.0;
srand(19702013);
if (argc > 1) {
DIM = atoi(argv[1]);
}
int size = DIM*DIM;
// allocate
A = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
B = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
C = static_cast<double *>(_mm_malloc(size * sizeof(double), 16));
// fill
for (int i=0; i<size; ++i) {
A[i] = (rand() % 131) * 0.01;
B[i] = (rand() % 131) * 0.01;
}
gsl_matrix_view a = gsl_matrix_view_array(A, DIM, DIM);
gsl_matrix_view b = gsl_matrix_view_array(B, DIM, DIM);
gsl_matrix_view c = gsl_matrix_view_array(C, DIM, DIM);
gsl_blas_dgemm(CblasNoTrans, CblasNoTrans,
alpha, &a.matrix, &b.matrix, beta, &c.matrix);
// check result
double sum = accumulate(&C[0], &C[size], 0.0);
cout << "sum = " << sum << endl;
_mm_free(A);
_mm_free(B);
_mm_free(C);
return 0;
}
3.3.5.b Exemple de régression linéaire multiple
// g++ -o gsl_regr_lin_mult.exe gsl_regr_lin_mult.cpp -Wall -lgsl -lgslcblas
// export LD_LIBRARY_PATH=/usr/local/lib
#include <stdio.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_math.h>
#include <gsl/gsl_multifit.h>
#include <iostream>
#include <iomanip>
using namespace std;
// structure used to store Data, we want to predict
// for each person (name), the weight in function of
// gender, age, height:
// weight = c0 + c1 * gender + c2 * age + c3 * height
// we have 5 persons so N=5
// we have 4 coefficients to find: c0, c1, c2, c3, P=4
// we should obtain: -337.945 54.6301 2.96528 5.37169
typedef struct {
string name;
int gender; // 1 male, 0 female
int age; // years
int height; // inches
int weight; // pounds
} Data;
Data data[] = {
{"Joe ", 1, 25, 72, 178},
{"Jill", 0, 32, 68, 122},
{"Jack", 1, 27, 69, 167},
{"John", 1, 45, 67, 210},
{"Jane", 0, 38, 62, 108}
};
int main() {
int N = sizeof(data) / sizeof(Data);
int P = 4; // cste, gender, age, weight
cout << "N = " << N << endl;
cout << "P = " << P << endl;
gsl_vector *y; // observed data (height)
gsl_matrix *X; // data used to predict : cste, gender, age, weight
gsl_vector *c; // the coefficients c0, c1, c2, c3
gsl_matrix *cov;
// allocate space for the matrices and vectors
X = gsl_matrix_alloc(N, P); // this is an input
y = gsl_vector_alloc(N); //this is an input
c = gsl_vector_alloc(P); //this is an output
cov = gsl_matrix_alloc(P, P); //this is an output
//now put the data into the X matrix, row by row
for (int i=0; i<N; ++i) {
gsl_matrix_set(X, i, 0, static_cast<double>(1)); // because cste
gsl_matrix_set(X, i, 1, static_cast<double>(data[i].gender));
gsl_matrix_set(X, i, 2, static_cast<double>(data[i].age));
gsl_matrix_set(X, i, 3, static_cast<double>(data[i].height));
}
// fill vector of observed data
for (int i=0; i<N; ++i) {
gsl_vector_set(y, i, data[i].weight);
}
double chisq;
// allocate temporary work space for gsl
gsl_multifit_linear_workspace *work;
work = gsl_multifit_linear_alloc(N, P);
// now do the fit
gsl_multifit_linear(X, y, c, cov, &chisq, work);
cout << "coefficients:" << endl;
for (int j = 0; j<P; j++) {
cout << "c" << j << " = " << std::setprecision(9);
cout << gsl_vector_get(c, j) << endl;
}
cout << endl;
cout << "person, observed weight <=> predicted weight" << endl;
double err = 0.0, errsq = 0.0;
for (int i=0; i<N; ++i) {
double r = gsl_vector_get(c, 0);
r += data[i].gender * gsl_vector_get(c, 1);
r += data[i].age * gsl_vector_get(c, 2);
r += data[i].height * gsl_vector_get(c, 3);
cout << data[i].name << ", " << data[i].weight;
cout << " <=> " << std::setprecision(9) << r << endl;
err += fabs(data[i].weight - r);
errsq += fabs(data[i].weight - r) * fabs(data[i].weight - r);
}
cout << "error = " << err << endl;
cout << "errorsq = " << errsq << endl;
cout << endl;
cout << "chi square = " << chisq << endl;
cout << "covariance matrix:" << endl;
for (int i=0; i<P; ++i) {
for (int j = 0; j<P; j++) {
cout.setf( std::ios::fixed, std:: ios::floatfield );
cout << gsl_matrix_get(cov, i, j) << " ";
}
cout << endl;
}
//************************************
//very important stuff at the end here
//************************************
// don't forget to deallocate - there is no garbage collector in C!
gsl_matrix_free(X);
gsl_matrix_free(cov);
gsl_vector_free(y);
gsl_vector_free(c);
return 0;
}
N = 5
P = 4
coefficients:
c0 = -337.944703
c1 = 54.6301002
c2 = 2.96528275
c3 = 5.37168933
person, observed weight <=> predicted weight
Joe , 178 <=> 177.579098
Jill, 122 <=> 122.21922
Jack, 167 <=> 167.394596
John, 210 <=> 210.026306
Jane, 108 <=> 107.78078
error = 1.28024338
errorsq = 0.429670723
chi square = 0.429670723
covariance matrix:
165.856820545 7.759985511 -0.613595628 -2.217940588
7.759985511 0.773607220 -0.023221288 -0.110185781
-0.613595628 -0.023221288 0.003537017 0.007535385
-2.217940588 -0.110185781 0.007535385 0.030064648
3.3.5.c Exemple de calcul avec valeurs propres
#include <iostream>
#include <numeric>
using namespace std;
#include <gsl/gsl_blas.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_eigen.h>
#include <xmmintrin.h>
#include <omp.h>
int DIM = 10;
int num_threds = 1;
gsl_matrix *A;
gsl_matrix *B;
gsl_vector *v;
gsl_eigen_symmv_workspace *wrk;
int main(int argc, char *argv[]) {
char TRANS = 'N';
double alpha = 1.0;
double beta = 0.0;
srand(19702013);
if (argc > 1) {
DIM = atoi(argv[1]);
}
if (argc > 2) {
omp_set_num_threads(atoi(argv[2]));
}
int size = DIM*DIM;
A = gsl_matrix_alloc(DIM, DIM);
for (int i=0; i<DIM; ++i) {
for (int j=0; j<DIM; ++j) {
gsl_matrix_set(A, i, j, 1.0/(i+j+2.0));
}
}
/*
cerr << "A = [" << endl;
for (int i=0; i<DIM; ++i) {
for (int j=0; j<DIM; j++) {
cerr << gsl_matrix_get(A, j, i) << " ";
}
cerr << endl;
}
cerr << "]" << endl;
*/
v = gsl_vector_alloc(DIM);
B = gsl_matrix_alloc(DIM, DIM);
wrk = gsl_eigen_symmv_alloc(DIM);
gsl_eigen_symmv(A, v, B, wrk);
cerr << endl << "v = [";
for (int i=0; i<DIM; ++i) {
cerr << gsl_vector_get(v, i) << " ";
}
cerr << "]";
gsl_eigen_symmv_free(wrk);
gsl_vector_free(v);
gsl_matrix_free(A);
gsl_matrix_free(B);
return 0;
}
Compiler avec :
g++ -o gsl_eigen.exe gsl_eigen.cpp -O2 -lgsl -lgslcblas -lm
Résultat :
./gsl_eigen.exe 3
0: 0.875115 0.743136 0.527843 0.411255
1: 0.0409049 0.638787 -0.376612 -0.670906
2: 0.000646659 0.19925 -0.761278 0.617053
3.3.6. PETSc
PETSc is a suite of data structures and routines for the scalable (parallel)
solution of scientific applications modeled by partial differential equations. It supports MPI, shared memory pthreads,
and GPUs through CUDA or OpenCL, as well as hybrid MPI-shared memory pthreads or MPI-GPU parallelism
- developé par Argonne National Laboratory depuis 1991
- surcouche MPI
- très difficile à utiliser
3.3.6.a Exemple produit de matrices
#include <petscmat.h>
#include <petscsys.h>
#include <xmmintrin.h>
#include <iostream>
using namespace std;
PetscInt DIM = 1000;
static char help[] = "Empty program.\n\n";
int main(int argc, char *argv[]) {
PetscMPIInt rank;
PetscErrorCode ierr;
Mat A, B, C;
PetscScalar fill=4.0;
srand(19702013);
if (argc > 1) {
DIM = atoi(argv[1]);
}
ierr = PetscInitialize(&argc,&argv,(char *)0,help); CHKERRQ(ierr);
// get rank from MPI ordering:
MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
PetscOptionsGetInt(NULL,"-size",&DIM,NULL);
int size = DIM*DIM;
PetscScalar *a, *b;
PetscMalloc1(size*sizeof(PetscScalar), &a);
PetscMalloc1(size*sizeof(PetscScalar), &b);
// fill
for (int i=0; i<size; ++i) {
a[i] = (rand() % 131) * 0.01;
b[i] = (rand() % 131) * 0.01;
}
MatCreate(MPI_COMM_SELF, &A);
MatSetSizes(A,DIM, DIM, DIM, DIM);
MatSetType(A,MATSEQDENSE);
MatSeqDenseSetPreallocation(A,a);
MatSeqDenseSetLDA(A,DIM);
MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
MatCreate(MPI_COMM_SELF, &B);
MatSetSizes(B,DIM, DIM, DIM, DIM);
MatSetType(B,MATSEQDENSE);
MatSeqDenseSetPreallocation(B,b);
MatSeqDenseSetLDA(B,DIM);
MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
ierr = MatMatMult(A,B,MAT_INITIAL_MATRIX,fill,&C);CHKERRQ(ierr);
/*if (rank == 0) {
cout << "rank = " << rank << endl;
MatView(A, PETSC_VIEWER_STDOUT_SELF);
MatView(B, PETSC_VIEWER_STDOUT_SELF);
MatView(C, PETSC_VIEWER_STDOUT_SELF);
}*/
PetscFree(a);
PetscFree(b);
MatDestroy(&A);
MatDestroy(&B);
MatDestroy(&C);
ierr = PetscFinalize();CHKERRQ(ierr);
return 0;
}
3.3.7. Comparaison BLAS, GSL, MKL, PETSC
3.3.7.a Produit de matrices
blas |
0.63 |
5.24 |
43.14 |
349.87 |
gsl |
0.73 |
7.96 |
67.09 |
518.74 |
mkl (1 thread) |
0.07 |
1.40 |
3.28 |
22.28 |
mkl (2 threads) |
0.04 |
1.05 |
2.18 |
12.35 |
mkl (4 threads) |
0.04 |
0.86 |
1.66 |
6.98 |
petsc (4 threads) |
3.24 |
26.49 |
3m36 |
30m(*) |
Produit de matrices carrées : temps de calcul (en secondes) pour différentes librairies sur Intel i5-4570 CPU at 3.20GHz 64 bits (double)
(*) Note : valeur estimée (facteur 8.15)
3.3.7.b Valeurs propres
gsl (1 thread) |
3.92 |
41.54 |
89.82 |
> 1h |
mkl (1 thread) |
1.25 |
11.70 |
88.65 |
665.09 |
mkl (2 threads) |
0.92 |
9.70 |
74.93 |
658.76 |
mkl (4 threads) |
0.82 |
9.35 |
72.12 |
582.31 |
mkl (4 threads) (1) |
0.58 |
6.68 |
51.04 |
397.83 |
python (1 thread) |
1.62 |
10.21 |
75.20 |
457.58 |
python (2 threads) + openblas (2) |
1.98 |
9.57 |
55.71 |
375.14 |
python (4 threads) + openblas (3) |
1.93 |
9.98 |
57.81 |
385.33 |
Valeurs propres : temps de calcul pour différentes librairies sur Intel i5-4570 CPU at 3.20GHz 64 bits (double)
- (1) : solution MKL avec jobvl = jobvr = 'N', les vecteurs droit et gauche ne sont pas calculés ce qui prend moins de temps.
- (2) : avec la solution python + openblas on devient compétitif par rapport à MKL à mesure que la dimension
de la matrice augmente.
- (3) : avec la solution python + openblas l'utilisation de 4 threads est moins efficace qu'avec 2 threads ce qui semble étrange.