JDFTx  1.2.0
BlasExtra.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------
2 Copyright 2011 Ravishankar Sundararaman
3 
4 This file is part of JDFTx.
5 
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with JDFTx. If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19 
20 #ifndef JDFTX_CORE_BLASEXTRA_H
21 #define JDFTX_CORE_BLASEXTRA_H
22 
27 #include <gsl/gsl_cblas.h>
28 #include <cstdlib>
29 #include <cstdio>
30 #include <cfloat>
31 #include <core/scalar.h>
32 #include <core/Thread.h>
33 
34 #ifdef GPU_ENABLED
35 #include <cublas.h>
36 #include <cuda_runtime.h>
37 #endif
38 
48 template<typename Ty, typename Tx> void eblas_mul(const int N, const Tx* X, const int incX, Ty* Y, const int incY);
49 
51 inline void eblas_dmul(const int N, const double* X, const int incX, double* Y, const int incY) { eblas_mul(N,X,incX,Y,incY); }
53 inline void eblas_zmul(const int N, const complex* X, const int incX, complex* Y, const int incY) { eblas_mul(N,X,incX,Y,incY); }
55 inline void eblas_zmuld(const int N, const double* X, const int incX, complex* Y, const int incY) { eblas_mul(N,X,incX,Y,incY); }
56 #ifdef GPU_ENABLED
57 //GPU versions of the above functions implemented in BlasExtra.cu
59 void eblas_dmul_gpu(const int N, const double* X, const int incX, double* Y, const int incY);
61 void eblas_zmul_gpu(const int N, const complex* X, const int incX, complex* Y, const int incY);
63 void eblas_zmuld_gpu(const int N, const double* X, const int incX, complex* Y, const int incY);
64 #endif
65 
75 template<typename Ty, typename Tx> void eblas_div(const int N, const Tx* X, const int incX, Ty* Y, const int incY);
76 
78 inline void eblas_ddiv(const int N, const double* X, const int incX, double* Y, const int incY) { eblas_div(N,X,incX,Y,incY); }
80 inline void eblas_zdiv(const int N, const complex* X, const int incX, complex* Y, const int incY) { eblas_div(N,X,incX,Y,incY); }
82 inline void eblas_zdivd(const int N, const double* X, const int incX, complex* Y, const int incY) { eblas_div(N,X,incX,Y,incY); }
83 #ifdef GPU_ENABLED
84 //GPU versions of the above functions implemented in BlasExtra.cu
86 void eblas_ddiv_gpu(const int N, const double* X, const int incX, double* Y, const int incY);
88 void eblas_zdiv_gpu(const int N, const complex* X, const int incX, complex* Y, const int incY);
90 void eblas_zdivd_gpu(const int N, const double* X, const int incX, complex* Y, const int incY);
91 #endif
92 
103 void eblas_lincomb(const int N,
104  const complex& sX, const complex* X, const int incX,
105  const complex& sY, const complex* Y, const int incY,
106  complex* Z, const int incZ);
107 
108 #ifdef GPU_ENABLED
109 void eblas_lincomb_gpu(const int N,
111  const complex& sX, const complex* X, const int incX,
112  const complex& sY, const complex* Y, const int incY,
113  complex* Z, const int incZ);
114 #endif
115 
118 void eblas_zgemm(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, int M, int N, int K,
119  const complex& alpha, const complex *A, const int lda, const complex *B, const int ldb,
120  const complex& beta, complex *C, const int ldc);
121 #ifdef GPU_ENABLED
122 void eblas_zgemm_gpu(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, int M, int N, int K,
124  const complex& alpha, const complex *A, const int lda, const complex *B, const int ldb,
125  const complex& beta, complex *C, const int ldc);
126 #endif
127 
128 //Sparse<->dense vector operations:
136 void eblas_scatter_zdaxpy(const int Nindex, double a, const int* index, const complex* x, complex* y, bool conjugate=false);
138 void eblas_scatter_zaxpy(const int Nindex, complex a, const int* index, const complex* x, complex* y, bool conjugate=false);
140 void eblas_scatter_daxpy(const int Nindex, double a, const int* index, const double* x, double* y);
141 
149 void eblas_gather_zdaxpy(const int Nindex, double a, const int* index, const complex* x, complex* y, bool conjugate=false);
151 void eblas_gather_zaxpy(const int Nindex, complex a, const int* index, const complex* x, complex* y, bool conjugate=false);
153 void eblas_gather_daxpy(const int Nindex, double a, const int* index, const double* x, double* y);
154 
155 #ifdef GPU_ENABLED
156 void eblas_scatter_zdaxpy_gpu(const int Nindex, double a, const int* index, const complex* x, complex* y, bool conjugate=false);
159 void eblas_scatter_zaxpy_gpu(const int Nindex, complex a, const int* index, const complex* x, complex* y, bool conjugate=false);
161 void eblas_scatter_daxpy_gpu(const int Nindex, double a, const int* index, const double* x, double* y);
163 void eblas_gather_zdaxpy_gpu(const int Nindex, double a, const int* index, const complex* x, complex* y, bool conjugate=false);
165 void eblas_gather_zaxpy_gpu(const int Nindex, complex a, const int* index, const complex* x, complex* y, bool conjugate=false);
167 void eblas_gather_daxpy_gpu(const int Nindex, double a, const int* index, const double* x, double* y);
168 #endif
169 
175 void eblas_accumNorm(int N, const double& a, const complex* x, double* y);
183 void eblas_accumProd(int N, const double& a, const complex* xU, const complex* xC, double* yRe, double* yIm);
184 #ifdef GPU_ENABLED
185 void eblas_accumNorm_gpu(int N, const double& a, const complex* x, double* y);
188 void eblas_accumProd_gpu(int N, const double& a, const complex* xU, const complex* xC, double* yRe, double* yIm);
189 #endif
190 
196 void eblas_symmetrize(int N, int n, const int* symmIndex, double* x);
198 void eblas_symmetrize(int N, int n, const int* symmIndex, complex* x);
199 #ifdef GPU_ENABLED
200 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, double* x);
203 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, complex* x);
204 #endif
205 
206 //Threaded-wrappers for BLAS1 functions (Cblas)
212 template<typename T> void eblas_copy(T* dest, const T* src, int N) { memcpy(dest, src, N*sizeof(T)); }
216 void eblas_zero(int N, double* x);
218 void eblas_zero(int N, complex* x);
220 void eblas_dscal(int N, double a, double* x, int incx);
222 void eblas_zdscal(int N, double a, complex* x, int incx);
224 void eblas_zscal(int N, const complex& a, complex* x, int incx);
226 void eblas_daxpy(int N, double a, const double* x, int incx, double* y, int incy);
228 void eblas_zaxpy(int N, const complex& a, const complex* x, int incx, complex* y, int incy);
230 complex eblas_zdotc(int N, const complex* x, int incx, const complex* y, int incy);
232 double eblas_ddot(int N, const double* x, int incx, const double* y, int ncy);
234 double eblas_dznrm2(int N, const complex* x, int incx);
236 double eblas_dnrm2(int N, const double* x, int incx);
237 
238 //Wrappers/alternate names for CUBLAS functions (auto-selectable with Cblas ones above using callPref)
239 #ifdef GPU_ENABLED
240 template<typename T> void eblas_copy_gpu(T* dest, const T* src, int N) { cudaMemcpy(dest, src, N*sizeof(T), cudaMemcpyDeviceToDevice); }
243 void eblas_zero_gpu(int N, double* x);
245 void eblas_zero_gpu(int N, complex* x);
247 #define eblas_dscal_gpu cublasDscal
248 void eblas_zdscal_gpu(int N, double a, complex* x, int incx);
251 void eblas_zscal_gpu(int N, const complex& a, complex* x, int incx);
253 #define eblas_daxpy_gpu cublasDaxpy
254 void eblas_zaxpy_gpu(int N, const complex& a, const complex* x, int incx, complex* y, int incy);
257 complex eblas_zdotc_gpu(int N, const complex* x, int incx, const complex* y, int incy);
259 #define eblas_ddot_gpu cublasDdot
260 double eblas_dznrm2_gpu(int N, const complex* x, int incx);
263 #define eblas_dnrm2_gpu cublasDnrm2
264 
265 #endif
266 
268 #ifdef GPU_ENABLED
269  #define callPref(functionName) functionName##_gpu //gpu versions available, so use them preferentially
270 #else
271  #define callPref(functionName) functionName //only cpu version available
272 #endif
273 
274 
282 void eblas_capMinMax(const int N, double* x, double& xMin, double& xMax, double capLo=-DBL_MAX, double capHi=+DBL_MAX);
283 #ifdef GPU_ENABLED
284 void eblas_capMinMax_gpu(const int N, double* x, double& xMin, double& xMax, double capLo=-DBL_MAX, double capHi=+DBL_MAX);
286 #endif
287 
288 
289 //###################################################################################################
290 //#### Implementation ####
291 //##########################
292 
294 
295 //Elementwise multiply implementation:
296 template<typename Ty, typename Tx>
297 void eblas_mul_sub(size_t iMin, size_t iMax, const Tx* X, const int incX, Ty* Y, const int incY)
298 { for(size_t i=iMin; i<iMax; i++) Y[incY*i] *= X[incX*i];
299 }
300 
301 template<typename Ty, typename Tx>
302 void eblas_mul(const int N, const Tx* X, const int incX, Ty* Y, const int incY)
303 { if(incY==0) die("incY cannot be = 0")
304  threadLaunch((N<100000) ? 1 : 0, //force single threaded for small problem sizes
305  eblas_mul_sub<Ty,Tx>, N, X, incX, Y, incY);
306 }
307 
308 //Elementwise divide implementation:
309 template<typename Ty, typename Tx>
310 void eblas_div_sub(size_t iMin, size_t iMax, const Tx* X, const int incX, Ty* Y, const int incY)
311 { for(size_t i=iMin; i<iMax; i++) Y[incY*i] /= X[incX*i];
312 }
313 template<typename Ty, typename Tx>
314 void eblas_div(const int N, const Tx* X, const int incX, Ty* Y, const int incY)
315 { if(incY==0) die("incY cannot be = 0")
316  threadLaunch((N<100000) ? 1 : 0, //force single threaded for small problem sizes
317  eblas_div_sub<Ty,Tx>, N, X, incX, Y, incY);
318 }
320 
321 #endif // JDFTX_CORE_BLASEXTRA_H
void eblas_zmul_gpu(const int N, const complex *X, const int incX, complex *Y, const int incY)
Equivalent of eblas_zmul() for GPU data pointers.
void eblas_zdscal(int N, double a, complex *x, int incx)
Scale a complex array by a real scale factor: threaded wrapper to the cblas_zdscal BLAS1 function...
void eblas_scatter_daxpy_gpu(const int Nindex, double a, const int *index, const double *x, double *y)
Equivalent of eblas_scatter_daxpy() for GPU data pointers.
void eblas_capMinMax_gpu(const int N, double *x, double &xMin, double &xMax, double capLo=-DBL_MAX, double capHi=+DBL_MAX)
Equivalent of eblas_capMinMax() for GPU data pointers.
void eblas_zmul(const int N, const complex *X, const int incX, complex *Y, const int incY)
Specialization of eblas_mul() for complex[] *= complex[].
Definition: BlasExtra.h:53
void eblas_zaxpy(int N, const complex &a, const complex *x, int incx, complex *y, int incy)
Scaled-accumulate on complex arrays: threaded wrapper to the cblas_zaxpy BLAS1 function.
void eblas_accumProd(int N, const double &a, const complex *xU, const complex *xC, double *yRe, double *yIm)
Accumulate elementwise product of two complex arrays xU and xC into real and imaginary parts yRe and ...
void eblas_dmul_gpu(const int N, const double *X, const int incX, double *Y, const int incY)
Equivalent of eblas_dmul() for GPU data pointers.
void eblas_accumProd_gpu(int N, const double &a, const complex *xU, const complex *xC, double *yRe, double *yIm)
Equivalent of eblas_accumProd() for GPU data pointers.
void eblas_ddiv(const int N, const double *X, const int incX, double *Y, const int incY)
Specialization of eblas_div() for double[] /= double[].
Definition: BlasExtra.h:78
void eblas_zmuld_gpu(const int N, const double *X, const int incX, complex *Y, const int incY)
Equivalent of eblas_zmuld() for GPU data pointers.
double eblas_ddot(int N, const double *x, int incx, const double *y, int ncy)
Dot product of real arrays: threaded wrapper to the cblas_ddot BLAS1 function.
void eblas_zdiv_gpu(const int N, const complex *X, const int incX, complex *Y, const int incY)
Equivalent of eblas_zdiv() for GPU data pointers.
complex eblas_zdotc(int N, const complex *x, int incx, const complex *y, int incy)
Dot product of complex arrays: threaded wrapper to the cblas_zdotc BLAS1 function.
void eblas_zdiv(const int N, const complex *X, const int incX, complex *Y, const int incY)
Specialization of eblas_div() for complex[] /= complex[].
Definition: BlasExtra.h:80
void eblas_zgemm_gpu(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, int M, int N, int K, const complex &alpha, const complex *A, const int lda, const complex *B, const int ldb, const complex &beta, complex *C, const int ldc)
Wrap cublasZgemm to provide the same interface as eblas_zgemm()
void eblas_copy_gpu(T *dest, const T *src, int N)
Equivalent of eblas_copy() for GPU data pointers.
Definition: BlasExtra.h:241
void eblas_gather_zdaxpy(const int Nindex, double a, const int *index, const complex *x, complex *y, bool conjugate=false)
Gather y += a * x(index)
void eblas_scatter_daxpy(const int Nindex, double a, const int *index, const double *x, double *y)
Equivalent of eblas_scatter_zdaxpy() for real data arrays.
void eblas_zaxpy_gpu(int N, const complex &a, const complex *x, int incx, complex *y, int incy)
Equivalent of eblas_zaxpy() for GPU data pointers.
void eblas_div(const int N, const Tx *X, const int incX, Ty *Y, const int incY)
Templated elementwise divide Y /= X for arrays X, Y.
void threadLaunch(int nThreads, Callable *func, size_t nJobs, Args...args)
A simple utility for running muliple threads.
void eblas_gather_zaxpy(const int Nindex, complex a, const int *index, const complex *x, complex *y, bool conjugate=false)
Equivalent of eblas_gather_zdaxpy() with a complex scale factor.
void eblas_zdscal_gpu(int N, double a, complex *x, int incx)
Equivalent of eblas_zdscal() for GPU data pointers.
void eblas_zscal_gpu(int N, const complex &a, complex *x, int incx)
Equivalent of eblas_zscal for GPU data pointers.
void eblas_gather_zaxpy_gpu(const int Nindex, complex a, const int *index, const complex *x, complex *y, bool conjugate=false)
Equivalent of eblas_gather_zaxpy() for GPU data pointers.
void eblas_symmetrize(int N, int n, const int *symmIndex, double *x)
Symmetrize an array x, using N n-fold equivalence classes in symmIndex.
void eblas_scatter_zaxpy_gpu(const int Nindex, complex a, const int *index, const complex *x, complex *y, bool conjugate=false)
Equivalent of eblas_scatter_zaxpy() for GPU data pointers.
void eblas_lincomb_gpu(const int N, const complex &sX, const complex *X, const int incX, const complex &sY, const complex *Y, const int incY, complex *Z, const int incZ)
Equivalent of eblas_lincomb() for GPU data pointers.
void eblas_accumNorm(int N, const double &a, const complex *x, double *y)
Accumulate elementwise norm of a complex array x into y i.e. y += a x conj(x)
void eblas_dscal(int N, double a, double *x, int incx)
Scale a real array: threaded wrapper to the cblas_dscal BLAS1 function.
void eblas_zgemm(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, int M, int N, int K, const complex &alpha, const complex *A, const int lda, const complex *B, const int ldb, const complex &beta, complex *C, const int ldc)
Threaded complex matrix multiply (threaded wrapper around zgemm) All the parameters have the same mea...
Utilities for threading (wrappers around std::thread)
complex eblas_zdotc_gpu(int N, const complex *x, int incx, const complex *y, int incy)
Equivalent of eblas_zdotc() for GPU data pointers.
void eblas_zmuld(const int N, const double *X, const int incX, complex *Y, const int incY)
Specialization of eblas_mul() for complex[] *= double[].
Definition: BlasExtra.h:55
void eblas_symmetrize_gpu(int N, int n, const int *symmIndex, double *x)
Equivalent of eblas_symmetrize() for real GPU data pointers.
void eblas_zscal(int N, const complex &a, complex *x, int incx)
Scale a complex array by a complex scale factor: threaded wrapper to the cblas_zscal BLAS1 function...
void eblas_gather_daxpy(const int Nindex, double a, const int *index, const double *x, double *y)
Equivalent of eblas_scatter_zdaxpy() for real data arrays.
void eblas_gather_zdaxpy_gpu(const int Nindex, double a, const int *index, const complex *x, complex *y, bool conjugate=false)
Equivalent of eblas_gather_zdaxpy() for GPU data pointers.
void eblas_zdivd(const int N, const double *X, const int incX, complex *Y, const int incY)
Specialization of eblas_div() for complex[] /= double[].
Definition: BlasExtra.h:82
void eblas_scatter_zdaxpy_gpu(const int Nindex, double a, const int *index, const complex *x, complex *y, bool conjugate=false)
Equivalent of eblas_scatter_zdaxpy() for GPU data pointers.
double eblas_dnrm2(int N, const double *x, int incx)
2-norm of a real array: threaded wrapper to the cblas_dnrm2 BLAS1 function
void eblas_dmul(const int N, const double *X, const int incX, double *Y, const int incY)
Specialization of eblas_mul() for double[] *= double[].
Definition: BlasExtra.h:51
void eblas_scatter_zdaxpy(const int Nindex, double a, const int *index, const complex *x, complex *y, bool conjugate=false)
Scatter y(index) += a * x.
#define die(...)
Quit with an error message (formatted using printf()). Must be called from all processes.
Definition: Util.h:114
void eblas_ddiv_gpu(const int N, const double *X, const int incX, double *Y, const int incY)
Equivalent of eblas_ddiv() for GPU data pointers.
double eblas_dznrm2_gpu(int N, const complex *x, int incx)
Equivalent of eblas_dznrm2() for GPU data pointers.
void eblas_zdivd_gpu(const int N, const double *X, const int incX, complex *Y, const int incY)
Equivalent of eblas_zdivd() for GPU data pointers.
void eblas_accumNorm_gpu(int N, const double &a, const complex *x, double *y)
Equivalent of eblas_accumNorm() for GPU data pointers.
void eblas_copy(T *dest, const T *src, int N)
Copy a data array.
Definition: BlasExtra.h:212
void eblas_zero_gpu(int N, double *x)
Equivalent of eblas_zero() for GPU data pointers.
Complex number (need to define our own because we need operators for gpu code as well) ...
Definition: scalar.h:49
void eblas_mul(const int N, const Tx *X, const int incX, Ty *Y, const int incY)
Templated elementwise multiply Y *= X for arrays X, Y.
void eblas_lincomb(const int N, const complex &sX, const complex *X, const int incX, const complex &sY, const complex *Y, const int incY, complex *Z, const int incZ)
Elementwise linear combination Z = sX * X + sY * Y.
void eblas_daxpy(int N, double a, const double *x, int incx, double *y, int incy)
Scaled-accumulate on real arrays: threaded wrapper to the cblas_daxpy BLAS1 function.
void eblas_gather_daxpy_gpu(const int Nindex, double a, const int *index, const double *x, double *y)
Equivalent of eblas_gather_daxpy() for GPU data pointers.
void eblas_zero(int N, double *x)
Zero a data array.
double eblas_dznrm2(int N, const complex *x, int incx)
2-norm of a complex array: threaded wrapper to the cblas_dznrm2 BLAS1 function
void eblas_scatter_zaxpy(const int Nindex, complex a, const int *index, const complex *x, complex *y, bool conjugate=false)
Equivalent of eblas_scatter_zdaxpy() with a complex scale factor.
void eblas_capMinMax(const int N, double *x, double &xMin, double &xMax, double capLo=-DBL_MAX, double capHi=+DBL_MAX)
Find the minimum and maximum of a data array and optionally cap it from above and/or below...