1 #ifndef PARGEMSLR_PARALLEL_H
2 #define PARGEMSLR_PARALLEL_H
12 #ifdef PARGEMSLR_OPENMP
16 #include <cuda_runtime.h>
18 #include "cublas_v2.h"
28 #define PARGEMSLR_TIMES_NUM 60
30 #define PARGEMSLR_BUILDTIME_PARTITION 0 // time for the entire partitioning
31 #define PARGEMSLR_BUILDTIME_IE 1 // time for split the interior and external nodes
32 #define PARGEMSLR_BUILDTIME_METIS 2 // time for calling the METIS
33 #define PARGEMSLR_BUILDTIME_STRUCTURE 3 // time for building the structure when having the domain number
34 #define PARGEMSLR_BUILDTIME_RCM 4 // time for applying the RCM ordering
35 #define PARGEMSLR_BUILDTIME_ILUT 5 // time for the ILUT factorization
36 #define PARGEMSLR_BUILDTIME_LRC 6 // time for building the low-rank correction
37 #define PARGEMSLR_BUILDTIME_ARNOLDI 7 // time for the standard arnoldi
38 #define PARGEMSLR_BUILDTIME_BUILD_RES 8 // time for phase the result of arnoldi
39 #define PARGEMSLR_BUILDTIME_SOLVELU 9 // time for the ILU solve in the setup phase
40 #define PARGEMSLR_BUILDTIME_SOLVELU_L 10 // time for the ILU solve in the setup phase on the last level
41 #define PARGEMSLR_BUILDTIME_SOLVELR 11 // time for applying the low-rank correction in the setup phase
42 #define PARGEMSLR_BUILDTIME_SOLVEEBFC 12 // time for solve with EBiFCi
43 #define PARGEMSLR_BUILDTIME_EXTRACTMAT 13 // time for extracting E, B, F, and C on the first level
44 #define PARGEMSLR_BUILDTIME_MOVEDATA 14 // time for moving data between levels
45 #define PARGEMSLR_BUILDTIME_LOCALPERM 15 // time for local permutation
46 #define PARGEMSLR_BUILDTIME_EMV 16 // time for matvec with E on all levels
47 #define PARGEMSLR_BUILDTIME_FMV 17 // time for matvec with F on all levels
48 #define PARGEMSLR_BUILDTIME_GEN_MAT 18 // time for generating matrix
49 #define PARGEMSLR_BUILDTIME_DECOMP 19 // time for decompositions. Hess, schur, eig, ordschur...
50 #define PARGEMSLR_BUILDTIME_MGS 20 // time for MGS in the Arnoldi
51 #define PARGEMSLR_BUILDTIME_EBFC 21 // time for EBFC in the Arnoldi
52 #define PARGEMSLR_PRECTIME_PRECOND 30 // time for applying the preconditioner
53 #define PARGEMSLR_PRECTIME_ILUT 31 // time for the ILU solve in the solve phase
54 #define PARGEMSLR_PRECTIME_ILUT_L 32 // time for the ILU solve in the solve phase on the last level
55 #define PARGEMSLR_PRECTIME_LRC 33 // time for applying the low-rank correction
56 #define PARGEMSLR_PRECTIME_EMV 34 // time for matvec with E on all levels
57 #define PARGEMSLR_PRECTIME_FMV 35 // time for matvec with F on all levels
58 #define PARGEMSLR_PRECTIME_INNER 36 // time for the inner iteration
59 #define PARGEMSLR_PRECTIME_MOVEDATA 37 // time for moving data between levels
60 #define PARGEMSLR_PRECTIME_LOCALPERM 38 // time for moving data between levels
61 #define PARGEMSLR_ITERTIME_AMV 40 // time for matvec with A
62 #define PARGEMSLR_ITERTIME_MGS 41 // time for MGS during solve
63 #define PARGEMSLR_TOTAL_GEN_MAT_TIME 50 // time for transfering data to device
64 #define PARGEMSLR_TOTAL_SETUP_TIME 51 // time for the setup phase
65 #define PARGEMSLR_TOTAL_SOLVE_TIME 52 // time for the solve phase
66 #define PARGEMSLR_BUILDTIME_BMV 53 // time for matvec with B on all levels
67 #define PARGEMSLR_BUILDTIME_CMV 54 // time for matvec with C on all levels
68 #define PARGEMSLR_BUILDTIME_SMV 55 // time for matvec with S on all levels
69 #define PARGEMSLR_PRECTIME_BMV 56 // time for matvec with B on all levels
70 #define PARGEMSLR_PRECTIME_CMV 57 // time for matvec with C on all levels
71 #define PARGEMSLR_PRECTIME_SMV 58 // time for matvec with S on all levels
73 #define PARGEMSLR_GLOBAL_FIRM_TIME_CALL(num, ...) {\
74 pargemslr::PargemslrMpiTime( (*(pargemslr::ParallelLogClass::_gcomm)), pargemslr::ParallelLogClass::_times_buffer_start[num]);\
76 pargemslr::PargemslrMpiTime( (*(pargemslr::ParallelLogClass::_gcomm)), pargemslr::ParallelLogClass::_times_buffer_end[num]);\
77 pargemslr::ParallelLogClass::_times[num] += pargemslr::ParallelLogClass::_times_buffer_end[num] - pargemslr::ParallelLogClass::_times_buffer_start[num];\
80 #define PARGEMSLR_LOCAL_FIRM_TIME_CALL(num, ...) {\
81 PARGEMSLR_CUDA_SYNCHRONIZE;\
82 pargemslr::ParallelLogClass::_times_buffer_start[num] = MPI_Wtime();\
84 PARGEMSLR_CUDA_SYNCHRONIZE;\
85 pargemslr::ParallelLogClass::_times_buffer_end[num] = MPI_Wtime();\
86 pargemslr::ParallelLogClass::_times[num] += pargemslr::ParallelLogClass::_times_buffer_end[num] - pargemslr::ParallelLogClass::_times_buffer_start[num];\
89 #define PARGEMSLR_FIRM_TIME_CALL(comm,num, ...) {\
90 pargemslr::PargemslrMpiTime( (comm), pargemslr::ParallelLogClass::_times_buffer_start[num]);\
92 pargemslr::PargemslrMpiTime( (comm), pargemslr::ParallelLogClass::_times_buffer_end[num]);\
93 pargemslr::ParallelLogClass::_times[num] += pargemslr::ParallelLogClass::_times_buffer_end[num] - pargemslr::ParallelLogClass::_times_buffer_start[num];\
96 #ifdef PARGEMSLR_TIMING
98 #define PARGEMSLR_PRINT_TIMING_RESULT(print_level, ...) {\
101 PARGEMSLR_PRINT("\n");\
102 PargemslrPrintDashLine(pargemslr::pargemslr_global::_dash_line_width);\
103 PARGEMSLR_PRINT("Time info:\n");\
104 PARGEMSLR_PRINT("\tLoad matrix time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_GEN_MAT_TIME]);\
105 PARGEMSLR_PRINT("\tPartition time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_PARTITION]+pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_STRUCTURE]);\
106 PARGEMSLR_PRINT("\tSetup time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_SETUP_TIME]-pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_PARTITION]-pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_STRUCTURE]);\
107 PARGEMSLR_PRINT("\tSolve time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_SOLVE_TIME]);\
108 PARGEMSLR_PRINT("\tTotal time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_SETUP_TIME]+pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_SOLVE_TIME]);\
109 PargemslrPrintDashLine(pargemslr::pargemslr_global::_dash_line_width);\
110 PARGEMSLR_PRINT("\n");\
113 PARGEMSLR_PRINT("\n");\
114 PargemslrPrintDashLine(pargemslr::pargemslr_global::_dash_line_width);\
115 PARGEMSLR_PRINT("Time detail:\n");\
116 PARGEMSLR_PRINT("\tMatvec with A time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_ITERTIME_AMV]);\
117 PARGEMSLR_PRINT("\tPrecond setup time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_SETUP_TIME]);\
118 PARGEMSLR_PRINT("\t-GeMSLR reordering time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_PARTITION]);\
119 PARGEMSLR_PRINT("\t-GeMSLR Setup Structure time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_STRUCTURE]);\
120 PARGEMSLR_PRINT("\t-GeMSLR ILU setup time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_ILUT]);\
121 PARGEMSLR_PRINT("\t--GeMSLR ILU reordering time: %fs - (note: this is the time on p0.)\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_RCM]);\
122 PARGEMSLR_PRINT("\t-GeMSLR low-rank setup time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_LRC]);\
123 PARGEMSLR_PRINT("\t--GeMSLR arnoldi iter time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_ARNOLDI]);\
124 PARGEMSLR_PRINT("\t---GeMSLR MGS time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_MGS]);\
125 PARGEMSLR_PRINT("\t---GeMSLR EB^{-1}FC^{-1} time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_EBFC]);\
126 PARGEMSLR_PRINT("\t---GeMSLR setup ILU solve time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_SOLVELU]);\
127 PARGEMSLR_PRINT("\t---GeMSLR setup ILU solve last lev: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_SOLVELU_L]);\
128 PARGEMSLR_PRINT("\t---GeMSLR setup LRC apply time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_SOLVELR]);\
129 PARGEMSLR_PRINT("\t---GeMSLR setup sparse matvec time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_EMV]+pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_FMV]);\
130 PARGEMSLR_PRINT("\t--GeMSLR build result time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_BUILD_RES]);\
131 PARGEMSLR_PRINT("\t--GeMSLR Lapack Dcomp time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_DECOMP]);\
132 PARGEMSLR_PRINT("\tPrecond applying time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_PRECTIME_PRECOND]);\
133 PARGEMSLR_PRINT("\t-GeMSLR ILU solve time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_PRECTIME_ILUT]);\
134 PARGEMSLR_PRINT("\t-GeMSLR ILU solve last lev: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_PRECTIME_ILUT_L]);\
135 PARGEMSLR_PRINT("\t-GeMSLR sparse matvec time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_PRECTIME_EMV]+pargemslr::ParallelLogClass::_times[PARGEMSLR_PRECTIME_FMV]\
136 +pargemslr::ParallelLogClass::_times[PARGEMSLR_PRECTIME_BMV]+pargemslr::ParallelLogClass::_times[PARGEMSLR_PRECTIME_SMV]+pargemslr::ParallelLogClass::_times[PARGEMSLR_PRECTIME_CMV]);\
137 PARGEMSLR_PRINT("\t-GeMSLR LRC apply time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_PRECTIME_LRC]);\
138 PARGEMSLR_PRINT("\tIterative solve MGS time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_ITERTIME_MGS]);\
139 PargemslrPrintDashLine(pargemslr::pargemslr_global::_dash_line_width);\
140 PARGEMSLR_PRINT("\n");\
145 #define PARGEMSLR_GLOBAL_TIME_CALL(num, ...) {\
146 pargemslr::PargemslrMpiTime( (*(pargemslr::ParallelLogClass::_gcomm)), pargemslr::ParallelLogClass::_times_buffer_start[num]);\
148 pargemslr::PargemslrMpiTime( (*(pargemslr::ParallelLogClass::_gcomm)), pargemslr::ParallelLogClass::_times_buffer_end[num]);\
149 pargemslr::ParallelLogClass::_times[num] += pargemslr::ParallelLogClass::_times_buffer_end[num] - pargemslr::ParallelLogClass::_times_buffer_start[num];\
152 #define PARGEMSLR_LOCAL_TIME_CALL(num, ...) {\
153 PARGEMSLR_CUDA_SYNCHRONIZE;\
154 pargemslr::ParallelLogClass::_times_buffer_start[num] = MPI_Wtime();\
156 PARGEMSLR_CUDA_SYNCHRONIZE;\
157 pargemslr::ParallelLogClass::_times_buffer_end[num] = MPI_Wtime();\
158 pargemslr::ParallelLogClass::_times[num] += pargemslr::ParallelLogClass::_times_buffer_end[num] - pargemslr::ParallelLogClass::_times_buffer_start[num];\
161 #define PARGEMSLR_TIME_CALL(comm,num, ...) {\
162 pargemslr::PargemslrMpiTime( (comm), pargemslr::ParallelLogClass::_times_buffer_start[num]);\
164 pargemslr::PargemslrMpiTime( (comm), pargemslr::ParallelLogClass::_times_buffer_end[num]);\
165 pargemslr::ParallelLogClass::_times[num] += pargemslr::ParallelLogClass::_times_buffer_end[num] - pargemslr::ParallelLogClass::_times_buffer_start[num];\
168 #define PARGEMSLR_RESET_TIME std::fill(pargemslr::ParallelLogClass::_times.begin(), pargemslr::ParallelLogClass::_times.end(), 0.0);
172 #define PARGEMSLR_GLOBAL_TIME_CALL(num, ...) {\
176 #define PARGEMSLR_LOCAL_TIME_CALL(num, ...) {\
180 #define PARGEMSLR_TIME_CALL(comm,num, ...) {\
184 #define PARGEMSLR_PRINT_TIMING_RESULT(print_level, ...) {\
187 PARGEMSLR_PRINT("\n");\
188 PargemslrPrintDashLine(pargemslr::pargemslr_global::_dash_line_width);\
189 PARGEMSLR_PRINT("Time info:\n");\
190 PARGEMSLR_PRINT("\tLoad matrix time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_GEN_MAT_TIME]);\
191 PARGEMSLR_PRINT("\tPartition time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_PARTITION]);\
192 PARGEMSLR_PRINT("\tSetup time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_SETUP_TIME]-pargemslr::ParallelLogClass::_times[PARGEMSLR_BUILDTIME_PARTITION]);\
193 PARGEMSLR_PRINT("\tSolve time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_SOLVE_TIME]);\
194 PARGEMSLR_PRINT("\tTotal time: %fs\n",pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_SETUP_TIME]+pargemslr::ParallelLogClass::_times[PARGEMSLR_TOTAL_SOLVE_TIME]);\
195 PargemslrPrintDashLine(pargemslr::pargemslr_global::_dash_line_width);\
196 PARGEMSLR_PRINT("\n");\
200 #define PARGEMSLR_RESET_TIME
215 #ifdef PARGEMSLR_CUDA
221 static curandGenerator_t _curand_gen;
227 static cublasHandle_t _cublas_handle;
233 static cusparseHandle_t _cusparse_handle;
239 static cudaStream_t _stream;
245 static cusparseIndexBase_t _cusparse_idx_base;
251 static cusparseMatDescr_t _mat_des;
257 static cusparseMatDescr_t _matL_des;
263 static cusparseMatDescr_t _matU_des;
269 static cusparseSolvePolicy_t _ilu_solve_policy;
275 static void *_cusparse_buffer;
281 static size_t _cusparse_buffer_length;
283 #if (PARGEMSLR_CUDA_VERSION == 11)
288 static cusparseIndexType_t _cusparse_idx_type;
294 static cusparseSpMVAlg_t _cusparse_spmv_algorithm;
442 }parallel_log, *parallel_logp;
450 int PargemslrSetOpenmpNumThreads(
int nthreads);
452 #ifdef PARGEMSLR_OPENMP
459 int PargemslrGetOpenmpThreadNum();
467 int PargemslrGetOpenmpNumThreads();
474 int PargemslrGetOpenmpMaxNumThreads();
481 int PargemslrGetOpenmpGlobalMaxNumThreads();
494 int PargemslrNLocalToNGlobal(
int n_local,
long int &n_start,
long int &n_global, MPI_Comm &comm);
508 int PargemslrNLocalToNGlobal(
int nrow_local,
int ncol_local,
long int &nrow_start,
long int &ncol_start,
long int &nrow_global,
long int &ncol_global, MPI_Comm &comm);
517 int PargemslrInit(
int *argc,
char ***argv);
525 int PargemslrInitMpi(MPI_Comm comm);
533 int PargemslrInitOpenMP(
int nthreads);
540 int PargemslrInitCUDA();
547 int PargemslrPrintParallelInfo();
554 int PargemslrFinalize();
561 int PargemslrFinalizeMpi();
568 int PargemslrFinalizeOpenMP();
575 int PargemslrFinalizeCUDA();
584 int PargemslrMpiTime(MPI_Comm comm,
double &t);
586 #ifdef MPI_C_FLOAT_COMPLEX
599 template <
typename T>
600 int PargemslrMpiIsend(T *buf,
int count,
int dest,
int tag, MPI_Comm comm, MPI_Request *request);
613 template <
typename T>
614 int PargemslrMpiIrecv(T *buf,
int count,
int source,
int tag, MPI_Comm comm, MPI_Request *request);
626 template <
typename T>
627 int PargemslrMpiSend(T *buf,
int count,
int dest,
int tag, MPI_Comm comm);
640 template <
typename T>
641 int PargemslrMpiRecv(T *buf,
int count,
int source,
int tag, MPI_Comm comm, MPI_Status * status);
656 template <
typename T>
657 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
658 PargemslrMpiIsend(T *buf,
int count,
int dest,
int tag, MPI_Comm comm, MPI_Request *request);
671 template <
typename T>
672 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
673 PargemslrMpiIsend(T *buf,
int count,
int dest,
int tag, MPI_Comm comm, MPI_Request *request);
686 template <
typename T>
687 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
688 PargemslrMpiIrecv(T *buf,
int count,
int source,
int tag, MPI_Comm comm, MPI_Request *request);
701 template <
typename T>
702 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
703 PargemslrMpiIrecv(T *buf,
int count,
int source,
int tag, MPI_Comm comm, MPI_Request *request);
715 template <
typename T>
716 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
717 PargemslrMpiSend(T *buf,
int count,
int dest,
int tag, MPI_Comm comm);
730 template <
typename T>
731 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
732 PargemslrMpiSend(T *buf,
int count,
int dest,
int tag, MPI_Comm comm);
745 template <
typename T>
746 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
747 PargemslrMpiRecv(T *buf,
int count,
int source,
int tag, MPI_Comm comm, MPI_Status * status);
760 template <
typename T>
761 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
762 PargemslrMpiRecv(T *buf,
int count,
int source,
int tag, MPI_Comm comm, MPI_Status * status);
766 #ifdef MPI_C_FLOAT_COMPLEX
777 template <
typename T>
778 int PargemslrMpiBcast(T *buf,
int count,
int root, MPI_Comm comm);
791 template <
typename T>
792 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
793 PargemslrMpiBcast(T *buf,
int count,
int root, MPI_Comm comm);
804 template <
typename T>
805 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
806 PargemslrMpiBcast(T *buf,
int count,
int root, MPI_Comm comm);
810 #ifdef MPI_C_FLOAT_COMPLEX
822 template <
typename T>
823 int PargemslrMpiScan(T *sendbuf, T *recvbuf,
int count, MPI_Op op, MPI_Comm comm);
837 template <
typename T>
838 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
839 PargemslrMpiScan(T *sendbuf, T *recvbuf,
int count, MPI_Op op, MPI_Comm comm);
851 template <
typename T>
852 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
853 PargemslrMpiScan(T *sendbuf, T *recvbuf,
int count, MPI_Op op, MPI_Comm comm);
857 #ifdef MPI_C_FLOAT_COMPLEX
870 template <
typename T>
871 int PargemslrMpiReduce(T *sendbuf, T *recvbuf,
int count, MPI_Op op,
int root, MPI_Comm comm);
886 template <
typename T>
887 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
888 PargemslrMpiReduce(T *sendbuf, T *recvbuf,
int count, MPI_Op op,
int root, MPI_Comm comm);
901 template <
typename T>
902 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
903 PargemslrMpiReduce(T *sendbuf, T *recvbuf,
int count, MPI_Op op,
int root, MPI_Comm comm);
907 #ifdef MPI_C_FLOAT_COMPLEX
919 template <
typename T>
920 int PargemslrMpiAllreduce(T *sendbuf, T *recvbuf,
int count, MPI_Op op, MPI_Comm comm);
934 template <
typename T>
935 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
936 PargemslrMpiAllreduce(T *sendbuf, T *recvbuf,
int count, MPI_Op op, MPI_Comm comm);
948 template <
typename T>
949 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
950 PargemslrMpiAllreduce(T *sendbuf, T *recvbuf,
int count, MPI_Op op, MPI_Comm comm);
954 #ifdef MPI_C_FLOAT_COMPLEX
965 template <
typename T>
966 int PargemslrMpiAllreduceInplace(T *buf,
int count, MPI_Op op, MPI_Comm comm);
979 template <
typename T>
980 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
981 PargemslrMpiAllreduceInplace(T *buf,
int count, MPI_Op op, MPI_Comm comm);
992 template <
typename T>
993 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
994 PargemslrMpiAllreduceInplace(T *buf,
int count, MPI_Op op, MPI_Comm comm);
998 #ifdef MPI_C_FLOAT_COMPLEX
1010 template <
typename T>
1011 int PargemslrMpiGather(T *sendbuf,
int count, T *recvbuf,
int root, MPI_Comm comm);
1025 template <
typename T>
1026 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
1027 PargemslrMpiGather(T *sendbuf,
int count, T *recvbuf,
int root, MPI_Comm comm);
1039 template <
typename T>
1040 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
1041 PargemslrMpiGather(T *sendbuf,
int count, T *recvbuf,
int root, MPI_Comm comm);
1045 #ifdef MPI_C_FLOAT_COMPLEX
1056 template <
typename T>
1057 int PargemslrMpiAllgather(T *sendbuf,
int count, T *recvbuf, MPI_Comm comm);
1070 template <
typename T>
1071 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
1072 PargemslrMpiAllgather(T *sendbuf,
int count, T *recvbuf, MPI_Comm comm);
1083 template <
typename T>
1084 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
1085 PargemslrMpiAllgather(T *sendbuf,
int count, T *recvbuf, MPI_Comm comm);
1089 #ifdef MPI_C_FLOAT_COMPLEX
1102 template <
typename T>
1103 int PargemslrMpiAllgatherv(T *sendbuf,
int count, T *recvbuf,
int *recvcounts,
int *recvdisps, MPI_Comm comm);
1118 template <
typename T>
1119 typename std::enable_if<!PargemslrIsComplex<T>::value,
int>::type
1120 PargemslrMpiAllgatherv(T *sendbuf,
int count, T *recvbuf,
int *recvcounts,
int *recvdisps, MPI_Comm comm);
1133 template <
typename T>
1134 typename std::enable_if<PargemslrIsComplex<T>::value,
int>::type
1135 PargemslrMpiAllgatherv(T *sendbuf,
int count, T *recvbuf,
int *recvcounts,
int *recvdisps, MPI_Comm comm);
1139 #ifdef PARGEMSLR_CUDA
1146 int PargemslrCudaSynchronize();
1154 template <
typename T>
1155 MPI_Datatype PargemslrMpiDataType();
1163 MPI_Datatype PargemslrMpiDataType<int>();
1171 MPI_Datatype PargemslrMpiDataType<long int>();
1179 MPI_Datatype PargemslrMpiDataType<float>();
1187 MPI_Datatype PargemslrMpiDataType<double>();
1195 MPI_Datatype PargemslrMpiDataType<complexs>();
1203 MPI_Datatype PargemslrMpiDataType<complexd>();
1208 #ifdef PARGEMSLR_OPENMP
1212 #define PARGEMSLR_OPENMP_SCHEDULE_DEFAULT schedule(static)
1213 #define PARGEMSLR_OPENMP_SCHEDULE_STATIC schedule(static)
1219 #ifdef PARGEMSLR_DEBUG
1221 #define PARGEMSLR_MPI_CALL(...) {\
1222 assert( (__VA_ARGS__) == MPI_SUCCESS);\
1227 #define PARGEMSLR_MPI_CALL(...) {\
1235 #ifdef PARGEMSLR_CUDA
1237 #ifndef PARGEMSLR_CUDA_VERSION
1240 #define PARGEMSLR_CUDA_VERSION 11
1244 #define PARGEMSLR_CUDA_SYNCHRONIZE PargemslrCudaSynchronize();
1246 #ifdef PARGEMSLR_DEBUG
1248 #define PARGEMSLR_CUDA_CALL(...) {\
1249 assert((__VA_ARGS__) == cudaSuccess);\
1252 #define PARGEMSLR_CURAND_CALL(...) {\
1253 assert((__VA_ARGS__) == CURAND_STATUS_SUCCESS);\
1256 #define PARGEMSLR_CUBLAS_CALL(...) {\
1257 assert( (__VA_ARGS__) == CUBLAS_STATUS_SUCCESS);\
1260 #define PARGEMSLR_CUSPARSE_CALL(...) {\
1261 assert((__VA_ARGS__) == CUSPARSE_STATUS_SUCCESS);\
1266 #define PARGEMSLR_CUDA_CALL(...) {\
1270 #define PARGEMSLR_CURAND_CALL(...) {\
1274 #define PARGEMSLR_CUBLAS_CALL(...) {\
1278 #define PARGEMSLR_CUSPARSE_CALL(...) {\
1286 #define PARGEMSLR_THRUST_CALL(thrust_function, ...) thrust::thrust_function( __VA_ARGS__)
1290 #define PARGEMSLR_CUDA_SYNCHRONIZE
1294 #define PARGEMSLR_GLOBAL_SEQUENTIAL_RUN(...) {\
1295 for(int pgsri = 0 ; pgsri < pargemslr::parallel_log::_gsize ; pgsri++)\
1297 if( pargemslr::parallel_log::_grank == pgsri)\
1301 MPI_Barrier(*(pargemslr::parallel_log::_gcomm));\