84 __typeof__(a) _a = (a); \
85 __typeof__(b) _b = (b); \
91 __typeof__(a) _a = (a); \
92 __typeof__(b) _b = (b); \
129 str_mess = (
char *) malloc(100 *
sizeof(
char));
130 if (error_number == 1)
132 "Error on line %d of %s. Toeplitz band width > vector size\n",
134 if (error_number == 2)
135 sprintf(str_mess,
"Error on line %d of %s. Bad allocation.\n", line,
137 if (error_number == 3)
139 "Error on line %d of %s. Error at fftw multithread "
142 if (error_number == 7)
143 sprintf(str_mess,
"Error on line %d of %s.\n", line, file);
144 fprintf(stderr,
"%s", str_mess);
145 printf(
"%s", str_mess);
186 min_pow2 = (int) ceil(log(min_bs) / log(2));
187 bs = pow(2, min_pow2);
192 }
else if (bs_flag == 3) {
194 min_pow2 = (int) ceil(log(min_bs) / log(2));
195 bs = pow(2, min_pow2);
200 }
else if (bs_flag == 4 || bs_flag == 0) {
202 min_pow2 = (int) ceil(log(min_bs) / log(2));
203 bs = pow(2, min_pow2);
208 }
else if (bs_flag == 5) {
211 while (bs < 2 * (lambda - 1) * log(bs + 1) && bs < n) { bs = bs * 2; }
213 }
else if (bs_flag == 6) {
218 min_pow2 = (int) ceil(log(min_bs) / log(2));
225 bs = pow(2, min_pow2);
238 printf(
"Error. Wrong value for bs_flag. Set to auto mode.\n");
240 min_pow2 = (int) ceil(log(min_bs) / log(2));
241 bs = pow(2, min_pow2);
249 printf(
"Computed optimal blocksize is %d (with lambda = %d)\n", bs,
269 if (flag_nfft == 0) nfft = NFFT_DEFAULT;
270 else if (flag_nfft == 1)
272 else if (flag_nfft == 2)
275 printf(
"Error. Wrong value for flag_nfft. Set to auto mode.\n");
299 fftw_complex **T_fft,
double *T, fftw_complex **V_fft,
300 double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b,
304 VERBOSE = flag_stgy.flag_verbose;
322 int n_thread = omp_get_max_threads();
329 flag_stgy.fixed_nfft);
333 printf(
"Using %d threads\n", n_thread);
334 printf(
"nfft = %d\n", *nfft);
338 int fftw_flag = flag_stgy.flag_fftw;
341 #ifdef fftw_MULTITHREADING
355 rhs_init_fftw(nfft, (*blocksize), V_fft, V_rfft, plan_f, plan_b, fftw_flag);
362 printf(
"Initialization finished successfully\n");
371 #ifdef fftw_MULTITHREADING
383 status = fftw_init_threads();
387 fftw_plan_with_nthreads(fftw_n_thread);
390 printf(
"Using multithreaded FFTW with %d threads\n", fftw_n_thread);
409 double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b,
412 *V_fft = (fftw_complex *) fftw_malloc((*nfft) * (fft_size / 2 + 1)
413 *
sizeof(fftw_complex));
414 *V_rfft = (
double *) fftw_malloc((*nfft) * fft_size *
sizeof(double));
415 if (*V_fft == 0 || *V_rfft == 0)
418 *plan_f = fftw_plan_many_dft_r2c(1, &fft_size, (*nfft), *V_rfft, &fft_size,
419 1, fft_size, *V_fft, NULL, 1,
420 fft_size / 2 + 1, fftw_flag);
421 *plan_b = fftw_plan_many_dft_c2r(1, &fft_size, (*nfft), *V_fft, NULL, 1,
422 fft_size / 2 + 1, *V_rfft, &fft_size, 1,
423 fft_size, fftw_flag);
442 fftw_complex **T_fft) {
445 int circ_fftw_flag = FFTW_ESTIMATE;
447 *T_fft = (fftw_complex *) fftw_malloc((fft_size / 2 + 1)
448 *
sizeof(fftw_complex));
450 double *T_circ = (
double *) (*T_fft);
454 plan_f_T = fftw_plan_dft_r2c_1d(fft_size, T_circ, *T_fft, circ_fftw_flag);
457 #pragma omp parallel for
458 for (i = 0; i < fft_size + 2; i++) T_circ[i] = 0.0;
461 for (i = 1; i < lambda; i++) {
463 T_circ[fft_size - i] = T[i];
466 fftw_execute(plan_f_T);
467 fftw_destroy_plan(plan_f_T);
485 int tpltz_cleanup(fftw_complex **T_fft, fftw_complex **V_fft,
double **V_rfft,
486 fftw_plan *plan_f, fftw_plan *plan_b) {
487 fftw_destroy_plan(*plan_f);
488 fftw_destroy_plan(*plan_b);
492 #ifdef fftw_MULTITHREADING
493 fftw_cleanup_threads();
514 int copy_block(
int ninrow,
int nincol,
double *Vin,
int noutrow,
int noutcol,
515 double *Vout,
int inrow,
int incol,
int nblockrow,
int nblockcol,
516 int outrow,
int outcol,
double norm,
int set_zero_flag) {
517 int i, j, p, offsetIn, offsetOut;
520 if ((nblockcol > nincol) || (nblockrow > ninrow) || (nblockcol > noutcol)
521 || (nblockrow > noutrow)) {
522 printf(
"Error in routine copy_block. Bad size setup.\n");
527 #pragma omp parallel for
528 for (i = 0; i < noutrow * noutcol;
533 offsetIn = ninrow * incol + inrow;
534 offsetOut = noutrow * outcol + outrow;
538 for (i = 0; i < nblockcol * nblockrow; i++) {
541 Vout[offsetOut + j * noutrow + p] =
542 Vin[offsetIn + j * ninrow + p] * norm;
569 int scmm_direct(
int fft_size,
int nfft, fftw_complex *C_fft,
int ncol,
570 double *V_rfft,
double **CV, fftw_complex *V_fft,
571 fftw_plan plan_f_V, fftw_plan plan_b_CV) {
573 int sizeT = fft_size / 2 + 1;
577 fftw_execute(plan_f_V);
584 #pragma omp parallel for private(idx)
585 for (i = 0; i < ncol * sizeT; i++) {
587 V_fft[i][0] = C_fft[idx][0] * V_fft[i][0] - C_fft[idx][1] * V_fft[i][1];
588 V_fft[i][1] = C_fft[idx][0] * V_fft[i][1] + C_fft[idx][1] * V_fft[i][0];
613 fftw_execute(plan_b_CV);
651 int scmm_basic(
double **V,
int blocksize,
int m, fftw_complex *C_fft,
652 double **CV, fftw_complex *V_fft,
double *V_rfft,
int nfft,
653 fftw_plan plan_f_V, fftw_plan plan_b_CV) {
656 int nloop = (int) ceil((1.0 * m) / nfft);
659 int ncol = min(nfft, m);
664 #pragma omp parallel for
665 for (i = 0; i < blocksize * ncol; i++)
672 for (k = 0; k < nloop;
675 ncol = m - (nloop - 1) * nfft;
683 copy_block(blocksize, m, (*V), blocksize, ncol, V_rfft, 0, k * nfft,
684 blocksize, ncol, 0, 0, 1.0, 0);
688 scmm_direct(blocksize, nfft, C_fft, ncol, V_rfft, CV, V_fft, plan_f_V,
693 copy_block(blocksize, ncol, V_rfft, blocksize, m, (*CV), 0, 0,
694 blocksize, ncol, 0, k * nfft, 1.0 / ((
double) blocksize), 0);
731 int stmm_core(
double **V,
int n,
int m,
double *T, fftw_complex *T_fft,
732 int blocksize,
int lambda, fftw_complex *V_fft,
double *V_rfft,
733 int nfft, fftw_plan plan_f, fftw_plan plan_b,
int flag_offset,
747 int distcorrmin = lambda - 1;
754 if (flag_offset == 1)
755 nbloc = ceil((1.0 * (n - 2 * distcorrmin)) / blocksize_eff);
757 nbloc = ceil((1.0 * n)
764 double *V_bloc, *TV_bloc;
765 V_bloc = (
double *) calloc(blocksize * m,
sizeof(
double));
766 TV_bloc = (
double *) calloc(blocksize * m,
sizeof(
double));
767 if ((V_bloc == 0) || (TV_bloc == 0))
771 if (flag_offset == 1) offset = distcorrmin;
779 currentsize = min(blocksize - distcorrmin + offset, n - iV);
783 copy_block(n, m, *V, blocksize, m, V_bloc, 0, 0, currentsize, m,
784 distcorrmin - offset, 0, 1.0, 0);
790 status =
scmm_basic(&V_bloc, blocksize, m, T_fft, &TV_bloc, V_fft,
791 V_rfft, nfft, plan_f, plan_b);
794 printf(
"Error in stmm_core.");
801 iV = blocksize_eff - distcorrmin + offset;
804 currentsize = min(blocksize, n - iV);
809 copy_block(n, m, *V, blocksize, m, V_bloc, iV, 0, currentsize, m, 0, 0,
814 currentsize = min(blocksize_eff, n - iTV);
815 copy_block(blocksize, m, TV_bloc, n, m, *V, distcorrmin, 0, currentsize, m,
819 iTV += blocksize_eff;
821 for (k = 1; k < nbloc; k++) {
828 status =
scmm_basic(&V_bloc, blocksize, m, T_fft, &TV_bloc, V_fft,
829 V_rfft, nfft, plan_f, plan_b);
831 if (status != 0)
break;
836 if (k != nbloc - 1) {
837 currentsize = min(blocksize, n - iV);
840 (currentsize != blocksize);
842 copy_block(n, m, *V, blocksize, m, V_bloc, iV, 0, currentsize, m, 0,
843 0, 1.0, flag_resetk);
847 currentsize = min(blocksize_eff, n - iTV);
848 copy_block(blocksize, m, TV_bloc, n, m, *V, distcorrmin, 0, currentsize,
850 iTV += blocksize_eff;
888 int stmm_main(
double **V,
int n,
int m,
int id0,
int l,
double *T,
889 fftw_complex *T_fft,
int lambda, fftw_complex *V_fft,
890 double *V_rfft, fftw_plan plan_f, fftw_plan plan_b,
int blocksize,
891 int nfft, Flag flag_stgy) {
895 int distcorrmin = lambda - 1;
896 int flag_prod_strategy_nofft = 0;
897 int flag_shortcut_m_eff_eq_1 = 1;
898 int flag_shortcut_nbcol_eq_1 = 1;
899 int flag_nfullcol_in_middle =
901 int flag_optim_offset_for_nfft = 0;
902 int flag_no_rshp = flag_stgy.flag_no_rshp;
903 int flag_nofft = flag_stgy.flag_nofft;
905 int m_eff = (id0 + l - 1) / n - id0 / n + 1;
917 if (m_eff == 1 && flag_shortcut_m_eff_eq_1 == 1 && nfft == 1
918 || flag_no_rshp == 1 && id0 == 0 && l == n * m) {
927 stmm_core(V, nr, m_eff, T, T_fft, blocksize, lambda, V_fft, V_rfft,
928 nfft, plan_f, plan_b, flag_offset, flag_nofft);
940 0, (l - (n - id0 % n) % n - (id0 + l) % n)
943 if (flag_nfullcol_in_middle == 1)
944 nloop_middle = ceil(1.0 * (nfullcol) / nfft);
946 nloop_middle = (nfullcol) / nfft;
948 if (flag_nfullcol_in_middle == 1) m_middle = nfullcol;
950 m_middle = nfft * nloop_middle;
953 int vmiddle_size = n * m_middle;
957 printf(
"nloop_middle=%d , m_middle=%d\n", nloop_middle, m_middle);
961 if (nloop_middle > 0) {
963 int offset_middle = (n - id0 % n) % n;
964 Vmiddle = (*V) + offset_middle;
967 stmm_core(&Vmiddle, n, m_middle, T, T_fft, blocksize, lambda, V_fft,
968 V_rfft, nfft, plan_f, plan_b, flag_offset, flag_nofft);
974 int v1edge_size = min(l, (n - id0 % n) % n);
975 int v2edge_size = max(l - (v1edge_size + vmiddle_size), 0);
976 int vedge_size = v1edge_size + v2edge_size;
979 if (vedge_size > 0) {
981 int m_v1edge, m_v2edge;
982 m_v1edge = (v1edge_size > 0) * 1;
983 m_v2edge = m - (m_v1edge + m_middle);
984 int nbcol = m_v1edge + m_v2edge;
986 nocol = (
int *) calloc(nbcol,
sizeof(
double));
989 if (m_v1edge == 1) nocol[0] = 0;
990 for (i = (m_v1edge); i < nbcol; i++) nocol[i] = m_middle + i;
993 printf(
"nbcol=%d , m_v1edge=%d , m_v2edge=%d\n", nbcol, m_v1edge,
997 if (nbcol == 1 && nfft == 1 && flag_shortcut_nbcol_eq_1 == 1) {
1001 int offset_edge = n * nocol[0];
1003 Vedge = (*V) + offset_edge;
1004 int flag_offset = 0;
1005 stmm_core(&Vedge, vedge_size, nbcol, T, T_fft, blocksize, lambda,
1006 V_fft, V_rfft, nfft, plan_f, plan_b, flag_offset,
1015 int lconc = vedge_size;
1018 int v1_size = lconc + (distcorrmin) * (nbcol - 1);
1019 int fft_size = ceil(1.0 * v1_size / nfft) + 2 * distcorrmin;
1021 int flag_format_rshp = (nfft > 1) * 2 + (nfft == 1 && nbcol > 1) * 1
1022 + (nfft == 1 && nbcol == 1) * 0;
1023 int nrshp, mrshp, lrshp;
1026 vedge_size, &nrshp, &mrshp, &lrshp);
1030 Vrshp = (
double *) calloc(lrshp,
sizeof(
double));
1035 fprintf(file,
"nrshp=%d , mrshp=%d , lrshp=%d\n", nrshp, mrshp,
1037 fprintf(file,
"flag_format_rshp=%d\n", flag_format_rshp);
1040 build_reshape(Vin, nocol, nbcol, lconc, n, m, id0, l, lambda, nfft,
1041 Vrshp, nrshp, mrshp, lrshp, flag_format_rshp);
1044 if (flag_format_rshp == 2 && flag_optim_offset_for_nfft == 1)
1050 stmm_core(&Vrshp, nrshp, mrshp, T, T_fft, blocksize, lambda, V_fft,
1051 V_rfft, nfft, plan_f, plan_b, flag_offset, flag_nofft);
1054 nfft, Vrshp, nrshp, mrshp, lrshp, flag_format_rshp);
1079 int mpi_stmm(
double **V,
int n,
int m,
int id0,
int l,
double *T,
int lambda,
1080 Flag flag_stgy, MPI_Comm comm) {
1086 MPI_Comm_rank(comm, &rank);
1087 MPI_Comm_size(comm, &size);
1093 int cfirst = id0 / n;
1094 int clast = idf / n;
1095 int clast_r = (idf - 1) / n;
1096 int m_eff = clast_r - cfirst + 1;
1097 double *V1, *Lambda;
1101 int right = rank + 1;
1102 int left = rank - 1;
1103 int v1_size = l + 2 * lambda;
1104 if (rank == 0 || cfirst * n == id0) {
1106 left = MPI_PROC_NULL;
1108 if (rank == (size - 1) || clast * n == idf) {
1110 right = MPI_PROC_NULL;
1114 Lambda = (
double *) malloc(2 * lambda *
sizeof(
double));
1117 for (i = 0; i < lambda; i++) {
1118 Lambda[i] = (*V)[i];
1119 Lambda[i + lambda] = (*V)[i + l - lambda];
1123 printf(
"[rank %d] Left comm with %d | Right comm with %d\n", rank, left,
1127 MPI_Sendrecv_replace(Lambda, lambda, MPI_DOUBLE, left, MPI_USER_TAG, right,
1130 MPI_Sendrecv_replace((Lambda + lambda), lambda, MPI_DOUBLE, right,
1131 MPI_USER_TAG, left, MPI_USER_TAG, comm,
1140 if (left == MPI_PROC_NULL && right == MPI_PROC_NULL)
1142 else if (left == MPI_PROC_NULL) {
1143 *V = realloc(*V, v1_size *
sizeof(
double));
1147 V1 = (
double *) malloc(v1_size *
sizeof(
double));
1149 if (left != MPI_PROC_NULL) {
1150 for (i = 0; i < lambda; i++) V1[i] = Lambda[i + lambda];
1153 if (right != MPI_PROC_NULL) {
1154 for (i = 0; i < lambda; i++) V1[i + v1_size - lambda] = Lambda[i];
1159 if (left != MPI_PROC_NULL) {
1161 #pragma omp parallel for
1162 for (i = offset; i < l + offset; i++) V1[i] = (*V)[i - offset];
1165 fftw_complex *V_fft, *T_fft;
1167 fftw_plan plan_f, plan_b;
1171 int nfft, blocksize;
1173 tpltz_init(v1_size, lambda, &nfft, &blocksize, &T_fft, T, &V_fft, &V_rfft,
1174 &plan_f, &plan_b, flag_stgy);
1177 printf(
"[rank %d] Before middle-level call : blocksize=%d, nfft=%d\n",
1178 rank, blocksize, nfft);
1180 stmm_main(&V1, n, m, id0, v1_size, T, T_fft, lambda, V_fft, V_rfft, plan_f,
1181 plan_b, blocksize, nfft, flag_stgy);
1188 if (left != MPI_PROC_NULL) offset = lambda;
1189 if (left == MPI_PROC_NULL && right == MPI_PROC_NULL)
1191 else if (left == MPI_PROC_NULL) {
1192 V1 = realloc(V1, l *
sizeof(
double));
1196 #pragma omp parallel for
1197 for (i = offset; i < l + offset; i++) (*V)[i - offset] = V1[i];
1200 if (left != MPI_PROC_NULL) free(V1);
int define_blocksize(int n, int lambda, int bs_flag, int fixed_bs)
Defines an optimal size of the block used in the sliding windows algorithm.
int rhs_init_fftw(const int *nfft, int fft_size, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b, int fftw_flag)
Initializes fftw array and plan for the right hand side, general matrix V.
int print_error_message(int error_number, char const *file, int line)
Prints error message corresponding to an error number.
int stmm_core(double **V, int n, int m, double *T, fftw_complex *T_fft, int blocksize, int lambda, fftw_complex *V_fft, double *V_rfft, int nfft, fftw_plan plan_f, fftw_plan plan_b, int flag_offset, int flag_nofft)
int scmm_direct(int fft_size, int nfft, fftw_complex *C_fft, int ncol, double *V_rfft, double **CV, fftw_complex *V_fft, fftw_plan plan_f_V, fftw_plan plan_b_CV)
int define_nfft(int n_thread, int flag_nfft, int fixed_nfft)
int circ_init_fftw(const double *T, int fft_size, int lambda, fftw_complex **T_fft)
int copy_block(int ninrow, int nincol, double *Vin, int noutrow, int noutcol, double *Vout, int inrow, int incol, int nblockrow, int nblockcol, int outrow, int outcol, double norm, int set_zero_flag)
int tpltz_init(int n, int lambda, int *nfft, int *blocksize, fftw_complex **T_fft, double *T, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b, Flag flag_stgy)
int scmm_basic(double **V, int blocksize, int m, fftw_complex *C_fft, double **CV, fftw_complex *V_fft, double *V_rfft, int nfft, fftw_plan plan_f_V, fftw_plan plan_b_CV)
int tpltz_cleanup(fftw_complex **T_fft, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b)
int stmm_main(double **V, int n, int m, int id0, int l, double *T, fftw_complex *T_fft, int lambda, fftw_complex *V_fft, double *V_rfft, fftw_plan plan_f, fftw_plan plan_b, int blocksize, int nfft, Flag flag_stgy)
int mpi_stmm(double **V, int n, int m, int id0, int l, double *T, int lambda, Flag flag_stgy, MPI_Comm comm)
int fftw_init_omp_threads(int fftw_n_thread)
Initialize omp threads for fftw plans.
int stmm_simple_basic(double **V, int n, int m, double *T, int lambda, double **TV)
Perform the product of a Toeplitz matrix by a matrix without using FFT's.
int build_reshape(double *Vin, int *nocol, int nbcol, int lconc, int n, int m, int id0, int l, int lambda, int nfft, double *Vrshp, int nrshp, int mrshp, int lrshp, int flag_format_rshp)
int define_rshp_size(int flag_format_rshp, int fft_size, int nfft, int v1_size, int vedge_size, int *nrshp, int *mrshp, int *lrshp)
int extract_result(double *Vout, int *nocol, int nbcol, int lconc, int n, int m, int id0, int l, int lambda, int nfft, double *Vrshp, int nrshp, int mrshp, int lrshp, int flag_format_rshp)