MIDAPACK - MIcrowave Data Analysis PACKage  1.1b
Parallel software tools for high performance CMB DA analysis
toeplitz.c
Go to the documentation of this file.
1 
74 #include "toeplitz.h"
75 
76 #ifdef _OPENMP
77 
78 #include <omp.h>
79 
80 #endif
81 
82 #define max(a, b) \
83  ({ \
84  __typeof__(a) _a = (a); \
85  __typeof__(b) _b = (b); \
86  _a > _b ? _a : _b; \
87  })
88 
89 #define min(a, b) \
90  ({ \
91  __typeof__(a) _a = (a); \
92  __typeof__(b) _b = (b); \
93  _a < _b ? _a : _b; \
94  })
95 
96 // r1.2 - Frederic Dauvergne (APC)
97 // This file contains the main part of the Toeplitz algebra module. This include
98 // the elementary product routines (using FFT) and initialization routines.
99 // This also contains the mpi version of the Toeplitz matrix product with global
100 // row-wise order distribution of the data.
101 //
102 // todo:
103 //- add in stmm non blocking communication as it is done for the stbmm routine
104 //- scmm_direct dont need nfft parameter
105 
106 
107 //=========================================================================
108 // Global parameters
109 
111 
115 
116 // Parameter just to know the rank for printing when VERBOSE mode is on
117 int PRINT_RANK = -1;
118 
119 //=========================================================================
120 
122 
127 int print_error_message(int error_number, char const *file, int line) {
128  char *str_mess;
129  str_mess = (char *) malloc(100 * sizeof(char));
130  if (error_number == 1)
131  sprintf(str_mess,
132  "Error on line %d of %s. Toeplitz band width > vector size\n",
133  line, file);
134  if (error_number == 2)
135  sprintf(str_mess, "Error on line %d of %s. Bad allocation.\n", line,
136  file);
137  if (error_number == 3)
138  sprintf(str_mess,
139  "Error on line %d of %s. Error at fftw multithread "
140  "initialization.\n",
141  line, file);
142  if (error_number == 7)
143  sprintf(str_mess, "Error on line %d of %s.\n", line, file);
144  fprintf(stderr, "%s", str_mess);
145  printf("%s", str_mess);
146  return error_number;
147 }
148 
149 
150 //=========================================================================
151 
153 
168 int define_blocksize(int n, int lambda, int bs_flag, int fixed_bs) {
169  int bs; // computed optimal block size
170  int min_bs; // minimum block size used for the computation
171  int min_pow2; // minimum power of two index used for the block size
172  // computation
173 
174  // cheating
175  // bs_flag = 5;//1;//5;
176  // fixed_bs = pow(2,15); //2^14 winner because smaller block than 2^15 (as
177  // same speed)
178 
179  if (bs_flag == 1) {
180  bs = fixed_bs;
181 
182  } else if (bs_flag
183  == 2) { // this formula need to be check - seems there is a pb
184  min_bs = 2 * lambda; // when bs = 2 lambda. Not enough data left in the
185  // middle
186  min_pow2 = (int) ceil(log(min_bs) / log(2));
187  bs = pow(2, min_pow2);
188  if (bs > n) // This is to avoid block size much bigger than the matrix.
189  // Append mostly
190  bs = min_bs; // when the matrix is small compared to his bandwith
191 
192  } else if (bs_flag == 3) {
193  min_bs = 3 * lambda;
194  min_pow2 = (int) ceil(log(min_bs) / log(2));
195  bs = pow(2, min_pow2);
196  if (bs > n) // This is to avoid block size much bigger than the matrix.
197  // Append mostly
198  bs = min_bs; // when the matrix is small compared to his bandwith
199 
200  } else if (bs_flag == 4 || bs_flag == 0) {
201  min_bs = 4 * lambda;
202  min_pow2 = (int) ceil(log(min_bs) / log(2));
203  bs = pow(2, min_pow2);
204  if (bs > n) // This is to avoid block size much bigger than the matrix.
205  // Append mostly
206  bs = min_bs; // when the matrix is small compared to his bandwith
207 
208  } else if (bs_flag == 5) {
209  // Different formula to compute the optimal block size
210  bs = 1;
211  while (bs < 2 * (lambda - 1) * log(bs + 1) && bs < n) { bs = bs * 2; }
212 
213  } else if (bs_flag == 6) { // the same as bs_flag==5 but with constrain on
214  // the minimal size
215  // and the number of subblocks.
216 
217  min_bs = 4 * lambda;
218  min_pow2 = (int) ceil(log(min_bs) / log(2));
219 
220  min_pow2 = max(
221  min_pow2,
222  pow(2, 14)); // add condition to have a minimum size 2^14 for bs
223  // This is based on empirical estimation and can be justified
224  // by the speed benchmark of FFTW3 (see the FFTW official website).
225  bs = pow(2, min_pow2);
226 
227  if (bs > n) // This is to avoid block size much bigger than the matrix.
228  // Append mostly
229  bs = min_bs; // when the matrix is small compared to his bandwith
230 
231  // test if enough subblock for sliding windows algorithm:
232  // int nbloc_bs = ceil( (1.0*n)/(bs-2*distcorrmin));
233  // if (nbloc_bs<8) //Empirical condition to avoid small number of
234  // subblocks
235  // bs = 0; //Switch to no sliding windows algorithm
236 
237  } else {
238  printf("Error. Wrong value for bs_flag. Set to auto mode.\n");
239  min_bs = 4 * lambda;
240  min_pow2 = (int) ceil(log(min_bs) / log(2));
241  bs = pow(2, min_pow2);
242  if (bs > n) // This is to avoid block size much bigger than the matrix.
243  // Append mostly
244  bs = min_bs; // when the matrix is small compared to his bandwith
245  }
246 
247 
248  if (PRINT_RANK == 0 && VERBOSE > 1)
249  printf("Computed optimal blocksize is %d (with lambda = %d)\n", bs,
250  lambda);
251 
252  return bs;
253 }
254 
255 
256 //=========================================================================
257 
260 
266 int define_nfft(int n_thread, int flag_nfft, int fixed_nfft) {
267  int nfft;
268 
269  if (flag_nfft == 0) nfft = NFFT_DEFAULT;
270  else if (flag_nfft == 1)
271  nfft = fixed_nfft;
272  else if (flag_nfft == 2)
273  nfft = n_thread;
274  else {
275  printf("Error. Wrong value for flag_nfft. Set to auto mode.\n");
276  nfft = NFFT_DEFAULT;
277  }
278 
279  return nfft;
280 }
281 
282 
283 //=========================================================================
284 
287 
298 int tpltz_init(int n, int lambda, int *nfft, int *blocksize,
299  fftw_complex **T_fft, double *T, fftw_complex **V_fft,
300  double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b,
301  Flag flag_stgy) {
302 
303  // Set the VERBOSE global variable
304  VERBOSE = flag_stgy.flag_verbose;
305 
306 
307  // initialize block size
308  *blocksize =
309  define_blocksize(n, lambda, flag_stgy.flag_bs, flag_stgy.fixed_bs);
310 
311 
312  // if (bs==0)
313 // flag_stgy.flag_bs = 9999 //swich to noslidingwindowsalgo
314 
315 
316 // #pragma omp parallel
317 //{ n_thread = omp_get_num_threads(); }
318 
319 // if ((NB_OMPTHREADS <= n_thread) && (NB_OMPTHREADS != 0))
320 // omp_set_num_threads(NB_OMPTHREADS);
321 #ifdef _OPENMP
322  int n_thread = omp_get_max_threads();
323 #else
324  int n_thread = 1;
325 #endif
326 
327  // initialize nfft
328  *nfft = define_nfft(n_thread, flag_stgy.flag_nfft,
329  flag_stgy.fixed_nfft); //*nfft=n_thread;
330 
331 
332  if (PRINT_RANK == 0 && VERBOSE > 0 && VERBOSE_FIRSTINIT == 1) {
333  printf("Using %d threads\n", n_thread);
334  printf("nfft = %d\n", *nfft);
335  }
336 
337  // initialize fftw plan allocation flag
338  int fftw_flag = flag_stgy.flag_fftw; // FFTW_FLAG;
339 
340  // initialize fftw for omp threads
341  #ifdef fftw_MULTITHREADING
342  fftw_init_omp_threads(n_thread);
343  #endif
344 
345  // initialize fftw array and plan for T (and make it circulant first)
346  // t1=MPI_Wtime();
347  circ_init_fftw(T, (*blocksize), lambda, T_fft);
348  // t2= MPI_Wtime();
349 
350  // if (PRINT_RANK==0 && VERBOSE>0)
351  // printf("time circ_init_fftw=%f\n", t2-t1);
352 
353  // initialize fftw array and plan for V
354  // t1=MPI_Wtime();
355  rhs_init_fftw(nfft, (*blocksize), V_fft, V_rfft, plan_f, plan_b, fftw_flag);
356  // t2= MPI_Wtime();
357 
358  // if (PRINT_RANK==0 && VERBOSE>0)
359  // printf("time rhs_init_fftw=%f\n", t2-t1);
360 
361  if (PRINT_RANK == 0 && VERBOSE > 1)
362  printf("Initialization finished successfully\n");
363 
364  VERBOSE_FIRSTINIT = 0;
365 
366  return 0;
367 }
368 
369 
370 //=========================================================================
371 #ifdef fftw_MULTITHREADING
373 
379 int fftw_init_omp_threads(int fftw_n_thread) {
380  int status;
381 
382  // initialize fftw omp threads
383  status = fftw_init_threads();
384  if (status == 0) return print_error_message(3, __FILE__, __LINE__);
385 
386  // set the number of FFTW threads
387  fftw_plan_with_nthreads(fftw_n_thread);
388 
389  if (PRINT_RANK == 0 && VERBOSE > 1 && VERBOSE_FIRSTINIT == 1)
390  printf("Using multithreaded FFTW with %d threads\n", fftw_n_thread);
391 
392  return 0;
393 }
394 #endif
395 
396 
397 //=========================================================================
398 
400 
408 int rhs_init_fftw(const int *nfft, int fft_size, fftw_complex **V_fft,
409  double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b,
410  int fftw_flag) {
411  // allocate fftw arrays and plans for V
412  *V_fft = (fftw_complex *) fftw_malloc((*nfft) * (fft_size / 2 + 1)
413  * sizeof(fftw_complex));
414  *V_rfft = (double *) fftw_malloc((*nfft) * fft_size * sizeof(double));
415  if (*V_fft == 0 || *V_rfft == 0)
416  return print_error_message(2, __FILE__, __LINE__);
417 
418  *plan_f = fftw_plan_many_dft_r2c(1, &fft_size, (*nfft), *V_rfft, &fft_size,
419  1, fft_size, *V_fft, NULL, 1,
420  fft_size / 2 + 1, fftw_flag);
421  *plan_b = fftw_plan_many_dft_c2r(1, &fft_size, (*nfft), *V_fft, NULL, 1,
422  fft_size / 2 + 1, *V_rfft, &fft_size, 1,
423  fft_size, fftw_flag);
424 
425 
426  return 0;
427 }
428 
429 
430 //=========================================================================
431 
434 
441 int circ_init_fftw(const double *T, int fft_size, int lambda,
442  fftw_complex **T_fft) {
443  // routine variable
444  int i;
445  int circ_fftw_flag = FFTW_ESTIMATE;
446  // allocation for T_fft
447  *T_fft = (fftw_complex *) fftw_malloc((fft_size / 2 + 1)
448  * sizeof(fftw_complex));
449  if (*T_fft == 0) return print_error_message(2, __FILE__, __LINE__);
450  double *T_circ = (double *) (*T_fft);
451 
452  // inplace fft
453  fftw_plan plan_f_T;
454  plan_f_T = fftw_plan_dft_r2c_1d(fft_size, T_circ, *T_fft, circ_fftw_flag);
455 
456  // make T circulant
457 #pragma omp parallel for
458  for (i = 0; i < fft_size + 2; i++) T_circ[i] = 0.0;
459 
460  T_circ[0] = T[0];
461  for (i = 1; i < lambda; i++) {
462  T_circ[i] = T[i];
463  T_circ[fft_size - i] = T[i];
464  }
465 
466  fftw_execute(plan_f_T);
467  fftw_destroy_plan(plan_f_T);
468 
469  return 0;
470 }
471 
472 
473 //=========================================================================
474 
477 
485 int tpltz_cleanup(fftw_complex **T_fft, fftw_complex **V_fft, double **V_rfft,
486  fftw_plan *plan_f, fftw_plan *plan_b) {
487  fftw_destroy_plan(*plan_f);
488  fftw_destroy_plan(*plan_b);
489  fftw_free(*T_fft);
490  fftw_free(*V_fft);
491  fftw_free(*V_rfft);
492 #ifdef fftw_MULTITHREADING
493  fftw_cleanup_threads();
494 #endif
495  fftw_cleanup();
496 
497  return 0;
498 }
499 
500 
501 //=========================================================================
502 
505 
514 int copy_block(int ninrow, int nincol, double *Vin, int noutrow, int noutcol,
515  double *Vout, int inrow, int incol, int nblockrow, int nblockcol,
516  int outrow, int outcol, double norm, int set_zero_flag) {
517  int i, j, p, offsetIn, offsetOut;
518 
519  // do some size checks first
520  if ((nblockcol > nincol) || (nblockrow > ninrow) || (nblockcol > noutcol)
521  || (nblockrow > noutrow)) {
522  printf("Error in routine copy_block. Bad size setup.\n");
523  return print_error_message(7, __FILE__, __LINE__);
524  }
525 
526  if (set_zero_flag) {
527 #pragma omp parallel for // private(i) num_threads(NB_OMPTHREADS_CPBLOCK)
528  for (i = 0; i < noutrow * noutcol;
529  i++) // could use maybe memset but how about threading
530  Vout[i] = 0.0;
531  }
532 
533  offsetIn = ninrow * incol + inrow;
534  offsetOut = noutrow * outcol + outrow;
535 
536  // #pragma omp parallel for private(i,j,p)
537  // num_threads(NB_OMPTHREADS_CPBLOCK)
538  for (i = 0; i < nblockcol * nblockrow; i++) { // copy the block
539  j = i / nblockrow;
540  p = i % nblockrow;
541  Vout[offsetOut + j * noutrow + p] =
542  Vin[offsetIn + j * ninrow + p] * norm;
543  }
544 
545  return 0;
546 }
547 
548 
549 //=========================================================================
550 
553 
569 int scmm_direct(int fft_size, int nfft, fftw_complex *C_fft, int ncol,
570  double *V_rfft, double **CV, fftw_complex *V_fft,
571  fftw_plan plan_f_V, fftw_plan plan_b_CV) {
572  // routine variables
573  int sizeT = fft_size / 2 + 1;
574  int i, idx;
575 
576  // perform forward FFT
577  fftw_execute(plan_f_V); // input in V_rfft; output in V_fft
578 
579  // printf("ncol=%d, fft_size=%d, sizeT=%d\n", ncol, fft_size, sizeT);
580 
581  // double t1, t2;
582  // t1=MPI_Wtime();
583 
584 #pragma omp parallel for private(idx) // num_threads(nfft)
585  for (i = 0; i < ncol * sizeT; i++) {
586  idx = i % sizeT;
587  V_fft[i][0] = C_fft[idx][0] * V_fft[i][0] - C_fft[idx][1] * V_fft[i][1];
588  V_fft[i][1] = C_fft[idx][0] * V_fft[i][1] + C_fft[idx][1] * V_fft[i][0];
589  }
590 
591  // t2= MPI_Wtime();
592  // printf("Computation time : %lf s.\n", t2-t1);
593 
594 
595  // This is wrong :
596  /*
597  int icol;
598  double t1, t2;
599  t1=MPI_Wtime();
600  #pragma omp parallel for private(i, idx)
601  for(icol=0;icol<ncol;icol++) {
602  for(idx=0;idx<sizeT;idx++) {
603  i=icol*idx;
604  V_fft[i][0] = C_fft[idx][0]*V_fft[i][0]-C_fft[idx][1]*V_fft[i][1];
605  V_fft[i][1] = C_fft[idx][0]*V_fft[i][1]+C_fft[idx][1]*V_fft[i][0];
606  }}
607  t2= MPI_Wtime();
608  */
609  // printf("Computation time : %lf s.\n", t2-t1);
610 
611 
612  // perform backward FFts
613  fftw_execute(plan_b_CV); // input in V_fft; output in V_rfft
614 
615  return 0;
616 }
617 
618 
619 //=========================================================================
620 
623 
651 int scmm_basic(double **V, int blocksize, int m, fftw_complex *C_fft,
652  double **CV, fftw_complex *V_fft, double *V_rfft, int nfft,
653  fftw_plan plan_f_V, fftw_plan plan_b_CV) {
654  // routine variables
655  int i, k; // loop index
656  int nloop = (int) ceil((1.0 * m) / nfft); // number of subblocks
657 
658  // Loop over set of columns
659  int ncol = min(nfft, m); // a number of columns to be copied from the data
660  // to working matrix
661  // equal the number of simultaneous FFTs
662 
663 
664 #pragma omp parallel for // num_threads(NB_OMPTHREADS_BASIC)//schedule(dynamic,1)
665  for (i = 0; i < blocksize * ncol; i++)
666  V_rfft[i] = 0.0; // could use maybe memset but how about threading
667 
668 
669  // bug fixed conflit between num_threads and nfft
670  // #pragma omp parallel for schedule(dynamic,1) num_threads(8)
671  // //num_threads(nfft)
672  for (k = 0; k < nloop;
673  k++) { // this is the main loop over the set of columns
674  if (k == nloop - 1) // last loop ncol may be smaller than nfft
675  ncol = m - (nloop - 1) * nfft;
676 
677  // init fftw matrices.
678  // extracts a block of ncol full-length columns from the data matrix and
679  // embeds in a bigger matrix padding each column with lambda zeros. Note
680  // that all columns will be zero padded thanks to the "memset" call
681  // above
682 
683  copy_block(blocksize, m, (*V), blocksize, ncol, V_rfft, 0, k * nfft,
684  blocksize, ncol, 0, 0, 1.0, 0);
685  // note: all nfft vectors are transformed below ALWAYS in a single go
686  // (if ncol < nfft) the extra useless work is done.
687 
688  scmm_direct(blocksize, nfft, C_fft, ncol, V_rfft, CV, V_fft, plan_f_V,
689  plan_b_CV);
690  // note: the parameter CV is not really used
691 
692  // extract the relevant part from the result
693  copy_block(blocksize, ncol, V_rfft, blocksize, m, (*CV), 0, 0,
694  blocksize, ncol, 0, k * nfft, 1.0 / ((double) blocksize), 0);
695 
696  } // end of loop over the column-sets
697 
698 
699  return 0;
700 }
701 
702 
703 //=========================================================================
704 
707 
731 int stmm_core(double **V, int n, int m, double *T, fftw_complex *T_fft,
732  int blocksize, int lambda, fftw_complex *V_fft, double *V_rfft,
733  int nfft, fftw_plan plan_f, fftw_plan plan_b, int flag_offset,
734  int flag_nofft) {
735 
736  // double t1,t2;
737 
738  // t1= MPI_Wtime();
739 
740  // cheating:
741  // flag_offset = 1;
742 
743  // routine variable
744  int status;
745  int i, j, k, p; // loop index
746  int currentsize;
747  int distcorrmin = lambda - 1;
748 
749  int blocksize_eff =
750  blocksize
751  - 2 * distcorrmin; // just a good part after removing the overlaps
752  int nbloc; // number of subblock of slide/overlap algorithm
753 
754  if (flag_offset == 1)
755  nbloc = ceil((1.0 * (n - 2 * distcorrmin)) / blocksize_eff);
756  else
757  nbloc = ceil((1.0 * n)
758  / blocksize_eff); // we need n because of reshaping
759 
760  // if(PRINT_RANK==0 && VERBOSE>0)
761  // printf("nbloc=%d, n=%d, m=%d, blocksize=%d, blocksize_eff=%d\n", nbloc,
762  // n, m, blocksize, blocksize_eff);
763 
764  double *V_bloc, *TV_bloc;
765  V_bloc = (double *) calloc(blocksize * m, sizeof(double));
766  TV_bloc = (double *) calloc(blocksize * m, sizeof(double));
767  if ((V_bloc == 0) || (TV_bloc == 0))
768  return print_error_message(2, __FILE__, __LINE__);
769 
770  int offset = 0;
771  if (flag_offset == 1) offset = distcorrmin;
772 
773  int iV = 0; //"-distcorrmin+offset"; //first index in V
774  int iTV = offset; // first index in TV
775 
776  //"k=0";
777  // first subblock separately as it requires some padding. prepare the block
778  // of the data vector with the overlaps on both sides
779  currentsize = min(blocksize - distcorrmin + offset, n - iV);
780  // note: if flag_offset=0, pad first distcorrmin elements with zeros (for
781  // the first subblock only)
782  // and if flag_offset=1 there is no padding with zeros.
783  copy_block(n, m, *V, blocksize, m, V_bloc, 0, 0, currentsize, m,
784  distcorrmin - offset, 0, 1.0, 0);
785 
786  // do block computation
787  if (flag_nofft == 1)
788  status = stmm_simple_basic(&V_bloc, blocksize, m, T, lambda, &TV_bloc);
789  else
790  status = scmm_basic(&V_bloc, blocksize, m, T_fft, &TV_bloc, V_fft,
791  V_rfft, nfft, plan_f, plan_b);
792 
793  if (status != 0) {
794  printf("Error in stmm_core.");
795  return print_error_message(7, __FILE__, __LINE__);
796  }
797 
798 
799  // now copy first the new chunk of the data matrix **before** overwriting
800  // the input due to overlaps !
801  iV = blocksize_eff - distcorrmin + offset;
802 
803  if (nbloc > 1) {
804  currentsize = min(blocksize, n - iV); // not to overshoot
805 
806  int flag_reset =
807  (currentsize
808  != blocksize); // with flag_reset=1, always "memset" the block.
809  copy_block(n, m, *V, blocksize, m, V_bloc, iV, 0, currentsize, m, 0, 0,
810  1.0, flag_reset);
811  }
812 
813  // and now store the ouput back in V
814  currentsize = min(blocksize_eff, n - iTV); // to trim the extra rows
815  copy_block(blocksize, m, TV_bloc, n, m, *V, distcorrmin, 0, currentsize, m,
816  iTV, 0, 1.0, 0);
817 
818 
819  iTV += blocksize_eff;
820  // now continue with all the other subblocks
821  for (k = 1; k < nbloc; k++) {
822 
823  // do bloc computation
824  if (flag_nofft == 1)
825  status = stmm_simple_basic(&V_bloc, blocksize, m, T, lambda,
826  &TV_bloc);
827  else
828  status = scmm_basic(&V_bloc, blocksize, m, T_fft, &TV_bloc, V_fft,
829  V_rfft, nfft, plan_f, plan_b);
830 
831  if (status != 0) break;
832 
833 
834  iV += blocksize_eff;
835  // copy first the next subblock to process
836  if (k != nbloc - 1) {
837  currentsize = min(blocksize, n - iV); // not to overshoot
838 
839  int flag_resetk =
840  (currentsize != blocksize); // with flag_reset=1, always
841  // "memset" the block.
842  copy_block(n, m, *V, blocksize, m, V_bloc, iV, 0, currentsize, m, 0,
843  0, 1.0, flag_resetk);
844  }
845 
846  // and then store the output in V
847  currentsize = min(blocksize_eff, n - iTV); // not to overshoot
848  copy_block(blocksize, m, TV_bloc, n, m, *V, distcorrmin, 0, currentsize,
849  m, iTV, 0, 1.0, 0);
850  iTV += blocksize_eff;
851 
852  } // end bloc computation
853 
854 
855  free(V_bloc);
856  free(TV_bloc);
857 
858 
859  // t2= MPI_Wtime();
860 
861  // if (PRINT_RANK==0 && VERBOSE>0)
862  // printf("time stmm_core=%f\n", t2-t1);
863 
864  return status;
865 }
866 
867 
868 //=========================================================================
869 
872 
888 int stmm_main(double **V, int n, int m, int id0, int l, double *T,
889  fftw_complex *T_fft, int lambda, fftw_complex *V_fft,
890  double *V_rfft, fftw_plan plan_f, fftw_plan plan_b, int blocksize,
891  int nfft, Flag flag_stgy) {
892 
893  // routine variable
894  int i, j, k, p; // loop index
895  int distcorrmin = lambda - 1;
896  int flag_prod_strategy_nofft = 0; // 0: ffts 1: no ffts
897  int flag_shortcut_m_eff_eq_1 = 1; // 1;//1;
898  int flag_shortcut_nbcol_eq_1 = 1; // 1;//1;
899  int flag_nfullcol_in_middle =
900  0; // 0; //in the case where m=1 can be good to direct stmm_core too
901  int flag_optim_offset_for_nfft = 0;
902  int flag_no_rshp = flag_stgy.flag_no_rshp; // 0;
903  int flag_nofft = flag_stgy.flag_nofft; // 1;
904 
905  int m_eff = (id0 + l - 1) / n - id0 / n + 1; // number of columns
906  int nfullcol;
907  int nloop_middle; // change it to number of full column to improve memory
908 
909  FILE *file;
910  file = stdout;
911 
912  if (l < distcorrmin) // test to avoid communications errors
913  return print_error_message(1, __FILE__, __LINE__);
914 
915 
916  // shortcut for m==1 if flag_shortcut_m_eff_eq_1==1 && nfft==1 ??
917  if (m_eff == 1 && flag_shortcut_m_eff_eq_1 == 1 && nfft == 1
918  || flag_no_rshp == 1 && id0 == 0 && l == n * m) {
919 
920  int flag_offset = 0;
921 
922  // if (flag_prod_strategy_nofft==1) //need to have T as input to make
923  // it work
924  // stmm_simple_core(V, n, m, T, blocksize, lambda, nfft, flag_offset);
925  // else
926  int nr = min(l, n);
927  stmm_core(V, nr, m_eff, T, T_fft, blocksize, lambda, V_fft, V_rfft,
928  nfft, plan_f, plan_b, flag_offset, flag_nofft);
929 
930 
931  return 0;
932  } // End shortcut for m==1
933 
934 
935  // the middle
936  int m_middle;
937 
938  // define splitting for the product computation
939  nfullcol = max(
940  0, (l - (n - id0 % n) % n - (id0 + l) % n)
941  / n); // check how many full columns input data we have
942 
943  if (flag_nfullcol_in_middle == 1)
944  nloop_middle = ceil(1.0 * (nfullcol) / nfft);
945  else
946  nloop_middle = (nfullcol) / nfft;
947 
948  if (flag_nfullcol_in_middle == 1) m_middle = nfullcol;
949  else
950  m_middle = nfft * nloop_middle;
951 
952 
953  int vmiddle_size = n * m_middle;
954 
955 
956  if (PRINT_RANK == 0 && VERBOSE > 2)
957  printf("nloop_middle=%d , m_middle=%d\n", nloop_middle, m_middle);
958 
959 
960  // compute the middle if needed
961  if (nloop_middle > 0) {
962  double *Vmiddle;
963  int offset_middle = (n - id0 % n) % n;
964  Vmiddle = (*V) + offset_middle;
965 
966  int flag_offset = 0;
967  stmm_core(&Vmiddle, n, m_middle, T, T_fft, blocksize, lambda, V_fft,
968  V_rfft, nfft, plan_f, plan_b, flag_offset, flag_nofft);
969 
970  } //(nloop_middle>0)
971 
972 
973  // edge (first+last columns + extra column from the euclidian division)
974  int v1edge_size = min(l, (n - id0 % n) % n);
975  int v2edge_size = max(l - (v1edge_size + vmiddle_size), 0);
976  int vedge_size = v1edge_size + v2edge_size;
977 
978  // compute the edges if needed
979  if (vedge_size > 0) {
980 
981  int m_v1edge, m_v2edge;
982  m_v1edge = (v1edge_size > 0) * 1; // m_v1 = 1 or 0 cannot be more
983  m_v2edge = m - (m_v1edge + m_middle);
984  int nbcol = m_v1edge + m_v2edge;
985  int *nocol;
986  nocol = (int *) calloc(nbcol, sizeof(double));
987 
988  // define the columns for the edge computation
989  if (m_v1edge == 1) nocol[0] = 0;
990  for (i = (m_v1edge); i < nbcol; i++) nocol[i] = m_middle + i;
991 
992  if (PRINT_RANK == 0 && VERBOSE > 2)
993  printf("nbcol=%d , m_v1edge=%d , m_v2edge=%d\n", nbcol, m_v1edge,
994  m_v2edge);
995 
996  // shorcut for nbcol==1
997  if (nbcol == 1 && nfft == 1 && flag_shortcut_nbcol_eq_1 == 1) {
998  // this is the case where no reshaping is needed. This is equivalent
999  // to flag_format_rshp==0
1000  double *Vedge;
1001  int offset_edge = n * nocol[0]; // work because all the previous
1002  // columns are obligatory full
1003  Vedge = (*V) + offset_edge;
1004  int flag_offset = 0;
1005  stmm_core(&Vedge, vedge_size, nbcol, T, T_fft, blocksize, lambda,
1006  V_fft, V_rfft, nfft, plan_f, plan_b, flag_offset,
1007  flag_nofft);
1008 
1009  } else { // general case to compute de edges
1010 
1011  double *Vin;
1012  Vin = (*V);
1013 
1014  // size for the different kinds of reshaping
1015  int lconc = vedge_size; // another way to compute : lconc = n*nbcol
1016  // - (nocol[0]==0)*(id0%n) -
1017  // (nocol[nbcol-1]==(m-1))*(n-(id0+l)%n);
1018  int v1_size = lconc + (distcorrmin) * (nbcol - 1);
1019  int fft_size = ceil(1.0 * v1_size / nfft) + 2 * distcorrmin;
1020 
1021  int flag_format_rshp = (nfft > 1) * 2 + (nfft == 1 && nbcol > 1) * 1
1022  + (nfft == 1 && nbcol == 1) * 0;
1023  int nrshp, mrshp, lrshp;
1024 
1025  define_rshp_size(flag_format_rshp, fft_size, nfft, v1_size,
1026  vedge_size, &nrshp, &mrshp, &lrshp);
1027 
1028  // allocate Vrshp for computation
1029  double *Vrshp;
1030  Vrshp = (double *) calloc(lrshp, sizeof(double));
1031  double *Vout;
1032  Vout = (*V);
1033 
1034  if (PRINT_RANK == 0 && VERBOSE > 2) {
1035  fprintf(file, "nrshp=%d , mrshp=%d , lrshp=%d\n", nrshp, mrshp,
1036  lrshp);
1037  fprintf(file, "flag_format_rshp=%d\n", flag_format_rshp);
1038  }
1039 
1040  build_reshape(Vin, nocol, nbcol, lconc, n, m, id0, l, lambda, nfft,
1041  Vrshp, nrshp, mrshp, lrshp, flag_format_rshp);
1042 
1043  int flag_offset;
1044  if (flag_format_rshp == 2 && flag_optim_offset_for_nfft == 1)
1045  flag_offset = 1;
1046  else
1047  flag_offset = 0;
1048 
1049  // compute Vrshp
1050  stmm_core(&Vrshp, nrshp, mrshp, T, T_fft, blocksize, lambda, V_fft,
1051  V_rfft, nfft, plan_f, plan_b, flag_offset, flag_nofft);
1052 
1053  extract_result(Vout, nocol, nbcol, lconc, n, m, id0, l, lambda,
1054  nfft, Vrshp, nrshp, mrshp, lrshp, flag_format_rshp);
1055 
1056 
1057  } // End general case to compute de edges
1058  } // End (vedge_size>0)
1059 
1060 
1061  return 0;
1062 }
1063 
1064 
1065 //=========================================================================
1066 #ifdef W_MPI
1069 
1079 int mpi_stmm(double **V, int n, int m, int id0, int l, double *T, int lambda,
1080  Flag flag_stgy, MPI_Comm comm) {
1081 
1082  // mpi variables
1083  int rank; // rank process
1084  int size; // number of processes
1085  MPI_Status status;
1086  MPI_Comm_rank(comm, &rank);
1087  MPI_Comm_size(comm, &size);
1088 
1089 
1090  // routine variables
1091  int i, j, k; // some index
1092  int idf = id0 + l; // first index of scattered V for rank "rank + 1";
1093  int cfirst = id0 / n; // first column index
1094  int clast = idf / n; // last column index
1095  int clast_r = (idf - 1) / n;
1096  int m_eff = clast_r - cfirst + 1;
1097  double *V1, *Lambda;
1098 
1099  // Mpi communication conditions
1100  // Mpi comm is needed when columns are truncated
1101  int right = rank + 1;
1102  int left = rank - 1;
1103  int v1_size = l + 2 * lambda; // size including comm
1104  if (rank == 0 || cfirst * n == id0) { // no left comm
1105  v1_size -= lambda;
1106  left = MPI_PROC_NULL;
1107  }
1108  if (rank == (size - 1) || clast * n == idf) { // no right comm
1109  v1_size -= lambda;
1110  right = MPI_PROC_NULL;
1111  }
1112 
1113  // init data to send
1114  Lambda = (double *) malloc(2 * lambda * sizeof(double));
1115  if (Lambda == 0) return print_error_message(2, __FILE__, __LINE__);
1116 
1117  for (i = 0; i < lambda; i++) {
1118  Lambda[i] = (*V)[i];
1119  Lambda[i + lambda] = (*V)[i + l - lambda];
1120  }
1121 
1122  if (PRINT_RANK == 0 && VERBOSE > 2)
1123  printf("[rank %d] Left comm with %d | Right comm with %d\n", rank, left,
1124  right);
1125 
1126  // send and receive data
1127  MPI_Sendrecv_replace(Lambda, lambda, MPI_DOUBLE, left, MPI_USER_TAG, right,
1128  MPI_USER_TAG, comm,
1129  &status); // 1st comm
1130  MPI_Sendrecv_replace((Lambda + lambda), lambda, MPI_DOUBLE, right,
1131  MPI_USER_TAG, left, MPI_USER_TAG, comm,
1132  &status); // 2nd comm
1133 
1134 
1135  if (l < lambda) // After sendrecv to avoid problems of communication for
1136  // others processors
1137  return print_error_message(1, __FILE__, __LINE__);
1138 
1139  // copy received data
1140  if (left == MPI_PROC_NULL && right == MPI_PROC_NULL) // 0--0 : nothing to do
1141  V1 = *V;
1142  else if (left == MPI_PROC_NULL) { // 0--1 : realloc
1143  *V = realloc(*V, v1_size * sizeof(double));
1144  if (*V == NULL) return print_error_message(2, __FILE__, __LINE__);
1145  V1 = *V;
1146  } else // 1--1 or 1--0 : new allocation
1147  V1 = (double *) malloc(v1_size * sizeof(double));
1148 
1149  if (left != MPI_PROC_NULL) {
1150  for (i = 0; i < lambda; i++) V1[i] = Lambda[i + lambda];
1151  id0 -= lambda;
1152  }
1153  if (right != MPI_PROC_NULL) {
1154  for (i = 0; i < lambda; i++) V1[i + v1_size - lambda] = Lambda[i];
1155  }
1156 
1157  // Copy input matrix V
1158  int offset = 0;
1159  if (left != MPI_PROC_NULL) {
1160  offset = lambda;
1161 #pragma omp parallel for
1162  for (i = offset; i < l + offset; i++) V1[i] = (*V)[i - offset];
1163  }
1164 
1165  fftw_complex *V_fft, *T_fft;
1166  double *V_rfft;
1167  fftw_plan plan_f, plan_b;
1168 
1169 
1170  // Compute matrix product
1171  int nfft, blocksize;
1172 
1173  tpltz_init(v1_size, lambda, &nfft, &blocksize, &T_fft, T, &V_fft, &V_rfft,
1174  &plan_f, &plan_b, flag_stgy);
1175 
1176  if (PRINT_RANK == 0 && VERBOSE > 1)
1177  printf("[rank %d] Before middle-level call : blocksize=%d, nfft=%d\n",
1178  rank, blocksize, nfft);
1179 
1180  stmm_main(&V1, n, m, id0, v1_size, T, T_fft, lambda, V_fft, V_rfft, plan_f,
1181  plan_b, blocksize, nfft, flag_stgy);
1182 
1183 
1184  tpltz_cleanup(&T_fft, &V_fft, &V_rfft, &plan_f, &plan_b);
1185 
1186  // Copy output matrix TV
1187  offset = 0;
1188  if (left != MPI_PROC_NULL) offset = lambda;
1189  if (left == MPI_PROC_NULL && right == MPI_PROC_NULL) // 0--0
1190  *V = V1;
1191  else if (left == MPI_PROC_NULL) { // 0--1
1192  V1 = realloc(V1, l * sizeof(double));
1193  if (V1 == NULL) return print_error_message(2, __FILE__, __LINE__);
1194  *V = V1;
1195  } else { // 1--0 or 1--1
1196 #pragma omp parallel for
1197  for (i = offset; i < l + offset; i++) (*V)[i - offset] = V1[i];
1198  }
1199 
1200  if (left != MPI_PROC_NULL) free(V1);
1201 
1202  return 0;
1203 }
1204 
1205 #endif
int VERBOSE_FIRSTINIT
Definition: toeplitz.c:114
int define_blocksize(int n, int lambda, int bs_flag, int fixed_bs)
Defines an optimal size of the block used in the sliding windows algorithm.
Definition: toeplitz.c:168
int rhs_init_fftw(const int *nfft, int fft_size, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b, int fftw_flag)
Initializes fftw array and plan for the right hand side, general matrix V.
Definition: toeplitz.c:408
int print_error_message(int error_number, char const *file, int line)
Prints error message corresponding to an error number.
Definition: toeplitz.c:127
int stmm_core(double **V, int n, int m, double *T, fftw_complex *T_fft, int blocksize, int lambda, fftw_complex *V_fft, double *V_rfft, int nfft, fftw_plan plan_f, fftw_plan plan_b, int flag_offset, int flag_nofft)
Definition: toeplitz.c:731
int scmm_direct(int fft_size, int nfft, fftw_complex *C_fft, int ncol, double *V_rfft, double **CV, fftw_complex *V_fft, fftw_plan plan_f_V, fftw_plan plan_b_CV)
Definition: toeplitz.c:569
int PRINT_RANK
Definition: toeplitz.c:117
int define_nfft(int n_thread, int flag_nfft, int fixed_nfft)
Definition: toeplitz.c:266
int circ_init_fftw(const double *T, int fft_size, int lambda, fftw_complex **T_fft)
Definition: toeplitz.c:441
int copy_block(int ninrow, int nincol, double *Vin, int noutrow, int noutcol, double *Vout, int inrow, int incol, int nblockrow, int nblockcol, int outrow, int outcol, double norm, int set_zero_flag)
Definition: toeplitz.c:514
int tpltz_init(int n, int lambda, int *nfft, int *blocksize, fftw_complex **T_fft, double *T, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b, Flag flag_stgy)
Definition: toeplitz.c:298
int VERBOSE
Verbose mode.
Definition: toeplitz.c:113
int scmm_basic(double **V, int blocksize, int m, fftw_complex *C_fft, double **CV, fftw_complex *V_fft, double *V_rfft, int nfft, fftw_plan plan_f_V, fftw_plan plan_b_CV)
Definition: toeplitz.c:651
int tpltz_cleanup(fftw_complex **T_fft, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b)
Definition: toeplitz.c:485
int stmm_main(double **V, int n, int m, int id0, int l, double *T, fftw_complex *T_fft, int lambda, fftw_complex *V_fft, double *V_rfft, fftw_plan plan_f, fftw_plan plan_b, int blocksize, int nfft, Flag flag_stgy)
Definition: toeplitz.c:888
int mpi_stmm(double **V, int n, int m, int id0, int l, double *T, int lambda, Flag flag_stgy, MPI_Comm comm)
Definition: toeplitz.c:1079
int fftw_init_omp_threads(int fftw_n_thread)
Initialize omp threads for fftw plans.
Definition: toeplitz.c:379
int stmm_simple_basic(double **V, int n, int m, double *T, int lambda, double **TV)
Perform the product of a Toeplitz matrix by a matrix without using FFT's.
int build_reshape(double *Vin, int *nocol, int nbcol, int lconc, int n, int m, int id0, int l, int lambda, int nfft, double *Vrshp, int nrshp, int mrshp, int lrshp, int flag_format_rshp)
int define_rshp_size(int flag_format_rshp, int fft_size, int nfft, int v1_size, int vedge_size, int *nrshp, int *mrshp, int *lrshp)
int extract_result(double *Vout, int *nocol, int nbcol, int lconc, int n, int m, int id0, int l, int lambda, int nfft, double *Vrshp, int nrshp, int mrshp, int lrshp, int flag_format_rshp)