MIDAPACK - MIcrowave Data Analysis PACKage  1.1b
Parallel software tools for high performance CMB DA analysis
toeplitz_block.c
Go to the documentation of this file.
1 
62 #include "toeplitz.h"
63 
64 #define max(a, b) \
65  ({ \
66  __typeof__(a) _a = (a); \
67  __typeof__(b) _b = (b); \
68  _a > _b ? _a : _b; \
69  })
70 
71 #define min(a, b) \
72  ({ \
73  __typeof__(a) _a = (a); \
74  __typeof__(b) _b = (b); \
75  _a < _b ? _a : _b; \
76  })
77 
78 // r1.1 - Frederic Dauvergne (APC)
79 // This is the routines related to the Toeplitz blocks diagonal routine.
80 // There is a sequential equivalent routine in the file toeplitz_seq.c
81 
82 // todo:
83 //- remove the nooptimize communication
84 
85 //=========================================================================
86 #ifdef W_MPI
90 
115 int mpi_stbmm(double **V, int64_t nrow, int m, int m_rowwise,
116  Block *tpltzblocks, int nb_blocks_local, int nb_blocks_all,
117  int64_t idp, int local_V_size, Flag flag_stgy, MPI_Comm comm) {
118 #else // for sequential use only
119 int mpi_stbmm(double **V, int64_t nrow, int m, int m_rowwise,
120  Block *tpltzblocks, int nb_blocks_local, int nb_blocks_all,
121  int64_t idp, int local_V_size, Flag flag_stgy) {
122 #endif
123 
124 
125  // MPI parameters
126  int rank; // process rank
127  int size; // process number
128 
129 #ifdef W_MPI
130  MPI_Status status;
131  MPI_Comm_rank(comm, &rank);
132  MPI_Comm_size(comm, &size);
133 
134 #else
135  rank = 0;
136  size = 1;
137 #endif
138 
139  PRINT_RANK = rank;
140 
141  FILE *file;
142  file = stdout;
143 
144  int i, j, k; // some indexes
145 
146 
147  // identification of the mpi neighbours process to communicate when there is
148  // a shared block
149  int right = rank + 1;
150  int left = rank - 1;
151 
152 
153  // Define the indices for each process
154  int idv0, idvn; // indice of the first and the last block of V for each
155  // processes
156 
157  int *nnew;
158  nnew = (int *) calloc(nb_blocks_local, sizeof(int));
159  int64_t idpnew;
160  int local_V_size_new;
161  int n_rowwise = local_V_size;
162 
163  int status_params = get_overlapping_blocks_params(
164  nb_blocks_local, tpltzblocks, local_V_size, nrow, idp, &idpnew,
165  &local_V_size_new, nnew, &idv0, &idvn);
166 
167 
168  if (PRINT_RANK == 0 && VERBOSE > 2)
169  printf("status_params=%d\n", status_params);
170 
171  if (status_params == 0) {
172  free(nnew);
173  return (0); // no work to be done
174  }
175 
176  if (tpltzblocks[idv0].lambda == 0 || tpltzblocks[idvn].lambda == 0)
177  return print_error_message(2, __FILE__, __LINE__);
178 
179 
180  if (PRINT_RANK == 0 && VERBOSE > 2) { // print on screen news parameters
181  // definition if VERBOSE
182  fprintf(file, "new parameters caracteristics:\n");
183  fprintf(file, "[%d] idp=%ld ; idpnew=%ld\n", rank, idp, idpnew);
184  fprintf(file, "[%d] local_V_size=%d ; local_V_size_new=%d\n", rank,
185  local_V_size, local_V_size_new);
186  for (i = 0; i < nb_blocks_local; i++)
187  fprintf(file, "[%d] n[%d]=%d ; nnew[%d]=%d\n", rank, i,
188  (tpltzblocks[i].n), i, nnew[i]);
189  for (i = 0; i < nb_blocks_local; i++)
190  fprintf(file, "[%d] tpltzblocks[%d].idv=%ld\n", rank, i,
191  tpltzblocks[i].idv);
192  }
193 
194 
195  int vShft = idpnew - idp; // new first element of relevance in V
196 
197  // Define the column indices:
198  // index of the first and the last column of V for the current process
199  int idvm0 = idpnew / nrow;
200  int idvmn = (idpnew + local_V_size_new - 1) / nrow;
201  // number of columns of V for the current process
202  int ncol_rank = idvmn - idvm0 + 1;
203  // number of blocks for the current process with possibly repetitions
204  int nb_blocks_rank;
205 
206  if (ncol_rank == 1) // Empty process not allowed
207  nb_blocks_rank = idvn - idv0 + 1;
208  else
209  nb_blocks_rank =
210  (ncol_rank - 2) * nb_blocks_local + (nb_blocks_local - idv0)
211  + (idvn + 1); // in this case nb_blocks_local = nblocs_all
212 
213  if (PRINT_RANK == 0 && VERBOSE > 2)
214  fprintf(file, "[%d] nb_blocks_rank=%d, nb_blocks_local=%d\n", rank,
215  nb_blocks_rank, nb_blocks_local);
216 
217  // Define the indices for the first and the last element in each blocks
218  int idvp0 = idpnew % nrow
219  - tpltzblocks[idv0].idv; // index of the first element of the
220  // process in the first block
221  int idvpn; // reverse index of the last element of the process in the last
222  // block
223  // It's the number of remaining elements needed to fully complete the last
224  // block
225  idvpn = tpltzblocks[idvn].idv + nnew[idvn] - 1
226  - (idpnew + local_V_size_new - 1) % nrow;
227 
228 
229  // Define the offsets for the first and last blocks of the process for V1
230  int offset0, offsetn;
231  int distcorrmin_idv0 = (tpltzblocks[idv0].lambda) - 1;
232  int distcorrmin_idvn = (tpltzblocks[idvn].lambda) - 1;
233 
234  // if(idvp0 != 0)
235  offset0 = min(idvp0, distcorrmin_idv0);
236  // if(idvpn != 0)
237  offsetn = min(idvpn, distcorrmin_idvn);
238 
239 
240  int toSendLeft = 0;
241  int toSendRight = 0;
242 
243 #ifdef W_MPI
244  if (offset0 != 0) {
245  toSendLeft = min(tpltzblocks[idv0].idv + nnew[idv0] - idpnew % nrow,
246  distcorrmin_idv0);
247  }
248  if (offsetn != 0) {
249  toSendRight =
250  min((idpnew + local_V_size_new) % nrow - tpltzblocks[idvn].idv,
251  distcorrmin_idvn);
252  }
253 
254  int flag_optimlambda = 1; // to allocate only the memory place needed
255 
256  int lambdaOut_offset;
257  int lambdaIn_offset;
258  double *LambdaOut;
259  int lambdaOut_size, lambdaIn_size;
260 
261  if (flag_optimlambda == 1) {
262  LambdaOut = (double *) calloc((toSendLeft + toSendRight) * m_rowwise,
263  sizeof(double));
264  lambdaOut_offset = toSendLeft * m_rowwise;
265  lambdaIn_offset = offset0 * m_rowwise;
266  lambdaOut_size = (toSendLeft + toSendRight) * m_rowwise;
267  lambdaIn_size = (offset0 + offsetn) * m_rowwise;
268  } else {
269  LambdaOut = (double *) calloc(
270  (tpltzblocks[idv0].lambda + tpltzblocks[idvn].lambda)
271  * m_rowwise,
272  sizeof(double));
273  lambdaOut_offset = tpltzblocks[idv0].lambda * m_rowwise;
274  lambdaIn_offset = tpltzblocks[idv0].lambda * m_rowwise;
275  lambdaOut_size = (tpltzblocks[idv0].lambda + tpltzblocks[idvn].lambda)
276  * m_rowwise;
277  lambdaIn_size = (tpltzblocks[idv0].lambda + tpltzblocks[idvn].lambda)
278  * m_rowwise;
279  }
280 
281 
282  if (offset0 != 0) {
283  for (j = 0; j < m_rowwise; j++)
284  for (i = 0; i < toSendLeft; i++)
285  LambdaOut[i + j * toSendLeft] =
286  (*V)[i + j * n_rowwise]; // good because toSendLeft=0 if
287  // it
288  } // doesnt start on a the first block.
289  if (offsetn != 0) {
290  for (j = 0; j < m_rowwise; j++)
291  for (i = 0; i < toSendRight; i++)
292  LambdaOut[i + j * toSendRight + lambdaOut_offset] =
293  (*V)[i + j * n_rowwise + local_V_size - toSendRight];
294  } // good too using same argument than for offset0!=0
295  // if local_V_size!=local_V_size_new+vShft mean there is extra
296  // terms a the end and so offsetn=0
297  // idpnew+local_V_size_new = idp+local_V_size and vShft=idpnew-idp
298  // so local_V_size=vShft+local_V_size_new
299  if (rank == 0 || offset0 == 0) left = MPI_PROC_NULL;
300  if (rank == size - 1 || offsetn == 0) right = MPI_PROC_NULL;
301 
302  double *LambdaIn = (double *) calloc(lambdaIn_size, sizeof(double));
303 
304 
305  int flag_blockingcomm = 0; // to use blocking comm
306  MPI_Request requestLeft_r, requestLeft_s;
307  MPI_Request requestRight_r, requestRight_s;
308 
309  if (flag_blockingcomm == 1) {
310  // send and receive data
312  // to the Left
313  MPI_Sendrecv(LambdaOut, toSendLeft * m_rowwise, MPI_DOUBLE, left,
314  MPI_USER_TAG, (LambdaIn + lambdaIn_offset),
315  offsetn * m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, comm,
316  &status);
317 
318  // to the Right
319  MPI_Sendrecv((LambdaOut + lambdaOut_offset), toSendRight * m_rowwise,
320  MPI_DOUBLE, right, MPI_USER_TAG, LambdaIn,
321  offset0 * m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, comm,
322  &status);
324  } else {
326  // to the Left
327  MPI_Irecv((LambdaIn + lambdaIn_offset), offsetn * m_rowwise, MPI_DOUBLE,
328  right, MPI_USER_TAG, comm, &requestLeft_r);
329  MPI_Isend(LambdaOut, toSendLeft * m_rowwise, MPI_DOUBLE, left,
330  MPI_USER_TAG, comm, &requestLeft_s);
331 
332  // to the Right
333  MPI_Irecv(LambdaIn, offset0 * m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG,
334  comm, &requestRight_r);
335  MPI_Isend((LambdaOut + lambdaOut_offset), toSendRight * m_rowwise,
336  MPI_DOUBLE, right, MPI_USER_TAG, comm, &requestRight_s);
338  }
339 
340 #endif
341 
342 
343  // size of the first and the last block for the current process
344  int v0rank_size, vnrank_size;
345  if (nb_blocks_rank == 1) { // only one block
346  v0rank_size = ((idpnew + local_V_size_new - 1) % nrow + 1)
347  - idpnew % nrow + offset0 + offsetn;
348  vnrank_size = 0; // just for convenience - no really need it
349  } else { // more than one block
350  v0rank_size =
351  tpltzblocks[idv0].idv + nnew[idv0] - idpnew % nrow + offset0;
352  vnrank_size = ((idpnew + local_V_size_new - 1) % nrow + 1)
353  - tpltzblocks[idvn].idv + offsetn;
354  }
355 
356 
357 #ifdef W_MPI
358 
359  if (flag_blockingcomm != 1) {
360  // MPI_Wait for lambda comm
362  MPI_Wait(&requestLeft_r, &status);
363  MPI_Wait(&requestLeft_s, &status);
364  MPI_Wait(&requestRight_r, &status);
365  MPI_Wait(&requestRight_s, &status);
367  }
368 
369 
370  free(LambdaOut);
371 
372 #endif
373 
374 
375  //---------------------------------------
376  // initialization for the blocks loop
377 
378  int idv1 = 0; // old index of *V1
379  int idv2 = 0; // index
380 
381 
382  int mid; // local number of column for the current block
383  // index of the first element of the process inside the first block
384  int offset_id0;
385  offset_id0 = idvp0;
386 
387  // fftw variables
388  fftw_complex *V_fft, *T_fft;
389  double *V_rfft;
390  fftw_plan plan_f, plan_b;
391  // init local block vector
392  double *V1block;
393  // int lambdaShft;
394 
395 
396  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
397  // loop on the blocks inside the process
398  int nfft, blocksize;
399  int iblock; // index for the loop on the blocks
400  // int loopindex;
401  int id; // indice of the current block
402 
403  int vblock_size;
404  int id0block;
405 
406  int jj;
407 
408 
409  for (iblock = idv0; iblock < idv0 + nb_blocks_rank; iblock++) {
410  id = iblock % nb_blocks_local; // index of current block
411 
412 
413  if (nnew[id] > 0) { // the block is ok
414 
415 #ifdef W_MPI
416  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
417  // first case : First block of the process
418  if (iblock == idv0) {
419  if (PRINT_RANK == 0 && VERBOSE > 2)
420  fprintf(file, "[%d] First block...\n", rank);
421 
422  vblock_size = v0rank_size;
423  id0block = (offset_id0 - offset0);
424 
425  V1block = (double *) calloc(vblock_size * m_rowwise,
426  sizeof(double));
427 
428  for (j = 0; j < m_rowwise; j++) {
429 #pragma omp parallel for // num_threads(NB_OMPTHREADS_STBMM)
430  for (i = 0; i < offset0; i++)
431  V1block[i + j * vblock_size] =
432  LambdaIn[i + j * offset0];
433  }
434  // note: check if copyblock could be used instead.
435 
436 
437  // if (nb_blocks_rank == 1)
438  // currentsize_middlepart=vblock_size-offset0-offsetn =
439  // local_V_size_new else
440  // currentsize_middlepart=vblock_size-offset0
441  int currentsize_middlepart =
442  min(vblock_size - offset0, local_V_size_new);
443 
444  for (j = 0; j < m_rowwise; j++) {
445 #pragma omp parallel for // num_threads(NB_OMPTHREADS_STBMM)
446  for (i = 0; i < currentsize_middlepart; i++)
447  V1block[offset0 + i + j * vblock_size] =
448  (*V)[i + vShft + j * n_rowwise];
449  }
450 
451  if (nb_blocks_rank == 1) {
452  for (j = 0; j < m_rowwise; j++) {
453 #pragma omp parallel for // num_threads(NB_OMPTHREADS_STBMM)
454  for (i = 0; i < offsetn; i++) {
455  V1block[vblock_size - offsetn + i
456  + j * vblock_size] =
457  LambdaIn[i + lambdaIn_offset + j * offsetn];
458  }
459  }
460  }
461 
462 
463  // init Toeplitz arrays
464  tpltz_init(vblock_size, tpltzblocks[id].lambda, &nfft,
465  &blocksize, &T_fft, (tpltzblocks[id].T_block),
466  &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
467 
468  // Toeplitz computation
469  if (PRINT_RANK == 0 && VERBOSE > 2)
470  fprintf(file,
471  "[%d] Before stmm_main call : nfft = %d, blocksize "
472  "= %d\n",
473  rank, nfft, blocksize);
474  stmm_main(&V1block, vblock_size, m_rowwise, 0,
475  m_rowwise * vblock_size, (tpltzblocks[id].T_block),
476  T_fft, tpltzblocks[id].lambda, V_fft, V_rfft, plan_f,
477  plan_b, blocksize, nfft, flag_stgy);
478 
479  tpltz_cleanup(&T_fft, &V_fft, &V_rfft, &plan_f, &plan_b);
480 
481 
482  int currentsize = min(vblock_size - offset0, local_V_size_new);
483  for (j = 0; j < m_rowwise; j++) {
484 #pragma omp parallel for // num_threads(NB_OMPTHREADS_STBMM)
485  for (i = 0; i < currentsize; i++)
486  (*V)[vShft + i + j * n_rowwise] =
487  V1block[offset0 + i + j * vblock_size];
488  }
489 
490  free(V1block);
491 
492  } // end (First case)
493 
494 
495  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
496  // Generic case : Generic block of the process
497  else if (iblock != idv0 && iblock != idv0 + nb_blocks_rank - 1) {
498 #endif
499 
500  if (PRINT_RANK == 0 && VERBOSE > 2)
501  fprintf(file, "[%d] generic block...\n", rank);
502 
503  vblock_size = nnew[id];
504  id0block = 0;
505 
506  V1block = (double *) calloc(vblock_size * m_rowwise,
507  sizeof(double));
508 
509  idv1 = (tpltzblocks[id].idv) - idp % nrow - vShft + offset0
510  + nrow * ((iblock / nb_blocks_local)); // no need
511  // idv2 = idv[id]-idp%nrow + nrow*( (iblock/nb_blocks_local) );
512  idv2 = (tpltzblocks[id].idv) - (idpnew) % nrow + vShft
513  + nrow * ((iblock / nb_blocks_local));
514 
515  for (j = 0; j < m_rowwise; j++) {
516 #pragma omp parallel for // num_threads(NB_OMPTHREADS_STBMM)
517  for (i = 0; i < vblock_size; i++)
518  V1block[i + j * vblock_size] =
519  (*V)[i + idv2 + j * n_rowwise];
520  // V1block[i] = (*V)[i+idv1-offset0+vShft];
521  }
522 
523  // init Toeplitz arrays
524  tpltz_init(nnew[id], tpltzblocks[id].lambda, &nfft, &blocksize,
525  &T_fft, (tpltzblocks[id].T_block), &V_fft, &V_rfft,
526  &plan_f, &plan_b, flag_stgy);
527 
528  // Toeplitz computation
529  if (PRINT_RANK == 0 && VERBOSE > 2)
530  fprintf(file,
531  "[%d] Before stmm_main call : nfft = %d, blocksize "
532  "= %d\n",
533  rank, nfft, blocksize);
534  stmm_main(&V1block, vblock_size, m_rowwise, 0,
535  m_rowwise * vblock_size, (tpltzblocks[id].T_block),
536  T_fft, tpltzblocks[id].lambda, V_fft, V_rfft, plan_f,
537  plan_b, blocksize, nfft, flag_stgy);
538 
539 
540  tpltz_cleanup(&T_fft, &V_fft, &V_rfft, &plan_f, &plan_b);
541 
542 
543  for (j = 0; j < m_rowwise; j++) {
544 #pragma omp parallel for // num_threads(NB_OMPTHREADS_STBMM)
545  for (i = 0; i < vblock_size; i++) {
546  (*V)[i + idv2 + j * n_rowwise] =
547  V1block[i + j * vblock_size];
548  }
549  }
550 
551 
552  free(V1block);
553 
554 #ifdef W_MPI
555  } // end (Generic case)
556 
557  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
558  // Last case : Last block of the process
559  else if (iblock == idv0 + nb_blocks_rank - 1 && iblock != idv0) {
560  if (PRINT_RANK == 0 && VERBOSE > 2)
561  fprintf(file, "[%d] last block...\n", rank);
562 
563  vblock_size = vnrank_size;
564  id0block = 0;
565 
566  V1block = (double *) calloc(vblock_size * m_rowwise,
567  sizeof(double));
568 
569  idv1 = (tpltzblocks[id].idv) - idp % nrow - vShft + offset0
570  + nrow * ((iblock / nb_blocks_local));
571  idv2 = (tpltzblocks[id].idv) - (idpnew) % nrow + vShft
572  + nrow * ((iblock / nb_blocks_local));
573 
574 
575  for (j = 0; j < m_rowwise; j++) {
576 #pragma omp parallel for // num_threads(NB_OMPTHREADS_STBMM)
577  for (i = 0; i < vblock_size - offsetn; i++)
578  V1block[i + j * vblock_size] =
579  (*V)[i + idv2 + j * n_rowwise];
580  // V1block[i] = (*V)[i+idv1-offset0+vShft];
581  }
582 
583  for (j = 0; j < m_rowwise; j++) {
584 #pragma omp parallel for // num_threads(NB_OMPTHREADS_STBMM)
585  for (i = 0; i < offsetn; i++)
586  V1block[vblock_size - offsetn + i + j * vblock_size] =
587  LambdaIn[i + lambdaIn_offset + j * offsetn];
588  }
589 
590 
591  // init Toeplitz arrays
592  tpltz_init(vblock_size, tpltzblocks[id].lambda, &nfft,
593  &blocksize, &T_fft, (tpltzblocks[id].T_block),
594  &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
595 
596  // Toeplitz computation
597  if (PRINT_RANK == 0 && VERBOSE > 2)
598  fprintf(file,
599  "[%d] Before stmm_main call : nfft = %d, blocksize "
600  "= %d\n",
601  rank, nfft, blocksize);
602 
603  stmm_main(&V1block, vblock_size, m_rowwise, 0,
604  vblock_size * m_rowwise, (tpltzblocks[id].T_block),
605  T_fft, tpltzblocks[id].lambda, V_fft, V_rfft, plan_f,
606  plan_b, blocksize, nfft, flag_stgy);
607 
608  tpltz_cleanup(&T_fft, &V_fft, &V_rfft, &plan_f, &plan_b);
609 
610  for (j = 0; j < m_rowwise; j++) {
611 #pragma omp parallel for // num_threads(NB_OMPTHREADS_STBMM)
612  for (i = 0; i < vnrank_size - offsetn; i++) {
613  (*V)[idv2 + i + j * n_rowwise] =
614  V1block[i + j * vblock_size];
615  }
616  }
617 
618 
619  free(V1block);
620 
621  } // end of last block
622  else {
623  break;
624  } // error //we can put the generic case here instead of between
625  // first and last cases
626 #endif
627  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
628  } // end of if(nnew[id]>0)
629  } // end of loop over the blocks
630 
631 
632  free(LambdaIn);
633 
634 
635  return 0;
636 }
637 
638 // #endif
639 
640 //====================================================================
641 
645 
652 int get_overlapping_blocks_params(int nbloc, Block *tpltzblocks,
653  int local_V_size, int64_t nrow, int64_t idp,
654  int64_t *idpnew, int *local_V_size_new,
655  int *nnew, int *ifirstBlock,
656  int *ilastBlock) {
657  int ib, nblockOK = 0, nfullcol_data;
658  int64_t firstrow, lastrow;
659  int64_t idptmp;
660 
661 
662  // check how many full columns input data have
663  nfullcol_data = max(0, (local_V_size - (nrow - idp % nrow) % nrow
664  - (idp + local_V_size) % nrow)
665  / nrow);
666 
667  if (nfullcol_data > 0) {
668 
669  for (ib = 0; ib < nbloc; ib++) {
670  if (tpltzblocks[ib].idv < nrow) {
671  nnew[ib] = min(tpltzblocks[ib].n,
672  nrow - tpltzblocks[ib].idv); // block used for
673  // the product
674  nblockOK++;
675  }
676  }
677 
678  } else { // no full column observed
679 
680  firstrow = idp % nrow;
681  lastrow = (idp + local_V_size - 1) % nrow;
682 
683  if (firstrow < lastrow) { // just one column partially observed
684 
685  for (ib = 0; ib < nbloc; ib++) {
686  if ((tpltzblocks[ib].idv + tpltzblocks[ib].n > firstrow)
687  && (tpltzblocks[ib].idv < lastrow + 1)) {
688  nnew[ib] =
689  min(tpltzblocks[ib].n,
690  nrow - tpltzblocks[ib].idv); // block used for
691  // the product
692  nblockOK++;
693  }
694  }
695 
696  } else { // two columns partially observed
697 
698  for (ib = 0; ib < nbloc; ib++) {
699  if ((tpltzblocks[ib].idv + tpltzblocks[ib].n > firstrow)
700  && (tpltzblocks[ib].idv
701  < nrow)) { // intersects first partial column
702  nnew[ib] =
703  min(tpltzblocks[ib].n,
704  nrow - tpltzblocks[ib].idv); // block used for
705  // the product
706  nblockOK++;
707  }
708 
709  if ((tpltzblocks[ib].idv < lastrow + 1)
710  && (tpltzblocks[ib].idv + tpltzblocks[ib].n
711  > 0)) { // intersects second partial column
712  nnew[ib] =
713  min(tpltzblocks[ib].n,
714  nrow - tpltzblocks[ib].idv); // block used for
715  // the product
716  nblockOK++; // may overcount but we do not care
717  } // could use else insteed!
718  }
719  }
720  }
721  if (PRINT_RANK == 0 && VERBOSE > 2) printf("nblockOK=%d\n", nblockOK);
722 
723 
724  if (nblockOK == 0) return (0); // no blocks overlapping with the data
725 
726  // find the first and last relevant blocks for the begining and end of the
727  // local data V
728 
729  // first block
730  idptmp = idp;
731 
732  for (*ifirstBlock = -1; *ifirstBlock == -1;) {
733  for (ib = 0; ib < nbloc; ib++) {
734  if (nnew[ib] != 0 && idptmp % nrow < tpltzblocks[ib].idv + nnew[ib])
735  break;
736  }
737 
738  if (ib < nbloc && tpltzblocks[ib].idv <= idptmp % nrow) {
739  *ifirstBlock = ib;
740  *idpnew = idptmp;
741  } else if (ib < nbloc && tpltzblocks[ib].idv > idptmp % nrow) {
742  *ifirstBlock = ib;
743  // int64_t extrabegining = tpltzblocks[ib].idv-idp%nrow; //note I
744  // put int64 just to be sure. Never used
745  // *idpnew = idp+extrabegining;//tpltzblocks[ib].idv;
746  int idvfirstcolumn = idptmp / nrow;
747  *idpnew = tpltzblocks[ib].idv + idvfirstcolumn * nrow;
748  } else { // ib=nb_blocs
749  idptmp += nrow - idptmp % nrow; //(int) (nrow-idptmp%nrow);
750  // idtmp = (int) ceil((1.0*idpnew)/(1.0*nrow))*nrow; // go
751  // to the first element of the next column
752  }
753  }
754 
755 
756  // last block
757  idptmp = idp + local_V_size - 1;
758 
759  for (*ilastBlock = -1; *ilastBlock == -1;) {
760  for (ib = nbloc - 1; ib >= 0; ib--) {
761  if (nnew[ib] != 0 && tpltzblocks[ib].idv <= idptmp % nrow) break;
762  }
763 
764 
765  if (ib >= 0 && idptmp % nrow < tpltzblocks[ib].idv + nnew[ib]) {
766  *ilastBlock = ib;
767  *local_V_size_new = local_V_size - (*idpnew) + idp;
768  } else if (ib >= 0 && tpltzblocks[ib].idv + nnew[ib] <= idptmp % nrow) {
769  *ilastBlock = ib;
770  // int64_t extraend =
771  // (local_V_size-1+idp)%nrow+1-(tpltzblocks[ib].idv+nnew[ib]);
772  // //note I put int64 just to be sure *local_V_size_new =
773  //(local_V_size+idp)%nrow-(idv[*ilastBlock]+nnew[*ilastBlock]);
774  // idv[*ilastBlock]+nnew[*ilastBlock]-(*idpnew);
775  //*local_V_size_new = local_V_size-(*idpnew)+idp-extraend; //compute
776  //twice ... ? remove this one
777 
778  int idvlastcolumn = idptmp / nrow;
779  *local_V_size_new = tpltzblocks[ib].idv + nnew[ib]
780  + idvlastcolumn * nrow - (*idpnew);
781 
782  } else {
783  idptmp = idptmp - (idptmp % nrow)
784  - 1; //(int) idptmp - (idptmp%nrow)-1;
785  // idtmp = (int) floor( (1.0*idpnew)/(1.0*nrow))*nrow-1; //
786  // go to the last element of the previous column
787  }
788  }
789 
790  return (1);
791 }
int print_error_message(int error_number, char const *file, int line)
Prints error message corresponding to an error number.
Definition: toeplitz.c:127
int PRINT_RANK
Definition: toeplitz.c:117
int tpltz_init(int n, int lambda, int *nfft, int *blocksize, fftw_complex **T_fft, double *T, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b, Flag flag_stgy)
Definition: toeplitz.c:298
int VERBOSE
Verbose mode.
Definition: toeplitz.c:113
int tpltz_cleanup(fftw_complex **T_fft, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b)
Definition: toeplitz.c:485
int stmm_main(double **V, int n, int m, int id0, int l, double *T, fftw_complex *T_fft, int lambda, fftw_complex *V_fft, double *V_rfft, fftw_plan plan_f, fftw_plan plan_b, int blocksize, int nfft, Flag flag_stgy)
Definition: toeplitz.c:888
int mpi_stbmm(double **V, int64_t nrow, int m, int m_rowwise, Block *tpltzblocks, int nb_blocks_local, int nb_blocks_all, int64_t idp, int local_V_size, Flag flag_stgy, MPI_Comm comm)