66 __typeof__(a) _a = (a); \
67 __typeof__(b) _b = (b); \
73 __typeof__(a) _a = (a); \
74 __typeof__(b) _b = (b); \
115 int mpi_stbmm(
double **V, int64_t nrow,
int m,
int m_rowwise,
116 Block *tpltzblocks,
int nb_blocks_local,
int nb_blocks_all,
117 int64_t idp,
int local_V_size, Flag flag_stgy, MPI_Comm comm) {
119 int mpi_stbmm(
double **V, int64_t nrow,
int m,
int m_rowwise,
120 Block *tpltzblocks,
int nb_blocks_local,
int nb_blocks_all,
121 int64_t idp,
int local_V_size, Flag flag_stgy) {
131 MPI_Comm_rank(comm, &rank);
132 MPI_Comm_size(comm, &size);
149 int right = rank + 1;
158 nnew = (
int *) calloc(nb_blocks_local,
sizeof(
int));
160 int local_V_size_new;
161 int n_rowwise = local_V_size;
163 int status_params = get_overlapping_blocks_params(
164 nb_blocks_local, tpltzblocks, local_V_size, nrow, idp, &idpnew,
165 &local_V_size_new, nnew, &idv0, &idvn);
169 printf(
"status_params=%d\n", status_params);
171 if (status_params == 0) {
176 if (tpltzblocks[idv0].lambda == 0 || tpltzblocks[idvn].lambda == 0)
182 fprintf(file,
"new parameters caracteristics:\n");
183 fprintf(file,
"[%d] idp=%ld ; idpnew=%ld\n", rank, idp, idpnew);
184 fprintf(file,
"[%d] local_V_size=%d ; local_V_size_new=%d\n", rank,
185 local_V_size, local_V_size_new);
186 for (i = 0; i < nb_blocks_local; i++)
187 fprintf(file,
"[%d] n[%d]=%d ; nnew[%d]=%d\n", rank, i,
188 (tpltzblocks[i].n), i, nnew[i]);
189 for (i = 0; i < nb_blocks_local; i++)
190 fprintf(file,
"[%d] tpltzblocks[%d].idv=%ld\n", rank, i,
195 int vShft = idpnew - idp;
199 int idvm0 = idpnew / nrow;
200 int idvmn = (idpnew + local_V_size_new - 1) / nrow;
202 int ncol_rank = idvmn - idvm0 + 1;
207 nb_blocks_rank = idvn - idv0 + 1;
210 (ncol_rank - 2) * nb_blocks_local + (nb_blocks_local - idv0)
214 fprintf(file,
"[%d] nb_blocks_rank=%d, nb_blocks_local=%d\n", rank,
215 nb_blocks_rank, nb_blocks_local);
218 int idvp0 = idpnew % nrow
219 - tpltzblocks[idv0].idv;
225 idvpn = tpltzblocks[idvn].idv + nnew[idvn] - 1
226 - (idpnew + local_V_size_new - 1) % nrow;
230 int offset0, offsetn;
231 int distcorrmin_idv0 = (tpltzblocks[idv0].lambda) - 1;
232 int distcorrmin_idvn = (tpltzblocks[idvn].lambda) - 1;
235 offset0 = min(idvp0, distcorrmin_idv0);
237 offsetn = min(idvpn, distcorrmin_idvn);
245 toSendLeft = min(tpltzblocks[idv0].idv + nnew[idv0] - idpnew % nrow,
250 min((idpnew + local_V_size_new) % nrow - tpltzblocks[idvn].idv,
254 int flag_optimlambda = 1;
256 int lambdaOut_offset;
259 int lambdaOut_size, lambdaIn_size;
261 if (flag_optimlambda == 1) {
262 LambdaOut = (
double *) calloc((toSendLeft + toSendRight) * m_rowwise,
264 lambdaOut_offset = toSendLeft * m_rowwise;
265 lambdaIn_offset = offset0 * m_rowwise;
266 lambdaOut_size = (toSendLeft + toSendRight) * m_rowwise;
267 lambdaIn_size = (offset0 + offsetn) * m_rowwise;
269 LambdaOut = (
double *) calloc(
270 (tpltzblocks[idv0].lambda + tpltzblocks[idvn].lambda)
273 lambdaOut_offset = tpltzblocks[idv0].lambda * m_rowwise;
274 lambdaIn_offset = tpltzblocks[idv0].lambda * m_rowwise;
275 lambdaOut_size = (tpltzblocks[idv0].lambda + tpltzblocks[idvn].lambda)
277 lambdaIn_size = (tpltzblocks[idv0].lambda + tpltzblocks[idvn].lambda)
283 for (j = 0; j < m_rowwise; j++)
284 for (i = 0; i < toSendLeft; i++)
285 LambdaOut[i + j * toSendLeft] =
286 (*V)[i + j * n_rowwise];
290 for (j = 0; j < m_rowwise; j++)
291 for (i = 0; i < toSendRight; i++)
292 LambdaOut[i + j * toSendRight + lambdaOut_offset] =
293 (*V)[i + j * n_rowwise + local_V_size - toSendRight];
299 if (rank == 0 || offset0 == 0) left = MPI_PROC_NULL;
300 if (rank == size - 1 || offsetn == 0) right = MPI_PROC_NULL;
302 double *LambdaIn = (
double *) calloc(lambdaIn_size,
sizeof(
double));
305 int flag_blockingcomm = 0;
306 MPI_Request requestLeft_r, requestLeft_s;
307 MPI_Request requestRight_r, requestRight_s;
309 if (flag_blockingcomm == 1) {
313 MPI_Sendrecv(LambdaOut, toSendLeft * m_rowwise, MPI_DOUBLE, left,
314 MPI_USER_TAG, (LambdaIn + lambdaIn_offset),
315 offsetn * m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, comm,
319 MPI_Sendrecv((LambdaOut + lambdaOut_offset), toSendRight * m_rowwise,
320 MPI_DOUBLE, right, MPI_USER_TAG, LambdaIn,
321 offset0 * m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, comm,
327 MPI_Irecv((LambdaIn + lambdaIn_offset), offsetn * m_rowwise, MPI_DOUBLE,
328 right, MPI_USER_TAG, comm, &requestLeft_r);
329 MPI_Isend(LambdaOut, toSendLeft * m_rowwise, MPI_DOUBLE, left,
330 MPI_USER_TAG, comm, &requestLeft_s);
333 MPI_Irecv(LambdaIn, offset0 * m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG,
334 comm, &requestRight_r);
335 MPI_Isend((LambdaOut + lambdaOut_offset), toSendRight * m_rowwise,
336 MPI_DOUBLE, right, MPI_USER_TAG, comm, &requestRight_s);
344 int v0rank_size, vnrank_size;
345 if (nb_blocks_rank == 1) {
346 v0rank_size = ((idpnew + local_V_size_new - 1) % nrow + 1)
347 - idpnew % nrow + offset0 + offsetn;
351 tpltzblocks[idv0].idv + nnew[idv0] - idpnew % nrow + offset0;
352 vnrank_size = ((idpnew + local_V_size_new - 1) % nrow + 1)
353 - tpltzblocks[idvn].idv + offsetn;
359 if (flag_blockingcomm != 1) {
362 MPI_Wait(&requestLeft_r, &status);
363 MPI_Wait(&requestLeft_s, &status);
364 MPI_Wait(&requestRight_r, &status);
365 MPI_Wait(&requestRight_s, &status);
388 fftw_complex *V_fft, *T_fft;
390 fftw_plan plan_f, plan_b;
409 for (iblock = idv0; iblock < idv0 + nb_blocks_rank; iblock++) {
410 id = iblock % nb_blocks_local;
418 if (iblock == idv0) {
420 fprintf(file,
"[%d] First block...\n", rank);
422 vblock_size = v0rank_size;
423 id0block = (offset_id0 - offset0);
425 V1block = (
double *) calloc(vblock_size * m_rowwise,
428 for (j = 0; j < m_rowwise; j++) {
429 #pragma omp parallel for
430 for (i = 0; i < offset0; i++)
431 V1block[i + j * vblock_size] =
432 LambdaIn[i + j * offset0];
441 int currentsize_middlepart =
442 min(vblock_size - offset0, local_V_size_new);
444 for (j = 0; j < m_rowwise; j++) {
445 #pragma omp parallel for
446 for (i = 0; i < currentsize_middlepart; i++)
447 V1block[offset0 + i + j * vblock_size] =
448 (*V)[i + vShft + j * n_rowwise];
451 if (nb_blocks_rank == 1) {
452 for (j = 0; j < m_rowwise; j++) {
453 #pragma omp parallel for
454 for (i = 0; i < offsetn; i++) {
455 V1block[vblock_size - offsetn + i
457 LambdaIn[i + lambdaIn_offset + j * offsetn];
464 tpltz_init(vblock_size, tpltzblocks[
id].lambda, &nfft,
465 &blocksize, &T_fft, (tpltzblocks[
id].T_block),
466 &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
471 "[%d] Before stmm_main call : nfft = %d, blocksize "
473 rank, nfft, blocksize);
474 stmm_main(&V1block, vblock_size, m_rowwise, 0,
475 m_rowwise * vblock_size, (tpltzblocks[
id].T_block),
476 T_fft, tpltzblocks[
id].lambda, V_fft, V_rfft, plan_f,
477 plan_b, blocksize, nfft, flag_stgy);
482 int currentsize = min(vblock_size - offset0, local_V_size_new);
483 for (j = 0; j < m_rowwise; j++) {
484 #pragma omp parallel for
485 for (i = 0; i < currentsize; i++)
486 (*V)[vShft + i + j * n_rowwise] =
487 V1block[offset0 + i + j * vblock_size];
497 else if (iblock != idv0 && iblock != idv0 + nb_blocks_rank - 1) {
501 fprintf(file,
"[%d] generic block...\n", rank);
503 vblock_size = nnew[id];
506 V1block = (
double *) calloc(vblock_size * m_rowwise,
509 idv1 = (tpltzblocks[id].idv) - idp % nrow - vShft + offset0
510 + nrow * ((iblock / nb_blocks_local));
512 idv2 = (tpltzblocks[id].idv) - (idpnew) % nrow + vShft
513 + nrow * ((iblock / nb_blocks_local));
515 for (j = 0; j < m_rowwise; j++) {
516 #pragma omp parallel for
517 for (i = 0; i < vblock_size; i++)
518 V1block[i + j * vblock_size] =
519 (*V)[i + idv2 + j * n_rowwise];
524 tpltz_init(nnew[
id], tpltzblocks[
id].lambda, &nfft, &blocksize,
525 &T_fft, (tpltzblocks[
id].T_block), &V_fft, &V_rfft,
526 &plan_f, &plan_b, flag_stgy);
531 "[%d] Before stmm_main call : nfft = %d, blocksize "
533 rank, nfft, blocksize);
534 stmm_main(&V1block, vblock_size, m_rowwise, 0,
535 m_rowwise * vblock_size, (tpltzblocks[
id].T_block),
536 T_fft, tpltzblocks[
id].lambda, V_fft, V_rfft, plan_f,
537 plan_b, blocksize, nfft, flag_stgy);
543 for (j = 0; j < m_rowwise; j++) {
544 #pragma omp parallel for
545 for (i = 0; i < vblock_size; i++) {
546 (*V)[i + idv2 + j * n_rowwise] =
547 V1block[i + j * vblock_size];
559 else if (iblock == idv0 + nb_blocks_rank - 1 && iblock != idv0) {
561 fprintf(file,
"[%d] last block...\n", rank);
563 vblock_size = vnrank_size;
566 V1block = (
double *) calloc(vblock_size * m_rowwise,
569 idv1 = (tpltzblocks[id].idv) - idp % nrow - vShft + offset0
570 + nrow * ((iblock / nb_blocks_local));
571 idv2 = (tpltzblocks[id].idv) - (idpnew) % nrow + vShft
572 + nrow * ((iblock / nb_blocks_local));
575 for (j = 0; j < m_rowwise; j++) {
576 #pragma omp parallel for
577 for (i = 0; i < vblock_size - offsetn; i++)
578 V1block[i + j * vblock_size] =
579 (*V)[i + idv2 + j * n_rowwise];
583 for (j = 0; j < m_rowwise; j++) {
584 #pragma omp parallel for
585 for (i = 0; i < offsetn; i++)
586 V1block[vblock_size - offsetn + i + j * vblock_size] =
587 LambdaIn[i + lambdaIn_offset + j * offsetn];
592 tpltz_init(vblock_size, tpltzblocks[
id].lambda, &nfft,
593 &blocksize, &T_fft, (tpltzblocks[
id].T_block),
594 &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
599 "[%d] Before stmm_main call : nfft = %d, blocksize "
601 rank, nfft, blocksize);
603 stmm_main(&V1block, vblock_size, m_rowwise, 0,
604 vblock_size * m_rowwise, (tpltzblocks[
id].T_block),
605 T_fft, tpltzblocks[
id].lambda, V_fft, V_rfft, plan_f,
606 plan_b, blocksize, nfft, flag_stgy);
610 for (j = 0; j < m_rowwise; j++) {
611 #pragma omp parallel for
612 for (i = 0; i < vnrank_size - offsetn; i++) {
613 (*V)[idv2 + i + j * n_rowwise] =
614 V1block[i + j * vblock_size];
652 int get_overlapping_blocks_params(
int nbloc, Block *tpltzblocks,
653 int local_V_size, int64_t nrow, int64_t idp,
654 int64_t *idpnew,
int *local_V_size_new,
655 int *nnew,
int *ifirstBlock,
657 int ib, nblockOK = 0, nfullcol_data;
658 int64_t firstrow, lastrow;
663 nfullcol_data = max(0, (local_V_size - (nrow - idp % nrow) % nrow
664 - (idp + local_V_size) % nrow)
667 if (nfullcol_data > 0) {
669 for (ib = 0; ib < nbloc; ib++) {
670 if (tpltzblocks[ib].idv < nrow) {
671 nnew[ib] = min(tpltzblocks[ib].n,
672 nrow - tpltzblocks[ib].idv);
680 firstrow = idp % nrow;
681 lastrow = (idp + local_V_size - 1) % nrow;
683 if (firstrow < lastrow) {
685 for (ib = 0; ib < nbloc; ib++) {
686 if ((tpltzblocks[ib].idv + tpltzblocks[ib].n > firstrow)
687 && (tpltzblocks[ib].idv < lastrow + 1)) {
689 min(tpltzblocks[ib].n,
690 nrow - tpltzblocks[ib].idv);
698 for (ib = 0; ib < nbloc; ib++) {
699 if ((tpltzblocks[ib].idv + tpltzblocks[ib].n > firstrow)
700 && (tpltzblocks[ib].idv
703 min(tpltzblocks[ib].n,
704 nrow - tpltzblocks[ib].idv);
709 if ((tpltzblocks[ib].idv < lastrow + 1)
710 && (tpltzblocks[ib].idv + tpltzblocks[ib].n
713 min(tpltzblocks[ib].n,
714 nrow - tpltzblocks[ib].idv);
724 if (nblockOK == 0)
return (0);
732 for (*ifirstBlock = -1; *ifirstBlock == -1;) {
733 for (ib = 0; ib < nbloc; ib++) {
734 if (nnew[ib] != 0 && idptmp % nrow < tpltzblocks[ib].idv + nnew[ib])
738 if (ib < nbloc && tpltzblocks[ib].idv <= idptmp % nrow) {
741 }
else if (ib < nbloc && tpltzblocks[ib].idv > idptmp % nrow) {
746 int idvfirstcolumn = idptmp / nrow;
747 *idpnew = tpltzblocks[ib].idv + idvfirstcolumn * nrow;
749 idptmp += nrow - idptmp % nrow;
757 idptmp = idp + local_V_size - 1;
759 for (*ilastBlock = -1; *ilastBlock == -1;) {
760 for (ib = nbloc - 1; ib >= 0; ib--) {
761 if (nnew[ib] != 0 && tpltzblocks[ib].idv <= idptmp % nrow)
break;
765 if (ib >= 0 && idptmp % nrow < tpltzblocks[ib].idv + nnew[ib]) {
767 *local_V_size_new = local_V_size - (*idpnew) + idp;
768 }
else if (ib >= 0 && tpltzblocks[ib].idv + nnew[ib] <= idptmp % nrow) {
778 int idvlastcolumn = idptmp / nrow;
779 *local_V_size_new = tpltzblocks[ib].idv + nnew[ib]
780 + idvlastcolumn * nrow - (*idpnew);
783 idptmp = idptmp - (idptmp % nrow)
int print_error_message(int error_number, char const *file, int line)
Prints error message corresponding to an error number.
int tpltz_init(int n, int lambda, int *nfft, int *blocksize, fftw_complex **T_fft, double *T, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b, Flag flag_stgy)
int tpltz_cleanup(fftw_complex **T_fft, fftw_complex **V_fft, double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b)
int stmm_main(double **V, int n, int m, int id0, int l, double *T, fftw_complex *T_fft, int lambda, fftw_complex *V_fft, double *V_rfft, fftw_plan plan_f, fftw_plan plan_b, int blocksize, int nfft, Flag flag_stgy)
int mpi_stbmm(double **V, int64_t nrow, int m, int m_rowwise, Block *tpltzblocks, int nb_blocks_local, int nb_blocks_all, int64_t idp, int local_V_size, Flag flag_stgy, MPI_Comm comm)