67 __typeof__(a) _a = (a); \
68 __typeof__(b) _b = (b); \
74 __typeof__(a) _a = (a); \
75 __typeof__(b) _b = (b); \
119 int mpi_gstbmm(
double **V,
int nrow,
int m,
int m_rowwise, Block *tpltzblocks,
120 int nb_blocks_local,
int nb_blocks_all,
int id0p,
121 int local_V_size, int64_t *id0gap,
int *lgap,
int ngap,
122 Flag flag_stgy, MPI_Comm comm) {
128 MPI_Comm_rank(comm, &rank);
129 MPI_Comm_size(comm, &size);
133 int flag_skip_build_gappy_blocks = flag_stgy.flag_skip_build_gappy_blocks;
140 reset_gaps(V, id0p, local_V_size, m, nrow, m_rowwise, id0gap, lgap, ngap);
145 int nb_blockgappy_max;
148 Block *tpltzblocks_gappy;
156 fprintf(file,
"[%d] flag_skip_build_gappy_blocks=%d\n", rank,
157 flag_skip_build_gappy_blocks);
159 if (flag_skip_build_gappy_blocks == 1) {
164 mpi_stbmm(V, nrow, m, m_rowwise, tpltzblocks, nb_blocks_local,
165 nb_blocks_all, id0p, local_V_size, flag_stgy, MPI_COMM_WORLD);
168 for (Tsize = i = 0; i < nb_blocks_local; i++)
169 Tsize += tpltzblocks[i].lambda;
171 for (i = 0; i < nb_blocks_local; i++) {
172 if (tpltzblocks[i].lambda > lambdamax)
173 lambdamax = tpltzblocks[i].lambda;
177 nb_blockgappy_max = nb_blocks_local + ngap;
178 Tgappysize_max = Tsize + lambdamax * ngap;
181 tpltzblocks_gappy = (Block *) calloc(nb_blockgappy_max,
sizeof(Block));
190 int flag_param_distmin_fixed = flag_stgy.flag_param_distmin_fixed;
192 id0gap, lgap, ngap, tpltzblocks_gappy,
193 &nb_blocks_gappy, flag_param_distmin_fixed);
196 fprintf(file,
"[%d] nb_blocks_gappy=%d\n", rank, nb_blocks_gappy);
197 for (i = 0; i < nb_blocks_gappy; i++)
198 fprintf(file,
"[%d] idvgappy[%d]=%ld ; ngappy[%d]=%d\n", rank,
199 i, tpltzblocks_gappy[i].idv, i, tpltzblocks_gappy[i].n);
206 mpi_stbmm(V, nrow, m, m_rowwise, tpltzblocks_gappy, nb_blocks_local,
207 nb_blocks_all, id0p, local_V_size, flag_stgy, MPI_COMM_WORLD);
214 reset_gaps(V, id0p, local_V_size, m, nrow, m_rowwise, id0gap, lgap, ngap);
227 int reset_gaps(
double **V,
int id0,
int local_V_size,
int m,
int nrow,
228 int m_rowwise,
const int64_t *id0gap,
const int *lgap,
232 for (j = 0; j < m; j++) {
234 #pragma omp parallel for private(i) schedule(dynamic, 1)
235 for (k = 0; k < ngap; k++)
236 for (i = 0; i < lgap[k]; i++)
237 if (id0gap[k] + i + j * nrow >= id0
238 && id0gap[k] + i + j * nrow < id0 + local_V_size) {
239 for (l = 0; l < m_rowwise; l++)
240 (*V)[id0gap[k] + i + j * nrow - id0
241 + l * local_V_size] = 0.;
275 int nb_blocks_all,
const int64_t *id0gap,
276 const int *lgap,
int ngap, Block *tpltzblocks_gappy,
277 int *nb_blocks_gappy_final,
278 int flag_param_distmin_fixed) {
283 int igapfirstblock, igaplastblock;
285 int param_distmin = 0;
286 if (flag_param_distmin_fixed != 0) param_distmin = flag_param_distmin_fixed;
290 int igaplastblock_prev = -1;
291 int lambdaShftgappy = 0;
295 int flag_igapfirstinside, flag_igaplastinside;
296 int nbloc = nb_blocks_local;
297 int nblocks_gappy = 0;
299 int idvtmp_firstblock;
301 int nb_blockgappy_max = nb_blocks_local + ngap;
314 for (k = 0; k < ngap; k++) {
317 for (igapfirstblock = -1; igapfirstblock == -1;) {
320 for (ib = 0; ib < nbloc; ib++) {
321 if (tpltzblocks[ib].n != 0
322 && idtmp % nrow < tpltzblocks[ib].idv + tpltzblocks[ib].n)
326 if (ib < nbloc && tpltzblocks[ib].idv <= idtmp) {
328 flag_igapfirstinside = 1;
329 }
else if (ib < nbloc && tpltzblocks[ib].idv > idtmp) {
331 flag_igapfirstinside = 0;
334 flag_igapfirstinside = 0;
339 for (igaplastblock = -1; igaplastblock == -1;) {
340 idtmp = id0gap[k] + lgap[k] - 1;
342 for (ib = nbloc - 1; ib >= 0; ib--) {
343 if (tpltzblocks[ib].n != 0 && tpltzblocks[ib].idv <= idtmp)
347 if (ib >= 0 && idtmp < tpltzblocks[ib].idv + tpltzblocks[ib].n) {
349 flag_igaplastinside = 1;
351 && tpltzblocks[ib].idv + tpltzblocks[ib].n <= idtmp) {
353 flag_igaplastinside = 0;
356 flag_igaplastinside = 0;
360 if (igapfirstblock == igaplastblock)
361 distcorr_min = tpltzblocks[igapfirstblock].lambda
368 if (lgap[k] > max(distcorr_min, param_distmin) && igapfirstblock != -2
369 && igaplastblock != -2) {
371 idvtmp_firstblock = max(tpltzblocks[igapfirstblock].idv,
372 id0gap[k_prev] + lgap[k_prev]);
375 if (igapfirstblock != igaplastblock) {
379 }
else if (id0gap[k] - idvtmp_firstblock
380 >= tpltzblocks[igapfirstblock].lambda
381 && tpltzblocks[igaplastblock].idv
382 + tpltzblocks[igaplastblock].n
383 - (id0gap[k] + lgap[k])
384 >= tpltzblocks[igaplastblock].lambda) {
387 }
else if (igapfirstblock == igaplastblock) {
389 int ngappyleft_tmp = id0gap[k] - idvtmp_firstblock;
390 int leftadd = max(0, tpltzblocks[igapfirstblock].lambda
392 int ngappyright_tmp = tpltzblocks[igaplastblock].idv
393 + tpltzblocks[igaplastblock].n
394 - (id0gap[k] + lgap[k]);
395 int rightadd = max(0, tpltzblocks[igapfirstblock].lambda
397 int restgap = lgap[k] - (leftadd + rightadd);
400 flag_gapok = (restgap >= max(0, param_distmin));
406 if (flag_gapok == 1) {
409 for (
id = igaplastblock_prev + 1;
id < igapfirstblock;
id++) {
411 tpltzblocks_gappy[nblocks_gappy].T_block =
412 tpltzblocks[id].T_block;
413 tpltzblocks_gappy[nblocks_gappy].lambda =
414 tpltzblocks[id].lambda;
415 tpltzblocks_gappy[nblocks_gappy].n = tpltzblocks[id].n;
416 tpltzblocks_gappy[nblocks_gappy].idv = tpltzblocks[id].idv;
418 nblocks_gappy = nblocks_gappy + 1;
423 if (igaplastblock_prev == igapfirstblock && k != 0) {
424 nblocks_gappy = nblocks_gappy - 1;
431 if (flag_igapfirstinside == 1
432 && (id0gap[k] - idvtmp_firstblock)
436 tpltzblocks_gappy[nblocks_gappy].T_block =
437 tpltzblocks[igapfirstblock].T_block;
438 tpltzblocks_gappy[nblocks_gappy].lambda =
439 tpltzblocks[id].lambda;
440 tpltzblocks_gappy[nblocks_gappy].n =
441 id0gap[k] - idvtmp_firstblock;
442 tpltzblocks_gappy[nblocks_gappy].n =
443 max(tpltzblocks_gappy[nblocks_gappy].n,
444 tpltzblocks[igapfirstblock].lambda);
446 tpltzblocks_gappy[nblocks_gappy].idv = idvtmp_firstblock;
447 nblocks_gappy = nblocks_gappy + 1;
451 if (flag_igaplastinside == 1
452 && (tpltzblocks[igaplastblock].idv
453 + tpltzblocks[igaplastblock].n - (id0gap[k] + lgap[k]))
457 tpltzblocks_gappy[nblocks_gappy].T_block =
458 tpltzblocks[igaplastblock].T_block;
459 tpltzblocks_gappy[nblocks_gappy].lambda =
460 tpltzblocks[id].lambda;
461 tpltzblocks_gappy[nblocks_gappy].n =
462 tpltzblocks[igaplastblock].idv
463 + tpltzblocks[igaplastblock].n
464 - (id0gap[k] + lgap[k]);
466 0, tpltzblocks[igapfirstblock].lambda
467 - tpltzblocks_gappy[nblocks_gappy].n);
469 tpltzblocks_gappy[nblocks_gappy].n =
470 max(tpltzblocks_gappy[nblocks_gappy].n,
471 tpltzblocks[igaplastblock].lambda);
473 tpltzblocks_gappy[nblocks_gappy].idv =
474 id0gap[k] + lgap[k] - rightadd0;
476 nblocks_gappy = nblocks_gappy + 1;
478 lambdaShftgappy + tpltzblocks[igaplastblock].lambda;
481 igaplastblock_prev = igaplastblock;
489 for (
id = igaplastblock_prev + 1;
id < nb_blocks_local;
id++) {
491 tpltzblocks_gappy[nblocks_gappy].T_block = tpltzblocks[id].T_block;
492 tpltzblocks_gappy[nblocks_gappy].lambda = tpltzblocks[id].lambda;
493 tpltzblocks_gappy[nblocks_gappy].n = tpltzblocks[id].n;
494 tpltzblocks_gappy[nblocks_gappy].idv = tpltzblocks[id].idv;
495 nblocks_gappy = nblocks_gappy + 1;
496 lambdaShftgappy = lambdaShftgappy + tpltzblocks[id].lambda;
499 *nb_blocks_gappy_final = nblocks_gappy;
int mpi_stbmm(double **V, int64_t nrow, int m, int m_rowwise, Block *tpltzblocks, int nb_blocks_local, int nb_blocks_all, int64_t idp, int local_V_size, Flag flag_stgy, MPI_Comm comm)
int mpi_gstbmm(double **V, int nrow, int m, int m_rowwise, Block *tpltzblocks, int nb_blocks_local, int nb_blocks_all, int id0p, int local_V_size, int64_t *id0gap, int *lgap, int ngap, Flag flag_stgy, MPI_Comm comm)
int reset_gaps(double **V, int id0, int local_V_size, int m, int nrow, int m_rowwise, const int64_t *id0gap, const int *lgap, int ngap)
Set the data to zeros at the gaps location.
int build_gappy_blocks(int nrow, int m, Block *tpltzblocks, int nb_blocks_local, int nb_blocks_all, const int64_t *id0gap, const int *lgap, int ngap, Block *tpltzblocks_gappy, int *nb_blocks_gappy_final, int flag_param_distmin_fixed)