MIDAPACK - MIcrowave Data Analysis PACKage  1.1b
Parallel software tools for high performance CMB DA analysis
butterfly_extra.c
Go to the documentation of this file.
1 
21 #ifdef W_MPI
22 #include <mpi.h>
23 #include <stdlib.h>
24 #include <string.h>
25 
26 
51 int butterfly_init(int *indices, int count, int **R, int *nR, int **S, int *nS,
52  int **com_indices, int *com_count, int steps,
53  MPI_Comm comm) {
54 
55  int i, k, p2k;
56  int rank, size, rk, sk;
57  int tag;
58  MPI_Request s_request, r_request;
59  int nbuf, *buf;
60  int **I, *nI;
61  int **J, *nJ;
62 
63  MPI_Comm_size(comm, &size);
64  MPI_Comm_rank(comm, &rank);
65 
66  I = (int **) malloc(steps * sizeof(int *));
67  nI = (int *) malloc(steps * sizeof(int));
68  tag = 0;
69  p2k = size / 2;
70 
71  for (k = 0; k < steps;
72  k++) { // butterfly first pass : bottom up (fill tabs nI and I)
73  sk = (rank + size - p2k) % size;
74  rk = (rank + p2k) % size;
75 
76  if (k == 0) { // S^0 := A
77  nS[k] = count;
78  S[k] = (int *) malloc(nS[k] * sizeof(int));
79  memcpy(S[k], indices, nS[k] * sizeof(int));
80  } else { // S^k := S^{k-1} \cup R^{k-1}
81  nS[k] = card_or(S[k - 1], nS[k - 1], I[steps - k], nI[steps - k]);
82  S[k] = (int *) malloc(nS[k] * sizeof(int));
83  set_or(S[k - 1], nS[k - 1], I[steps - k], nI[steps - k], S[k]);
84  }
85 
86  MPI_Irecv(&nI[steps - k - 1], 1, MPI_INT, rk, tag, comm,
87  &r_request); // receive number of indices
88  MPI_Isend(&nS[k], 1, MPI_INT, sk, tag, comm,
89  &s_request); // send number of indices
90  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
91  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
92 
93  I[steps - k - 1] = (int *) malloc(nI[steps - k - 1] * sizeof(int));
94 
95  tag++;
96  MPI_Irecv(I[steps - k - 1], nI[steps - k - 1], MPI_INT, rk, tag, comm,
97  &r_request); // receive indices
98  MPI_Isend(S[k], nS[k], MPI_INT, sk, tag, comm,
99  &s_request); // send indices
100  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
101  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
102 
103  p2k /= 2;
104  tag++;
105  }
106 
107  J = (int **) malloc(steps * sizeof(int *));
108  nJ = (int *) malloc(steps * sizeof(int));
109 
110  tag = 0;
111  p2k = 1;
112  for (k = 0; k < steps;
113  k++) { // buuterfly second pass : top down (fill tabs nJ and J)
114  free(S[k]);
115  sk = (rank + p2k) % size;
116  rk = (rank + size - p2k) % size;
117  if (k == 0) {
118  nJ[k] = count;
119  J[k] = (int *) malloc(nJ[k] * sizeof(int));
120  memcpy(J[k], indices, nJ[k] * sizeof(int));
121  } else {
122  nJ[k] = card_or(J[k - 1], nJ[k - 1], R[k - 1], nR[k - 1]);
123  J[k] = (int *) malloc(nJ[k] * sizeof(int));
124  set_or(J[k - 1], nJ[k - 1], R[k - 1], nR[k - 1],
125  J[k]); // J^k=R^k-1 \cup J^k-1
126  free(R[k - 1]);
127  }
128  if (k != steps - 1) {
129  MPI_Irecv(&nR[k], 1, MPI_INT, rk, tag, comm, &r_request);
130  MPI_Isend(&nJ[k], 1, MPI_INT, sk, tag, comm, &s_request);
131  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
132  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
133 
134  R[k] = (int *) malloc(nR[k] * sizeof(int));
135  tag++;
136 
137  MPI_Irecv(R[k], nR[k], MPI_INT, rk, tag, comm, &r_request);
138  MPI_Isend(J[k], nJ[k], MPI_INT, sk, tag, comm, &s_request);
139  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
140  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
141  }
142  p2k *= 2;
143  tag++;
144  }
145 
146 
147  tag = 0;
148  p2k = 1;
149  for (k = 0; k < steps; k++) { // butterfly last pass : know that Sending tab
150  // is S = I \cap J, so send S and we'll get R
151  sk = (rank + p2k) % size;
152  rk = (rank + size - p2k) % size;
153 
154  nS[k] = card_and(I[k], nI[k], J[k], nJ[k]);
155  S[k] = (int *) malloc(nJ[k] * sizeof(int));
156  set_and(I[k], nI[k], J[k], nJ[k], S[k]); // S^k=I^k \cap J^k
157 
158  free(I[k]);
159  free(J[k]);
160 
161  MPI_Irecv(&nR[k], 1, MPI_INT, rk, tag, comm,
162  &r_request); // receive size
163  MPI_Isend(&nS[k], 1, MPI_INT, sk, tag, comm, &s_request); // send size
164  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
165  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
166 
167  R[k] = (int *) malloc(nR[k] * sizeof(int));
168  tag++;
169 
170  MPI_Irecv(R[k], nR[k], MPI_INT, rk, tag, comm,
171  &r_request); // receive indices
172  MPI_Isend(S[k], nS[k], MPI_INT, sk, tag, comm,
173  &s_request); // send indices
174  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
175  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
176 
177  p2k *= 2;
178  tag++;
179  }
180 
181  // Now we work locally
182  int **USR, *nUSR, **U, *nU;
183 
184  USR = (int **) malloc(steps * sizeof(int *));
185  nUSR = (int *) malloc(steps * sizeof(int));
186  U = (int **) malloc(steps * sizeof(int *));
187  nU = (int *) malloc(steps * sizeof(int));
188 
189  for (k = 0; k < steps; k++) {
190  nUSR[k] = card_or(S[k], nS[k], R[k], nR[k]);
191  USR[k] = (int *) malloc(nUSR[k] * sizeof(int));
192  set_or(S[k], nS[k], R[k], nR[k], USR[k]);
193  }
194  for (k = 0; k < steps; k++) {
195  if (k == 0) {
196  nU[k] = nUSR[k];
197  U[k] = (int *) malloc(nU[k] * sizeof(int));
198  memcpy(U[k], USR[k], nU[k] * sizeof(int));
199  } else {
200  nU[k] = card_or(U[k - 1], nU[k - 1], USR[k], nUSR[k]);
201  U[k] = (int *) malloc(nU[k] * sizeof(int *));
202  set_or(U[k - 1], nU[k - 1], USR[k], nUSR[k], U[k]);
203  }
204  }
205  *com_count = nU[steps - 1];
206  *com_indices = (int *) malloc(*com_count * sizeof(int));
207  memcpy(*com_indices, U[steps - 1], *com_count * sizeof(int));
208  //====================================================================
209 
210  for (k = 0; k < steps; k++) {
211  subset2map(*com_indices, *com_count, S[k], nS[k]);
212  subset2map(*com_indices, *com_count, R[k], nR[k]);
213  }
214  free(USR);
215  free(U);
216 
217  return 0;
218 }
219 
220 
235 double butterfly_reduce(int **R, int *nR, int nRmax, int **S, int *nS,
236  int nSmax, double *val, int steps, MPI_Comm comm) {
237  double st, t;
238  t = 0.0;
239  int k, p2k, tag;
240  int rank, size, rk, sk;
241  MPI_Request s_request, r_request;
242  double *sbuf, *rbuf;
243 
244  MPI_Comm_size(comm, &size);
245  MPI_Comm_rank(comm, &rank);
246 
247  sbuf = (double *) malloc(nSmax * sizeof(double));
248  rbuf = (double *) malloc(nRmax * sizeof(double));
249  tag = 0;
250  p2k = 1;
251 
252  for (k = 0; k < steps; k++) {
253  sk = (rank + p2k) % size;
254  rk = (rank + size - p2k) % size;
255 
256  m2s(val, sbuf, S[k], nS[k]); // fill the sending buffer
257 
258  st = MPI_Wtime();
259  MPI_Irecv(rbuf, nR[k], MPI_DOUBLE, rk, tag, comm, &r_request);
260  MPI_Isend(sbuf, nS[k], MPI_DOUBLE, sk, tag, comm, &s_request);
261 
262  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
263  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
264 
265  t = t + MPI_Wtime() - st;
266 
267  s2m_sum(val, rbuf, R[k], nR[k]); // sum receive buffer into values
268 
269  p2k *= 2;
270  tag++;
271  }
272  free(sbuf);
273  free(rbuf);
274  return t;
275 }
276 
277 int truebutterfly_init(int *indices, int count, int **R, int *nR, int **S,
278  int *nS, int **com_indices, int *com_count, int steps,
279  MPI_Comm comm) {
280 
281  int i, k, p2k, p2k1;
282  int rank, size, rk, sk;
283  int tag;
284  MPI_Request s_request, r_request;
285  int nbuf, *buf;
286  int **I, *nI;
287  int **J, *nJ;
288 
289  MPI_Comm_size(comm, &size);
290  MPI_Comm_rank(comm, &rank);
291 
292  I = (int **) malloc(steps * sizeof(int *));
293  nI = (int *) malloc(steps * sizeof(int));
294  tag = 0;
295  p2k = size / 2;
296  p2k1 = 2 * p2k;
297 
298  for (k = 0; k < steps;
299  k++) { // butterfly first pass : bottom up (fill tabs nI and I)
300 
301  if (rank % p2k1 < p2k) sk = rk = rank + p2k;
302  else
303  sk = rk = rank - p2k;
304 
305  if (k == 0) { // S^0 := A
306  nS[k] = count;
307  S[k] = (int *) malloc(nS[k] * sizeof(int));
308  memcpy(S[k], indices, nS[k] * sizeof(int));
309  } else { // S^k := S^{k-1} \cup R^{k-1}
310  nS[k] = card_or(S[k - 1], nS[k - 1], I[steps - k], nI[steps - k]);
311  S[k] = (int *) malloc(nS[k] * sizeof(int));
312  set_or(S[k - 1], nS[k - 1], I[steps - k], nI[steps - k], S[k]);
313  }
314 
315  MPI_Irecv(&nI[steps - k - 1], 1, MPI_INT, rk, tag, comm,
316  &r_request); // receive number of indices
317  MPI_Isend(&nS[k], 1, MPI_INT, sk, tag, comm,
318  &s_request); // send number of indices
319  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
320  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
321 
322  I[steps - k - 1] = (int *) malloc(nI[steps - k - 1] * sizeof(int));
323 
324  tag++;
325  MPI_Irecv(I[steps - k - 1], nI[steps - k - 1], MPI_INT, rk, tag, comm,
326  &r_request); // receive indices
327  MPI_Isend(S[k], nS[k], MPI_INT, sk, tag, comm,
328  &s_request); // send indices
329  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
330  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
331 
332  p2k /= 2;
333  p2k1 /= 2;
334  tag++;
335  }
336 
337  J = (int **) malloc(steps * sizeof(int *));
338  nJ = (int *) malloc(steps * sizeof(int));
339 
340  tag = 0;
341  p2k = 1;
342  p2k1 = p2k * 2;
343  for (k = 0; k < steps;
344  k++) { // buuterfly second pass : top down (fill tabs nJ and J)
345  free(S[k]);
346 
347  if (rank % p2k1 < p2k) sk = rk = rank + p2k;
348  else
349  sk = rk = rank - p2k;
350 
351  if (k == 0) {
352  nJ[k] = count;
353  J[k] = (int *) malloc(nJ[k] * sizeof(int));
354  memcpy(J[k], indices, nJ[k] * sizeof(int));
355  } else {
356  nJ[k] = card_or(J[k - 1], nJ[k - 1], R[k - 1], nR[k - 1]);
357  J[k] = (int *) malloc(nJ[k] * sizeof(int));
358  set_or(J[k - 1], nJ[k - 1], R[k - 1], nR[k - 1],
359  J[k]); // J^k=R^k-1 \cup J^k-1
360  free(R[k - 1]);
361  }
362  if (k != steps - 1) {
363  MPI_Irecv(&nR[k], 1, MPI_INT, rk, tag, comm, &r_request);
364  MPI_Isend(&nJ[k], 1, MPI_INT, sk, tag, comm, &s_request);
365  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
366  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
367 
368  R[k] = (int *) malloc(nR[k] * sizeof(int));
369  tag++;
370 
371  MPI_Irecv(R[k], nR[k], MPI_INT, rk, tag, comm, &r_request);
372  MPI_Isend(J[k], nJ[k], MPI_INT, sk, tag, comm, &s_request);
373  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
374  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
375  }
376  p2k *= 2;
377  p2k1 *= 2;
378  tag++;
379  }
380 
381 
382  tag = 0;
383  p2k = 1;
384  p2k1 = p2k * 2;
385  for (k = 0; k < steps; k++) { // butterfly last pass : know that Sending tab
386  // is S = I \cap J, so send S and we'll get R
387 
388  if (rank % p2k1 < p2k) sk = rk = rank + p2k;
389  else
390  sk = rk = rank - p2k;
391 
392  nS[k] = card_and(I[k], nI[k], J[k], nJ[k]);
393  S[k] = (int *) malloc(nJ[k] * sizeof(int));
394  set_and(I[k], nI[k], J[k], nJ[k], S[k]); // S^k=I^k \cap J^k
395 
396  free(I[k]);
397  free(J[k]);
398 
399  MPI_Irecv(&nR[k], 1, MPI_INT, rk, tag, comm,
400  &r_request); // receive size
401  MPI_Isend(&nS[k], 1, MPI_INT, sk, tag, comm, &s_request); // send size
402  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
403  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
404 
405  R[k] = (int *) malloc(nR[k] * sizeof(int));
406  tag++;
407 
408  MPI_Irecv(R[k], nR[k], MPI_INT, rk, tag, comm,
409  &r_request); // receive indices
410  MPI_Isend(S[k], nS[k], MPI_INT, sk, tag, comm,
411  &s_request); // send indices
412  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
413  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
414 
415  p2k *= 2;
416  p2k1 *= 2;
417  tag++;
418  }
419 
420  // Now we work locally
421  int **USR, *nUSR, **U, *nU;
422 
423  USR = (int **) malloc(steps * sizeof(int *));
424  nUSR = (int *) malloc(steps * sizeof(int));
425  U = (int **) malloc(steps * sizeof(int *));
426  nU = (int *) malloc(steps * sizeof(int));
427 
428  for (k = 0; k < steps; k++) {
429  nUSR[k] = card_or(S[k], nS[k], R[k], nR[k]);
430  USR[k] = (int *) malloc(nUSR[k] * sizeof(int));
431  set_or(S[k], nS[k], R[k], nR[k], USR[k]);
432  }
433  for (k = 0; k < steps; k++) {
434  if (k == 0) {
435  nU[k] = nUSR[k];
436  U[k] = (int *) malloc(nU[k] * sizeof(int));
437  memcpy(U[k], USR[k], nU[k] * sizeof(int));
438  } else {
439  nU[k] = card_or(U[k - 1], nU[k - 1], USR[k], nUSR[k]);
440  U[k] = (int *) malloc(nU[k] * sizeof(int *));
441  set_or(U[k - 1], nU[k - 1], USR[k], nUSR[k], U[k]);
442  }
443  }
444  *com_count = nU[steps - 1];
445  *com_indices = (int *) malloc(*com_count * sizeof(int));
446  memcpy(*com_indices, U[steps - 1], *com_count * sizeof(int));
447  //====================================================================
448 
449  for (k = 0; k < steps; k++) {
450  subset2map(*com_indices, *com_count, S[k], nS[k]);
451  subset2map(*com_indices, *com_count, R[k], nR[k]);
452  }
453  free(USR);
454  free(U);
455 
456  return 0;
457 }
458 
459 
474 double truebutterfly_reduce(int **R, int *nR, int nRmax, int **S, int *nS,
475  int nSmax, double *val, int steps, MPI_Comm comm) {
476  double st, t;
477  t = 0.0;
478  int k, p2k, p2k1, tag;
479  int rank, size, rk, sk;
480  MPI_Status status;
481  MPI_Request s_request, r_request;
482  double *sbuf, *rbuf;
483 
484  MPI_Comm_size(comm, &size);
485  MPI_Comm_rank(comm, &rank);
486 
487  sbuf = (double *) malloc(nSmax * sizeof(double));
488  rbuf = (double *) malloc(nRmax * sizeof(double));
489  tag = 0;
490  p2k = 1;
491  p2k1 = p2k * 2;
492 
493  for (k = 0; k < steps; k++) {
494 
495  if (rank % p2k1 < p2k) {
496 
497  sk = rk = rank + p2k;
498 
499  st = MPI_Wtime();
500 
501  // MPI_Sendrecv(sbuf, nS[k], MPI_DOUBLE, sk, tag, rbuf, nR[k],
502  // MPI_DOUBLE, rk, tag, comm, &status);
503 
504  m2s(val, sbuf, S[k], nS[k]); // fill the sending buffer
505  MPI_Isend(sbuf, nS[k], MPI_DOUBLE, sk, tag, comm, &s_request);
506  MPI_Irecv(rbuf, nR[k], MPI_DOUBLE, rk, tag, comm, &r_request);
507 
508  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
509  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
510  s2m_sum(val, rbuf, R[k], nR[k]); // sum receive buffer into values
511 
512 
513  t = t + MPI_Wtime() - st;
514 
515  } else {
516 
517  sk = rk = rank - p2k;
518 
519  st = MPI_Wtime();
520 
521  MPI_Irecv(rbuf, nR[k], MPI_DOUBLE, rk, tag, comm, &r_request);
522  m2s(val, sbuf, S[k], nS[k]); // fill the sending buffer
523  MPI_Isend(sbuf, nS[k], MPI_DOUBLE, sk, tag, comm, &s_request);
524 
525  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
526  s2m_sum(val, rbuf, R[k], nR[k]); // sum receive buffer into values
527 
528  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
529 
530  // MPI_Sendrecv(sbuf, nS[k], MPI_DOUBLE, sk, tag, rbuf, nR[k],
531  // MPI_DOUBLE, rk, tag, comm, &status);
532 
533  t = t + MPI_Wtime() - st;
534  }
535 
536  p2k *= 2;
537  p2k1 *= 2;
538  tag++;
539  }
540  free(sbuf);
541  free(rbuf);
542  return t;
543 }
544 
545 #endif
void m2s(double *mapval, double *submapval, int *subset, int count)
Definition: alm.c:27
void s2m_sum(double *mapval, double *submapval, int *subset, int count)
Sum submap values the submap values array.
Definition: alm.c:54
int set_or(int *A1, int n1, int *A2, int n2, int *A1orA2)
Definition: als.c:138
int set_and(int *A1, int n1, int *A2, int n2, int *A1andA2)
Definition: als.c:186
int card_or(int *A1, int n1, int *A2, int n2)
Definition: als.c:72
int card_and(int *A1, int n1, int *A2, int n2)
Definition: als.c:111
void subset2map(int *A, int nA, int *subA, int nsubA)
Definition: als.c:237
double truebutterfly_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a butterfly-like communication scheme.
double butterfly_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a butterfly-like communication scheme.
int truebutterfly_init(int *indices, int count, int **R, int *nR, int **S, int *nS, int **com_indices, int *com_count, int steps, MPI_Comm comm)
int butterfly_init(int *indices, int count, int **R, int *nR, int **S, int *nS, int **com_indices, int *com_count, int steps, MPI_Comm comm)
Initialize tables for butterfly-like communication scheme This routine set up needed tables for the b...