MIDAPACK - MIcrowave Data Analysis PACKage  1.1b
Parallel software tools for high performance CMB DA analysis
butterfly.c
Go to the documentation of this file.
1 
21 #ifdef W_MPI
22 #include "mapmat.h"
23 #include <mpi.h>
24 #include <stdlib.h>
25 #include <string.h>
26 
27 
52 int butterfly_init(int *indices, int count, int **R, int *nR, int **S, int *nS,
53  int **com_indices, int *com_count, int steps,
54  MPI_Comm comm) {
55 
56  int i, k, p2k;
57  int rank, size, rk, sk;
58  int tag;
59  MPI_Request s_request, r_request;
60  int nbuf, *buf;
61  int **I, *nI;
62  int **J, *nJ;
63 
64  MPI_Comm_size(comm, &size);
65  MPI_Comm_rank(comm, &rank);
66 
67  I = (int **) malloc(steps * sizeof(int *));
68  nI = (int *) malloc(steps * sizeof(int));
69  tag = 0;
70  p2k = size / 2;
71 
72  for (k = 0; k < steps;
73  k++) { // butterfly first pass : bottom up (fill tabs nI and I)
74  sk = (rank + size - p2k) % size;
75  rk = (rank + p2k) % size;
76 
77  if (k == 0) { // S^0 := A
78  nS[k] = count;
79  S[k] = (int *) malloc(nS[k] * sizeof(int));
80  memcpy(S[k], indices, nS[k] * sizeof(int));
81  } else { // S^k := S^{k-1} \cup R^{k-1}
82  nS[k] = card_or(S[k - 1], nS[k - 1], I[steps - k], nI[steps - k]);
83  S[k] = (int *) malloc(nS[k] * sizeof(int));
84  set_or(S[k - 1], nS[k - 1], I[steps - k], nI[steps - k], S[k]);
85  }
86 
87  MPI_Irecv(&nI[steps - k - 1], 1, MPI_INT, rk, tag, comm,
88  &r_request); // receive number of indices
89  MPI_Isend(&nS[k], 1, MPI_INT, sk, tag, comm,
90  &s_request); // send number of indices
91  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
92  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
93 
94  I[steps - k - 1] = (int *) malloc(nI[steps - k - 1] * sizeof(int));
95 
96  tag++;
97  MPI_Irecv(I[steps - k - 1], nI[steps - k - 1], MPI_INT, rk, tag, comm,
98  &r_request); // receive indices
99  MPI_Isend(S[k], nS[k], MPI_INT, sk, tag, comm,
100  &s_request); // send indices
101  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
102  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
103 
104  p2k /= 2;
105  tag++;
106  }
107 
108  J = (int **) malloc(steps * sizeof(int *));
109  nJ = (int *) malloc(steps * sizeof(int));
110 
111  tag = 0;
112  p2k = 1;
113  for (k = 0; k < steps;
114  k++) { // buuterfly second pass : top down (fill tabs nJ and J)
115  free(S[k]);
116  sk = (rank + p2k) % size;
117  rk = (rank + size - p2k) % size;
118  if (k == 0) {
119  nJ[k] = count;
120  J[k] = (int *) malloc(nJ[k] * sizeof(int));
121  memcpy(J[k], indices, nJ[k] * sizeof(int));
122  } else {
123  nJ[k] = card_or(J[k - 1], nJ[k - 1], R[k - 1], nR[k - 1]);
124  J[k] = (int *) malloc(nJ[k] * sizeof(int));
125  set_or(J[k - 1], nJ[k - 1], R[k - 1], nR[k - 1],
126  J[k]); // J^k=R^k-1 \cup J^k-1
127  free(R[k - 1]);
128  }
129  if (k != steps - 1) {
130  MPI_Irecv(&nR[k], 1, MPI_INT, rk, tag, comm, &r_request);
131  MPI_Isend(&nJ[k], 1, MPI_INT, sk, tag, comm, &s_request);
132  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
133  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
134 
135  R[k] = (int *) malloc(nR[k] * sizeof(int));
136  tag++;
137 
138  MPI_Irecv(R[k], nR[k], MPI_INT, rk, tag, comm, &r_request);
139  MPI_Isend(J[k], nJ[k], MPI_INT, sk, tag, comm, &s_request);
140  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
141  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
142  }
143  p2k *= 2;
144  tag++;
145  }
146 
147 
148  tag = 0;
149  p2k = 1;
150  for (k = 0; k < steps; k++) { // butterfly last pass : know that Sending tab
151  // is S = I \cap J, so send S and we'll get R
152  sk = (rank + p2k) % size;
153  rk = (rank + size - p2k) % size;
154 
155  nS[k] = card_and(I[k], nI[k], J[k], nJ[k]);
156  S[k] = (int *) malloc(nJ[k] * sizeof(int));
157  set_and(I[k], nI[k], J[k], nJ[k], S[k]); // S^k=I^k \cap J^k
158 
159  free(I[k]);
160  free(J[k]);
161 
162  MPI_Irecv(&nR[k], 1, MPI_INT, rk, tag, comm,
163  &r_request); // receive size
164  MPI_Isend(&nS[k], 1, MPI_INT, sk, tag, comm, &s_request); // send size
165  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
166  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
167 
168  R[k] = (int *) malloc(nR[k] * sizeof(int));
169  tag++;
170 
171  MPI_Irecv(R[k], nR[k], MPI_INT, rk, tag, comm,
172  &r_request); // receive indices
173  MPI_Isend(S[k], nS[k], MPI_INT, sk, tag, comm,
174  &s_request); // send indices
175  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
176  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
177 
178  p2k *= 2;
179  tag++;
180  }
181 
182  // Now we work locally
183  int **USR, *nUSR, **U, *nU;
184 
185  USR = (int **) malloc(steps * sizeof(int *));
186  nUSR = (int *) malloc(steps * sizeof(int));
187  U = (int **) malloc(steps * sizeof(int *));
188  nU = (int *) malloc(steps * sizeof(int));
189 
190  for (k = 0; k < steps; k++) {
191  nUSR[k] = card_or(S[k], nS[k], R[k], nR[k]);
192  USR[k] = (int *) malloc(nUSR[k] * sizeof(int));
193  set_or(S[k], nS[k], R[k], nR[k], USR[k]);
194  }
195  for (k = 0; k < steps; k++) {
196  if (k == 0) {
197  nU[k] = nUSR[k];
198  U[k] = (int *) malloc(nU[k] * sizeof(int));
199  memcpy(U[k], USR[k], nU[k] * sizeof(int));
200  } else {
201  nU[k] = card_or(U[k - 1], nU[k - 1], USR[k], nUSR[k]);
202  U[k] = (int *) malloc(nU[k] * sizeof(int *));
203  set_or(U[k - 1], nU[k - 1], USR[k], nUSR[k], U[k]);
204  }
205  }
206  *com_count = nU[steps - 1];
207  *com_indices = (int *) malloc(*com_count * sizeof(int));
208  memcpy(*com_indices, U[steps - 1], *com_count * sizeof(int));
209  //====================================================================
210 
211  for (k = 0; k < steps; k++) {
212  subset2map(*com_indices, *com_count, S[k], nS[k]);
213  subset2map(*com_indices, *com_count, R[k], nR[k]);
214  }
215  free(USR);
216  free(U);
217 
218  return 0;
219 }
220 
221 
236 int butterfly_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax,
237  double *val, int steps, MPI_Comm comm) {
238  // double st, t;
239  // t=0.0;
240  int k, p2k, tag;
241  int rank, size, rk, sk;
242  MPI_Request s_request, r_request;
243  double *sbuf, *rbuf;
244 
245  MPI_Comm_size(comm, &size);
246  MPI_Comm_rank(comm, &rank);
247 
248  sbuf = (double *) malloc(nSmax * sizeof(double));
249  rbuf = (double *) malloc(nRmax * sizeof(double));
250  tag = 0;
251  p2k = 1;
252 
253  for (k = 0; k < steps; k++) {
254  // st=MPI_Wtime();
255  rk = (rank + size - p2k) % size;
256  MPI_Irecv(rbuf, nR[k], MPI_DOUBLE, rk, tag, comm, &r_request);
257  sk = (rank + p2k) % size;
258  m2s(val, sbuf, S[k], nS[k]); // fill the sending buffer
259  MPI_Isend(sbuf, nS[k], MPI_DOUBLE, sk, tag, comm, &s_request);
260  MPI_Wait(&r_request, MPI_STATUS_IGNORE);
261  s2m_sum(val, rbuf, R[k],
262  nR[k]); // sum receive buffer into values //nR[k] floating sum
263  p2k *= 2;
264  tag++;
265  MPI_Wait(&s_request, MPI_STATUS_IGNORE);
266  // t=t+MPI_Wtime()-st;
267  }
268  free(sbuf);
269  free(rbuf);
270  return 0;
271 }
272 
273 //===============================================Modification of the code by
274 //Sebastien Cayrols : 01/09/2015 ; Berkeley
275 
290 int butterfly_blocking_2instr_reduce(int **R, int *nR, int nRmax, int **S,
291  int *nS, int nSmax, double *val, int steps,
292  MPI_Comm comm) {
293  // double st, t;
294  // t=0.0;
295  int k, p2k, tag;
296  int rank, size, rk, sk;
297  double *sbuf, *rbuf;
298  MPI_Status status;
299 
300  MPI_Comm_size(comm, &size);
301  MPI_Comm_rank(comm, &rank);
302 
303  sbuf = (double *) malloc(nSmax * sizeof(double));
304  rbuf = (double *) malloc(nRmax * sizeof(double));
305  tag = 0;
306  p2k = 1;
307 
308  for (k = 0; k < steps; k++) {
309  // st=MPI_Wtime();
310  sk = (rank + p2k) % size;
311  m2s(val, sbuf, S[k], nS[k]); // fill the sending buffer
312  MPI_Send(sbuf, nS[k], MPI_DOUBLE, sk, tag, comm);
313  rk = (rank + size - p2k) % size;
314  MPI_Recv(rbuf, nR[k], MPI_DOUBLE, rk, tag, comm, &status);
315  s2m_sum(val, rbuf, R[k],
316  nR[k]); // sum receive buffer into values //nR[k] floating sum
317  p2k *= 2;
318  tag++;
319  // t=t+MPI_Wtime()-st;
320  }
321  free(sbuf);
322  free(rbuf);
323  return 0;
324 }
325 
340 int butterfly_blocking_1instr_reduce(int **R, int *nR, int nRmax, int **S,
341  int *nS, int nSmax, double *val, int steps,
342  MPI_Comm comm) {
343  // double st, t;
344  // t=0.0;
345  int k, p2k, tag;
346  int rank, size, rk, sk;
347  double *sbuf, *rbuf;
348  MPI_Status status;
349 
350  MPI_Comm_size(comm, &size);
351  MPI_Comm_rank(comm, &rank);
352 
353  sbuf = (double *) malloc(nSmax * sizeof(double));
354  rbuf = (double *) malloc(nRmax * sizeof(double));
355  tag = 0;
356  p2k = 1;
357 
358  for (k = 0; k < steps; k++) {
359  // st=MPI_Wtime();
360  sk = (rank + p2k) % size;
361  rk = (rank + size - p2k) % size;
362  m2s(val, sbuf, S[k], nS[k]); // fill the sending buffer
363  MPI_Sendrecv(sbuf, nS[k], MPI_DOUBLE, sk, tag, rbuf, nR[k], MPI_DOUBLE,
364  rk, tag, comm, &status);
365  s2m_sum(val, rbuf, R[k],
366  nR[k]); // sum receive buffer into values //nR[k] floating sum
367  p2k *= 2;
368  tag++;
369  // t=t+MPI_Wtime()-st;
370  }
371  free(sbuf);
372  free(rbuf);
373  return 0;
374 }
375 #endif
void m2s(double *mapval, double *submapval, int *subset, int count)
Definition: alm.c:27
void s2m_sum(double *mapval, double *submapval, int *subset, int count)
Sum submap values the submap values array.
Definition: alm.c:54
int set_or(int *A1, int n1, int *A2, int n2, int *A1orA2)
Definition: als.c:138
int set_and(int *A1, int n1, int *A2, int n2, int *A1andA2)
Definition: als.c:186
int card_or(int *A1, int n1, int *A2, int n2)
Definition: als.c:72
int card_and(int *A1, int n1, int *A2, int n2)
Definition: als.c:111
void subset2map(int *A, int nA, int *subA, int nsubA)
Definition: als.c:237
int butterfly_blocking_2instr_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a butterfly-like communication scheme.
Definition: butterfly.c:290
int butterfly_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a butterfly-like communication scheme.
Definition: butterfly.c:236
int butterfly_init(int *indices, int count, int **R, int *nR, int **S, int *nS, int **com_indices, int *com_count, int steps, MPI_Comm comm)
Initialize tables for butterfly-like communication scheme This routine set up needed tables for the b...
Definition: butterfly.c:52
int butterfly_blocking_1instr_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a butterfly-like communication scheme.
Definition: butterfly.c:340