MIDAPACK - MIcrowave Data Analysis PACKage  1.1b
Parallel software tools for high performance CMB DA analysis
mapmat.c
Go to the documentation of this file.
1 
21 #ifdef W_MPI
22 #include <mpi.h>
23 #endif
24 #include "mapmat.h"
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
54 int MatInit(Mat *A, int m, int nnz, int *indices, double *values, int flag
55 #ifdef W_MPI
56  ,
57  MPI_Comm comm
58 #endif
59 ) {
60  int err;
61  MatSetIndices(A, m, nnz, indices);
62 
63  MatSetValues(A, m, nnz, values);
64 
65  err = MatLocalShape(
66  A,
67  3); // compute lindices (local columns) (method 3 = counting sort)
68 
69 #ifdef W_MPI
70  err = MatComShape(A, flag, comm); // build communication scheme
71 #endif
72  return err;
73 }
74 
83 void MatSetIndices(Mat *A, int m, int nnz, int *indices) {
84  A->m = m; // set number of local rows
85  A->nnz = nnz; // set number of non-zero values per row
86  A->indices = indices; // point to indices
87 }
88 
97 void MatSetValues(Mat *A, int m, int nnz, double *values) {
98  int err;
99  A->m = m; // set number of local rows
100  A->nnz = nnz; // set number of non-zero values per row
101  A->values = values; // point to values
102 }
103 
104 //===================Part added by Sebastien Cayrols to get amount of memory
105 // needed by communication algoritms
106 void CommInfo(Mat *A) {
107 #if W_MPI
108  int i = 0, size, rank;
109  double maxSizeR = 0.0;
110  double maxSizeS = 0.0;
111  double amountSizeR = 0.0;
112  double amountSizeS = 0.0;
113  double stepSum = 0.0, stepAvg = 0.0;
114  // this value is based on data sent
115  double *amountSizeByStep = NULL;
116  double minStep = 0.0, maxStep = 0.0;
117  double *s = NULL;
118  double *r = NULL;
119  MPI_Comm comm = MPI_COMM_WORLD;
120  MPI_Comm_rank(comm, &rank);
121  MPI_Comm_size(comm, &size);
122  s = (double *) malloc(4 * sizeof(double));
123  r = (double *) malloc(4 * 3 * sizeof(double));
124  amountSizeByStep = (double *) malloc(A->steps * sizeof(double));
125  switch (A->flag) {
126  case NONE:
127  break;
128  case BUTTERFLY:
129  for (i = 0; i < A->steps; i++) {
130  amountSizeR += A->nR[i];
131  amountSizeS += A->nS[i];
132  if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];
133  if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];
134  }
135  break;
136  //==========================Modification added by Sebastien Cayrols :
137  // 01/09/2015 , Berkeley
138  case BUTTERFLY_BLOCKING_1:
139  for (i = 0; i < A->steps; i++) {
140  amountSizeR += A->nR[i];
141  amountSizeS += A->nS[i];
142  if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];
143  if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];
144  }
145  break;
146  case BUTTERFLY_BLOCKING_2:
147  for (i = 0; i < A->steps; i++) {
148  amountSizeR += A->nR[i];
149  amountSizeS += A->nS[i];
150  if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];
151  if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];
152  }
153  break;
154  case NOEMPTYSTEPRING:
155  for (i = 0; i < A->steps; i++) {
156  amountSizeR += A->nR[i];
157  amountSizeS += A->nS[i];
158  if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];
159  if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];
160  }
161  break;
162  //==========================End modification
163  case RING:
164  for (i = 0; i < A->steps; i++) {
165  amountSizeR += A->nR[i];
166  amountSizeS += A->nS[i];
167  if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];
168  if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];
169  }
170  break;
171  case NONBLOCKING:
172  for (i = 0; i < A->steps; i++) {
173  amountSizeR += A->nR[i];
174  amountSizeS += A->nS[i];
175  if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];
176  if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];
177  }
178  break;
179  case NOEMPTY:
180  for (i = 0; i < A->steps; i++) {
181  amountSizeR += A->nR[i];
182  amountSizeS += A->nS[i];
183  if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];
184  if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];
185  }
186  break;
187  case ALLTOALLV: // added -- rs 2015/02/04
188  for (i = 0; i < A->steps; i++) {
189  amountSizeR += A->nR[i];
190  amountSizeS += A->nS[i];
191  }
192  break;
193  case ALLREDUCE:
194  amountSizeR = A->com_count;
195  amountSizeS = A->com_count;
196  maxSizeR = A->com_count;
197  maxSizeS = A->com_count;
198  break;
199  }
200 
201  if (A->flag != ALLREDUCE && A->flag != ALLTOALLV) {
202  double *t = NULL;
203 
204  t = (double *) malloc(A->steps * sizeof(double));
205  // Copy int array into double array
206  for (i = 0; i < A->steps; i++) t[i] = A->nS[i];
207 
208  MPI_Reduce(t, amountSizeByStep, A->steps, MPI_DOUBLE, MPI_SUM, 0, comm);
209 
210  free(t);
211 
212  if (rank == 0) {
213  stepSum = minStep = maxStep = amountSizeByStep[0];
214  printf("\n[MEMORY]Step n°%4d, message size : %e", 0,
215  amountSizeByStep[0]);
216  for (i = 1; i < A->steps; i++) {
217  printf("\n[MEMORY]Step n°%4d, message size : %e", i,
218  amountSizeByStep[i]);
219  if (minStep > amountSizeByStep[i])
220  minStep = amountSizeByStep[i];
221  else if (maxStep < amountSizeByStep[i])
222  maxStep = amountSizeByStep[i];
223  stepSum += amountSizeByStep[i];
224  }
225  stepAvg = stepSum / A->steps;
226  }
227  }
228  s[0] = amountSizeR;
229  s[1] = amountSizeS;
230  s[2] = maxSizeR;
231  s[3] = maxSizeS;
232  MPI_Reduce(s, r, 4, MPI_DOUBLE, MPI_SUM, 0, comm);
233  if (rank == 0)
234  for (i = 0; i < 4; i++) r[i] /= size;
235  MPI_Reduce(s, &r[4], 4, MPI_DOUBLE, MPI_MIN, 0, comm);
236  MPI_Reduce(s, &r[8], 4, MPI_DOUBLE, MPI_MAX, 0, comm);
237  if (rank == 0) {
238  printf("\n[MEMORY]Step average : %e\t[%e,%e]", stepAvg,
239  minStep, maxStep);
240  printf("\n[MEMORY]Amount of data received : %e\t[%e,%e]", r[0], r[4],
241  r[8]);
242  printf("\n[MEMORY]Amount of data sent : %e\t[%e,%e]", r[1], r[5],
243  r[9]);
244  printf("\n[MEMORY]Message size received : %e\t[%e,%e]", r[2], r[6],
245  r[10]);
246  printf("\n[MEMORY]Message size sent : %e\t[%e,%e]\n", r[3], r[7],
247  r[11]);
248  }
249  free(s);
250  free(r);
251  free(amountSizeByStep);
252 #endif
253 }
254 
255 void MatReset(Mat *A) {
256 #if W_MPI
257  switch (A->flag) {
258  case NONE:
259  break;
260  case BUTTERFLY:
261  free(A->R); //
262  free(A->nR); //
263  free(A->S); //
264  free(A->nS);
265  break;
266  case BUTTERFLY_BLOCKING_1:
267  free(A->R); //
268  free(A->nR); //
269  free(A->S); //
270  free(A->nS);
271  break;
272  case BUTTERFLY_BLOCKING_2:
273  free(A->R); //
274  free(A->nR); //
275  free(A->S); //
276  free(A->nS);
277  break;
278  case NOEMPTYSTEPRING:
279  free(A->R); //
280  free(A->nR); //
281  free(A->S); //
282  free(A->nS);
283  break;
284  case RING:
285  free(A->R); //
286  free(A->nR); //
287  free(A->S); //
288  free(A->nS);
289  break;
290  case NONBLOCKING:
291  free(A->R); //
292  free(A->nR); //
293  free(A->S); //
294  free(A->nS);
295  break;
296  case NOEMPTY:
297  free(A->R); //
298  free(A->nR); //
299  free(A->S); //
300  free(A->nS);
301  break;
302  case ALLTOALLV: // added -- rs 2015/02/04
303  free(A->R); //
304  free(A->nR); //
305  free(A->S); //
306  free(A->nS);
307  break;
308  case ALLREDUCE:
309  break;
310  }
311 #endif
312 }
313 
314 //===================End
315 
323 void MatFree(Mat *A) {
324 
325  // get information about communication size
326  CommInfo(A);
327 
328  free(A->lindices);
329 #if W_MPI
330  switch (A->flag) {
331  case NONE:
332  break;
333  case BUTTERFLY:
334  free(A->com_indices); //
335  free(A->R); //
336  free(A->nR); //
337  free(A->S); //
338  free(A->nS);
339  break;
340  //==========================Modification added by Sebastien Cayrols :
341  // 01/09/2015 , Berkeley
342  case BUTTERFLY_BLOCKING_1:
343  free(A->com_indices); //
344  free(A->R); //
345  free(A->nR); //
346  free(A->S); //
347  free(A->nS);
348  break;
349  case BUTTERFLY_BLOCKING_2:
350  free(A->com_indices); //
351  free(A->R); //
352  free(A->nR); //
353  free(A->S); //
354  free(A->nS);
355  break;
356  case NOEMPTYSTEPRING:
357  free(A->R); //
358  free(A->nR); //
359  free(A->S); //
360  free(A->nS);
361  break;
362  //==========================End modification
363  case RING:
364  free(A->R); //
365  free(A->nR); //
366  free(A->S); //
367  free(A->nS);
368  break;
369  case NONBLOCKING:
370  free(A->R); //
371  free(A->nR); //
372  free(A->S); //
373  free(A->nS);
374  break;
375  case NOEMPTY:
376  free(A->R); //
377  free(A->nR); //
378  free(A->S); //
379  free(A->nS);
380  break;
381  case ALLTOALLV: // Added: rs 2015/02/04
382  free(A->R); //
383  free(A->nR); //
384  free(A->S); //
385  free(A->nS);
386  break;
387  case ALLREDUCE:
388  free(A->com_indices); //
389  //===================================Modification
390  // from Sebastien Cayrols : comment of these
391  // lines to avoid SEGSIGV
392  // free(A->R); //
393  // free(A->nR); //
394  // free(A->S); //
395  // free(A->nS);
396  //===================================End modif
397  break;
398  }
399 #endif
400 }
401 
414 int MatLoad(Mat *mat, char *filename) {
415  int err;
416  int rank;
417 #if W_MPI
418  MPI_Comm_rank(mat->comm, &rank);
419 #else
420  rank = 0;
421 #endif
422  FILE *in;
423  char fn[100];
424  int i = 0;
425  sprintf(fn, "%s_%d.dat", filename, rank);
426  printf("%s", fn);
427  in = fopen(fn, "r");
428  if (in == NULL) {
429  printf("cannot open file %s", fn);
430  return 1;
431  }
432  while (feof(in) == 0 && i < (mat->m * mat->nnz)) {
433  if (mat->nnz == 1) {
434  fscanf(in, "%d %lf", &(mat->indices[i]), &(mat->values[i]));
435  } else if (mat->nnz == 2) {
436  fscanf(in, "%d %lf %d %lf", &(mat->indices[i]), &(mat->values[i]),
437  &(mat->indices[i + 1]), &(mat->values[i + 1]));
438  } else {
439  return 1; //(nnz > 2) not implement
440  }
441  i += mat->nnz;
442  }
443  if (i != mat->m * mat->nnz) { printf("WARNNING data size doesn't fit\n"); }
444  fclose(in);
445  return 0;
446 }
447 
460 int MatSave(Mat *mat, char *filename) {
461  FILE *out;
462  char fn[100];
463  int i, j;
464  int rank;
465 #if W_MPI
466  MPI_Comm_rank(mat->comm, &rank);
467 #else
468  sprintf(fn, "%s_%d.dat", filename, rank);
469 #endif
470  out = fopen(fn, "w");
471  if (out == NULL) {
472  printf("cannot open file %s", fn);
473  return 1;
474  }
475  for (i = 0; i < (mat->nnz * mat->m); i += mat->nnz) {
476  for (j = 0; j < mat->nnz; j++) {
477  fprintf(out, "%d ", mat->indices[i + j]);
478  fprintf(out, "%f ", mat->values[i + j]);
479  }
480  fprintf(out, "\n");
481  }
482  fclose(out);
483  return 0;
484 }
485 
496 int MatLocalShape(Mat *A, int sflag) {
497  int *tmp_indices;
498 
499  tmp_indices = (int *) malloc(
500  (int64_t) (A->m) * A->nnz
501  * sizeof(int)); // allocate a tmp copy of indices tab to sort
502  memcpy(tmp_indices, A->indices,
503  (int64_t) (A->m) * A->nnz * sizeof(int)); // copy
504 
505  // A->lcount = omp_psort(tmp_indices, A->m * A->nnz, sflag);
506  // //sequential sort tmp_indices
507  A->lcount = ssort(tmp_indices, A->m * A->nnz,
508  sflag); // sequential sort tmp_indices
509 
510  A->lindices = (int *) malloc(A->lcount * sizeof(int));
511  memcpy(A->lindices, tmp_indices,
512  A->lcount * sizeof(int)); // copy tmp_indices into lindices and free
513  free(tmp_indices);
514 
515  sindex(A->lindices, A->lcount, A->indices, A->nnz * A->m);
516 
517  // check for masked pixels
518  if (A->lindices[0] < 0) { A->trash_pix = 1; }
519 
520  return 0;
521 }
522 
523 #if W_MPI
530 int MatComShape(Mat *A, int flag, MPI_Comm comm) {
531  int size;
532  int i, min, max, j;
533  A->comm = comm; // set communivcator
534  A->flag = flag;
535  MPI_Comm_size(A->comm, &size);
536  if ((A->flag == BUTTERFLY || A->flag == BUTTERFLY_BLOCKING_1
537  || A->flag == BUTTERFLY_BLOCKING_2)
538  && is_pow_2(size) != 0)
539  A->flag = RING;
540  switch (A->flag) {
541  case BUTTERFLY:
542  A->steps = log_2(size);
543  A->S = (int **) malloc(
544  A->steps * sizeof(int *)); // allocate sending maps tab
545  A->R = (int **) malloc(
546  A->steps * sizeof(int *)); // allocate receiving maps tab
547  A->nS = (int *) malloc(
548  A->steps * sizeof(int)); // allocate sending map sizes tab
549  A->nR = (int *) malloc(
550  A->steps * sizeof(int)); // allocate receiving map size tab
551  butterfly_init(A->lindices + (A->nnz) * (A->trash_pix),
552  A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR,
553  A->S, A->nS, &(A->com_indices), &(A->com_count),
554  A->steps, A->comm);
555  break;
556  //==========================Modification added by Sebastien Cayrols :
557  // 01/09/2015 , Berkeley
558  case BUTTERFLY_BLOCKING_1:
559  A->steps = log_2(size);
560  A->S = (int **) malloc(
561  A->steps * sizeof(int *)); // allocate sending maps tab
562  A->R = (int **) malloc(
563  A->steps * sizeof(int *)); // allocate receiving maps tab
564  A->nS = (int *) malloc(
565  A->steps * sizeof(int)); // allocate sending map sizes tab
566  A->nR = (int *) malloc(
567  A->steps * sizeof(int)); // allocate receiving map size tab
568  butterfly_init(A->lindices + (A->nnz) * (A->trash_pix),
569  A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR,
570  A->S, A->nS, &(A->com_indices), &(A->com_count),
571  A->steps, A->comm);
572  break;
573  case BUTTERFLY_BLOCKING_2:
574  A->steps = log_2(size);
575  A->S = (int **) malloc(
576  A->steps * sizeof(int *)); // allocate sending maps tab
577  A->R = (int **) malloc(
578  A->steps * sizeof(int *)); // allocate receiving maps tab
579  A->nS = (int *) malloc(
580  A->steps * sizeof(int)); // allocate sending map sizes tab
581  A->nR = (int *) malloc(
582  A->steps * sizeof(int)); // allocate receiving map size tab
583  butterfly_init(A->lindices + (A->nnz) * (A->trash_pix),
584  A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR,
585  A->S, A->nS, &(A->com_indices), &(A->com_count),
586  A->steps, A->comm);
587  break;
588  case NOEMPTYSTEPRING:
589  A->steps = size;
590  A->S = (int **) malloc(
591  A->steps * sizeof(int *)); // allocate sending maps tab
592  A->R = (int **) malloc(
593  A->steps * sizeof(int *)); // allocate receiving maps tab
594  A->nS = (int *) malloc(
595  A->steps * sizeof(int)); // allocate sending map sizes tab
596  A->nR = (int *) malloc(
597  A->steps * sizeof(int)); // allocate receiving map size tab
598  ring_init(A->lindices + (A->nnz) * (A->trash_pix),
599  A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,
600  A->nS, A->steps, A->comm);
601  A->com_count = A->lcount - (A->nnz) * (A->trash_pix);
602  A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);
603  break;
604  //==========================End modification
605  case RING:
606  A->steps = size;
607  A->S = (int **) malloc(
608  A->steps * sizeof(int *)); // allocate sending maps tab
609  A->R = (int **) malloc(
610  A->steps * sizeof(int *)); // allocate receiving maps tab
611  A->nS = (int *) malloc(
612  A->steps * sizeof(int)); // allocate sending map sizes tab
613  A->nR = (int *) malloc(
614  A->steps * sizeof(int)); // allocate receiving map size tab
615  ring_init(A->lindices + (A->nnz) * (A->trash_pix),
616  A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,
617  A->nS, A->steps, A->comm);
618  A->com_count = A->lcount - (A->nnz) * (A->trash_pix);
619  A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);
620  break;
621  case NONBLOCKING:
622  A->steps = size;
623  A->S = (int **) malloc(
624  A->steps * sizeof(int *)); // allocate sending maps tab
625  A->R = (int **) malloc(
626  A->steps * sizeof(int *)); // allocate receiving maps tab
627  A->nS = (int *) malloc(
628  A->steps * sizeof(int)); // allocate sending map sizes tab
629  A->nR = (int *) malloc(
630  A->steps * sizeof(int)); // allocate receiving map size tab
631  ring_init(A->lindices + (A->nnz) * (A->trash_pix),
632  A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,
633  A->nS, A->steps, A->comm);
634  A->com_count = A->lcount - (A->nnz) * (A->trash_pix);
635  A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);
636  break;
637  case NOEMPTY:
638  A->steps = size;
639  A->S = (int **) malloc(
640  A->steps * sizeof(int *)); // allocate sending maps tab
641  A->R = (int **) malloc(
642  A->steps * sizeof(int *)); // allocate receiving maps tab
643  A->nS = (int *) malloc(
644  A->steps * sizeof(int)); // allocate sending map sizes tab
645  A->nR = (int *) malloc(
646  A->steps * sizeof(int)); // allocate receiving map size tab
647  ring_init(A->lindices + (A->nnz) * (A->trash_pix),
648  A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,
649  A->nS, A->steps, A->comm);
650  A->com_count = A->lcount - (A->nnz) * (A->trash_pix);
651  A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);
652  break;
653  case ALLTOALLV:
654  A->steps = size;
655  A->S = (int **) malloc(
656  A->steps * sizeof(int *)); // allocate sending maps tab
657  A->R = (int **) malloc(
658  A->steps * sizeof(int *)); // allocate receiving maps tab
659  A->nS = (int *) malloc(
660  A->steps * sizeof(int)); // allocate sending map sizes tab
661  A->nR = (int *) malloc(
662  A->steps * sizeof(int)); // allocate receiving map size tab
663  ring_init(A->lindices + (A->nnz) * (A->trash_pix),
664  A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,
665  A->nS, A->steps, A->comm);
666  A->com_count = A->lcount - (A->nnz) * (A->trash_pix);
667  A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);
668  break;
669  case ALLREDUCE:
670  MPI_Allreduce(&(A->lindices[A->lcount - 1]), &max, 1, MPI_INT,
671  MPI_MAX,
672  A->comm); // maximum index
673  MPI_Allreduce(&(A->lindices[(A->nnz) * (A->trash_pix)]), &min, 1,
674  MPI_INT, MPI_MIN,
675  A->comm); //
676  A->com_count = (max - min + 1);
677  A->com_indices =
678  (int *) malloc((A->lcount - (A->nnz) * (A->trash_pix))
679  * sizeof(int)); // warning
680  i = (A->nnz) * (A->trash_pix);
681  j = 0;
682  while (j < A->com_count
683  && i < A->lcount) { // same as subsetmap for a coutiguous set
684  if (min + j < A->lindices[i]) {
685  j++;
686  } else {
687  A->com_indices[i - (A->nnz) * (A->trash_pix)] = j;
688  i++;
689  j++;
690  }
691  }
692  break;
693  }
694  return 0;
695 }
696 #endif
697 
704 int MatVecProd(Mat *A, double *x, double *y, int pflag) {
705  int i, j, e; // indexes
706 
707  // set output vector to zero
708  for (i = 0; i < A->m; i++) y[i] = 0.0;
709 
710  e = 0;
711  if (A->trash_pix) {
712  for (i = 0; i < A->m * A->nnz; i += A->nnz) {
713  if (A->indices[i] != 0) {
714  for (j = 0; j < A->nnz; j++) {
715  y[e] += A->values[i + j] * x[A->indices[i + j] - (A->nnz)];
716  }
717  }
718  e++;
719  }
720  } else {
721  for (i = 0; i < A->m * A->nnz; i += A->nnz) {
722  for (j = 0; j < A->nnz; j++) {
723  y[e] += A->values[i + j] * x[A->indices[i + j]];
724  }
725  e++;
726  }
727  }
728  return 0;
729 }
730 
731 #ifdef W_MPI
744 int TrMatVecProd_Naive(Mat *A, double *y, double *x, int pflag) {
745  int i, j, e, rank, size;
746  int *rbuf, rbufcount;
747  double *rbufvalues, *lvalues;
748  int p, rp, sp, tag;
749  MPI_Request s_request, r_request;
750  MPI_Status status;
751 
752  MPI_Comm_rank(A->comm, &rank); // get rank and size of the communicator
753  MPI_Comm_size(A->comm, &size); //
754  lvalues = (double *) malloc(
755  A->lcount * sizeof(double)); // allocate and set local values to 0.0
756  for (i = 0; i < A->lcount; i++) //
757  lvalues[i] = 0.0; //
758 
759  e = 0;
760  for (i = 0; i < A->m; i++) { // local transform reduces
761  for (j = 0; j < A->nnz; j++) { //
762  lvalues[A->indices[i * A->nnz + j]] +=
763  (A->values[i * A->nnz + j]) * y[i];
764  }
765  }
766 
767  memcpy(x, lvalues,
768  (A->lcount) * sizeof(double)); // copy local values into the result*/
769  MPI_Allreduce(
770  &(A->lcount), &(rbufcount), 1, MPI_INT, MPI_MAX,
771  A->comm); // find the max communication buffer sizes, and allocate
772 
773  rbuf = (int *) malloc(rbufcount * sizeof(int));
774  rbufvalues = (double *) malloc(rbufcount * sizeof(double));
775 
776  tag = 0;
777  for (p = 1; p < size;
778  p++) { // loop : collective global reduce in ring-like fashion
779  rp = (size + rank - p) % size;
780  sp = (rank + p) % size;
781  MPI_Send(&(A->lcount), 1, MPI_INT, sp, 0, A->comm); // exchange sizes
782  MPI_Recv(&rbufcount, 1, MPI_INT, rp, 0, A->comm, &status);
783  tag++;
784  MPI_Irecv(rbuf, rbufcount, MPI_INT, rp, tag, A->comm,
785  &r_request); // exchange local indices
786  MPI_Isend(A->lindices, A->lcount, MPI_INT, sp, tag, A->comm,
787  &s_request);
788  MPI_Wait(&r_request, &status);
789  MPI_Wait(&s_request, &status);
790  tag++;
791  MPI_Irecv(rbufvalues, rbufcount, MPI_DOUBLE, rp, tag, A->comm,
792  &r_request); // exchange local values
793  MPI_Isend(lvalues, A->lcount, MPI_DOUBLE, sp, tag, A->comm, &s_request);
794  tag++;
795  MPI_Wait(&r_request, &status);
796  m2m_sum(rbufvalues, rbuf, rbufcount, x, A->lindices,
797  A->lcount); // sum in the result
798  MPI_Wait(&s_request, &status);
799  }
800  free(lvalues);
801  return 0;
802 }
803 #endif
804 
819 int TrMatVecProd(Mat *A, double *y, double *x, int pflag) {
820  // double *sbuf, *rbuf;
821  int i, j, k, e;
822  // int nSmax, nRmax;
823  // double *lvalues;
824 
825  if (A->trash_pix) {
826  // refresh output vector
827  for (i = 0; i < A->lcount - A->nnz; i++) x[i] = 0.0;
828 
829  e = 0;
830  for (i = 0; i < A->m * A->nnz; i += A->nnz) {
831  if (A->indices[i] != 0) {
832  // local transform reduce
833  for (j = 0; j < A->nnz; j++) {
834  x[A->indices[i + j] - (A->nnz)] +=
835  A->values[i + j] * y[e]; //
836  }
837  }
838  e++;
839  }
840  } else {
841  // refresh output vector
842  for (i = 0; i < A->lcount; i++) x[i] = 0.0;
843 
844  e = 0;
845  for (i = 0; i < A->m * A->nnz; i += A->nnz) {
846  // local transform reduce
847  for (j = 0; j < A->nnz; j++) {
848  x[A->indices[i + j]] += A->values[i + j] * y[e];
849  }
850  e++;
851  }
852  }
853 
854 #ifdef W_MPI
855  // perform global reduce
856  greedyreduce(A, x);
857 #endif
858  return 0;
859 }
860 
861 #ifdef W_MPI
867 int MatInfo(Mat *mat, int verbose, char *filename) {
868  FILE *out;
869  int *n;
870  int *sr;
871  int *s;
872  int nnzline, sparsity, maxstep, maxsize, sumline, total;
873  int i, j, k;
874  char fn[100];
875  int rank, size;
876  int master = 0;
877  MPI_Comm_rank(mat->comm, &rank);
878  MPI_Comm_size(mat->comm, &size);
879 
880  if (rank == master) { // master process saves data into filename_info.txt
881  sprintf(fn, "%s_%s", filename, "info.txt");
882  out = fopen(fn, "w");
883  if (out == NULL) {
884  printf("cannot open file %s\n", fn);
885  return 1;
886  }
887  printf("open file %s ...", fn);
888  fprintf(out, "flag %d\n",
889  mat->flag); // print matirx main description : flag
890  // (communication scheme),
891  fprintf(out, "rows %d\n ", mat->m); // rows per process,
892  fprintf(out, "nnz %d\n", mat->nnz); // nnz (number of non zero per row).
893  fprintf(out, "\n"); // separator
894  }
895 
896  /*n = (int* ) calloc(mat->lcount,sizeof(int)); //allocate
897  //printf("before gather %d\n", rank);
898  MPI_Gather(&(mat->lcount), 1, MPI_INT, n, 1, MPI_INT, master, mat->comm);
899  //gather nbnonempty cols
900  //printf("after gather %d\n", rank);
901 
902  if(rank==master){ //master process saves data into
903  filename_info.txt fprintf(out, "cols :\n"); //nnz (number of non zero per
904  row). for(i=0; i<size; i++) // fprintf(out, "%d ", n[i]); //non-empty
905  columns per process. fprintf(out, "\n"); //
906  }
907  free(n); */
908  // free allocated tabs
909 
910  nnzline = 0; // compute communication sparsity and maximum message size
911  sumline = 0;
912  for (i = 0; i < mat->steps; i++) { //
913  sumline += mat->nS[i];
914  if (mat->nS[i] == 0) { //
915  nnzline += 1; //
916  } //
917  } //
918  MPI_Reduce(&nnzline, &sparsity, 1, MPI_INT, MPI_SUM, 0,
919  mat->comm); // sparsity
920  MPI_Reduce(&sumline, &total, 1, MPI_INT, MPI_SUM, 0, mat->comm); // sparsity
921  if (rank == master) { // master process saves data into filename_info.txt
922  fprintf(out, "sparsity %d\n", sparsity); //
923  fprintf(out, "total %d\n", total); //
924  }
925 
926  maxsize = 0;
927  for (i = 0; i < mat->steps; i++) { //
928  MPI_Reduce(&(mat->nS[i]), &maxstep, 1, MPI_INT, MPI_MAX, 0,
929  mat->comm); // maximum message size
930  maxsize += maxstep; //
931  } //
932  if (rank == master) { // master process saves data into filename_info.txt
933  fprintf(out, "maxsize %d\n ", maxsize); //
934  fprintf(out, "\n"); // separator
935  } //
936 
937  /* s = (int* ) calloc((mat->steps),sizeof(int)); //allocate steps
938  MPI_Reduce(mat->nS, s, mat->steps, MPI_INT, MPI_SUM, 0, mat->comm);
939  //imaximum message size
940 
941  if(rank==master){ //master process saves data into
942  filename_info.txt
943  fprintf(out, "sumsteps :\n"); //nnz (number of non zero per row).
944  for(i=0; i<mat->steps; i++) //
945  fprintf(out, "%d ", s[i]); //non-empty columns per process.
946  fprintf(out, "\n"); //
947  }
948  free(s);
949 
950  if(verbose==1){
951  sr = (int* ) calloc((mat->steps)*size,sizeof(int)); //allocate
952  send/receive matrix
953  //printf("before gather %d\n", rank);
954  MPI_Gather(mat->nS, mat->steps, MPI_INT, sr, mat->steps, MPI_INT,
955  master, mat->comm);
956  //gather nbnonempty cols
957  //printf("after gather %d\n", rank);
958 
959  if(rank==master){ //master process saves data into
960  filename_info.txt fprintf(out, "send/receive matrix\n"); //separator
961  for(i=0; i<size; i++){ //print collective description :
962  if(mat->flag==BUTTERFLY){ //send-receive matrix
963  for(j=0; j<size; j++){ //print send/receive matrix
964  if(j>i){
965  if(is_pow_2(j-i)==0)
966  fprintf(out,"%d ",
967  sr[i*(mat->steps)+log_2(j-i)]); else fprintf(out,"%d ", 0);
968  }
969  else if(i>j){
970  if(is_pow_2(size+j-i)==0)
971  fprintf(out,"%d ",
972  sr[i*(mat->steps)+log_2(size+j-i)]); else fprintf(out,"%d ", 0);
973  }
974  else{
975  fprintf(out,"%d ", 0);
976  }
977  }
978  fprintf(out, "\n");
979  }
980  else{
981  for(j=0; j<size; j++){ //print send/receive matrix
982  if(j>i){
983  fprintf(out,"%d ", sr[i*(mat->steps)+j-i]);
984  }
985  else if(i>j){
986  fprintf(out,"%d ", sr[(i+1)*(mat->steps)-i+j]);
987  }
988  else{
989  fprintf(out,"%d ", 0);
990  }
991  }
992  fprintf(out, "\n");
993  }
994  }
995  }
996  free(sr);
997  }*/
998 
999  if (rank == master) { // master process saves data into filename_info.txt
1000  fclose(out);
1001  printf("close %s\n", fn);
1002  }
1003  return 0;
1004 }
1005 #endif
1006 
1007 #if W_MPI
1008 int greedyreduce(Mat *A, double *x) {
1009  int i, j, k;
1010  int nSmax, nRmax, nStot, nRtot;
1011  double *lvalues;
1012  lvalues = (double *) malloc(
1013  (A->lcount - (A->nnz) * (A->trash_pix))
1014  * sizeof(double)); // allocate and set to 0.0 local values
1015  memcpy(lvalues, x,
1016  (A->lcount - (A->nnz) * (A->trash_pix))
1017  * sizeof(double)); // copy local values into result values
1018  double *com_val;
1019  double *out_val;
1020  int ne = 0;
1021  switch (A->flag) {
1022  case BUTTERFLY:
1023  for (k = 0; k < A->steps;
1024  k++) // compute max communication buffer size
1025  if (A->nR[k] > nRmax) nRmax = A->nR[k];
1026  for (k = 0; k < A->steps; k++)
1027  if (A->nS[k] > nSmax) nSmax = A->nS[k];
1028  com_val = (double *) malloc(A->com_count * sizeof(double));
1029  for (i = 0; i < A->com_count; i++) com_val[i] = 0.0;
1030  m2m(lvalues, A->lindices + (A->nnz) * (A->trash_pix),
1031  A->lcount - (A->nnz) * (A->trash_pix), com_val, A->com_indices,
1032  A->com_count);
1033  butterfly_reduce(A->R, A->nR, nRmax, A->S, A->nS, nSmax, com_val,
1034  A->steps, A->comm);
1035  m2m(com_val, A->com_indices, A->com_count, x,
1036  A->lindices + (A->nnz) * (A->trash_pix),
1037  A->lcount - (A->nnz) * (A->trash_pix));
1038  free(com_val);
1039  break;
1040  //==========================Modification added by Sebastien Cayrols :
1041  // 01/09/2015 , Berkeley
1042  case BUTTERFLY_BLOCKING_1:
1043  for (k = 0; k < A->steps;
1044  k++) // compute max communication buffer size
1045  if (A->nR[k] > nRmax) nRmax = A->nR[k];
1046  for (k = 0; k < A->steps; k++)
1047  if (A->nS[k] > nSmax) nSmax = A->nS[k];
1048  com_val = (double *) malloc(A->com_count * sizeof(double));
1049  for (i = 0; i < A->com_count; i++) com_val[i] = 0.0;
1050  m2m(lvalues, A->lindices + (A->nnz) * (A->trash_pix),
1051  A->lcount - (A->nnz) * (A->trash_pix), com_val, A->com_indices,
1052  A->com_count);
1053  butterfly_blocking_1instr_reduce(A->R, A->nR, nRmax, A->S, A->nS,
1054  nSmax, com_val, A->steps, A->comm);
1055  m2m(com_val, A->com_indices, A->com_count, x,
1056  A->lindices + (A->nnz) * (A->trash_pix),
1057  A->lcount - (A->nnz) * (A->trash_pix));
1058  free(com_val);
1059  break;
1060  case BUTTERFLY_BLOCKING_2:
1061  for (k = 0; k < A->steps;
1062  k++) // compute max communication buffer size
1063  if (A->nR[k] > nRmax) nRmax = A->nR[k];
1064  for (k = 0; k < A->steps; k++)
1065  if (A->nS[k] > nSmax) nSmax = A->nS[k];
1066  com_val = (double *) malloc(A->com_count * sizeof(double));
1067  for (i = 0; i < A->com_count; i++) com_val[i] = 0.0;
1068  m2m(lvalues, A->lindices + (A->nnz) * (A->trash_pix),
1069  A->lcount - (A->nnz) * (A->trash_pix), com_val, A->com_indices,
1070  A->com_count);
1071  butterfly_blocking_1instr_reduce(A->R, A->nR, nRmax, A->S, A->nS,
1072  nSmax, com_val, A->steps, A->comm);
1073  m2m(com_val, A->com_indices, A->com_count, x,
1074  A->lindices + (A->nnz) * (A->trash_pix),
1075  A->lcount - (A->nnz) * (A->trash_pix));
1076  free(com_val);
1077  break;
1078  case NOEMPTYSTEPRING:
1079  for (k = 1; k < A->steps;
1080  k++) // compute max communication buffer size
1081  if (A->nR[k] > nRmax) nRmax = A->nR[k];
1082  nSmax = nRmax;
1083  ring_noempty_step_reduce(A->R, A->nR, nRmax, A->S, A->nS, nSmax,
1084  lvalues, x, A->steps, A->comm);
1085  break;
1086  //==========================End modification
1087  case RING:
1088  for (k = 1; k < A->steps;
1089  k++) // compute max communication buffer size
1090  if (A->nR[k] > nRmax) nRmax = A->nR[k];
1091  nSmax = nRmax;
1092  ring_reduce(A->R, A->nR, nRmax, A->S, A->nS, nSmax, lvalues, x,
1093  A->steps, A->comm);
1094  break;
1095  case NONBLOCKING:
1096  ring_nonblocking_reduce(A->R, A->nR, A->S, A->nS, lvalues, x,
1097  A->steps, A->comm);
1098  break;
1099  case NOEMPTY:
1100  for (k = 1; k < A->steps; k++)
1101  if (A->nR[k] != 0) ne++;
1102  ring_noempty_reduce(A->R, A->nR, ne, A->S, A->nS, ne, lvalues, x,
1103  A->steps, A->comm);
1104  break;
1105  case ALLREDUCE:
1106  com_val = (double *) malloc(A->com_count * sizeof(double));
1107  out_val = (double *) malloc(A->com_count * sizeof(double));
1108  for (i = 0; i < A->com_count; i++) {
1109  com_val[i] = 0.0;
1110  out_val[i] = 0.0;
1111  }
1112  s2m(com_val, lvalues, A->com_indices,
1113  A->lcount - (A->nnz) * (A->trash_pix));
1114  /*for(i=0; i < A->com_count; i++){
1115  printf("%lf ", com_val[i]);
1116  } */
1117  MPI_Allreduce(com_val, out_val, A->com_count, MPI_DOUBLE, MPI_SUM,
1118  A->comm); // maximum index
1119  /*for(i=0; i < A->com_count; i++){
1120  printf("%lf ", out_val[i]);
1121  } */
1122  m2s(out_val, x, A->com_indices,
1123  A->lcount - (A->nnz) * (A->trash_pix)); // sum receive buffer
1124  // into values
1125  free(com_val);
1126  free(out_val);
1127  break;
1128  case ALLTOALLV:
1129  nRtot = nStot = 0;
1130  for (k = 0; k < A->steps; k++) { // compute buffer sizes
1131  nRtot += A->nR[k]; // to receive
1132  nStot += A->nS[k]; // to send
1133  }
1134 
1135  alltoallv_reduce(A->R, A->nR, nRtot, A->S, A->nS, nStot, lvalues, x,
1136  A->steps, A->comm);
1137  break;
1138  }
1139  free(lvalues);
1140  return 0;
1141 }
1142 #endif
void m2s(double *mapval, double *submapval, int *subset, int count)
Definition: alm.c:27
int m2m_sum(double *vA1, int *A1, int n1, double *vA2, int *A2, int n2)
Definition: alm.c:120
int m2m(double *vA1, int *A1, int n1, double *vA2, int *A2, int n2)
Definition: alm.c:97
void s2m(double *mapval, double *submapval, int *subset, int count)
assign submap values the submap values array
Definition: alm.c:64
int is_pow_2(int n)
Definition: bitop.c:13
int log_2(int n)
Definition: bitop.c:26
int butterfly_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a butterfly-like communication scheme.
Definition: butterfly.c:236
int butterfly_init(int *indices, int count, int **R, int *nR, int **S, int *nS, int **com_indices, int *com_count, int steps, MPI_Comm comm)
Initialize tables for butterfly-like communication scheme This routine set up needed tables for the b...
Definition: butterfly.c:52
int butterfly_blocking_1instr_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a butterfly-like communication scheme.
Definition: butterfly.c:340
int sindex(int *T, int nT, int *A, int nA)
Definition: cindex.c:34
int ssort(int *indices, int count, int flag)
Definition: csort.c:172
int TrMatVecProd(Mat *A, double *y, double *x, int pflag)
Definition: mapmat.c:819
int MatInit(Mat *A, int m, int nnz, int *indices, double *values, int flag #ifdef W_MPI, MPI_Comm comm #endif)
Definition: mapmat.c:54
int TrMatVecProd_Naive(Mat *A, double *y, double *x, int pflag)
Definition: mapmat.c:744
void MatFree(Mat *A)
Definition: mapmat.c:323
int MatLoad(Mat *mat, char *filename)
Definition: mapmat.c:414
int MatComShape(Mat *A, int flag, MPI_Comm comm)
Definition: mapmat.c:530
void MatReset(Mat *A)
Definition: mapmat.c:255
int MatSave(Mat *mat, char *filename)
Definition: mapmat.c:460
int greedyreduce(Mat *A, double *x)
Definition: mapmat.c:1008
int MatInfo(Mat *mat, int verbose, char *filename)
Print information about a matrix. Usefull function to check, debug or bench. It prints matrix array...
Definition: mapmat.c:867
void MatSetIndices(Mat *A, int m, int nnz, int *indices)
Definition: mapmat.c:83
void CommInfo(Mat *A)
Definition: mapmat.c:106
void MatSetValues(Mat *A, int m, int nnz, double *values)
Definition: mapmat.c:97
int MatLocalShape(Mat *A, int sflag)
Definition: mapmat.c:496
int MatVecProd(Mat *A, double *x, double *y, int pflag)
Definition: mapmat.c:704
int ring_noempty_step_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a ring-like communication scheme.
Definition: ring.c:380
int alltoallv_reduce(int **R, int *nR, int nRtot, int **S, int *nS, int nStot, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using an MPI-Alltoallv call.
Definition: ring.c:157
int ring_nonblocking_reduce(int **R, int *nR, int **S, int *nS, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a ring-like non-blocking communication sch...
Definition: ring.c:234
int ring_noempty_reduce(int **R, int *nR, int nneR, int **S, int *nS, int nneS, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a ring-like non-blocking no-empty communic...
Definition: ring.c:298
int ring_init(int *indices, int count, int **R, int *nR, int **S, int *nS, int steps, MPI_Comm comm)
Initialize tables for ring-like communication scheme.
Definition: ring.c:48
int ring_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a ring-like communication scheme.
Definition: ring.c:109