midapack/mapmat_8c_source.html

 #ifdef W_MPI

 #include <mpi.h>

 #endif

 #include "mapmat.h"

 #include <stdio.h>

 #include <stdlib.h>

 #include <string.h>


 int MatInit(Mat *A, int m, int nnz, int *indices, double *values, int flag

 #ifdef W_MPI

             ,

             MPI_Comm comm

 #endif

 ) {

     int err;

     MatSetIndices(A, m, nnz, indices);


     MatSetValues(A, m, nnz, values);


     err = MatLocalShape(

             A,

             3); // compute lindices (local columns) (method 3 = counting sort)


 #ifdef W_MPI

     err = MatComShape(A, flag, comm); // build communication scheme

 #endif

     return err;

 }


 void MatSetIndices(Mat *A, int m, int nnz, int *indices) {

     A->m       = m;       // set number of local rows

     A->nnz     = nnz;     // set number of non-zero values per row

     A->indices = indices; // point to indices

 }


 void MatSetValues(Mat *A, int m, int nnz, double *values) {

     int err;

     A->m      = m;      // set number of local rows

     A->nnz    = nnz;    // set number of non-zero values per row

     A->values = values; // point to values

 }


 //===================Part added by Sebastien Cayrols to get amount of memory

 // needed by communication algoritms

 void CommInfo(Mat *A) {

 #if W_MPI

     int    i           = 0, size, rank;

     double maxSizeR    = 0.0;

     double maxSizeS    = 0.0;

     double amountSizeR = 0.0;

     double amountSizeS = 0.0;

     double stepSum = 0.0, stepAvg = 0.0;

     // this value is based on data sent

     double  *amountSizeByStep = NULL;

     double   minStep = 0.0, maxStep = 0.0;

     double  *s    = NULL;

     double  *r    = NULL;

     MPI_Comm comm = MPI_COMM_WORLD;

     MPI_Comm_rank(comm, &rank);

     MPI_Comm_size(comm, &size);

     s                = (double *) malloc(4 * sizeof(double));

     r                = (double *) malloc(4 * 3 * sizeof(double));

     amountSizeByStep = (double *) malloc(A->steps * sizeof(double));

     switch (A->flag) {

         case NONE:

             break;

         case BUTTERFLY:

             for (i = 0; i < A->steps; i++) {

                 amountSizeR += A->nR[i];

                 amountSizeS += A->nS[i];

                 if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];

                 if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];

             }

             break;

         //==========================Modification added by Sebastien Cayrols :

         // 01/09/2015 , Berkeley

         case BUTTERFLY_BLOCKING_1:

             for (i = 0; i < A->steps; i++) {

                 amountSizeR += A->nR[i];

                 amountSizeS += A->nS[i];

                 if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];

                 if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];

             }

             break;

         case BUTTERFLY_BLOCKING_2:

             for (i = 0; i < A->steps; i++) {

                 amountSizeR += A->nR[i];

                 amountSizeS += A->nS[i];

                 if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];

                 if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];

             }

             break;

         case NOEMPTYSTEPRING:

             for (i = 0; i < A->steps; i++) {

                 amountSizeR += A->nR[i];

                 amountSizeS += A->nS[i];

                 if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];

                 if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];

             }

             break;

         //==========================End modification

         case RING:

             for (i = 0; i < A->steps; i++) {

                 amountSizeR += A->nR[i];

                 amountSizeS += A->nS[i];

                 if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];

                 if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];

             }

             break;

         case NONBLOCKING:

             for (i = 0; i < A->steps; i++) {

                 amountSizeR += A->nR[i];

                 amountSizeS += A->nS[i];

                 if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];

                 if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];

             }

             break;

         case NOEMPTY:

             for (i = 0; i < A->steps; i++) {

                 amountSizeR += A->nR[i];

                 amountSizeS += A->nS[i];

                 if (A->nR[i] > maxSizeR) maxSizeR = A->nR[i];

                 if (A->nS[i] > maxSizeS) maxSizeS = A->nS[i];

             }

             break;

         case ALLTOALLV: // added -- rs 2015/02/04

             for (i = 0; i < A->steps; i++) {

                 amountSizeR += A->nR[i];

                 amountSizeS += A->nS[i];

             }

             break;

         case ALLREDUCE:

             amountSizeR = A->com_count;

             amountSizeS = A->com_count;

             maxSizeR    = A->com_count;

             maxSizeS    = A->com_count;

             break;

     }


     if (A->flag != ALLREDUCE && A->flag != ALLTOALLV) {

         double *t = NULL;


         t = (double *) malloc(A->steps * sizeof(double));

         // Copy int array into double array

         for (i = 0; i < A->steps; i++) t[i] = A->nS[i];


         MPI_Reduce(t, amountSizeByStep, A->steps, MPI_DOUBLE, MPI_SUM, 0, comm);


         free(t);


         if (rank == 0) {

             stepSum = minStep = maxStep = amountSizeByStep[0];

             printf("\n[MEMORY]Step n°%4d, message size : %e", 0,

                    amountSizeByStep[0]);

             for (i = 1; i < A->steps; i++) {

                 printf("\n[MEMORY]Step n°%4d, message size : %e", i,

                        amountSizeByStep[i]);

                 if (minStep > amountSizeByStep[i])

                     minStep = amountSizeByStep[i];

                 else if (maxStep < amountSizeByStep[i])

                     maxStep = amountSizeByStep[i];

                 stepSum += amountSizeByStep[i];

             }

             stepAvg = stepSum / A->steps;

         }

     }

     s[0] = amountSizeR;

     s[1] = amountSizeS;

     s[2] = maxSizeR;

     s[3] = maxSizeS;

     MPI_Reduce(s, r, 4, MPI_DOUBLE, MPI_SUM, 0, comm);

     if (rank == 0)

         for (i = 0; i < 4; i++) r[i] /= size;

     MPI_Reduce(s, &r[4], 4, MPI_DOUBLE, MPI_MIN, 0, comm);

     MPI_Reduce(s, &r[8], 4, MPI_DOUBLE, MPI_MAX, 0, comm);

     if (rank == 0) {

         printf("\n[MEMORY]Step average             : %e\t[%e,%e]", stepAvg,

                minStep, maxStep);

         printf("\n[MEMORY]Amount of data received  : %e\t[%e,%e]", r[0], r[4],

                r[8]);

         printf("\n[MEMORY]Amount of data sent      : %e\t[%e,%e]", r[1], r[5],

                r[9]);

         printf("\n[MEMORY]Message size received    : %e\t[%e,%e]", r[2], r[6],

                r[10]);

         printf("\n[MEMORY]Message size sent        : %e\t[%e,%e]\n", r[3], r[7],

                r[11]);

     }

     free(s);

     free(r);

     free(amountSizeByStep);

 #endif

 }


 void MatReset(Mat *A) {

 #if W_MPI

     switch (A->flag) {

         case NONE:

             break;

         case BUTTERFLY:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case BUTTERFLY_BLOCKING_1:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case BUTTERFLY_BLOCKING_2:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case NOEMPTYSTEPRING:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case RING:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case NONBLOCKING:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case NOEMPTY:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case ALLTOALLV:  // added -- rs 2015/02/04

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case ALLREDUCE:

             break;

     }

 #endif

 }


 //===================End


 void MatFree(Mat *A) {


     // get information about communication size

     CommInfo(A);


     free(A->lindices);

 #if W_MPI

     switch (A->flag) {

         case NONE:

             break;

         case BUTTERFLY:

             free(A->com_indices); //

             free(A->R);           //

             free(A->nR);          //

             free(A->S);           //

             free(A->nS);

             break;

         //==========================Modification added by Sebastien Cayrols :

         // 01/09/2015 , Berkeley

         case BUTTERFLY_BLOCKING_1:

             free(A->com_indices); //

             free(A->R);           //

             free(A->nR);          //

             free(A->S);           //

             free(A->nS);

             break;

         case BUTTERFLY_BLOCKING_2:

             free(A->com_indices); //

             free(A->R);           //

             free(A->nR);          //

             free(A->S);           //

             free(A->nS);

             break;

         case NOEMPTYSTEPRING:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         //==========================End modification

         case RING:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case NONBLOCKING:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case NOEMPTY:

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case ALLTOALLV:  // Added: rs 2015/02/04

             free(A->R);  //

             free(A->nR); //

             free(A->S);  //

             free(A->nS);

             break;

         case ALLREDUCE:

             free(A->com_indices); //

                                   //===================================Modification

                                   // from Sebastien Cayrols : comment of these

                                   // lines to avoid SEGSIGV

                                   //      free(A->R);           //

                                   //      free(A->nR);          //

                                   //      free(A->S);           //

                                   //      free(A->nS);

                                   //===================================End modif

             break;

     }

 #endif

 }


 int MatLoad(Mat *mat, char *filename) {

     int err;

     int rank;

 #if W_MPI

     MPI_Comm_rank(mat->comm, &rank);

 #else

     rank = 0;

 #endif

     FILE *in;

     char  fn[100];

     int   i = 0;

     sprintf(fn, "%s_%d.dat", filename, rank);

     printf("%s", fn);

     in = fopen(fn, "r");

     if (in == NULL) {

         printf("cannot open file %s", fn);

         return 1;

     }

     while (feof(in) == 0 && i < (mat->m * mat->nnz)) {

         if (mat->nnz == 1) {

             fscanf(in, "%d %lf", &(mat->indices[i]), &(mat->values[i]));

         } else if (mat->nnz == 2) {

             fscanf(in, "%d %lf %d %lf", &(mat->indices[i]), &(mat->values[i]),

                    &(mat->indices[i + 1]), &(mat->values[i + 1]));

         } else {

             return 1; //(nnz > 2) not implement

         }

         i += mat->nnz;

     }

     if (i != mat->m * mat->nnz) { printf("WARNNING data size doesn't fit\n"); }

     fclose(in);

     return 0;

 }


 int MatSave(Mat *mat, char *filename) {

     FILE *out;

     char  fn[100];

     int   i, j;

     int   rank;

 #if W_MPI

     MPI_Comm_rank(mat->comm, &rank);

 #else

     sprintf(fn, "%s_%d.dat", filename, rank);

 #endif

     out = fopen(fn, "w");

     if (out == NULL) {

         printf("cannot open file %s", fn);

         return 1;

     }

     for (i = 0; i < (mat->nnz * mat->m); i += mat->nnz) {

         for (j = 0; j < mat->nnz; j++) {

             fprintf(out, "%d ", mat->indices[i + j]);

             fprintf(out, "%f ", mat->values[i + j]);

         }

         fprintf(out, "\n");

     }

     fclose(out);

     return 0;

 }


 int MatLocalShape(Mat *A, int sflag) {

     int *tmp_indices;


     tmp_indices = (int *) malloc(

             (int64_t) (A->m) * A->nnz

             * sizeof(int)); // allocate a tmp copy of indices tab to sort

     memcpy(tmp_indices, A->indices,

            (int64_t) (A->m) * A->nnz * sizeof(int)); // copy


     //  A->lcount = omp_psort(tmp_indices, A->m * A->nnz, sflag);

     //  //sequential sort tmp_indices

     A->lcount = ssort(tmp_indices, A->m * A->nnz,

                       sflag); // sequential sort tmp_indices


     A->lindices = (int *) malloc(A->lcount * sizeof(int));

     memcpy(A->lindices, tmp_indices,

            A->lcount * sizeof(int)); // copy tmp_indices into lindices and free

     free(tmp_indices);


     sindex(A->lindices, A->lcount, A->indices, A->nnz * A->m);


     // check for masked pixels

     if (A->lindices[0] < 0) { A->trash_pix = 1; }


     return 0;

 }


 #if W_MPI

 int MatComShape(Mat *A, int flag, MPI_Comm comm) {

     int size;

     int i, min, max, j;

     A->comm = comm; // set communivcator

     A->flag = flag;

     MPI_Comm_size(A->comm, &size);

     if ((A->flag == BUTTERFLY || A->flag == BUTTERFLY_BLOCKING_1

          || A->flag == BUTTERFLY_BLOCKING_2)

         && is_pow_2(size) != 0)

         A->flag = RING;

     switch (A->flag) {

         case BUTTERFLY:

             A->steps = log_2(size);

             A->S     = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate sending maps tab

             A->R = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate receiving maps tab

             A->nS = (int *) malloc(

                     A->steps * sizeof(int));   // allocate sending map sizes tab

             A->nR = (int *) malloc(

                     A->steps * sizeof(int)); // allocate receiving map size tab

             butterfly_init(A->lindices + (A->nnz) * (A->trash_pix),

                            A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR,

                            A->S, A->nS, &(A->com_indices), &(A->com_count),

                            A->steps, A->comm);

             break;

         //==========================Modification added by Sebastien Cayrols :

         // 01/09/2015 , Berkeley

         case BUTTERFLY_BLOCKING_1:

             A->steps = log_2(size);

             A->S     = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate sending maps tab

             A->R = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate receiving maps tab

             A->nS = (int *) malloc(

                     A->steps * sizeof(int));   // allocate sending map sizes tab

             A->nR = (int *) malloc(

                     A->steps * sizeof(int)); // allocate receiving map size tab

             butterfly_init(A->lindices + (A->nnz) * (A->trash_pix),

                            A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR,

                            A->S, A->nS, &(A->com_indices), &(A->com_count),

                            A->steps, A->comm);

             break;

         case BUTTERFLY_BLOCKING_2:

             A->steps = log_2(size);

             A->S     = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate sending maps tab

             A->R = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate receiving maps tab

             A->nS = (int *) malloc(

                     A->steps * sizeof(int));   // allocate sending map sizes tab

             A->nR = (int *) malloc(

                     A->steps * sizeof(int)); // allocate receiving map size tab

             butterfly_init(A->lindices + (A->nnz) * (A->trash_pix),

                            A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR,

                            A->S, A->nS, &(A->com_indices), &(A->com_count),

                            A->steps, A->comm);

             break;

         case NOEMPTYSTEPRING:

             A->steps = size;

             A->S     = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate sending maps tab

             A->R = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate receiving maps tab

             A->nS = (int *) malloc(

                     A->steps * sizeof(int));   // allocate sending map sizes tab

             A->nR = (int *) malloc(

                     A->steps * sizeof(int)); // allocate receiving map size tab

             ring_init(A->lindices + (A->nnz) * (A->trash_pix),

                       A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,

                       A->nS, A->steps, A->comm);

             A->com_count   = A->lcount - (A->nnz) * (A->trash_pix);

             A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);

             break;

         //==========================End modification

         case RING:

             A->steps = size;

             A->S     = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate sending maps tab

             A->R = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate receiving maps tab

             A->nS = (int *) malloc(

                     A->steps * sizeof(int));   // allocate sending map sizes tab

             A->nR = (int *) malloc(

                     A->steps * sizeof(int)); // allocate receiving map size tab

             ring_init(A->lindices + (A->nnz) * (A->trash_pix),

                       A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,

                       A->nS, A->steps, A->comm);

             A->com_count   = A->lcount - (A->nnz) * (A->trash_pix);

             A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);

             break;

         case NONBLOCKING:

             A->steps = size;

             A->S     = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate sending maps tab

             A->R = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate receiving maps tab

             A->nS = (int *) malloc(

                     A->steps * sizeof(int));   // allocate sending map sizes tab

             A->nR = (int *) malloc(

                     A->steps * sizeof(int)); // allocate receiving map size tab

             ring_init(A->lindices + (A->nnz) * (A->trash_pix),

                       A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,

                       A->nS, A->steps, A->comm);

             A->com_count   = A->lcount - (A->nnz) * (A->trash_pix);

             A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);

             break;

         case NOEMPTY:

             A->steps = size;

             A->S     = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate sending maps tab

             A->R = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate receiving maps tab

             A->nS = (int *) malloc(

                     A->steps * sizeof(int));   // allocate sending map sizes tab

             A->nR = (int *) malloc(

                     A->steps * sizeof(int)); // allocate receiving map size tab

             ring_init(A->lindices + (A->nnz) * (A->trash_pix),

                       A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,

                       A->nS, A->steps, A->comm);

             A->com_count   = A->lcount - (A->nnz) * (A->trash_pix);

             A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);

             break;

         case ALLTOALLV:

             A->steps = size;

             A->S     = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate sending maps tab

             A->R = (int **) malloc(

                     A->steps * sizeof(int *)); // allocate receiving maps tab

             A->nS = (int *) malloc(

                     A->steps * sizeof(int));   // allocate sending map sizes tab

             A->nR = (int *) malloc(

                     A->steps * sizeof(int)); // allocate receiving map size tab

             ring_init(A->lindices + (A->nnz) * (A->trash_pix),

                       A->lcount - (A->nnz) * (A->trash_pix), A->R, A->nR, A->S,

                       A->nS, A->steps, A->comm);

             A->com_count   = A->lcount - (A->nnz) * (A->trash_pix);

             A->com_indices = A->lindices + (A->nnz) * (A->trash_pix);

             break;

         case ALLREDUCE:

             MPI_Allreduce(&(A->lindices[A->lcount - 1]), &max, 1, MPI_INT,

                           MPI_MAX,

                           A->comm); // maximum index

             MPI_Allreduce(&(A->lindices[(A->nnz) * (A->trash_pix)]), &min, 1,

                           MPI_INT, MPI_MIN,

                           A->comm); //

             A->com_count = (max - min + 1);

             A->com_indices =

                     (int *) malloc((A->lcount - (A->nnz) * (A->trash_pix))

                                    * sizeof(int)); // warning

             i = (A->nnz) * (A->trash_pix);

             j = 0;

             while (j < A->com_count

                    && i < A->lcount) { // same as subsetmap for a coutiguous set

                 if (min + j < A->lindices[i]) {

                     j++;

                 } else {

                     A->com_indices[i - (A->nnz) * (A->trash_pix)] = j;

                     i++;

                     j++;

                 }

             }

             break;

     }

     return 0;

 }

 #endif


 int MatVecProd(Mat *A, double *x, double *y, int pflag) {

     int i, j, e; // indexes


     // set output vector to zero

     for (i = 0; i < A->m; i++) y[i] = 0.0;


     e = 0;

     if (A->trash_pix) {

         for (i = 0; i < A->m * A->nnz; i += A->nnz) {

             if (A->indices[i] != 0) {

                 for (j = 0; j < A->nnz; j++) {

                     y[e] += A->values[i + j] * x[A->indices[i + j] - (A->nnz)];

                 }

             }

             e++;

         }

     } else {

         for (i = 0; i < A->m * A->nnz; i += A->nnz) {

             for (j = 0; j < A->nnz; j++) {

                 y[e] += A->values[i + j] * x[A->indices[i + j]];

             }

             e++;

         }

     }

     return 0;

 }


 #ifdef W_MPI

 int TrMatVecProd_Naive(Mat *A, double *y, double *x, int pflag) {

     int         i, j, e, rank, size;

     int        *rbuf, rbufcount;

     double     *rbufvalues, *lvalues;

     int         p, rp, sp, tag;

     MPI_Request s_request, r_request;

     MPI_Status  status;


     MPI_Comm_rank(A->comm, &rank); // get rank and size of the communicator

     MPI_Comm_size(A->comm, &size); //

     lvalues = (double *) malloc(

             A->lcount * sizeof(double)); // allocate and set local values to 0.0

     for (i = 0; i < A->lcount; i++)      //

         lvalues[i] = 0.0;                //


     e = 0;

     for (i = 0; i < A->m; i++) {       // local transform reduces

         for (j = 0; j < A->nnz; j++) { //

             lvalues[A->indices[i * A->nnz + j]] +=

                     (A->values[i * A->nnz + j]) * y[i];

         }

     }


     memcpy(x, lvalues,

            (A->lcount) * sizeof(double)); // copy local values into the result*/

     MPI_Allreduce(

             &(A->lcount), &(rbufcount), 1, MPI_INT, MPI_MAX,

             A->comm); // find the max communication buffer sizes, and allocate


     rbuf       = (int *) malloc(rbufcount * sizeof(int));

     rbufvalues = (double *) malloc(rbufcount * sizeof(double));


     tag = 0;

     for (p = 1; p < size;

          p++) { // loop : collective global reduce in ring-like fashion

         rp = (size + rank - p) % size;

         sp = (rank + p) % size;

         MPI_Send(&(A->lcount), 1, MPI_INT, sp, 0, A->comm); // exchange sizes

         MPI_Recv(&rbufcount, 1, MPI_INT, rp, 0, A->comm, &status);

         tag++;

         MPI_Irecv(rbuf, rbufcount, MPI_INT, rp, tag, A->comm,

                   &r_request); // exchange local indices

         MPI_Isend(A->lindices, A->lcount, MPI_INT, sp, tag, A->comm,

                   &s_request);

         MPI_Wait(&r_request, &status);

         MPI_Wait(&s_request, &status);

         tag++;

         MPI_Irecv(rbufvalues, rbufcount, MPI_DOUBLE, rp, tag, A->comm,

                   &r_request); // exchange local values

         MPI_Isend(lvalues, A->lcount, MPI_DOUBLE, sp, tag, A->comm, &s_request);

         tag++;

         MPI_Wait(&r_request, &status);

         m2m_sum(rbufvalues, rbuf, rbufcount, x, A->lindices,

                 A->lcount); // sum in the result

         MPI_Wait(&s_request, &status);

     }

     free(lvalues);

     return 0;

 }

 #endif


 int TrMatVecProd(Mat *A, double *y, double *x, int pflag) {

     // double *sbuf, *rbuf;

     int i, j, k, e;

     // int nSmax, nRmax;

     // double *lvalues;


     if (A->trash_pix) {

         // refresh output vector

         for (i = 0; i < A->lcount - A->nnz; i++) x[i] = 0.0;


         e = 0;

         for (i = 0; i < A->m * A->nnz; i += A->nnz) {

             if (A->indices[i] != 0) {

                 // local transform reduce

                 for (j = 0; j < A->nnz; j++) {

                     x[A->indices[i + j] - (A->nnz)] +=

                             A->values[i + j] * y[e]; //

                 }

             }

             e++;

         }

     } else {

         // refresh output vector

         for (i = 0; i < A->lcount; i++) x[i] = 0.0;


         e = 0;

         for (i = 0; i < A->m * A->nnz; i += A->nnz) {

             // local transform reduce

             for (j = 0; j < A->nnz; j++) {

                 x[A->indices[i + j]] += A->values[i + j] * y[e];

             }

             e++;

         }

     }


 #ifdef W_MPI

     // perform global reduce

     greedyreduce(A, x);

 #endif

     return 0;

 }


 #ifdef W_MPI

 int MatInfo(Mat *mat, int verbose, char *filename) {

     FILE *out;

     int  *n;

     int  *sr;

     int  *s;

     int   nnzline, sparsity, maxstep, maxsize, sumline, total;

     int   i, j, k;

     char  fn[100];

     int   rank, size;

     int   master = 0;

     MPI_Comm_rank(mat->comm, &rank);

     MPI_Comm_size(mat->comm, &size);


     if (rank == master) { // master process saves data into filename_info.txt

         sprintf(fn, "%s_%s", filename, "info.txt");

         out = fopen(fn, "w");

         if (out == NULL) {

             printf("cannot open file %s\n", fn);

             return 1;

         }

         printf("open file %s ...", fn);

         fprintf(out, "flag %d\n",

                 mat->flag); // print matirx main description : flag

                             // (communication scheme),

         fprintf(out, "rows %d\n ", mat->m); // rows per process,

         fprintf(out, "nnz %d\n", mat->nnz); // nnz (number of non zero per row).

         fprintf(out, "\n");                 // separator

     }


     /*n = (int* ) calloc(mat->lcount,sizeof(int));              //allocate

     //printf("before gather %d\n", rank);

     MPI_Gather(&(mat->lcount), 1, MPI_INT, n, 1, MPI_INT, master, mat->comm);

     //gather nbnonempty cols

     //printf("after gather %d\n", rank);


     if(rank==master){                   //master process saves data into

     filename_info.txt fprintf(out, "cols :\n"); //nnz (number of non zero per

     row). for(i=0; i<size; i++)         // fprintf(out, "%d ", n[i]);   //non-empty

     columns per process. fprintf(out, "\n");                    //

     }

     free(n); */

     // free allocated tabs


     nnzline = 0; // compute communication sparsity and maximum message size

     sumline = 0;

     for (i = 0; i < mat->steps; i++) {                               //

         sumline += mat->nS[i];

         if (mat->nS[i] == 0) {                                       //

             nnzline += 1;                                            //

         }                                                            //

     }                                                                //

     MPI_Reduce(&nnzline, &sparsity, 1, MPI_INT, MPI_SUM, 0,

                mat->comm);                                           // sparsity

     MPI_Reduce(&sumline, &total, 1, MPI_INT, MPI_SUM, 0, mat->comm); // sparsity

     if (rank == master) { // master process saves data into filename_info.txt

         fprintf(out, "sparsity %d\n", sparsity); //

         fprintf(out, "total %d\n", total);       //

     }


     maxsize = 0;

     for (i = 0; i < mat->steps; i++) { //

         MPI_Reduce(&(mat->nS[i]), &maxstep, 1, MPI_INT, MPI_MAX, 0,

                    mat->comm);         // maximum message size

         maxsize += maxstep;            //

     }                                  //

     if (rank == master) { // master process saves data into filename_info.txt

         fprintf(out, "maxsize %d\n ", maxsize); //

         fprintf(out, "\n");                     // separator

     }                                           //


     /* s = (int* ) calloc((mat->steps),sizeof(int));    //allocate steps

      MPI_Reduce(mat->nS, s, mat->steps, MPI_INT, MPI_SUM, 0, mat->comm);

      //imaximum message size


      if(rank==master){                  //master process saves data into

      filename_info.txt

          fprintf(out, "sumsteps :\n");  //nnz (number of non zero per row).

          for(i=0; i<mat->steps; i++)            //

              fprintf(out, "%d ", s[i]); //non-empty columns per process.

          fprintf(out, "\n");                    //

      }

      free(s);


      if(verbose==1){

          sr = (int* ) calloc((mat->steps)*size,sizeof(int));    //allocate

      send/receive matrix

          //printf("before gather %d\n", rank);

          MPI_Gather(mat->nS, mat->steps, MPI_INT, sr, mat->steps, MPI_INT,

      master, mat->comm);

      //gather nbnonempty cols

          //printf("after gather %d\n", rank);


          if(rank==master){                      //master process saves data into

      filename_info.txt fprintf(out, "send/receive matrix\n");   //separator

              for(i=0; i<size; i++){             //print collective description :

                  if(mat->flag==BUTTERFLY){              //send-receive matrix

                      for(j=0; j<size; j++){             //print send/receive matrix

                          if(j>i){

                              if(is_pow_2(j-i)==0)

                                  fprintf(out,"%d ",

      sr[i*(mat->steps)+log_2(j-i)]); else fprintf(out,"%d ", 0);

                          }

                          else if(i>j){

                              if(is_pow_2(size+j-i)==0)

                                  fprintf(out,"%d ",

      sr[i*(mat->steps)+log_2(size+j-i)]); else fprintf(out,"%d ", 0);

                          }

                          else{

                              fprintf(out,"%d ", 0);

                          }

                      }

                      fprintf(out, "\n");

                  }

                  else{

                      for(j=0; j<size; j++){             //print send/receive matrix

                          if(j>i){

                              fprintf(out,"%d ", sr[i*(mat->steps)+j-i]);

                          }

                          else if(i>j){

                              fprintf(out,"%d ", sr[(i+1)*(mat->steps)-i+j]);

                          }

                          else{

                              fprintf(out,"%d ", 0);

                          }

                      }

                      fprintf(out, "\n");

                  }

              }

          }

          free(sr);

      }*/


     if (rank == master) { // master process saves data into filename_info.txt

         fclose(out);

         printf("close %s\n", fn);

     }

     return 0;

 }

 #endif


 #if W_MPI

 int greedyreduce(Mat *A, double *x) {

     int     i, j, k;

     int     nSmax, nRmax, nStot, nRtot;

     double *lvalues;

     lvalues = (double *) malloc(

             (A->lcount - (A->nnz) * (A->trash_pix))

             * sizeof(double)); // allocate and set to 0.0 local values

     memcpy(lvalues, x,

            (A->lcount - (A->nnz) * (A->trash_pix))

                    * sizeof(double)); // copy local values into result values

     double *com_val;

     double *out_val;

     int     ne = 0;

     switch (A->flag) {

         case BUTTERFLY:

             for (k = 0; k < A->steps;

                  k++) // compute max communication buffer size

                 if (A->nR[k] > nRmax) nRmax = A->nR[k];

             for (k = 0; k < A->steps; k++)

                 if (A->nS[k] > nSmax) nSmax = A->nS[k];

             com_val = (double *) malloc(A->com_count * sizeof(double));

             for (i = 0; i < A->com_count; i++) com_val[i] = 0.0;

             m2m(lvalues, A->lindices + (A->nnz) * (A->trash_pix),

                 A->lcount - (A->nnz) * (A->trash_pix), com_val, A->com_indices,

                 A->com_count);

             butterfly_reduce(A->R, A->nR, nRmax, A->S, A->nS, nSmax, com_val,

                              A->steps, A->comm);

             m2m(com_val, A->com_indices, A->com_count, x,

                 A->lindices + (A->nnz) * (A->trash_pix),

                 A->lcount - (A->nnz) * (A->trash_pix));

             free(com_val);

             break;

         //==========================Modification added by Sebastien Cayrols :

         // 01/09/2015 , Berkeley

         case BUTTERFLY_BLOCKING_1:

             for (k = 0; k < A->steps;

                  k++) // compute max communication buffer size

                 if (A->nR[k] > nRmax) nRmax = A->nR[k];

             for (k = 0; k < A->steps; k++)

                 if (A->nS[k] > nSmax) nSmax = A->nS[k];

             com_val = (double *) malloc(A->com_count * sizeof(double));

             for (i = 0; i < A->com_count; i++) com_val[i] = 0.0;

             m2m(lvalues, A->lindices + (A->nnz) * (A->trash_pix),

                 A->lcount - (A->nnz) * (A->trash_pix), com_val, A->com_indices,

                 A->com_count);

             butterfly_blocking_1instr_reduce(A->R, A->nR, nRmax, A->S, A->nS,

                                              nSmax, com_val, A->steps, A->comm);

             m2m(com_val, A->com_indices, A->com_count, x,

                 A->lindices + (A->nnz) * (A->trash_pix),

                 A->lcount - (A->nnz) * (A->trash_pix));

             free(com_val);

             break;

         case BUTTERFLY_BLOCKING_2:

             for (k = 0; k < A->steps;

                  k++) // compute max communication buffer size

                 if (A->nR[k] > nRmax) nRmax = A->nR[k];

             for (k = 0; k < A->steps; k++)

                 if (A->nS[k] > nSmax) nSmax = A->nS[k];

             com_val = (double *) malloc(A->com_count * sizeof(double));

             for (i = 0; i < A->com_count; i++) com_val[i] = 0.0;

             m2m(lvalues, A->lindices + (A->nnz) * (A->trash_pix),

                 A->lcount - (A->nnz) * (A->trash_pix), com_val, A->com_indices,

                 A->com_count);

             butterfly_blocking_1instr_reduce(A->R, A->nR, nRmax, A->S, A->nS,

                                              nSmax, com_val, A->steps, A->comm);

             m2m(com_val, A->com_indices, A->com_count, x,

                 A->lindices + (A->nnz) * (A->trash_pix),

                 A->lcount - (A->nnz) * (A->trash_pix));

             free(com_val);

             break;

         case NOEMPTYSTEPRING:

             for (k = 1; k < A->steps;

                  k++) // compute max communication buffer size

                 if (A->nR[k] > nRmax) nRmax = A->nR[k];

             nSmax = nRmax;

             ring_noempty_step_reduce(A->R, A->nR, nRmax, A->S, A->nS, nSmax,

                                      lvalues, x, A->steps, A->comm);

             break;

         //==========================End modification

         case RING:

             for (k = 1; k < A->steps;

                  k++) // compute max communication buffer size

                 if (A->nR[k] > nRmax) nRmax = A->nR[k];

             nSmax = nRmax;

             ring_reduce(A->R, A->nR, nRmax, A->S, A->nS, nSmax, lvalues, x,

                         A->steps, A->comm);

             break;

         case NONBLOCKING:

             ring_nonblocking_reduce(A->R, A->nR, A->S, A->nS, lvalues, x,

                                     A->steps, A->comm);

             break;

         case NOEMPTY:

             for (k = 1; k < A->steps; k++)

                 if (A->nR[k] != 0) ne++;

             ring_noempty_reduce(A->R, A->nR, ne, A->S, A->nS, ne, lvalues, x,

                                 A->steps, A->comm);

             break;

         case ALLREDUCE:

             com_val = (double *) malloc(A->com_count * sizeof(double));

             out_val = (double *) malloc(A->com_count * sizeof(double));

             for (i = 0; i < A->com_count; i++) {

                 com_val[i] = 0.0;

                 out_val[i] = 0.0;

             }

             s2m(com_val, lvalues, A->com_indices,

                 A->lcount - (A->nnz) * (A->trash_pix));

             /*for(i=0; i < A->com_count; i++){

              printf("%lf ", com_val[i]);

         } */

             MPI_Allreduce(com_val, out_val, A->com_count, MPI_DOUBLE, MPI_SUM,

                           A->comm); // maximum index

             /*for(i=0; i < A->com_count; i++){

              printf("%lf ", out_val[i]);

         } */

             m2s(out_val, x, A->com_indices,

                 A->lcount - (A->nnz) * (A->trash_pix)); // sum receive buffer

                                                         // into values

             free(com_val);

             free(out_val);

             break;

         case ALLTOALLV:

             nRtot = nStot = 0;

             for (k = 0; k < A->steps; k++) { // compute buffer sizes

                 nRtot += A->nR[k];           // to receive

                 nStot += A->nS[k];           // to send

             }


             alltoallv_reduce(A->R, A->nR, nRtot, A->S, A->nS, nStot, lvalues, x,

                              A->steps, A->comm);

             break;

     }

     free(lvalues);

     return 0;

 }

 #endif

m2s
void m2s(double *mapval, double *submapval, int *subset, int count)
Definition: alm.c:27

m2m_sum
int m2m_sum(double *vA1, int *A1, int n1, double *vA2, int *A2, int n2)
Definition: alm.c:120

m2m
int m2m(double *vA1, int *A1, int n1, double *vA2, int *A2, int n2)
Definition: alm.c:97

s2m
void s2m(double *mapval, double *submapval, int *subset, int count)
assign submap values the submap values array
Definition: alm.c:64

is_pow_2
int is_pow_2(int n)
Definition: bitop.c:13

log_2
int log_2(int n)
Definition: bitop.c:26

butterfly_reduce
int butterfly_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a butterfly-like communication scheme.
Definition: butterfly.c:236

butterfly_init
int butterfly_init(int *indices, int count, int **R, int *nR, int **S, int *nS, int **com_indices, int *com_count, int steps, MPI_Comm comm)
Initialize tables for butterfly-like communication scheme This routine set up needed tables for the b...
Definition: butterfly.c:52

butterfly_blocking_1instr_reduce
int butterfly_blocking_1instr_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a butterfly-like communication scheme.
Definition: butterfly.c:340

sindex
int sindex(int *T, int nT, int *A, int nA)
Definition: cindex.c:34

ssort
int ssort(int *indices, int count, int flag)
Definition: csort.c:172

TrMatVecProd
int TrMatVecProd(Mat *A, double *y, double *x, int pflag)
Definition: mapmat.c:819

MatInit
int MatInit(Mat *A, int m, int nnz, int *indices, double *values, int flag #ifdef W_MPI, MPI_Comm comm #endif)
Definition: mapmat.c:54

TrMatVecProd_Naive
int TrMatVecProd_Naive(Mat *A, double *y, double *x, int pflag)
Definition: mapmat.c:744

MatFree
void MatFree(Mat *A)
Definition: mapmat.c:323

MatLoad
int MatLoad(Mat *mat, char *filename)
Definition: mapmat.c:414

MatComShape
int MatComShape(Mat *A, int flag, MPI_Comm comm)
Definition: mapmat.c:530

MatReset
void MatReset(Mat *A)
Definition: mapmat.c:255

MatSave
int MatSave(Mat *mat, char *filename)
Definition: mapmat.c:460

greedyreduce
int greedyreduce(Mat *A, double *x)
Definition: mapmat.c:1008

MatInfo
int MatInfo(Mat *mat, int verbose, char *filename)
Print information about a matrix.   Usefull function to check, debug or bench. It prints matrix array...
Definition: mapmat.c:867

MatSetIndices
void MatSetIndices(Mat *A, int m, int nnz, int *indices)
Definition: mapmat.c:83

CommInfo
void CommInfo(Mat *A)
Definition: mapmat.c:106

MatSetValues
void MatSetValues(Mat *A, int m, int nnz, double *values)
Definition: mapmat.c:97

MatLocalShape
int MatLocalShape(Mat *A, int sflag)
Definition: mapmat.c:496

MatVecProd
int MatVecProd(Mat *A, double *x, double *y, int pflag)
Definition: mapmat.c:704

ring_noempty_step_reduce
int ring_noempty_step_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a ring-like communication scheme.
Definition: ring.c:380

alltoallv_reduce
int alltoallv_reduce(int **R, int *nR, int nRtot, int **S, int *nS, int nStot, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using an MPI-Alltoallv call.
Definition: ring.c:157

ring_nonblocking_reduce
int ring_nonblocking_reduce(int **R, int *nR, int **S, int *nS, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a ring-like non-blocking communication sch...
Definition: ring.c:234

ring_noempty_reduce
int ring_noempty_reduce(int **R, int *nR, int nneR, int **S, int *nS, int nneS, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a ring-like non-blocking no-empty communic...
Definition: ring.c:298

ring_init
int ring_init(int *indices, int count, int **R, int *nR, int **S, int *nS, int steps, MPI_Comm comm)
Initialize tables for ring-like communication scheme.
Definition: ring.c:48

ring_reduce
int ring_reduce(int **R, int *nR, int nRmax, int **S, int *nS, int nSmax, double *val, double *res_val, int steps, MPI_Comm comm)
Perform a sparse sum reduction (or mapped reduction) using a ring-like communication scheme.
Definition: ring.c:109