doxygen/src/cs__parall_8h_source.html

#ifndef __CS_PARALL_H__

#define __CS_PARALL_H__


/*============================================================================

 * Functions dealing with parallelism

 *============================================================================*/


/*

  This file is part of code_saturne, a general-purpose CFD tool.


  Copyright (C) 1998-2024 EDF S.A.


  This program is free software; you can redistribute it and/or modify it under

  the terms of the GNU General Public License as published by the Free Software

  Foundation; either version 2 of the License, or (at your option) any later

  version.


  This program is distributed in the hope that it will be useful, but WITHOUT

  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS

  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more

  details.


  You should have received a copy of the GNU General Public License along with

  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin

  Street, Fifth Floor, Boston, MA 02110-1301, USA.

*/


/*----------------------------------------------------------------------------*/


/*----------------------------------------------------------------------------

 *  Local headers

 *----------------------------------------------------------------------------*/


#include "cs_defs.h"

#include "cs_execution_context.h"


/*----------------------------------------------------------------------------*/


BEGIN_C_DECLS


/*============================================================================

 * General types and macros used throughout code_saturne

 *============================================================================*/


/*----------------------------------------------------------------------------

 * Variable value type.

 *----------------------------------------------------------------------------*/


typedef enum {


  CS_E2N_SUM_SCATTER,

  CS_E2N_SUM_SCATTER_ATOMIC,

  CS_E2N_SUM_GATHER

} cs_e2n_sum_t;


/*============================================================================

 * Global variables

 *============================================================================*/


/* Preferred indexed sum option, adapted to shared-memory parallelism */


extern cs_e2n_sum_t cs_glob_e2n_sum_type;


/*=============================================================================

 * Public function prototypes

 *============================================================================*/


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_counter(cs_gnum_t   cpt[],

                  const int   n)

{

  if (cs_glob_n_ranks > 1) {

    MPI_Allreduce(MPI_IN_PLACE, cpt, n, CS_MPI_GNUM, MPI_SUM,

                  cs_glob_mpi_comm);

  }

}


#else


void

cs_parall_counter(cs_gnum_t   cpt[],

                  const int   n);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_counter_max(cs_lnum_t   cpt[],

                      const int   n)

{

  if (cs_glob_n_ranks > 1) {

    MPI_Allreduce(MPI_IN_PLACE, cpt, n, CS_MPI_LNUM, MPI_MAX,

                  cs_glob_mpi_comm);

  }

}


#else


void

cs_parall_counter_max(cs_lnum_t   cpt[],

                      const int   n);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_sum(int             n,

              cs_datatype_t   datatype,

              void           *val)

{

  if (cs_glob_n_ranks > 1) {

    MPI_Allreduce(MPI_IN_PLACE, val, n, cs_datatype_to_mpi[datatype], MPI_SUM,

                  cs_glob_mpi_comm);

  }

}


#else


void

cs_parall_sum(int             n,

              cs_datatype_t   datatype,

              void           *val);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_max(int             n,

              cs_datatype_t   datatype,

              void           *val)

{

  if (cs_glob_n_ranks > 1) {

    MPI_Allreduce(MPI_IN_PLACE, val, n, cs_datatype_to_mpi[datatype], MPI_MAX,

                  cs_glob_mpi_comm);

  }

}


#else


void

cs_parall_max(int             n,

              cs_datatype_t   datatype,

              void           *val);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_min(int             n,

              cs_datatype_t   datatype,

              void           *val)

{

  if (cs_glob_n_ranks > 1) {

    MPI_Allreduce(MPI_IN_PLACE, val, n, cs_datatype_to_mpi[datatype], MPI_MIN,

                  cs_glob_mpi_comm);

  }

}


#else


void

cs_parall_min(int             n,

              cs_datatype_t   datatype,

              void           *val);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI)


inline static void

cs_parall_bcast(int             root_rank,

                int             n,

                cs_datatype_t   datatype,

                void           *val)

{

  if (cs_glob_n_ranks > 1)

    MPI_Bcast(val, n, cs_datatype_to_mpi[datatype], root_rank,

              cs_glob_mpi_comm);

}


#else


#define cs_parall_bcast(_root_rank, _n, _datatype, _val);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_allgather_r(int        n_elts,

                      int        n_g_elts,

                      cs_real_t  array[],

                      cs_real_t  g_array[]);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_allgather_ordered_r(int        n_elts,

                              int        n_g_elts,

                              int        stride,

                              cs_real_t  o_key[],

                              cs_real_t  array[],

                              cs_real_t  g_array[]);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_gather_r(int               root_rank,

                   int               n_elts,

                   int               n_g_elts,

                   const cs_real_t   array[],

                   cs_real_t         g_array[]);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_gather_ordered_r(int        root_rank,

                           int        n_elts,

                           int        n_g_elts,

                           int        stride,

                           cs_real_t  o_key[],

                           cs_real_t  array[],

                           cs_real_t  g_array[]);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_scatter_r(int               root_rank,

                    int               n_elts,

                    int               n_g_elts,

                    const cs_real_t   g_array[],

                    cs_real_t         array[]);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_gather_f(int             root_rank,

                   int             n_elts,

                   int             n_g_elts,

                   const float     array[],

                   float           g_array[]);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_scatter_f(int           root_rank,

                    int           n_elts,

                    int           n_g_elts,

                    const float   g_array[],

                    float         array[]);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_max_loc_vals(int         n,

                       cs_real_t  *max,

                       cs_real_t   max_loc_vals[]);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_min_loc_vals(int         n,

                       cs_real_t  *min,

                       cs_real_t   min_loc_vals[]);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_min_id_rank_r(cs_lnum_t  *elt_id,

                        int        *rank_id,

                        cs_real_t   val);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


size_t

cs_parall_get_min_coll_buf_size(void);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


void

cs_parall_set_min_coll_buf_size(size_t buffer_size);


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


inline static int

cs_parall_n_threads(cs_lnum_t  n_elements,

                    cs_lnum_t  min_thread_elements)

{

#if defined(HAVE_OPENMP)

  int n_t = omp_get_max_threads();

  int n_t_l = n_elements / min_thread_elements;

  if (n_t_l < n_t)

    n_t = n_t_l;

  if (n_t < 1)

    n_t = 1;

  return n_t;

#else

  CS_UNUSED(n_elements);         /* avoid compiler warning */

  CS_UNUSED(min_thread_elements);

  return 1;

#endif

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


inline static void

cs_parall_thread_range(cs_lnum_t    n,

                       size_t       type_size,

                       cs_lnum_t   *s_id,

                       cs_lnum_t   *e_id)

{

#if defined(HAVE_OPENMP)

  const int t_id = omp_get_thread_num();

  const int n_t = omp_get_num_threads();

  const cs_lnum_t t_n = (n + n_t - 1) / n_t;

  const cs_lnum_t cl_m = CS_CL_SIZE / type_size;  /* Cache line multiple */


  *s_id =  t_id    * t_n;

  *e_id = (t_id+1) * t_n;

  *s_id = cs_align(*s_id, cl_m);

  *e_id = cs_align(*e_id, cl_m);

  if (*e_id > n) *e_id = n;

#else

  CS_UNUSED(type_size);         /* avoid compiler warning */

  *s_id = 0;

  *e_id = n;

#endif

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


inline static void

cs_parall_thread_range_upper(cs_lnum_t    n,

                             size_t       type_size,

                             cs_lnum_t   *s_id,

                             cs_lnum_t   *e_id)

{

#if defined(HAVE_OPENMP)

  const int t_id = omp_get_thread_num();

  const double n_t = omp_get_num_threads();

  const cs_lnum_t cl_m = CS_CL_SIZE / type_size;  /* Cache line multiple */


  double r0 = (double)t_id / (double)n_t;

  double r1 = (double)(t_id+1) / (double)n_t;


  r0 = r0*r0;

  r1 = r1*r1;


  const cs_lnum_t t_0 = r0*n;

  const cs_lnum_t t_1 = r1*n;


  *s_id = t_0 * n;

  *e_id = t_1 * n;

  *s_id = cs_align(*s_id, cl_m);

  *e_id = cs_align(*e_id, cl_m);

  if (*e_id > n) *e_id = n;

#else

  CS_UNUSED(type_size);         /* avoid compiler warning */

  *s_id = 0;

  *e_id = n;

#endif

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


static inline size_t

cs_parall_block_count(size_t  n,

                      size_t  block_size)

{

  return (n % block_size) ?  n/block_size + 1 : n/block_size;

}


/*----------------------------------------------------------------------------*/


END_C_DECLS


#if defined(__cplusplus)


/*=============================================================================

 * Public C++ functions

 *============================================================================*/


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_counter(cs_execution_context  *ec,

                  cs_gnum_t              cpt[],

                  const int              n)

{

  if (ec->use_mpi()) {

    MPI_Allreduce(MPI_IN_PLACE, cpt, n, CS_MPI_GNUM, MPI_SUM,

                  ec->comm());

  }

}


#else


void

cs_parall_counter(cs_execution_context  *ec,

                  cs_gnum_t              cpt[],

                  const int              n);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_counter_max(cs_execution_context  *ec,

                      cs_lnum_t              cpt[],

                      const int              n)

{

  if (ec->use_mpi()) {

    MPI_Allreduce(MPI_IN_PLACE, cpt, n, CS_MPI_LNUM, MPI_MAX,

                  ec->comm());

  }

}


#else


void

cs_parall_counter_max(cs_execution_context  *ec,

                      cs_lnum_t              cpt[],

                      const int              n);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_sum(cs_execution_context  *ec,

              int                    n,

              cs_datatype_t          datatype,

              void                  *val)

{

  if (ec->use_mpi()) {

    MPI_Allreduce(MPI_IN_PLACE, val, n,

                  cs_datatype_to_mpi[datatype], MPI_SUM,

                  ec->comm());

  }

}


#else


void

cs_parall_sum(cs_execution_context  *ec,

              int                    n,

              cs_datatype_t          datatype,

              void                  *val);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_max(cs_execution_context  *ec,

              int                    n,

              cs_datatype_t          datatype,

              void                  *val)

{

  if (ec->use_mpi()) {

    MPI_Allreduce(MPI_IN_PLACE, val, n,

                  cs_datatype_to_mpi[datatype], MPI_MAX,

                  ec->comm());

  }

}


#else


void

cs_parall_max(cs_execution_context *ec,

              int                   n,

              cs_datatype_t         datatype,

              void                 *val);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI_IN_PLACE)


inline static void

cs_parall_min(cs_execution_context  *ec,

              int                    n,

              cs_datatype_t          datatype,

              void                  *val)

{

  if (ec->use_mpi()) {

    MPI_Allreduce(MPI_IN_PLACE, val, n,

                  cs_datatype_to_mpi[datatype], MPI_MIN,

                  ec->comm());

  }

}


#else


void

cs_parall_min(cs_execution_context  *ec,

              int                    n,

              cs_datatype_t          datatype,

              void                  *val);


#endif


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


inline static void

cs_parall_thread_range(cs_lnum_t    n,

                       size_t       type_size,

                       int          t_id,

                       int          n_t,

                       cs_lnum_t   *s_id,

                       cs_lnum_t   *e_id)

{

  const cs_lnum_t t_n = (n + n_t - 1) / n_t;

  const cs_lnum_t cl_m = CS_CL_SIZE / type_size;  /* Cache line multiple */


  *s_id =  t_id    * t_n;

  *e_id = (t_id+1) * t_n;

  *s_id = cs_align(*s_id, cl_m);

  *e_id = cs_align(*e_id, cl_m);

  if (*s_id > n) *s_id = n;

  if (*e_id > n) *e_id = n;

}


/*=============================================================================

 * Public C++ templates

 *============================================================================*/


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <typename T, typename... Vals>

static void

cs_parall_sum_scalars

(

  T&       first,

  Vals&... values

)

{

#if defined(HAVE_MPI)


  if (cs_glob_n_ranks == 1)

    return;


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_sum(1, datatype, &first);

  else {

    /* Unpack values */

    T *_values[] = {&values ...};


    T w[n_vals + 1];

    w[0] = first;

    for (size_t i = 0; i < n_vals; i++)

      w[i+1] = *(_values[i]);


    cs_parall_sum(n_vals + 1, datatype, w);


    first = w[0];

    for (size_t i = 0; i < n_vals; i++)

      *(_values[i]) = w[i+1];

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <typename T, typename... Vals>

static void

cs_parall_sum_scalars

(

  cs_execution_context  *ec,

  T&                     first,

  Vals&...               values

)

{

#if defined(HAVE_MPI)


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_sum(ec, 1, datatype, &first);

  else {

    /* Unpack values */

    T *_values[] = {&values ...};


    T w[n_vals + 1];

    w[0] = first;

    for (size_t i = 0; i < n_vals; i++)

      w[i+1] = *(_values[i]);


    cs_parall_sum(ec, n_vals + 1, datatype, w);


    first = w[0];

    for (size_t i = 0; i < n_vals; i++)

      *(_values[i]) = w[i+1];

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <int Stride, typename T, typename... Vals>

static void

cs_parall_sum_strided

(

  cs_execution_context  *ec,

  T                      first[],

  Vals&&...              values

)

{

#if defined(HAVE_MPI)


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_sum(ec, Stride, datatype, first);

  else {

    /* Unpack values */

    T *_values[] = {values ...};


    constexpr size_t work_size = (n_vals + 1) * Stride;


    T w[work_size];

    for (int i = 0; i < Stride; i++)

      w[i] = first[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        w[(i+1)*Stride + j] = _values[i][j];


    cs_parall_sum(ec, work_size, datatype, w);


    for (int i = 0; i < Stride; i++)

      first[i] = w[i];


    for (size_t i = 0; i < n_vals; i++) {

      for (int j = 0; j < Stride; j++)

        _values[i][j] = w[(i+1)*Stride + j];

    }

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <int Stride, typename T, typename... Vals>

static void

cs_parall_sum_strided

(

  T         first[],

  Vals&&... values

)

{

#if defined(HAVE_MPI)


  if (cs_glob_n_ranks == 1)

    return;


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_sum(Stride, datatype, first);

  else {

    /* Unpack values */

    T *_values[] = {values ...};


    constexpr size_t work_size = (n_vals + 1) * Stride;


    T w[work_size];

    for (int i = 0; i < Stride; i++)

      w[i] = first[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        w[(i+1)*Stride + j] = _values[i][j];


    cs_parall_sum(work_size, datatype, w);


    for (int i = 0; i < Stride; i++)

      first[i] = w[i];


    for (size_t i = 0; i < n_vals; i++) {

      for (int j = 0; j < Stride; j++)

        _values[i][j] = w[(i+1)*Stride + j];

    }

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <typename T, typename... Vals>

static void

cs_parall_max_scalars

(

  T&       first,

  Vals&... values

)

{

#if defined(HAVE_MPI)


  if (cs_glob_n_ranks == 1)

    return;


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_max(1, datatype, &first);

  else {


    /* Unpack values */

    T *_values[] = {&values ...};


    T w[n_vals + 1];

    w[0] = first;

    for (size_t i = 0; i < n_vals; i++)

      w[i+1] = *(_values[i]);


    cs_parall_max(n_vals + 1, datatype, w);


    first = w[0];

    for (size_t i = 0; i < n_vals; i++)

      *(_values[i]) = w[i+1];

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <typename T, typename... Vals>

static void

cs_parall_max_scalars

(

  cs_execution_context  *ec,

  T&                     first,

  Vals&...               values

)

{

#if defined(HAVE_MPI)


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_max(ec, 1, datatype, &first);

  else {


    /* Unpack values */

    T *_values[] = {&values ...};


    T w[n_vals + 1];

    w[0] = first;

    for (size_t i = 0; i < n_vals; i++)

      w[i+1] = *(_values[i]);


    cs_parall_max(ec, n_vals + 1, datatype, w);


    first = w[0];

    for (size_t i = 0; i < n_vals; i++)

      *(_values[i]) = w[i+1];

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <int Stride, typename T, typename... Vals>

static void

cs_parall_max_strided

(

  T         first[],

  Vals&&... values

)

{

#if defined(HAVE_MPI)


  if (cs_glob_n_ranks == 1)

    return;


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_max(Stride, datatype, first);

  else {

    /* Unpack values */

    T *_values[] = {values ...};


    constexpr size_t work_size = (n_vals + 1) * Stride;


    T w[work_size];

    for (int i = 0; i < Stride; i++)

      w[i] = first[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        w[(i+1)*Stride + j] = _values[i][j];


    cs_parall_max(work_size, datatype, w);


    for (int i = 0; i < Stride; i++)

      first[i] = w[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        _values[i][j] = w[(i+1)*Stride + j];

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <int Stride, typename T, typename... Vals>

static void

cs_parall_max_strided

(

  cs_execution_context  *ec,

  T                      first[],

  Vals&&...              values

)

{

#if defined(HAVE_MPI)


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_max(ec, Stride, datatype, first);

  else {

    /* Unpack values */

    T *_values[] = {values ...};


    constexpr size_t work_size = (n_vals + 1) * Stride;


    T w[work_size];

    for (int i = 0; i < Stride; i++)

      w[i] = first[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        w[(i+1)*Stride + j] = _values[i][j];


    cs_parall_max(ec, work_size, datatype, w);


    for (int i = 0; i < Stride; i++)

      first[i] = w[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        _values[i][j] = w[(i+1)*Stride + j];

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <typename T, typename... Vals>

static void

cs_parall_min_scalars

(

  T&       first,

  Vals&... values

)

{

#if defined(HAVE_MPI)


  if (cs_glob_n_ranks == 1)

    return;


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  if (n_vals == 0)

    cs_parall_min(1, datatype, &first);


  else {

    /* Temporary work array and parallel sum */


    /* Unpack values */

    T *_values[] = {&values ...};


    T w[n_vals + 1];

    w[0] = first;

    for (size_t i = 0; i < n_vals; i++)

      w[i + 1] = *(_values[i]);


    cs_parall_min(n_vals + 1, datatype, w);


    first = w[0];

    for (size_t i = 0; i < n_vals; i++)

      *(_values[i]) = w[i + 1];

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <typename T, typename... Vals>

static void

cs_parall_min_scalars

(

  cs_execution_context  *ec,

  T&                     first,

  Vals&...               values

)

{

#if defined(HAVE_MPI)


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  if (n_vals == 0)

    cs_parall_min(ec, 1, datatype, &first);


  else {

    /* Temporary work array and parallel sum */


    /* Unpack values */

    T *_values[] = {&values ...};


    T w[n_vals + 1];

    w[0] = first;

    for (size_t i = 0; i < n_vals; i++)

      w[i + 1] = *(_values[i]);


    cs_parall_min(ec, n_vals + 1, datatype, w);


    first = w[0];

    for (size_t i = 0; i < n_vals; i++)

      *(_values[i]) = w[i + 1];

  }


#endif // defined(HAVE_MPI)

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


template <int Stride, typename T, typename... Vals>

static void

cs_parall_min_strided

(

  T         first[],

  Vals&&... values

)

{

#if defined(HAVE_MPI)


  if (cs_glob_n_ranks == 1)

    return;


  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_min(Stride, datatype, first);

  else {

    /* Unpack values */

    T *_values[] = {values ...};


    constexpr size_t work_size = (n_vals + 1) * Stride;


    T w[work_size];

    for (int i = 0; i < Stride; i++)

      w[i] = first[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        w[(i+1)*Stride + j] = _values[i][j];


    cs_parall_min(work_size, datatype, w);


    for (int i = 0; i < Stride; i++)

      first[i] = w[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        _values[i][j] = w[(i+1)*Stride + j];

  }


#endif

}


/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/


#if defined(HAVE_MPI)


template <int Stride, typename T, typename... Vals>

static void

cs_parall_min_strided

(

  cs_execution_context  *ec,

  T                      first[],

  Vals&&...              values

)

{

  /* Count number of values */

  constexpr size_t n_vals = sizeof...(Vals);


  /* Set datatype for global communication */

  cs_datatype_t datatype = cs_datatype_from_type<T>();


  /* Temporary work array and parallel sum */

  if (n_vals == 0)

    cs_parall_min(ec, Stride, datatype, first);


  else {

    /* Unpack values */

    T *_values[] = {values ...};


    constexpr size_t work_size = (n_vals + 1) * Stride;


    T w[work_size];

    for (int i = 0; i < Stride; i++)

      w[i] = first[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        w[(i+1)*Stride + j] = _values[i][j];


    cs_parall_min(ec, work_size, datatype, w);


    for (int i = 0; i < Stride; i++)

      first[i] = w[i];


    for (size_t i = 0; i < n_vals; i++)

      for (int j = 0; j < Stride; j++)

        _values[i][j] = w[(i+1)*Stride + j];

  }

}


#endif // defined(HAVE_MPI)


#endif //__cplusplus


/*----------------------------------------------------------------------------*/


#endif /* __CS_PARALL_H__ */

cs_execution_context
Definition: cs_execution_context.h:61

cs_execution_context::use_mpi
bool use_mpi() const
Does the execution context uses MPI parallelism ?
Definition: cs_execution_context.h:128

cs_execution_context::comm
MPI_Comm comm() const
Getter function for MPI communicator.
Definition: cs_execution_context.h:227

cs_glob_n_ranks
int cs_glob_n_ranks
Definition: cs_defs.cpp:175

cs_datatype_to_mpi
MPI_Datatype cs_datatype_to_mpi[]
Definition: cs_defs.cpp:157

cs_glob_mpi_comm
MPI_Comm cs_glob_mpi_comm
Definition: cs_defs.cpp:183

cs_defs.h

cs_datatype_t
cs_datatype_t
Definition: cs_defs.h:300

BEGIN_C_DECLS
#define BEGIN_C_DECLS
Definition: cs_defs.h:542

cs_real_t
double cs_real_t
Floating-point value.
Definition: cs_defs.h:342

CS_MPI_LNUM
#define CS_MPI_LNUM
Definition: cs_defs.h:438

CS_MPI_GNUM
#define CS_MPI_GNUM
Definition: cs_defs.h:418

cs_gnum_t
uint64_t cs_gnum_t
global mesh entity number
Definition: cs_defs.h:325

cs_align
static cs_lnum_t cs_align(cs_lnum_t i, cs_lnum_t m)
Given a base index i, return the next index aligned with a size m.
Definition: cs_defs.h:652

CS_UNUSED
#define CS_UNUSED(x)
Definition: cs_defs.h:528

END_C_DECLS
#define END_C_DECLS
Definition: cs_defs.h:543

cs_lnum_t
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:335

CS_CL_SIZE
#define CS_CL_SIZE
Definition: cs_defs.h:498

cs_execution_context.h

cs_parall_gather_r
void cs_parall_gather_r(int root_rank, int n_elts, int n_g_elts, const cs_real_t array[], cs_real_t g_array[])
Build a global array on the given root rank from all local arrays.
Definition: cs_parall.cpp:1030

cs_parall_bcast
static void cs_parall_bcast(int root_rank, int n, cs_datatype_t datatype, void *val)
Broadcast values of a given datatype to all default communicator processes.
Definition: cs_parall.h:248

cs_parall_set_min_coll_buf_size
void cs_parall_set_min_coll_buf_size(size_t buffer_size)
Define minimum recommended scatter or gather buffer size.
Definition: cs_parall.cpp:1353

cs_parall_gather_ordered_r
void cs_parall_gather_ordered_r(int root_rank, int n_elts, int n_g_elts, int stride, cs_real_t o_key[], cs_real_t array[], cs_real_t g_array[])
Build an ordered global array on the given root rank from all local arrays.
Definition: cs_parall.cpp:1097

cs_parall_min_id_rank_r
void cs_parall_min_id_rank_r(cs_lnum_t *elt_id, int *rank_id, cs_real_t val)
Given an (id, rank, value) tuple, return the local id and rank corresponding to the global minimum va...
Definition: cs_parall.cpp:855

cs_parall_thread_range_upper
static void cs_parall_thread_range_upper(cs_lnum_t n, size_t type_size, cs_lnum_t *s_id, cs_lnum_t *e_id)
Compute array index bounds for a local thread for upper triangular matrix elements.
Definition: cs_parall.h:634

cs_parall_max
static void cs_parall_max(int n, cs_datatype_t datatype, void *val)
Maximum values of a given datatype on all default communicator processes.
Definition: cs_parall.h:180

cs_parall_counter_max
static void cs_parall_counter_max(cs_lnum_t cpt[], const int n)
Maximum values of a counter on all default communicator processes.
Definition: cs_parall.h:117

cs_parall_allgather_r
void cs_parall_allgather_r(int n_elts, int n_g_elts, cs_real_t array[], cs_real_t g_array[])
Build a global array from each local array in each domain.
Definition: cs_parall.cpp:909

cs_parall_n_threads
static int cs_parall_n_threads(cs_lnum_t n_elements, cs_lnum_t min_thread_elements)
Compute recommended number of threads for a section.
Definition: cs_parall.h:555

cs_parall_counter
static void cs_parall_counter(cs_gnum_t cpt[], const int n)
Sum values of a counter on all default communicator processes.
Definition: cs_parall.h:88

cs_parall_scatter_r
void cs_parall_scatter_r(int root_rank, int n_elts, int n_g_elts, const cs_real_t g_array[], cs_real_t array[])
Distribute a global array from a given root rank over all ranks. Each rank receive the part related t...
Definition: cs_parall.cpp:1146

cs_parall_sum
static void cs_parall_sum(int n, cs_datatype_t datatype, void *val)
Sum values of a given datatype on all default communicator processes.
Definition: cs_parall.h:147

cs_parall_allgather_ordered_r
void cs_parall_allgather_ordered_r(int n_elts, int n_g_elts, int stride, cs_real_t o_key[], cs_real_t array[], cs_real_t g_array[])
Build an ordered global array from each local array in each domain.
Definition: cs_parall.cpp:986

cs_parall_scatter_f
void cs_parall_scatter_f(int root_rank, int n_elts, int n_g_elts, const float g_array[], float array[])
Distribute a global array from a given root rank over all ranks. Each rank receive the part related t...
Definition: cs_parall.cpp:1276

cs_parall_thread_range
static void cs_parall_thread_range(cs_lnum_t n, size_t type_size, cs_lnum_t *s_id, cs_lnum_t *e_id)
Compute array index bounds for a local thread. When called inside an OpenMP parallel section,...
Definition: cs_parall.h:589

cs_parall_min_loc_vals
void cs_parall_min_loc_vals(int n, cs_real_t *min, cs_real_t min_loc_vals[])
Minimum value of a real and the value of related array on all default communicator processes.
Definition: cs_parall.cpp:816

cs_parall_min
static void cs_parall_min(int n, cs_datatype_t datatype, void *val)
Minimum values of a given datatype on all default communicator processes.
Definition: cs_parall.h:213

cs_parall_gather_f
void cs_parall_gather_f(int root_rank, int n_elts, int n_g_elts, const float array[], float g_array[])
Build a global array on the given root rank from all local arrays. Function dealing with single-preci...
Definition: cs_parall.cpp:1211

cs_parall_max_loc_vals
void cs_parall_max_loc_vals(int n, cs_real_t *max, cs_real_t max_loc_vals[])
Maximum value of a real and the value of related array on all default communicator processes.
Definition: cs_parall.cpp:778

cs_parall_get_min_coll_buf_size
size_t cs_parall_get_min_coll_buf_size(void)
Return minimum recommended scatter or gather buffer size.
Definition: cs_parall.cpp:1331

cs_parall_block_count
static size_t cs_parall_block_count(size_t n, size_t block_size)
Compute number of blocks needed for a given array and block sizes.
Definition: cs_parall.h:677

cs_e2n_sum_t
cs_e2n_sum_t
Definition: cs_parall.h:52

CS_E2N_SUM_SCATTER_ATOMIC
@ CS_E2N_SUM_SCATTER_ATOMIC
Definition: cs_parall.h:57

CS_E2N_SUM_SCATTER
@ CS_E2N_SUM_SCATTER
Definition: cs_parall.h:54

CS_E2N_SUM_GATHER
@ CS_E2N_SUM_GATHER
Definition: cs_parall.h:59

cs_glob_e2n_sum_type
cs_e2n_sum_t cs_glob_e2n_sum_type