8#include "kamping/communicator.hpp"
11#include "kamping/plugin/plugin_helpers.hpp"
13#if not(defined(MPIX_ERR_PROC_FAILED)) or not(defined(MPIX_ERR_PROC_FAILED_PENDING)) or not(defined(MPIX_ERR_REVOKED))
15 "MPIX_ERR_PROC_FAILED, MPIX_ERR_PROC_FAILED_PENDING, or MPIX_ERR_REVOKED not defined. You need a MPI implementation which supports fault-tolerance to enable the FaultTolerance."
25 return "A MPI process failed or the communicator was revoked.";
34 return "A process failure prevented the completetion of the MPI operation.";
43 return "A potential sender matching a non-blocking wildcard source receive has failed.";
52 return "The communicator was revoked.";
57namespace kamping::plugin {
61template <
typename Comm,
template <
typename...>
typename DefaultContainerType>
75 this->to_communicator().mpi_error_hook(
ret,
"MPIX_Comm_revoke");
85 this->to_communicator().mpi_error_hook(
ret,
"MPIX_Comm_ack_failed");
98 return ack_failed(this->mpi_communicator().size());
106 this->to_communicator().mpi_error_hook(
ret,
"MPIX_Comm_shrink");
116 this->to_communicator().mpi_error_hook(
ret,
"MPIX_Comm_agree");
133 this->to_communicator().mpi_error_hook(
ret,
"MPIX_Comm_failure_get_acked");
159 this->to_communicator().mpi_error_default_handler(
ret,
callee);
165 return this->to_communicator().mpi_communicator();
Helper functions that make casts safer.
A group of MPI processes.
Definition group.hpp:36
STL-compatible allocator for requesting memory using the builtin MPI allocator.
Definition allocator.hpp:32
Base class for all exceptions thrown by the FaultTolerance plugin. Means, that either a process faile...
Definition ulfm.hpp:21
char const * what() const noexcept override
Returns an explanatory string.
Definition ulfm.hpp:24
Thrown when a process failure prevented the completetion of the MPI operation.
Definition ulfm.hpp:30
char const * what() const noexcept override
Returns an explanatory string.
Definition ulfm.hpp:33
Thrown when a potential sender matching a non-blocking wildcard source receive has failed.
Definition ulfm.hpp:39
char const * what() const noexcept override
Returns an explanatory string.
Definition ulfm.hpp:42
Thrown when the communicator was revoked.
Definition ulfm.hpp:48
char const * what() const noexcept override
Returns an explanatory string.
Definition ulfm.hpp:51
A plugin implementing a wrapper around the User-Level Failure-Mitigation (ULFM) feature of the upcomi...
Definition ulfm.hpp:62
uint32_t ack_all_failed()
Acknowledge all failures.
Definition ulfm.hpp:97
bool agree(bool flag)
Agrees on a boolean flag from all live processes and distributes the result back to all live processe...
Definition ulfm.hpp:124
Group get_failed()
Obtains the group of currently failed processes.
Definition ulfm.hpp:130
void revoke()
Revokes the current communicator.
Definition ulfm.hpp:73
Comm shrink()
Creates a new communicator from this communicator, excluding the failed processes.
Definition ulfm.hpp:103
void mpi_error_handler(int const ret, std::string const &callee) const
Overwrite the on-MPI-error handler to throw appropriate exceptions for then hardware faults happened.
Definition ulfm.hpp:146
int agree(int flag)
Agrees on a flag from all live processes and distributes the result back to all live processes,...
Definition ulfm.hpp:114
bool is_revoked()
Checks if this communicator has been revoked.
Definition ulfm.hpp:139
uint32_t ack_failed(uint32_t const num_to_ack)
Acknowledges that the application intends to ignore the effect of currently known failures on wildcar...
Definition ulfm.hpp:82
UserLevelFailureMitigation()
Default constructor; sets the error handler of MPI_COMM_WORLD (!) to MPI_ERRORS_RETURN....
Definition ulfm.hpp:67
uint32_t num_ack_failed()
Gets the number of acknowledged failures.
Definition ulfm.hpp:91
Wrapper for MPI functions that don't require a communicator.
An abstraction around MPI_Group.
constexpr int light
Assertion level for lightweight assertions.
Definition assertion_levels.hpp:13
Helper class for using CRTP for mixins. Which are used to implement kamping plugins.
Definition plugin_helpers.hpp:32