KaMPIng 0.1.1
Flexible and (near) zero-overhead C++ bindings for MPI
Loading...
Searching...
No Matches
ulfm.hpp
1#include <numeric>
2
3#include <mpi.h>
4// We have to include mpi.h /before/ mpi-ext.h order for OMPI_DECLSPEC to be defined.
5#include <mpi-ext.h>
6
8#include "kamping/communicator.hpp"
10#include "kamping/group.hpp"
11#include "kamping/plugin/plugin_helpers.hpp"
12
13#if not(defined(MPIX_ERR_PROC_FAILED)) or not(defined(MPIX_ERR_PROC_FAILED_PENDING)) or not(defined(MPIX_ERR_REVOKED))
14 #pragma message \
15 "MPIX_ERR_PROC_FAILED, MPIX_ERR_PROC_FAILED_PENDING, or MPIX_ERR_REVOKED not defined. You need a MPI implementation which supports fault-tolerance to enable the FaultTolerance."
16#endif
17
18namespace kamping {
19/// @brief Base class for all exceptions thrown by the FaultTolerance plugin.
20/// Means, that either a process failed or the communicator was revoked.
21class MPIFailureDetected : public std::exception {
22public:
23 /// @brief Returns an explanatory string.
24 char const* what() const noexcept override {
25 return "A MPI process failed or the communicator was revoked.";
26 }
27};
28
29/// @brief Thrown when a process failure prevented the completetion of the MPI operation.
31public:
32 /// @brief Returns an explanatory string.
33 char const* what() const noexcept override {
34 return "A process failure prevented the completetion of the MPI operation.";
35 }
36};
37
38/// @brief Thrown when a potential sender matching a non-blocking wildcard source receive has failed.
40public:
41 /// @brief Returns an explanatory string.
42 char const* what() const noexcept override {
43 return "A potential sender matching a non-blocking wildcard source receive has failed.";
44 }
45};
46
47/// @brief Thrown when the communicator was revoked.
49public:
50 /// @brief Returns an explanatory string.
51 char const* what() const noexcept override {
52 return "The communicator was revoked.";
53 }
54};
55} // namespace kamping
56
57namespace kamping::plugin {
58
59/// @brief A plugin implementing a wrapper around the User-Level Failure-Mitigation (ULFM) feature of the upcoming MPI 4
60/// standard. This plugin and the accompanying example is tested with OpenMPI 5.0.2.
61template <typename Comm, template <typename...> typename DefaultContainerType>
62class UserLevelFailureMitigation : public plugin::PluginBase<Comm, DefaultContainerType, UserLevelFailureMitigation> {
63public:
64 /// @brief Default constructor; sets the error handler of MPI_COMM_WORLD (!) to MPI_ERRORS_RETURN.
65 /// Although the standard allows setting the error handler for only a specific communicator; neither MPICH nor
66 /// OpenMPI currently (March 2024) support this.
68 // MPI_Comm_set_errhandler(_comm(), MPI_ERRORS_RETURN);
70 }
71
72 /// @brief Revokes the current communicator.
73 void revoke() {
74 auto const ret = MPIX_Comm_revoke(_comm());
75 this->to_communicator().mpi_error_hook(ret, "MPIX_Comm_revoke");
76 }
77
78 /// @brief Acknowledges that the application intends to ignore the effect of currently known failures on wildcard
79 /// receive completions and agreement return values.
80 /// @param num_to_ack The number of failures to acknowledge.
81 /// @return The <i>overall</i> number of failures acknowledged.
83 int num_acked;
85 this->to_communicator().mpi_error_hook(ret, "MPIX_Comm_ack_failed");
87 }
88
89 /// @brief Gets the number of acknowledged failures.
90 /// @return The number of acknowledged failures.
92 return ack_failed(0);
93 }
94
95 /// @brief Acknowledge all failures.
96 /// @return The <i>overall</i> number of failures acknowledged.
98 return ack_failed(this->mpi_communicator().size());
99 }
100
101 /// @brief Creates a new communicator from this communicator, excluding the failed processes.
102 /// @return The new communicator.
105 auto const ret = MPIX_Comm_shrink(_comm(), &newcomm);
106 this->to_communicator().mpi_error_hook(ret, "MPIX_Comm_shrink");
107 return Comm(newcomm);
108 }
109
110 /// @brief Agrees on a flag from all live processes and distributes the result back to all live processes, even
111 /// after process failures.
112 /// @param flag The flag to agree on.
113 /// @return The bitwise AND over the contributed input values of \c flag.
114 [[nodiscard]] int agree(int flag) {
115 auto const ret = MPIX_Comm_agree(_comm(), &flag);
116 this->to_communicator().mpi_error_hook(ret, "MPIX_Comm_agree");
117 return flag;
118 }
119
120 /// @brief Agrees on a boolean flag from all live processes and distributes the result back to all live processes,
121 /// even after process failures.
122 /// @param flag The flag to agree on.
123 /// @return The bitwise AND over the contributed input values of \c flag.
124 [[nodiscard]] bool agree(bool flag) {
125 return agree(static_cast<bool>(flag));
126 }
127
128 /// @brief Obtains the group of currently failed processes.
129 /// @return The group of currently failed processes.
132 auto const ret = MPIX_Comm_get_failed(_comm(), &failed_group);
133 this->to_communicator().mpi_error_hook(ret, "MPIX_Comm_failure_get_acked");
134 return Group(failed_group);
135 }
136
137 /// @brief Checks if this communicator has been revoked.
138 /// @return True if the communicator has been revoked, false otherwise.
140 int is_revoked;
142 return static_cast<bool>(is_revoked);
143 }
144
145 /// Overwrite the on-MPI-error handler to throw appropriate exceptions for then hardware faults happened.
146 void mpi_error_handler(int const ret, [[maybe_unused]] std::string const& callee) const {
147 KASSERT(ret != MPI_SUCCESS, "MPI error handler called with MPI_SUCCESS", assert::light);
148 switch (ret) {
150 throw MPIProcFailedError();
151 break;
154 break;
155 case MPIX_ERR_REVOKED:
156 throw MPIRevokedError();
157 break;
158 default:
159 this->to_communicator().mpi_error_default_handler(ret, callee);
160 }
161 }
162
163private:
164 auto _comm() {
165 return this->to_communicator().mpi_communicator();
166 }
167};
168
169} // namespace kamping::plugin
Helper functions that make casts safer.
A group of MPI processes.
Definition group.hpp:36
STL-compatible allocator for requesting memory using the builtin MPI allocator.
Definition allocator.hpp:32
Base class for all exceptions thrown by the FaultTolerance plugin. Means, that either a process faile...
Definition ulfm.hpp:21
char const * what() const noexcept override
Returns an explanatory string.
Definition ulfm.hpp:24
Thrown when a process failure prevented the completetion of the MPI operation.
Definition ulfm.hpp:30
char const * what() const noexcept override
Returns an explanatory string.
Definition ulfm.hpp:33
Thrown when a potential sender matching a non-blocking wildcard source receive has failed.
Definition ulfm.hpp:39
char const * what() const noexcept override
Returns an explanatory string.
Definition ulfm.hpp:42
Thrown when the communicator was revoked.
Definition ulfm.hpp:48
char const * what() const noexcept override
Returns an explanatory string.
Definition ulfm.hpp:51
A plugin implementing a wrapper around the User-Level Failure-Mitigation (ULFM) feature of the upcomi...
Definition ulfm.hpp:62
uint32_t ack_all_failed()
Acknowledge all failures.
Definition ulfm.hpp:97
bool agree(bool flag)
Agrees on a boolean flag from all live processes and distributes the result back to all live processe...
Definition ulfm.hpp:124
Group get_failed()
Obtains the group of currently failed processes.
Definition ulfm.hpp:130
void revoke()
Revokes the current communicator.
Definition ulfm.hpp:73
Comm shrink()
Creates a new communicator from this communicator, excluding the failed processes.
Definition ulfm.hpp:103
void mpi_error_handler(int const ret, std::string const &callee) const
Overwrite the on-MPI-error handler to throw appropriate exceptions for then hardware faults happened.
Definition ulfm.hpp:146
int agree(int flag)
Agrees on a flag from all live processes and distributes the result back to all live processes,...
Definition ulfm.hpp:114
bool is_revoked()
Checks if this communicator has been revoked.
Definition ulfm.hpp:139
uint32_t ack_failed(uint32_t const num_to_ack)
Acknowledges that the application intends to ignore the effect of currently known failures on wildcar...
Definition ulfm.hpp:82
UserLevelFailureMitigation()
Default constructor; sets the error handler of MPI_COMM_WORLD (!) to MPI_ERRORS_RETURN....
Definition ulfm.hpp:67
uint32_t num_ack_failed()
Gets the number of acknowledged failures.
Definition ulfm.hpp:91
Wrapper for MPI functions that don't require a communicator.
An abstraction around MPI_Group.
constexpr int light
Assertion level for lightweight assertions.
Definition assertion_levels.hpp:13
Helper class for using CRTP for mixins. Which are used to implement kamping plugins.
Definition plugin_helpers.hpp:32