Skip to content

Commit

Permalink
Merge #6330 #6340
Browse files Browse the repository at this point in the history
6330: Adding basic logging to collective operations r=hkaiser a=hkaiser

- flyby: fixing JeMalloc integration for dependent projects


6340: Log alive hpx threads on exit r=hkaiser a=Pansysk75

Adds logging when an HPX program is terminated suddenly (ie by a termination signal, addresses issue #6329).
I'll leave it as draft until I make up my mind on whether I think it is actually a good idea




Co-authored-by: Hartmut Kaiser <[email protected]>
Co-authored-by: Panos Syskakis <[email protected]>
  • Loading branch information
3 people committed Sep 10, 2023
3 parents 2c0345e + 53dd971 + 1ff7a46 commit ea8f049
Show file tree
Hide file tree
Showing 12 changed files with 128 additions and 12 deletions.
40 changes: 40 additions & 0 deletions libs/core/runtime_local/src/runtime_local.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,36 @@
#include <thread>
#include <utility>

#if defined(HPX_HAVE_LOGGING)
namespace hpx { namespace detail {
void try_log_runtime_threads()
{
// This may be used in non-valid runtime states, let it fail silently
try
{
auto rt = hpx::get_runtime_ptr();
if (rt == nullptr)
return;

rt->get_thread_manager().enumerate_threads(
[](hpx::threads::thread_id_type id) -> bool {
hpx::threads::thread_data* td = get_thread_id_data(id);
auto sched = td->get_scheduler_base();
LTM_(debug).format("Logging all runtime threads: pool({}), "
"scheduler({}),"
"thread({}), description({}), state({})",
sched->get_parent_pool(), sched, id,
td->get_description(), td->get_state().state());
return true;
});
}
catch (...)
{
}
}
}}; // namespace hpx::detail
#endif

///////////////////////////////////////////////////////////////////////////////
// Make sure the system gets properly shut down while handling Ctrl-C and other
// system signals
Expand Down Expand Up @@ -91,6 +121,11 @@ namespace hpx {
}
#endif

#if defined(HPX_HAVE_LOGGING)
LRT_(debug).format("Terminating due to system signal({})", reason);
hpx::detail::try_log_runtime_threads();
#endif

std::cerr << "{what}: " << (reason ? reason : "Unknown reason")
<< "\n";
}
Expand Down Expand Up @@ -164,6 +199,11 @@ namespace hpx {
}
#endif

#if defined(HPX_HAVE_LOGGING)
LRT_(debug).format("Terminating due to system signal({})", signum);
hpx::detail::try_log_runtime_threads();
#endif

std::cerr << "{what}: " << (reason ? reason : "Unknown reason")
<< "\n";
}
Expand Down
6 changes: 6 additions & 0 deletions libs/full/collectives/include/hpx/collectives/all_gather.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ namespace hpx::traits {

namespace communication {
struct all_gather_tag;

template <>
constexpr char const* communicator_name<all_gather_tag>() noexcept
{
return "all_gather";
}
} // namespace communication

///////////////////////////////////////////////////////////////////////////
Expand Down
7 changes: 7 additions & 0 deletions libs/full/collectives/include/hpx/collectives/all_reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,14 @@ namespace hpx { namespace collectives {
namespace hpx::traits {

namespace communication {

struct all_reduce_tag;

template <>
constexpr char const* communicator_name<all_reduce_tag>() noexcept
{
return "all_reduce";
}
} // namespace communication

///////////////////////////////////////////////////////////////////////////
Expand Down
7 changes: 7 additions & 0 deletions libs/full/collectives/include/hpx/collectives/all_to_all.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,14 @@ namespace hpx { namespace collectives {
namespace hpx::traits {

namespace communication {

struct all_to_all_tag;

template <>
constexpr char const* communicator_name<all_to_all_tag>() noexcept
{
return "all_to_all";
}
} // namespace communication

///////////////////////////////////////////////////////////////////////////
Expand Down
7 changes: 7 additions & 0 deletions libs/full/collectives/include/hpx/collectives/broadcast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,14 @@ namespace hpx::traits {
///////////////////////////////////////////////////////////////////////////
// support for broadcast
namespace communication {

struct broadcast_tag;

template <>
constexpr char const* communicator_name<broadcast_tag>() noexcept
{
return "broadcast";
}
} // namespace communication

template <typename Communicator>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,18 +102,15 @@ namespace hpx::collectives::detail {
};
} // namespace hpx::collectives::detail

namespace hpx::util {

// This is explicitly instantiated to ensure that the id is stable across
// shared libraries.
template <>
struct extra_data_helper<collectives::detail::communicator_data>
{
HPX_EXPORT static extra_data_id_type id() noexcept;
static constexpr void reset(
collectives::detail::communicator_data*) noexcept {};
};
} // namespace hpx::util
// This is explicitly instantiated to ensure that the id is stable across
// shared libraries.
template <>
struct hpx::util::extra_data_helper<hpx::collectives::detail::communicator_data>
{
HPX_EXPORT static extra_data_id_type id() noexcept;
static constexpr void reset(
collectives::detail::communicator_data*) noexcept {};
}; // namespace hpx::util

namespace hpx::collectives {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <hpx/datastructures/any.hpp>
#include <hpx/futures/future.hpp>
#include <hpx/lcos_local/and_gate.hpp>
#include <hpx/modules/logging.hpp>
#include <hpx/synchronization/spinlock.hpp>
#include <hpx/thread_support/assert_owns_lock.hpp>
#include <hpx/type_support/unused.hpp>
Expand All @@ -31,6 +32,16 @@ namespace hpx::traits {
// This type can be specialized for a particular collective operation
template <typename Communicator, typename Operation>
struct communication_operation;

namespace communication {

// Retrieve name of the current communicator
template <typename Operation>
constexpr char const* communicator_name() noexcept
{
return "<unknown>";
}
} // namespace communication
} // namespace hpx::traits

namespace hpx::collectives::detail {
Expand All @@ -55,6 +66,11 @@ namespace hpx::collectives::detail {
using collective_operation =
traits::communication_operation<communicator_server, Operation>;

LHPX_(info, " [COL] ")
.format("get({}): which({}), generation({})",
traits::communication::communicator_name<Operation>(),
which, generation);

return collective_operation::template get<Result>(
*this, which, generation, HPX_MOVE(args)...);
}
Expand All @@ -75,6 +91,11 @@ namespace hpx::collectives::detail {
using collective_operation =
traits::communication_operation<communicator_server, Operation>;

LHPX_(info, " [COL] ")
.format("set({}): which({}), generation({})",
traits::communication::communicator_name<Operation>(),
which, generation);

return collective_operation::template set<Result>(
*this, which, generation, HPX_MOVE(args)...);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,14 @@ namespace hpx { namespace collectives {
namespace hpx::traits {

namespace communication {

struct exclusive_scan_tag;

template <>
constexpr char const* communicator_name<exclusive_scan_tag>() noexcept
{
return "exclusive_scan";
}
} // namespace communication

///////////////////////////////////////////////////////////////////////////
Expand Down
6 changes: 6 additions & 0 deletions libs/full/collectives/include/hpx/collectives/gather.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,12 @@ namespace hpx::traits {
namespace communication {

struct gather_tag;

template <>
constexpr char const* communicator_name<gather_tag>() noexcept
{
return "gather";
}
} // namespace communication

template <typename Communicator>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,12 @@ namespace hpx::traits {
namespace communication {

struct inclusive_scan_tag;

template <>
constexpr char const* communicator_name<inclusive_scan_tag>() noexcept
{
return "inclusive_scan";
}
} // namespace communication

///////////////////////////////////////////////////////////////////////////
Expand Down
6 changes: 6 additions & 0 deletions libs/full/collectives/include/hpx/collectives/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,12 @@ namespace hpx::traits {
namespace communication {

struct reduce_tag;

template <>
constexpr char const* communicator_name<reduce_tag>() noexcept
{
return "reduce";
}
} // namespace communication

///////////////////////////////////////////////////////////////////////////
Expand Down
6 changes: 6 additions & 0 deletions libs/full/collectives/include/hpx/collectives/scatter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,12 @@ namespace hpx::traits {
namespace communication {

struct scatter_tag;

template <>
constexpr char const* communicator_name<scatter_tag>() noexcept
{
return "scatter";
}
} // namespace communication

template <typename Communicator>
Expand Down

0 comments on commit ea8f049

Please sign in to comment.