From 303e2d9c4f0b6ba54f1cb22a0d58f8f66fb0fadf Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 12 Aug 2024 16:27:25 +0200
Subject: [PATCH 01/19] [WIP] Implement communication deadlock detection
 mechanism

---
 integration_test/test_cpp_tcp_server.py       |   2 +-
 libmuscle/python/libmuscle/communicator.py    |  46 ++++++-
 libmuscle/python/libmuscle/instance.py        |  24 +++-
 .../libmuscle/manager/deadlock_detector.py    | 126 ++++++++++++++++++
 libmuscle/python/libmuscle/manager/manager.py |   9 +-
 .../python/libmuscle/manager/mmp_server.py    |  38 +++++-
 libmuscle/python/libmuscle/mcp/protocol.py    |   3 +
 .../libmuscle/mcp/tcp_transport_client.py     |  41 +++++-
 .../python/libmuscle/mcp/transport_client.py  |  29 +++-
 libmuscle/python/libmuscle/mmp_client.py      |  20 +++
 libmuscle/python/libmuscle/mpp_client.py      |   7 +-
 .../libmuscle/test/test_communicator.py       |  22 +--
 .../python/libmuscle/test/test_instance.py    |   5 +-
 13 files changed, 341 insertions(+), 31 deletions(-)
 create mode 100644 libmuscle/python/libmuscle/manager/deadlock_detector.py

diff --git a/integration_test/test_cpp_tcp_server.py b/integration_test/test_cpp_tcp_server.py
index 912b838d..50a60c16 100644
--- a/integration_test/test_cpp_tcp_server.py
+++ b/integration_test/test_cpp_tcp_server.py
@@ -37,7 +37,7 @@ def test_cpp_tcp_server(log_file_in_tmpdir):
     assert TcpTransportClient.can_connect_to(location)
 
     client = MPPClient([location])
-    msg_bytes, _ = client.receive(Reference('test_receiver.port'))
+    msg_bytes, _ = client.receive(Reference('test_receiver.port'), None)
     msg = MPPMessage.from_bytes(msg_bytes)
     client.close()
 
diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index 19b8e9c8..74e2a2e2 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -3,10 +3,12 @@
 from ymmsl import Identifier, Reference, Settings
 
 from libmuscle.endpoint import Endpoint
+from libmuscle.mmp_client import MMPClient
 from libmuscle.mpp_message import ClosePort, MPPMessage
 from libmuscle.mpp_client import MPPClient
 from libmuscle.mpp_server import MPPServer
 from libmuscle.mcp.tcp_util import SocketClosed
+from libmuscle.mcp.transport_client import TimeoutHandler
 from libmuscle.peer_info import PeerInfo
 from libmuscle.port_manager import PortManager
 from libmuscle.profiler import Profiler
@@ -60,6 +62,31 @@ def __init__(self, timestamp: float, next_timestamp: Optional[float] = None,
         self.settings = settings
 
 
+class RecvTimeoutHandler(TimeoutHandler):
+    def __init__(
+            self, manager: MMPClient,
+            peer_instance: str, port_name: str, slot: Optional[int],
+            timeout: float
+            ) -> None:
+        self._manager = manager
+        self._peer_instance = peer_instance
+        self._port_name = port_name
+        self._slot = slot
+        self._timeout = timeout
+
+    @property
+    def timeout(self) -> float:
+        return self._timeout
+
+    def on_timeout(self) -> None:
+        self._manager.waiting_for_receive(
+                self._peer_instance, self._port_name, self._slot)
+
+    def on_receive(self) -> None:
+        self._manager.waiting_for_receive_done(
+            self._peer_instance, self._port_name, self._slot)
+
+
 class Communicator:
     """Communication engine for MUSCLE3.
 
@@ -88,6 +115,8 @@ def __init__(
         self._index = index
         self._port_manager = port_manager
         self._profiler = profiler
+        # TODO: make this a proper argument of __init__()
+        self._manager = profiler._manager
 
         self._server = MPPServer()
 
@@ -178,7 +207,8 @@ def send_message(
             self._profiler.record_event(profile_event)
 
     def receive_message(
-            self, port_name: str, slot: Optional[int] = None) -> Tuple[Message, float]:
+            self, port_name: str, slot: Optional[int], timeout: float
+            ) -> Tuple[Message, float]:
         """Receive a message and attached settings overlay.
 
         Receiving is a blocking operation. This function will contact
@@ -228,8 +258,14 @@ def receive_message(
         snd_endpoint = self._peer_info.get_peer_endpoints(
                 recv_endpoint.port, slot_list)[0]
         client = self.__get_client(snd_endpoint.instance())
+        timeout_handler = None
+        if timeout >= 0:
+            timeout_handler = RecvTimeoutHandler(
+                    self._manager, str(snd_endpoint.instance()),
+                    port_name, slot, timeout)
         try:
-            mpp_message_bytes, profile = client.receive(recv_endpoint.ref())
+            mpp_message_bytes, profile = client.receive(
+                    recv_endpoint.ref(), timeout_handler)
         except (ConnectionError, SocketClosed) as exc:
             raise RuntimeError(
                 "Error while receiving a message: connection with peer"
@@ -288,7 +324,7 @@ def receive_message(
                 _logger.debug(f'Discarding received message on {port_and_slot}'
                               ': resuming from weakly consistent snapshot')
                 port.set_resumed(slot)
-                return self.receive_message(port_name, slot)
+                return self.receive_message(port_name, slot, timeout)
             raise RuntimeError(f'Received message on {port_and_slot} with'
                                ' unexpected message number'
                                f' {mpp_message.message_number}. Was expecting'
@@ -398,7 +434,7 @@ def _drain_incoming_port(self, port_name: str) -> None:
         port = self._port_manager.get_port(port_name)
         while port.is_open():
             # TODO: log warning if not a ClosePort
-            self.receive_message(port_name)
+            self.receive_message(port_name, None, 10.0)  # FIXME: timeout
 
     def _drain_incoming_vector_port(self, port_name: str) -> None:
         """Receives messages until a ClosePort is received.
@@ -413,7 +449,7 @@ def _drain_incoming_vector_port(self, port_name: str) -> None:
                        for slot in range(port.get_length())]):
             for slot in range(port.get_length()):
                 if port.is_open(slot):
-                    self.receive_message(port_name, slot)
+                    self.receive_message(port_name, slot, 10.0)  # FIXME: timeout
 
     def _close_incoming_ports(self) -> None:
         """Closes incoming ports.
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index bfca4d06..f6899d27 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -145,6 +145,10 @@ def __init__(
                 self._name, self._index, self._port_manager, self._profiler)
         """Communicator for this instance."""
 
+        self._receive_timeout = 10.0
+        """Timeout in seconds on message receives after which the manager is notified
+        that we are waiting for a message. Used to detect communication deadlocks."""
+
         self._declared_ports = ports
         """Declared ports for this instance."""
 
@@ -186,6 +190,7 @@ def __init__(
         self._set_local_log_level()
         self._set_remote_log_level()
         self._setup_profiling()
+        self._setup_receive_timeout()
 
     def reuse_instance(self) -> bool:
         """Decide whether to run this instance again.
@@ -809,6 +814,16 @@ def _setup_profiling(self) -> None:
 
         self._profiler.set_level(profile_level_str)
 
+    def _setup_receive_timeout(self) -> None:
+        """Configures receive timeout with settings from settings.
+        """
+        try:
+            self._receive_timeout = self.get_setting(
+                    'muscle_deadlock_receive_timeout', 'float')
+        except KeyError:
+            pass  # do nothing and keep the default
+        _logger.debug("Timeout on receiving messages set to %f", self._receive_timeout)
+
     def _decide_reuse_instance(self) -> bool:
         """Decide whether and how to reuse the instance.
 
@@ -933,7 +948,8 @@ def __receive_message(
                     return default
 
             else:
-                msg, saved_until = self._communicator.receive_message(port_name, slot)
+                msg, saved_until = self._communicator.receive_message(
+                        port_name, slot, self._receive_timeout)
                 if not port.is_open(slot):
                     err_msg = (('Port {} was closed while trying to'
                                 ' receive on it, did the peer crash?'
@@ -1077,7 +1093,8 @@ def __receive_settings(self) -> bool:
         Returns:
             False iff the port is connnected and ClosePort was received.
         """
-        message, saved_until = self._communicator.receive_message('muscle_settings_in')
+        message, saved_until = self._communicator.receive_message(
+                'muscle_settings_in', None, self._receive_timeout)
 
         if isinstance(message.data, ClosePort):
             return False
@@ -1107,7 +1124,8 @@ def __pre_receive_f_init(self) -> None:
         apply_overlay = InstanceFlags.DONT_APPLY_OVERLAY not in self._flags
 
         def pre_receive(port_name: str, slot: Optional[int]) -> None:
-            msg, saved_until = self._communicator.receive_message(port_name, slot)
+            msg, saved_until = self._communicator.receive_message(
+                    port_name, slot, self._receive_timeout)
             self._f_init_cache[(port_name, slot)] = msg
             if apply_overlay:
                 self.__apply_overlay(msg)
diff --git a/libmuscle/python/libmuscle/manager/deadlock_detector.py b/libmuscle/python/libmuscle/manager/deadlock_detector.py
new file mode 100644
index 00000000..b1cf0597
--- /dev/null
+++ b/libmuscle/python/libmuscle/manager/deadlock_detector.py
@@ -0,0 +1,126 @@
+import logging
+from threading import Thread
+from typing import Callable, Dict, List, Optional, Tuple
+from queue import Queue
+
+
+_logger = logging.getLogger(__name__)
+_QueueItem = Tuple[bool, str, str, str, Optional[int]]
+
+
+class DeadlockDetector(Thread):
+    """TODO"""
+
+    def __init__(
+            self, shutdown_callback: Callable[[], None], wait_before_shutdown: float
+            ) -> None:
+        super().__init__(name="DeadlockDetector")
+
+        self._shutdown_callback = shutdown_callback
+        self._wait_before_shutdown = wait_before_shutdown
+
+        self._queue: Queue[Optional[_QueueItem]] = Queue()
+        self._waiting_instances: Dict[str, str] = {}
+        self._waiting_instance_ports: Dict[str, Tuple[str, int]] = {}
+
+        self._detected_deadlocks: List[str] = []
+
+    def run(self) -> None:
+        """Logic that is executed in the thread."""
+        while True:
+            item = self._queue.get()
+            if item is None:  # Shutdown sentinal
+                return
+            # Handle item
+            self._process_queue_item(item)
+
+    def shutdown(self) -> None:
+        """Stop the deadlock detector thread."""
+        self._queue.put(None)
+
+    def put_waiting(
+            self, instance_id: str, peer_instance_id: str,
+            port_name: str, slot: Optional[int]
+            ) -> None:
+        """Process a WATING_FOR_RECEIVE message from an instance."""
+        self._queue.put((True, instance_id, peer_instance_id, port_name, slot))
+
+    def put_waiting_done(
+            self, instance_id: str, peer_instance_id: str,
+            port_name: str, slot: Optional[int]
+            ) -> None:
+        """Process a WATING_FOR_RECEIVE_DONE message from an instance."""
+        self._queue.put((False, instance_id, peer_instance_id, port_name, slot))
+
+    def _process_queue_item(self, item: _QueueItem) -> None:
+        _logger.info("Processing queue item: %s", item)
+        is_waiting, instance_id, peer_instance_id, port_name, slot = item
+        if is_waiting:
+            # Sanity checks, triggering this is a bug in the instance or the manager
+            if instance_id in self._waiting_instances:
+                _logger.error(
+                    "Instance %s was already waiting on a receive call. "
+                    "Did we miss a WAITING DONE event?",
+                    instance_id)
+            # Register that the instance is waiting
+            self._waiting_instances[instance_id] = peer_instance_id
+            self._waiting_instance_ports[instance_id] = (port_name, slot)
+            self._check_for_deadlock(instance_id)
+
+        else:
+            # Sanity checks, triggering these is a bug in the instance or the manager
+            if instance_id not in self._waiting_instances:
+                _logger.error(
+                    "Instance %s is not waiting on a receive call.", instance_id)
+            elif self._waiting_instances[instance_id] != peer_instance_id:
+                _logger.error(
+                    "Instance %s was waiting for %s, not for %s.",
+                    instance_id,
+                    self._waiting_instances[instance_id],
+                    peer_instance_id)
+            elif self._waiting_instance_ports[instance_id] != (port_name, slot):
+                _logger.error(
+                    "Instance %s was waiting on port[slot] %s[%s], not on %s[%s]",
+                    instance_id,
+                    *self._waiting_instance_ports[instance_id],
+                    port_name, slot)
+            else:
+                del self._waiting_instances[instance_id]
+                del self._waiting_instance_ports[instance_id]
+
+    def _check_for_deadlock(self, instance_id: str) -> None:
+        """Check if there is a cycle of waiting instances that involves this instance.
+        """
+        deadlock_instances = [instance_id]
+        cur_instance = instance_id
+        while cur_instance in self._waiting_instances:
+            cur_instance = self._waiting_instances[cur_instance]
+            if cur_instance == instance_id:
+                break  # Found a deadlocked cycle
+            deadlock_instances.append(cur_instance)
+        else:  # No cycle detected
+            _logger.info("No deadlock detected")
+            return
+
+        _logger.warning(
+            "Potential deadlock detected, aborting run in %d seconds.\n%s",
+            self._wait_before_shutdown,
+            self._format_deadlock(deadlock_instances),
+            )
+        # TODO: wait and abort
+        self._shutdown_callback()
+
+    def _format_deadlock(self, instances: List[str]) -> str:
+        """Create and return formatted deadlock debug info."""
+        num_instances = str(len(instances))
+        lines = [f"The following {num_instances} instances are dead-locked:"]
+        for i, instance in enumerate(instances):
+            num = str(i+1).rjust(len(num_instances))
+            peer_instance = self._waiting_instances[instance]
+            port, slot = self._waiting_instance_ports[instance]
+            slot_txt = "" if slot is None else f"[{slot}]"
+            lines.append(
+                f"{num}. Instance '{instance}' is waiting on instance '{peer_instance}'"
+                f" in a receive on port '{port}{slot_txt}'."
+            )
+        return "\n".join(lines)
diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py
index 044a28d7..df4cd0d8 100644
--- a/libmuscle/python/libmuscle/manager/manager.py
+++ b/libmuscle/python/libmuscle/manager/manager.py
@@ -14,6 +14,7 @@
 from libmuscle.manager.run_dir import RunDir
 from libmuscle.manager.snapshot_registry import SnapshotRegistry
 from libmuscle.manager.topology_store import TopologyStore
+from libmuscle.manager.deadlock_detector import DeadlockDetector
 
 
 _logger = logging.getLogger(__name__)
@@ -79,11 +80,14 @@ def __init__(
         self._snapshot_registry = SnapshotRegistry(
                 configuration, snapshot_dir, self._topology_store)
         self._snapshot_registry.start()
+        # FIXME configure timeout:
+        self._deadlock_detector = DeadlockDetector(self.stop, 5.0)
+        self._deadlock_detector.start()
 
         self._server = MMPServer(
                 self._logger, self._profile_store, self._configuration,
                 self._instance_registry, self._topology_store,
-                self._snapshot_registry, run_dir)
+                self._snapshot_registry, self._deadlock_detector, run_dir)
 
         if self._instance_manager:
             self._instance_manager.set_manager_location(
@@ -121,6 +125,9 @@ def stop(self) -> None:
         """Shuts down the manager."""
         if self._instance_manager:
             self._instance_manager.shutdown()
+        self._deadlock_detector.shutdown()
+        # Note: don't join() deadlock detector, as this method may be called from the
+        # DeadlockDetector thread. join() would (ironically) deadlock the shutdown :)
         self._server.stop()
         self._snapshot_registry.shutdown()
         self._snapshot_registry.join()
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index 94847fd9..6eca3131 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -20,6 +20,7 @@
 from libmuscle.mcp.tcp_transport_server import TcpTransportServer
 from libmuscle.mcp.transport_server import RequestHandler
 from libmuscle.manager.profile_store import ProfileStore
+from libmuscle.manager.deadlock_detector import DeadlockDetector
 from libmuscle.profiling import (
         ProfileEvent, ProfileEventType, ProfileTimestamp)
 from libmuscle.snapshot import SnapshotMetadata
@@ -77,6 +78,7 @@ def __init__(
             instance_registry: InstanceRegistry,
             topology_store: TopologyStore,
             snapshot_registry: SnapshotRegistry,
+            deadlock_detector: DeadlockDetector,
             run_dir: Optional[RunDir]
             ) -> None:
         """Create an MMPRequestHandler.
@@ -93,6 +95,7 @@ def __init__(
         self._instance_registry = instance_registry
         self._topology_store = topology_store
         self._snapshot_registry = snapshot_registry
+        self._deadlock_detector = deadlock_detector
         self._run_dir = run_dir
         self._reference_time = time.monotonic()
 
@@ -124,6 +127,10 @@ def handle_request(self, request: bytes) -> bytes:
             response = self._submit_snapshot(*req_args)
         elif req_type == RequestType.GET_CHECKPOINT_INFO.value:
             response = self._get_checkpoint_info(*req_args)
+        elif req_type == RequestType.WAITING_FOR_RECEIVE.value:
+            response = self._waiting_for_receive(*req_args)
+        elif req_type == RequestType.WAITING_FOR_RECEIVE_DONE.value:
+            response = self._waiting_for_receive_done(*req_args)
 
         return cast(bytes, msgpack.packb(response, use_bin_type=True))
 
@@ -357,6 +364,34 @@ def _get_checkpoint_info(self, instance_id: str) -> Any:
                 resume,
                 snapshot_directory]
 
+    def _waiting_for_receive(
+            self, instance_id: str, peer_instance_id: str,
+            port_name: str, slot: Optional[int]) -> Any:
+        """Indicate that the instance is waiting to receive a message.
+
+        Args:
+            instance_id: The instance that is waiting
+            port_name: Port name that the instance is waiting on
+            slot: Slot that the instance is waiting on
+        """
+        self._deadlock_detector.put_waiting(
+                instance_id, peer_instance_id, port_name, slot)
+        return [ResponseType.SUCCESS.value]
+
+    def _waiting_for_receive_done(
+            self, instance_id: str, peer_instance_id: str,
+            port_name: str, slot: Optional[int]) -> Any:
+        """Indicate that the instance is done waiting to receive a message.
+
+        Args:
+            instance_id: The instance that is waiting
+            port_name: Port name that the instance is waiting on
+            slot: Slot that the instance is waiting on
+        """
+        self._deadlock_detector.put_waiting_done(
+                instance_id, peer_instance_id, port_name, slot)
+        return [ResponseType.SUCCESS.value]
+
 
 class MMPServer:
     """The MUSCLE Manager Protocol server.
@@ -373,6 +408,7 @@ def __init__(
             instance_registry: InstanceRegistry,
             topology_store: TopologyStore,
             snapshot_registry: SnapshotRegistry,
+            deadlock_detector: DeadlockDetector,
             run_dir: Optional[RunDir]
             ) -> None:
         """Create an MMPServer.
@@ -395,7 +431,7 @@ def __init__(
         """
         self._handler = MMPRequestHandler(
                 logger, profile_store, configuration, instance_registry,
-                topology_store, snapshot_registry, run_dir)
+                topology_store, snapshot_registry, deadlock_detector, run_dir)
         try:
             self._server = TcpTransportServer(self._handler, 9000)
         except OSError as e:
diff --git a/libmuscle/python/libmuscle/mcp/protocol.py b/libmuscle/python/libmuscle/mcp/protocol.py
index 5d1217ed..da23b221 100644
--- a/libmuscle/python/libmuscle/mcp/protocol.py
+++ b/libmuscle/python/libmuscle/mcp/protocol.py
@@ -22,6 +22,9 @@ class RequestType(Enum):
     SUBMIT_PROFILE_EVENTS = 6
     SUBMIT_SNAPSHOT = 7
     GET_CHECKPOINT_INFO = 8
+    # Connection deadlock detection
+    WAITING_FOR_RECEIVE = 9
+    WAITING_FOR_RECEIVE_DONE = 10
 
     # MUSCLE Peer Protocol
     GET_NEXT_MESSAGE = 21
diff --git a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
index ed9536d2..bf565e5e 100644
--- a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
+++ b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
@@ -1,8 +1,9 @@
 from errno import ENOTCONN
+import select
 import socket
 from typing import Optional, Tuple
 
-from libmuscle.mcp.transport_client import ProfileData, TransportClient
+from libmuscle.mcp.transport_client import ProfileData, TransportClient, TimeoutHandler
 from libmuscle.mcp.tcp_util import recv_all, recv_int64, send_int64
 from libmuscle.profiling import ProfileTimestamp
 
@@ -51,13 +52,22 @@ def __init__(self, location: str) -> None:
                 sock.setsockopt(socket.SOL_TCP, socket.TCP_QUICKACK, 1)
             self._socket = sock
 
-    def call(self, request: bytes) -> Tuple[bytes, ProfileData]:
+        if hasattr(select, "poll"):
+            self._poll_obj = select.poll()
+            self._poll_obj.register(self._socket, select.POLLIN)
+        else:
+            self._poll_obj = None  # On platforms that don't support select.poll
+
+    def call(self, request: bytes, timeout_handler: Optional[TimeoutHandler] = None
+             ) -> Tuple[bytes, ProfileData]:
         """Send a request to the server and receive the response.
 
         This is a blocking call.
 
         Args:
             request: The request to send
+            timeout_handler: Optional timeout handler. This is used for communication
+                deadlock detection.
 
         Returns:
             The received response
@@ -66,13 +76,40 @@ def call(self, request: bytes) -> Tuple[bytes, ProfileData]:
         send_int64(self._socket, len(request))
         self._socket.sendall(request)
 
+        did_timeout = False
+        if timeout_handler is not None:
+            if not self._poll(timeout_handler.timeout):
+                did_timeout = True
+                timeout_handler.on_timeout()
+
         length = recv_int64(self._socket)
+        if did_timeout:
+            timeout_handler.on_receive()
         start_transfer = ProfileTimestamp()
 
         response = recv_all(self._socket, length)
         stop_transfer = ProfileTimestamp()
         return response, (start_wait, start_transfer, stop_transfer)
 
+    def _poll(self, timeout: float) -> bool:
+        """Poll the socket and return whether its ready for receiving.
+
+        This method blocks until the socket is ready for receiving, or :param:`timeout`
+        seconds have passed (whichever is earlier).
+
+        Args:
+            timeout: timeout in seconds
+
+        Returns:
+            True if the socket is ready for receiving data, False otherwise.
+        """
+        if self._poll_obj is not None:
+            ready = self._poll_obj.poll(timeout * 1000)  # poll timeout is in ms
+        else:
+            # Fallback to select()
+            ready, _, _ = select.select([self._socket], (), (), timeout)
+        return bool(ready)
+
     def close(self) -> None:
         """Closes this client.
 
diff --git a/libmuscle/python/libmuscle/mcp/transport_client.py b/libmuscle/python/libmuscle/mcp/transport_client.py
index 55942dc9..9b1c0ab5 100644
--- a/libmuscle/python/libmuscle/mcp/transport_client.py
+++ b/libmuscle/python/libmuscle/mcp/transport_client.py
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Optional, Tuple
 
 from libmuscle.profiling import ProfileTimestamp
 
@@ -6,6 +6,28 @@
 ProfileData = Tuple[ProfileTimestamp, ProfileTimestamp, ProfileTimestamp]
 
 
+class TimeoutHandler:
+    """Object handling timeouts during :meth:`TransportClient.call`."""
+
+    @property
+    def timeout(self) -> float:
+        """Timeout (in seconds) after which :meth:`on_timeout` is called."""
+        raise NotImplementedError()     # pragma: no cover
+
+    def on_timeout(self) -> None:
+        """Callback when :attr:`timeout` seconds have passed without a response from the
+        peer.
+        """
+        raise NotImplementedError()     # pragma: no cover
+
+    def on_receive(self) -> None:
+        """Callback when receiving a response from the peer.
+
+        Note: this method is only called when the request has timed out.
+        """
+        raise NotImplementedError()     # pragma: no cover
+
+
 class TransportClient:
     """A client that connects to an MCP server.
 
@@ -25,7 +47,8 @@ def can_connect_to(location: str) -> bool:
         """
         raise NotImplementedError()     # pragma: no cover
 
-    def call(self, request: bytes) -> Tuple[bytes, ProfileData]:
+    def call(self, request: bytes, timeout_handler: Optional[TimeoutHandler] = None
+             ) -> Tuple[bytes, ProfileData]:
         """Send a request to the server and receive the response.
 
         This is a blocking call. Besides the result, this function
@@ -36,6 +59,8 @@ def call(self, request: bytes) -> Tuple[bytes, ProfileData]:
 
         Args:
             request: The request to send
+            timeout_handler: Optional timeout handler. This is used for communication
+                deadlock detection.
 
         Returns:
             The received response, and the timestamps
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 34238a87..00a2a20c 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -275,6 +275,26 @@ def deregister_instance(self) -> None:
             raise RuntimeError('Error deregistering instance: {}'.format(
                     response[1]))
 
+    def waiting_for_receive(
+            self, peer_instance_id: str, port_name: str, slot: Optional[int]
+            ) -> None:
+        """Notify the manager that we're waiting to receive a message."""
+        request = [
+                RequestType.WAITING_FOR_RECEIVE.value,
+                str(self._instance_id),
+                peer_instance_id, port_name, slot]
+        self._call_manager(request)
+
+    def waiting_for_receive_done(
+            self, peer_instance_id: str, port_name: str, slot: Optional[int]
+            ) -> None:
+        """Notify the manager that we're done waiting to receive a message."""
+        request = [
+                RequestType.WAITING_FOR_RECEIVE_DONE.value,
+                str(self._instance_id),
+                peer_instance_id, port_name, slot]
+        self._call_manager(request)
+
     def _call_manager(self, request: Any) -> Any:
         """Call the manager and do en/decoding.
 
diff --git a/libmuscle/python/libmuscle/mpp_client.py b/libmuscle/python/libmuscle/mpp_client.py
index 852ec938..5eedbf97 100644
--- a/libmuscle/python/libmuscle/mpp_client.py
+++ b/libmuscle/python/libmuscle/mpp_client.py
@@ -4,7 +4,7 @@
 from ymmsl import Reference
 
 from libmuscle.mcp.protocol import RequestType
-from libmuscle.mcp.transport_client import ProfileData, TransportClient
+from libmuscle.mcp.transport_client import ProfileData, TransportClient, TimeoutHandler
 from libmuscle.mcp.type_registry import transport_client_types
 
 
@@ -40,7 +40,8 @@ def __init__(self, locations: List[str]) -> None:
 
         self._transport_client = client
 
-    def receive(self, receiver: Reference) -> Tuple[bytes, ProfileData]:
+    def receive(self, receiver: Reference, timeout_handler: Optional[TimeoutHandler]
+                ) -> Tuple[bytes, ProfileData]:
         """Receive a message from a port this client connects to.
 
         Args:
@@ -51,7 +52,7 @@ def receive(self, receiver: Reference) -> Tuple[bytes, ProfileData]:
         """
         request = [RequestType.GET_NEXT_MESSAGE.value, str(receiver)]
         encoded_request = msgpack.packb(request, use_bin_type=True)
-        return self._transport_client.call(encoded_request)
+        return self._transport_client.call(encoded_request, timeout_handler)
 
     def close(self) -> None:
         """Closes this client.
diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index ff1e08dd..5397a14e 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -115,9 +115,9 @@ def test_receive_message(connected_communicator, mpp_client):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    recv_msg, saved_until = connected_communicator.receive_message('in')
+    recv_msg, saved_until = connected_communicator.receive_message('in', None, -1)
 
-    mpp_client.receive.assert_called_with(Ref('component.in'))
+    mpp_client.receive.assert_called_with(Ref('component.in'), None)
 
     assert recv_msg.timestamp == 2.0
     assert recv_msg.next_timestamp == 3.0
@@ -135,9 +135,9 @@ def test_receive_message_vector(connected_communicator, mpp_client):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    recv_msg, saved_until = connected_communicator.receive_message('in_v', 5)
+    recv_msg, saved_until = connected_communicator.receive_message('in_v', 5, -1)
 
-    mpp_client.receive.assert_called_with(Ref('component.in_v[5]'))
+    mpp_client.receive.assert_called_with(Ref('component.in_v[5]'), None)
 
     assert recv_msg.timestamp == 4.0
     assert recv_msg.next_timestamp == 6.0
@@ -155,7 +155,7 @@ def test_receive_close_port(connected_communicator, mpp_client, port_manager):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    recv_msg, saved_until = connected_communicator.receive_message('in')
+    recv_msg, saved_until = connected_communicator.receive_message('in', None, -1)
 
     assert port_manager.get_port('in').is_open() is False
 
@@ -167,7 +167,7 @@ def test_receive_close_port_vector(connected_communicator, mpp_client, port_mana
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    recv_msg, saved_until = connected_communicator.receive_message('in_v', 5)
+    recv_msg, saved_until = connected_communicator.receive_message('in_v', 5, -1)
 
     assert port_manager.get_port('in_v').is_open(5) is False
 
@@ -182,12 +182,12 @@ def test_port_count_validation(
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    connected_communicator.receive_message('in')
+    connected_communicator.receive_message('in', None, -1)
     assert connected_port_manager.get_port('in').get_message_counts() == [1]
 
     with pytest.raises(RuntimeError):
         # the message received has message_number = 0 again
-        connected_communicator.receive_message('in')
+        connected_communicator.receive_message('in', None, -1)
 
 
 def test_port_discard_error_on_resume(
@@ -212,7 +212,7 @@ def test_port_discard_error_on_resume(
     # message_number=1
     with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'):
         with pytest.raises(RuntimeError):
-            connected_communicator.receive_message('in')
+            connected_communicator.receive_message('in', None, -1)
 
         assert any([
             'Discarding received message' in rec.message
@@ -238,7 +238,7 @@ def test_port_discard_success_on_resume(
         assert connected_port_manager.get_port(port).is_resuming(None)
 
     with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'):
-        msg, _ = connected_communicator.receive_message('in')
+        msg, _ = connected_communicator.receive_message('in', None, -1)
         assert any([
             'Discarding received message' in rec.message
             for rec in caplog.records])
@@ -272,7 +272,7 @@ def test_shutdown(
                     sender, receiver, slot, float('inf'), None,
                     Settings(), 0, 3.5, ClosePort())
 
-    def receive(receiver):
+    def receive(receiver, timeout_handler):
         return messages[receiver].encoded(), MagicMock()
 
     mpp_client.receive = receive
diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py
index 9dc52c52..264a0b9b 100644
--- a/libmuscle/python/libmuscle/test/test_instance.py
+++ b/libmuscle/python/libmuscle/test/test_instance.py
@@ -355,6 +355,7 @@ def test_reuse_set_overlay(
         instance, port_manager, mock_ports, communicator, settings_manager):
     port_manager.settings_in_connected.return_value = True
     mock_ports['in']._is_connected = False
+    instance._receive_timeout = -1
 
     mock_msg = MagicMock()
     mock_msg.data = Settings({'s1': 1, 's2': 2})
@@ -363,7 +364,7 @@ def test_reuse_set_overlay(
 
     instance.reuse_instance()
 
-    communicator.receive_message.assert_called_with('muscle_settings_in')
+    communicator.receive_message.assert_called_with('muscle_settings_in', None, -1)
     assert settings_manager.overlay['s0'] == 0
     assert settings_manager.overlay['s1'] == 1
     assert settings_manager.overlay['s2'] == 2
@@ -478,7 +479,7 @@ def test_receive_no_default(instance):
 def test_receive_inconsistent_settings(
         instance, settings_manager, port_manager, communicator):
 
-    def receive_message(port, slot=None):
+    def receive_message(port, slot, timeout):
         mock_msg = MagicMock()
         if port == 'muscle_settings_in':
             mock_msg.data = Settings({'s1': 1})

From 523d1fdf2d5596531f8a1794643c139f84bb1486 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 13 Aug 2024 09:46:20 +0200
Subject: [PATCH 02/19] Mypy and unit test fixes

---
 .../libmuscle/manager/deadlock_detector.py     |  2 +-
 .../python/libmuscle/manager/test/conftest.py  | 18 ++++++++++++------
 .../manager/test/test_mmp_request_handler.py   |  4 ++--
 .../libmuscle/mcp/tcp_transport_client.py      |  3 ++-
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/deadlock_detector.py b/libmuscle/python/libmuscle/manager/deadlock_detector.py
index b1cf0597..e7f820dc 100644
--- a/libmuscle/python/libmuscle/manager/deadlock_detector.py
+++ b/libmuscle/python/libmuscle/manager/deadlock_detector.py
@@ -21,7 +21,7 @@ def __init__(
 
         self._queue: Queue[Optional[_QueueItem]] = Queue()
         self._waiting_instances: Dict[str, str] = {}
-        self._waiting_instance_ports: Dict[str, Tuple[str, int]] = {}
+        self._waiting_instance_ports: Dict[str, Tuple[str, Optional[int]]] = {}
 
         self._detected_deadlocks: List[str] = []
 
diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py
index 78a50487..36f7dc52 100644
--- a/libmuscle/python/libmuscle/manager/test/conftest.py
+++ b/libmuscle/python/libmuscle/manager/test/conftest.py
@@ -8,6 +8,7 @@
 from libmuscle.manager.snapshot_registry import SnapshotRegistry
 from libmuscle.manager.topology_store import TopologyStore
 from libmuscle.manager.profile_store import ProfileStore
+from libmuscle.manager.deadlock_detector import DeadlockDetector
 
 
 @pytest.fixture
@@ -54,13 +55,18 @@ def snapshot_registry(mmp_configuration, topology_store) -> SnapshotRegistry:
     return SnapshotRegistry(mmp_configuration, None, topology_store)
 
 
+@pytest.fixture
+def deadlock_detector() -> DeadlockDetector:
+    return DeadlockDetector(lambda: None, 1.0)
+
+
 @pytest.fixture
 def mmp_request_handler(
         logger, profile_store, mmp_configuration, instance_registry,
-        topology_store, snapshot_registry):
+        topology_store, snapshot_registry, deadlock_detector):
     return MMPRequestHandler(
             logger, profile_store, mmp_configuration, instance_registry,
-            topology_store, snapshot_registry, None)
+            topology_store, snapshot_registry, deadlock_detector, None)
 
 
 @pytest.fixture
@@ -77,10 +83,10 @@ def loaded_instance_registry(instance_registry):
 @pytest.fixture
 def registered_mmp_request_handler(
         logger, profile_store, mmp_configuration, loaded_instance_registry,
-        topology_store, snapshot_registry):
+        topology_store, snapshot_registry, deadlock_detector):
     return MMPRequestHandler(
             logger, profile_store, mmp_configuration, loaded_instance_registry,
-            topology_store, snapshot_registry, None)
+            topology_store, snapshot_registry, deadlock_detector, None)
 
 
 @pytest.fixture
@@ -133,8 +139,8 @@ def loaded_instance_registry2():
 @pytest.fixture
 def registered_mmp_request_handler2(
         logger, profile_store, mmp_configuration, loaded_instance_registry2,
-        topology_store2, snapshot_registry2, tmp_path):
+        topology_store2, snapshot_registry2, deadlock_detector, tmp_path):
     return MMPRequestHandler(
             logger, profile_store, mmp_configuration,
             loaded_instance_registry2, topology_store2, snapshot_registry2,
-            RunDir(tmp_path))
+            deadlock_detector, RunDir(tmp_path))
diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
index 768d2160..0286649c 100644
--- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
+++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
@@ -15,10 +15,10 @@
 
 def test_create_servicer(
         logger, profile_store, mmp_configuration, instance_registry,
-        topology_store, snapshot_registry):
+        topology_store, snapshot_registry, deadlock_detector):
     MMPRequestHandler(
             logger, profile_store, mmp_configuration, instance_registry,
-            topology_store, snapshot_registry, None)
+            topology_store, snapshot_registry, deadlock_detector, None)
 
 
 def test_log_message(mmp_request_handler, caplog):
diff --git a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
index bf565e5e..2c3483b3 100644
--- a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
+++ b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
@@ -53,7 +53,7 @@ def __init__(self, location: str) -> None:
             self._socket = sock
 
         if hasattr(select, "poll"):
-            self._poll_obj = select.poll()
+            self._poll_obj: Optional[select.poll] = select.poll()
             self._poll_obj.register(self._socket, select.POLLIN)
         else:
             self._poll_obj = None  # On platforms that don't support select.poll
@@ -84,6 +84,7 @@ def call(self, request: bytes, timeout_handler: Optional[TimeoutHandler] = None
 
         length = recv_int64(self._socket)
         if did_timeout:
+            assert timeout_handler is not None  # mypy
             timeout_handler.on_receive()
         start_transfer = ProfileTimestamp()
 

From 1f6de948185c07c790f2b9d701cf469dc89274d2 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 13 Aug 2024 15:49:25 +0200
Subject: [PATCH 03/19] Implement timeout for shutdown after a deadlock

Improve documentation of the DeadlockDetector and add unit tests
---
 .../libmuscle/manager/deadlock_detector.py    | 146 ++++++++++++++----
 .../manager/test/test_deadlock_detector.py    | 106 +++++++++++++
 2 files changed, 226 insertions(+), 26 deletions(-)
 create mode 100644 libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py

diff --git a/libmuscle/python/libmuscle/manager/deadlock_detector.py b/libmuscle/python/libmuscle/manager/deadlock_detector.py
index e7f820dc..dc3c5111 100644
--- a/libmuscle/python/libmuscle/manager/deadlock_detector.py
+++ b/libmuscle/python/libmuscle/manager/deadlock_detector.py
@@ -1,7 +1,8 @@
 import logging
 from threading import Thread
+import time
 from typing import Callable, Dict, List, Optional, Tuple
-from queue import Queue
+from queue import Empty, Queue
 
 
 _logger = logging.getLogger(__name__)
@@ -9,30 +10,77 @@
 
 
 class DeadlockDetector(Thread):
-    """TODO"""
+    """The DeadlockDetector attempts to detect when multiple instances are stuck waiting
+    for each other.
+
+    This class is responsible for handling WAITING_FOR_RECEIVE and
+    WAITING_FOR_RECEIVE_DONE MMP messages, which are submitted by the MMPServer.
+
+    When a deadlock is detected, the cycle of instances that is waiting on each other is
+    logged with FATAL severity. If this deadlock does not get resoled in
+    ``wait_before_shutdown`` seconds, the simulation is shut down.
+    """
 
     def __init__(
             self, shutdown_callback: Callable[[], None], wait_before_shutdown: float
             ) -> None:
+        """Construct a new DeadlockDetector thread.
+
+        Args:
+            shutdown_callback: function to execute when a deadlock is detected. This
+                callback (which is executed in this thread!) is responsible for stopping
+                the simulation when a deadlock is detected.
+            wait_before_shutdown: Number of seconds to wait before executing
+                :param:`shutdown_callback` after a deadlock is detected. If the deadlock
+                is resolved (although this is unlikely), the simulation will not shut
+                down.
+        """
         super().__init__(name="DeadlockDetector")
 
         self._shutdown_callback = shutdown_callback
         self._wait_before_shutdown = wait_before_shutdown
 
         self._queue: Queue[Optional[_QueueItem]] = Queue()
+        """Queue of incoming messages. Incoming messages can come in any communication
+        thread and will be consumed and processed in this worker thread.
+        """
         self._waiting_instances: Dict[str, str] = {}
+        """Maps instance IDs to the peer instance IDs they are waiting for."""
         self._waiting_instance_ports: Dict[str, Tuple[str, Optional[int]]] = {}
+        """Maps instance IDs to the port/slot they are waiting on.."""
 
-        self._detected_deadlocks: List[str] = []
+        self._detected_deadlocks: List[List[str]] = []
+        """List of deadlocked instance cycles. Set by _handle_potential_deadlock.
+        """
+        self._shutdown_time: Optional[float] = None
+        """Future time when we confirm the potential deadlock and abort the simulation.
+        """
 
     def run(self) -> None:
         """Logic that is executed in the thread."""
         while True:
-            item = self._queue.get()
-            if item is None:  # Shutdown sentinal
+            # Set a timeout when a deadlock was detected
+            timeout = None
+            if self._shutdown_time is not None:
+                timeout = max(0, self._shutdown_time - time.monotonic())
+
+            # Grab a new item from the queue, this raises Empty when timeout expires:
+            try:
+                item = self._queue.get(timeout=timeout)
+                if item is None:  # On shutdown, None is pushed to the queue
+                    return  # exit thread
+                self._process_queue_item(item)
+
+            except Empty:
+                # timeout expired and queue is empty, call shutdown callback
+                formatted_deadlocks = "\n\n".join(
+                        self._format_deadlock(instances)
+                        for instances in self._detected_deadlocks)
+                _logger.fatal(
+                        "Aborting simulation: deadlock detected.\n%s",
+                        formatted_deadlocks)
+                self._shutdown_callback()
                 return
-            # Handle item
-            self._process_queue_item(item)
 
     def shutdown(self) -> None:
         """Stop the deadlock detector thread."""
@@ -42,18 +90,40 @@ def put_waiting(
             self, instance_id: str, peer_instance_id: str,
             port_name: str, slot: Optional[int]
             ) -> None:
-        """Process a WATING_FOR_RECEIVE message from an instance."""
+        """Queue a WAITING_FOR_RECEIVE message from an instance for processing.
+
+        This method can be called from any thread.
+
+        Args:
+            instance_id: ID of instance that is waiting to receive a message.
+            peer_instance_id: ID of the peer that the instance is waiting on.
+            port_name: Name of the input port.
+            slot: Optional slot number of the input port.
+        """
         self._queue.put((True, instance_id, peer_instance_id, port_name, slot))
 
     def put_waiting_done(
             self, instance_id: str, peer_instance_id: str,
             port_name: str, slot: Optional[int]
             ) -> None:
-        """Process a WATING_FOR_RECEIVE_DONE message from an instance."""
+        """Queue a WAITING_FOR_RECEIVE_DONE message from an instance for processing.
+
+        This method can be called from any thread.
+
+        Args:
+            instance_id: ID of instance that is waiting to receive a message.
+            peer_instance_id: ID of the peer that the instance is waiting on.
+            port_name: Name of the input port.
+            slot: Optional slot number of the input port.
+        """
         self._queue.put((False, instance_id, peer_instance_id, port_name, slot))
 
     def _process_queue_item(self, item: _QueueItem) -> None:
-        _logger.info("Processing queue item: %s", item)
+        """Actually process a WAITING_FOR_RECEIVE[_DONE] request.
+
+        This method should be called inside the worker thread.
+        """
+        _logger.debug("Processing queue item: %s", item)
         is_waiting, instance_id, peer_instance_id, port_name, slot = item
         if is_waiting:
             # Sanity checks, triggering this is a bug in the instance or the manager
@@ -88,6 +158,15 @@ def _process_queue_item(self, item: _QueueItem) -> None:
                 del self._waiting_instances[instance_id]
                 del self._waiting_instance_ports[instance_id]
 
+                # Check if we were part of a deadlock
+                for i, instance_list in enumerate(self._detected_deadlocks):
+                    if instance_id in instance_list:
+                        del self._detected_deadlocks[i]
+                        break
+                if not self._detected_deadlocks:
+                    # There are no deadlocks anymore: cancel shutdown
+                    self._shutdown_time = None
+
     def _check_for_deadlock(self, instance_id: str) -> None:
         """Check if there is a cycle of waiting instances that involves this instance.
         """
@@ -96,25 +175,40 @@ def _check_for_deadlock(self, instance_id: str) -> None:
         while cur_instance in self._waiting_instances:
             cur_instance = self._waiting_instances[cur_instance]
             if cur_instance == instance_id:
-                break  # Found a deadlocked cycle
+                self._handle_potential_deadlock(deadlock_instances)
+                return
             deadlock_instances.append(cur_instance)
-        else:  # No cycle detected
-            _logger.info("No deadlock detected")
-            return
-
-        _logger.warning(
-            "Potential deadlock detected, aborting run in %d seconds.\n%s",
-            self._wait_before_shutdown,
-            self._format_deadlock(deadlock_instances),
-            )
-        # TODO: wait and abort
-        self._shutdown_callback()
+        _logger.debug("No deadlock detected")
+
+    def _handle_potential_deadlock(self, deadlock_instances: List[str]) -> None:
+        """Handle a potential deadlock.
 
-    def _format_deadlock(self, instances: List[str]) -> str:
-        """Create and return formatted deadlock debug info."""
-        num_instances = str(len(instances))
+        Args:
+            deadlock_instances: list of instances waiting on eachother
+        """
+        shutdown_delay = self._wait_before_shutdown
+        if self._shutdown_time is not None:
+            # Get time until shutdown
+            shutdown_delay = self._shutdown_time - time.monotonic()
+        _logger.fatal(
+                "Potential deadlock detected, aborting run in %d seconds.\n%s",
+                shutdown_delay,
+                self._format_deadlock(deadlock_instances),
+                )
+
+        self._detected_deadlocks.append(deadlock_instances)
+        if self._shutdown_time is None:
+            self._shutdown_time = time.monotonic() + self._wait_before_shutdown
+
+    def _format_deadlock(self, deadlock_instances: List[str]) -> str:
+        """Create and return formatted deadlock debug info.
+
+        Args:
+            deadlock_instances: list of instances waiting on eachother
+        """
+        num_instances = str(len(deadlock_instances))
         lines = [f"The following {num_instances} instances are dead-locked:"]
-        for i, instance in enumerate(instances):
+        for i, instance in enumerate(deadlock_instances):
             num = str(i+1).rjust(len(num_instances))
             peer_instance = self._waiting_instances[instance]
             port, slot = self._waiting_instance_ports[instance]
diff --git a/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py b/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
new file mode 100644
index 00000000..718a6569
--- /dev/null
+++ b/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
@@ -0,0 +1,106 @@
+import logging
+import time
+from typing import Iterator
+from unittest.mock import Mock
+
+import pytest
+
+from libmuscle.manager.deadlock_detector import DeadlockDetector
+
+
+@pytest.fixture
+def shutdown_callback() -> Mock:
+    return Mock()
+
+
+@pytest.fixture
+def detector(shutdown_callback) -> Iterator[DeadlockDetector]:
+    # Using a very short delay (10ms) to speed up unit testing
+    detector = DeadlockDetector(shutdown_callback, 0.01)
+    detector.start()
+    yield detector
+    if detector.is_alive():
+        detector.shutdown()
+        detector.join()
+
+
+def test_no_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) -> None:
+    detector.put_waiting("macro", "micro", "s", None)
+    detector.put_waiting_done("macro", "micro", "s", None)
+    time.sleep(0.05)
+    detector.shutdown()
+    detector.join()
+    shutdown_callback.assert_not_called()
+
+
+def test_double_waiting_log_error(
+        caplog: pytest.LogCaptureFixture, detector: DeadlockDetector) -> None:
+    detector.put_waiting("macro", "micro", "s", 0)
+    detector.put_waiting("macro", "micro", "s", 1)
+    detector.shutdown()
+    detector.join()
+    assert len(caplog.record_tuples) == 1
+    assert caplog.record_tuples[0][:2] == (
+            "libmuscle.manager.deadlock_detector", logging.ERROR)
+
+
+def test_not_waiting_log_error(
+        caplog: pytest.LogCaptureFixture, detector: DeadlockDetector) -> None:
+    detector.put_waiting_done("macro", "micro", "s", 0)
+    detector.shutdown()
+    detector.join()
+    assert len(caplog.record_tuples) == 1
+    assert caplog.record_tuples[0][:2] == (
+            "libmuscle.manager.deadlock_detector", logging.ERROR)
+
+
+def test_waiting_for_different_instance_log_error(
+        caplog: pytest.LogCaptureFixture, detector: DeadlockDetector) -> None:
+    detector.put_waiting("macro", "micro", "s", 0)
+    detector.put_waiting_done("macro", "meso", "s", 0)
+    detector.shutdown()
+    detector.join()
+    assert len(caplog.record_tuples) == 1
+    assert caplog.record_tuples[0][:2] == (
+            "libmuscle.manager.deadlock_detector", logging.ERROR)
+
+
+def test_waiting_for_different_port_log_error(
+        caplog: pytest.LogCaptureFixture, detector: DeadlockDetector) -> None:
+    detector.put_waiting("macro", "micro", "s", 0)
+    detector.put_waiting_done("macro", "micro", "f_init", 0)
+    detector.shutdown()
+    detector.join()
+    assert len(caplog.record_tuples) == 1
+    assert caplog.record_tuples[0][:2] == (
+            "libmuscle.manager.deadlock_detector", logging.ERROR)
+
+
+def test_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) -> None:
+    detector.put_waiting("macro", "micro", "s", None)
+    detector.put_waiting("micro", "macro", "f_init", None)
+    time.sleep(0.05)
+    assert not detector.is_alive()
+    shutdown_callback.assert_called_once()
+
+
+def test_deadlock_cancelled(
+        shutdown_callback: Mock, detector: DeadlockDetector) -> None:
+    detector.put_waiting("macro", "micro", "s", None)
+    detector.put_waiting("micro", "macro", "f_init", None)
+    detector.put_waiting_done("macro", "micro", "s", None)
+    time.sleep(0.05)
+    detector.shutdown()
+    detector.join()
+    shutdown_callback.assert_not_called()
+
+
+def test_double_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) -> None:
+    detector.put_waiting("macro", "micro", "s", None)
+    detector.put_waiting("micro", "macro", "f_init", None)
+    detector.put_waiting("cycle2", "peer2", "s", None)
+    detector.put_waiting("peer2", "cycle2", "f_init", None)
+    detector.put_waiting_done("macro", "micro", "s", None)
+    time.sleep(0.05)
+    assert not detector.is_alive()
+    shutdown_callback.assert_called()

From d37b663503d1f304184996a26f5974d5c88b1093 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 13 Aug 2024 16:34:42 +0200
Subject: [PATCH 04/19] Refactor receive_timeout

Move the variable to Communicator to simplify some logic
---
 libmuscle/python/libmuscle/communicator.py    | 25 +++++++++++++------
 libmuscle/python/libmuscle/instance.py        | 21 ++++++----------
 .../libmuscle/test/test_communicator.py       | 18 +++++++------
 .../python/libmuscle/test/test_instance.py    |  5 ++--
 4 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index 74e2a2e2..153c194c 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -117,6 +117,8 @@ def __init__(
         self._profiler = profiler
         # TODO: make this a proper argument of __init__()
         self._manager = profiler._manager
+        # Notify manager, by default, after 10 seconds waiting in receive_message()
+        self._receive_timeout = 10.0
 
         self._server = MPPServer()
 
@@ -146,6 +148,16 @@ def set_peer_info(self, peer_info: PeerInfo) -> None:
         """
         self._peer_info = peer_info
 
+    def set_receive_timeout(self, receive_timeout: float) -> None:
+        """Update the timeout after which the manager is notified that we are waiting
+        for a message.
+
+        Args:
+            receive_timeout: Timeout (seconds). A negative number disables the deadlock
+                notification mechanism.
+        """
+        self._receive_timeout = receive_timeout
+
     def send_message(
             self, port_name: str, message: Message,
             slot: Optional[int] = None,
@@ -207,8 +219,7 @@ def send_message(
             self._profiler.record_event(profile_event)
 
     def receive_message(
-            self, port_name: str, slot: Optional[int], timeout: float
-            ) -> Tuple[Message, float]:
+            self, port_name: str, slot: Optional[int] = None) -> Tuple[Message, float]:
         """Receive a message and attached settings overlay.
 
         Receiving is a blocking operation. This function will contact
@@ -259,10 +270,10 @@ def receive_message(
                 recv_endpoint.port, slot_list)[0]
         client = self.__get_client(snd_endpoint.instance())
         timeout_handler = None
-        if timeout >= 0:
+        if self._receive_timeout >= 0:
             timeout_handler = RecvTimeoutHandler(
                     self._manager, str(snd_endpoint.instance()),
-                    port_name, slot, timeout)
+                    port_name, slot, self._receive_timeout)
         try:
             mpp_message_bytes, profile = client.receive(
                     recv_endpoint.ref(), timeout_handler)
@@ -324,7 +335,7 @@ def receive_message(
                 _logger.debug(f'Discarding received message on {port_and_slot}'
                               ': resuming from weakly consistent snapshot')
                 port.set_resumed(slot)
-                return self.receive_message(port_name, slot, timeout)
+                return self.receive_message(port_name, slot)
             raise RuntimeError(f'Received message on {port_and_slot} with'
                                ' unexpected message number'
                                f' {mpp_message.message_number}. Was expecting'
@@ -434,7 +445,7 @@ def _drain_incoming_port(self, port_name: str) -> None:
         port = self._port_manager.get_port(port_name)
         while port.is_open():
             # TODO: log warning if not a ClosePort
-            self.receive_message(port_name, None, 10.0)  # FIXME: timeout
+            self.receive_message(port_name)
 
     def _drain_incoming_vector_port(self, port_name: str) -> None:
         """Receives messages until a ClosePort is received.
@@ -449,7 +460,7 @@ def _drain_incoming_vector_port(self, port_name: str) -> None:
                        for slot in range(port.get_length())]):
             for slot in range(port.get_length()):
                 if port.is_open(slot):
-                    self.receive_message(port_name, slot, 10.0)  # FIXME: timeout
+                    self.receive_message(port_name, slot)
 
     def _close_incoming_ports(self) -> None:
         """Closes incoming ports.
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index f6899d27..c38f6aa3 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -145,10 +145,6 @@ def __init__(
                 self._name, self._index, self._port_manager, self._profiler)
         """Communicator for this instance."""
 
-        self._receive_timeout = 10.0
-        """Timeout in seconds on message receives after which the manager is notified
-        that we are waiting for a message. Used to detect communication deadlocks."""
-
         self._declared_ports = ports
         """Declared ports for this instance."""
 
@@ -818,11 +814,13 @@ def _setup_receive_timeout(self) -> None:
         """Configures receive timeout with settings from settings.
         """
         try:
-            self._receive_timeout = self.get_setting(
-                    'muscle_deadlock_receive_timeout', 'float')
+            timeout = self.get_setting('muscle_deadlock_receive_timeout', 'float')
+            self._communicator.set_receive_timeout(timeout)
         except KeyError:
             pass  # do nothing and keep the default
-        _logger.debug("Timeout on receiving messages set to %f", self._receive_timeout)
+        _logger.debug(
+                "Timeout on receiving messages set to %f",
+                self._communicator._receive_timeout)
 
     def _decide_reuse_instance(self) -> bool:
         """Decide whether and how to reuse the instance.
@@ -948,8 +946,7 @@ def __receive_message(
                     return default
 
             else:
-                msg, saved_until = self._communicator.receive_message(
-                        port_name, slot, self._receive_timeout)
+                msg, saved_until = self._communicator.receive_message(port_name, slot)
                 if not port.is_open(slot):
                     err_msg = (('Port {} was closed while trying to'
                                 ' receive on it, did the peer crash?'
@@ -1093,8 +1090,7 @@ def __receive_settings(self) -> bool:
         Returns:
             False iff the port is connnected and ClosePort was received.
         """
-        message, saved_until = self._communicator.receive_message(
-                'muscle_settings_in', None, self._receive_timeout)
+        message, saved_until = self._communicator.receive_message('muscle_settings_in')
 
         if isinstance(message.data, ClosePort):
             return False
@@ -1124,8 +1120,7 @@ def __pre_receive_f_init(self) -> None:
         apply_overlay = InstanceFlags.DONT_APPLY_OVERLAY not in self._flags
 
         def pre_receive(port_name: str, slot: Optional[int]) -> None:
-            msg, saved_until = self._communicator.receive_message(
-                    port_name, slot, self._receive_timeout)
+            msg, saved_until = self._communicator.receive_message(port_name, slot)
             self._f_init_cache[(port_name, slot)] = msg
             if apply_overlay:
                 self.__apply_overlay(msg)
diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index 5397a14e..165e3353 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -115,7 +115,8 @@ def test_receive_message(connected_communicator, mpp_client):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    recv_msg, saved_until = connected_communicator.receive_message('in', None, -1)
+    connected_communicator.set_receive_timeout(-1)
+    recv_msg, saved_until = connected_communicator.receive_message('in')
 
     mpp_client.receive.assert_called_with(Ref('component.in'), None)
 
@@ -135,7 +136,8 @@ def test_receive_message_vector(connected_communicator, mpp_client):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    recv_msg, saved_until = connected_communicator.receive_message('in_v', 5, -1)
+    connected_communicator.set_receive_timeout(-1)
+    recv_msg, saved_until = connected_communicator.receive_message('in_v', 5)
 
     mpp_client.receive.assert_called_with(Ref('component.in_v[5]'), None)
 
@@ -155,7 +157,7 @@ def test_receive_close_port(connected_communicator, mpp_client, port_manager):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    recv_msg, saved_until = connected_communicator.receive_message('in', None, -1)
+    recv_msg, saved_until = connected_communicator.receive_message('in')
 
     assert port_manager.get_port('in').is_open() is False
 
@@ -167,7 +169,7 @@ def test_receive_close_port_vector(connected_communicator, mpp_client, port_mana
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    recv_msg, saved_until = connected_communicator.receive_message('in_v', 5, -1)
+    recv_msg, saved_until = connected_communicator.receive_message('in_v', 5)
 
     assert port_manager.get_port('in_v').is_open(5) is False
 
@@ -182,12 +184,12 @@ def test_port_count_validation(
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    connected_communicator.receive_message('in', None, -1)
+    connected_communicator.receive_message('in')
     assert connected_port_manager.get_port('in').get_message_counts() == [1]
 
     with pytest.raises(RuntimeError):
         # the message received has message_number = 0 again
-        connected_communicator.receive_message('in', None, -1)
+        connected_communicator.receive_message('in')
 
 
 def test_port_discard_error_on_resume(
@@ -212,7 +214,7 @@ def test_port_discard_error_on_resume(
     # message_number=1
     with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'):
         with pytest.raises(RuntimeError):
-            connected_communicator.receive_message('in', None, -1)
+            connected_communicator.receive_message('in')
 
         assert any([
             'Discarding received message' in rec.message
@@ -238,7 +240,7 @@ def test_port_discard_success_on_resume(
         assert connected_port_manager.get_port(port).is_resuming(None)
 
     with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'):
-        msg, _ = connected_communicator.receive_message('in', None, -1)
+        msg, _ = connected_communicator.receive_message('in')
         assert any([
             'Discarding received message' in rec.message
             for rec in caplog.records])
diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py
index 264a0b9b..9dc52c52 100644
--- a/libmuscle/python/libmuscle/test/test_instance.py
+++ b/libmuscle/python/libmuscle/test/test_instance.py
@@ -355,7 +355,6 @@ def test_reuse_set_overlay(
         instance, port_manager, mock_ports, communicator, settings_manager):
     port_manager.settings_in_connected.return_value = True
     mock_ports['in']._is_connected = False
-    instance._receive_timeout = -1
 
     mock_msg = MagicMock()
     mock_msg.data = Settings({'s1': 1, 's2': 2})
@@ -364,7 +363,7 @@ def test_reuse_set_overlay(
 
     instance.reuse_instance()
 
-    communicator.receive_message.assert_called_with('muscle_settings_in', None, -1)
+    communicator.receive_message.assert_called_with('muscle_settings_in')
     assert settings_manager.overlay['s0'] == 0
     assert settings_manager.overlay['s1'] == 1
     assert settings_manager.overlay['s2'] == 2
@@ -479,7 +478,7 @@ def test_receive_no_default(instance):
 def test_receive_inconsistent_settings(
         instance, settings_manager, port_manager, communicator):
 
-    def receive_message(port, slot, timeout):
+    def receive_message(port, slot=None):
         mock_msg = MagicMock()
         if port == 'muscle_settings_in':
             mock_msg.data = Settings({'s1': 1})

From c1d1d574526af84f34eb9f8a4ec1e0ca3fb107c7 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 13 Aug 2024 16:49:25 +0200
Subject: [PATCH 05/19] Update documentation and logging

Also pass the manager as a direct argument to the Communicator
---
 libmuscle/python/libmuscle/communicator.py    | 25 ++++++++++++++++---
 libmuscle/python/libmuscle/instance.py        |  3 ++-
 .../libmuscle/manager/instance_manager.py     |  6 +++++
 .../libmuscle/test/test_communicator.py       |  4 +--
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index 153c194c..a1c886d3 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -63,11 +63,28 @@ def __init__(self, timestamp: float, next_timestamp: Optional[float] = None,
 
 
 class RecvTimeoutHandler(TimeoutHandler):
+    """Timeout handler when receiving messages from peers.
+
+    This handler sends a message to the Muscle Manager when the receive times out (and
+    another message when the message does arrive).
+
+    This is used by the manager to detect if the simulation is in a deadlock, where a
+    cycle of instances is waiting on each other.
+    """
+
     def __init__(
             self, manager: MMPClient,
             peer_instance: str, port_name: str, slot: Optional[int],
             timeout: float
             ) -> None:
+        """Initialize a new timeout handler.
+
+        Args:
+            manager: Connection to the muscle manager.
+            peer_instance: the peer instance we try to receive from.
+            port_name: the name of the port we try to receive on.
+            slot: the slot we try to receive on.
+        """
         self._manager = manager
         self._peer_instance = peer_instance
         self._port_name = port_name
@@ -84,7 +101,7 @@ def on_timeout(self) -> None:
 
     def on_receive(self) -> None:
         self._manager.waiting_for_receive_done(
-            self._peer_instance, self._port_name, self._slot)
+                self._peer_instance, self._port_name, self._slot)
 
 
 class Communicator:
@@ -97,7 +114,8 @@ class Communicator:
     """
     def __init__(
             self, kernel: Reference, index: List[int],
-            port_manager: PortManager, profiler: Profiler) -> None:
+            port_manager: PortManager, profiler: Profiler,
+            manager: MMPClient) -> None:
         """Create a Communicator.
 
         The instance reference must start with one or more Identifiers,
@@ -115,8 +133,7 @@ def __init__(
         self._index = index
         self._port_manager = port_manager
         self._profiler = profiler
-        # TODO: make this a proper argument of __init__()
-        self._manager = profiler._manager
+        self._manager = manager
         # Notify manager, by default, after 10 seconds waiting in receive_message()
         self._receive_timeout = 10.0
 
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index c38f6aa3..d98777f5 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -142,7 +142,8 @@ def __init__(
         """PortManager for this instance."""
 
         self._communicator = Communicator(
-                self._name, self._index, self._port_manager, self._profiler)
+                self._name, self._index, self._port_manager, self._profiler,
+                self.__manager)
         """Communicator for this instance."""
 
         self._declared_ports = ports
diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py
index 8d06c45e..58582cda 100644
--- a/libmuscle/python/libmuscle/manager/instance_manager.py
+++ b/libmuscle/python/libmuscle/manager/instance_manager.py
@@ -190,6 +190,9 @@ def cancel_all() -> None:
                     _logger.info(
                             f'Instance {result.instance} was shut down by'
                             f' MUSCLE3 because an error occurred elsewhere')
+                # Ensure we don't see this as a succesful run when shutdown() is called
+                # by another thread:
+                all_seemingly_okay = False
             else:
                 stderr_file = (
                         self._run_dir.instance_dir(result.instance) /
@@ -260,6 +263,9 @@ def cancel_all() -> None:
                         'More output may be found in'
                         f' {self._run_dir.instance_dir(result.instance)}\n'
                         )
+        elif not all_seemingly_okay:
+            # shutdown() was called by another thread (e.g. the DeadlockDetector):
+            _logger.error('The simulation was aborted.')
         else:
             _logger.info('The simulation finished without error.')
 
diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index 165e3353..739777fa 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -1,5 +1,5 @@
 import logging
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, Mock, patch
 
 import pytest
 
@@ -46,7 +46,7 @@ def mpp_client(MPPClient):
 
 @pytest.fixture
 def communicator(connected_port_manager, profiler):
-    return Communicator(Ref('component'), [], connected_port_manager, profiler)
+    return Communicator(Ref('component'), [], connected_port_manager, profiler, Mock())
 
 
 @pytest.fixture

From a3ccab2fbe4ecb648ba254ed7f0b41988b77edc8 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Aug 2024 09:14:10 +0200
Subject: [PATCH 06/19] Update waiting times in unit tests (should fix a CI
 failure)

---
 .../python/libmuscle/manager/test/test_deadlock_detector.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py b/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
index 718a6569..c0055f73 100644
--- a/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
+++ b/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
@@ -79,7 +79,7 @@ def test_waiting_for_different_port_log_error(
 def test_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) -> None:
     detector.put_waiting("macro", "micro", "s", None)
     detector.put_waiting("micro", "macro", "f_init", None)
-    time.sleep(0.05)
+    detector.join(1)  # wait max 1 second then check that the deadlock was detected
     assert not detector.is_alive()
     shutdown_callback.assert_called_once()
 
@@ -101,6 +101,6 @@ def test_double_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) ->
     detector.put_waiting("cycle2", "peer2", "s", None)
     detector.put_waiting("peer2", "cycle2", "f_init", None)
     detector.put_waiting_done("macro", "micro", "s", None)
-    time.sleep(0.05)
+    detector.join(1)  # wait max 1 second then check that the deadlock was detected
     assert not detector.is_alive()
     shutdown_callback.assert_called()

From c3a13b112b3eac5bdc314ca0177a02d4665eaeaa Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Aug 2024 10:37:53 +0200
Subject: [PATCH 07/19] Process review feedback for the manager component of
 the deadlock detection

---
 .../libmuscle/manager/deadlock_detector.py    | 70 +++++++++----------
 libmuscle/python/libmuscle/manager/manager.py |  8 ++-
 .../manager/test/test_deadlock_detector.py    | 58 +++++++--------
 setup.cfg                                     |  4 ++
 4 files changed, 73 insertions(+), 67 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/deadlock_detector.py b/libmuscle/python/libmuscle/manager/deadlock_detector.py
index dc3c5111..775e80ff 100644
--- a/libmuscle/python/libmuscle/manager/deadlock_detector.py
+++ b/libmuscle/python/libmuscle/manager/deadlock_detector.py
@@ -60,19 +60,29 @@ def run(self) -> None:
         """Logic that is executed in the thread."""
         while True:
             # Set a timeout when a deadlock was detected
-            timeout = None
+            seconds_until_abort = None
             if self._shutdown_time is not None:
-                timeout = max(0, self._shutdown_time - time.monotonic())
+                seconds_until_abort = max(0, self._shutdown_time - time.monotonic())
 
             # Grab a new item from the queue, this raises Empty when timeout expires:
             try:
-                item = self._queue.get(timeout=timeout)
+                item = self._queue.get(timeout=seconds_until_abort)
                 if item is None:  # On shutdown, None is pushed to the queue
                     return  # exit thread
                 self._process_queue_item(item)
 
             except Empty:
-                # timeout expired and queue is empty, call shutdown callback
+                # Timeout was set and has expired without any new messages:
+                # - We only set a timeout when there is a deadlock
+                # - An item is pushed to the queue when any instance sends a
+                #   WAITING_FOR_RECEIVE_DONE message (which may cancel a potential
+                #   deadlock)
+                # Therefore:
+                # - We have not received a message cancelling a deadlock cycle
+                #   (otherwise this would have triggered a _process_queue_item, clearing
+                #   self._shutdown_time and we had not set another timeout). And so a
+                #   deadlock is still present. Assert it to be absolutely certain:
+                assert self._detected_deadlocks
                 formatted_deadlocks = "\n\n".join(
                         self._format_deadlock(instances)
                         for instances in self._detected_deadlocks)
@@ -127,11 +137,8 @@ def _process_queue_item(self, item: _QueueItem) -> None:
         is_waiting, instance_id, peer_instance_id, port_name, slot = item
         if is_waiting:
             # Sanity checks, triggering this is a bug in the instance or the manager
-            if instance_id in self._waiting_instances:
-                _logger.error(
-                    "Instance %s was already waiting on a receive call. "
-                    "Did we miss a WAITING DONE event?",
-                    instance_id)
+            assert instance_id not in self._waiting_instances
+
             # Register that the instance is waiting
             self._waiting_instances[instance_id] = peer_instance_id
             self._waiting_instance_ports[instance_id] = (port_name, slot)
@@ -139,32 +146,23 @@ def _process_queue_item(self, item: _QueueItem) -> None:
 
         else:
             # Sanity checks, triggering these is a bug in the instance or the manager
-            if instance_id not in self._waiting_instances:
-                _logger.error(
-                    "Instance %s is not waiting on a receive call.", instance_id)
-            elif self._waiting_instances[instance_id] != peer_instance_id:
-                _logger.error(
-                    "Instance %s was waiting for %s, not for %s.",
-                    instance_id,
-                    self._waiting_instances[instance_id],
-                    peer_instance_id)
-            elif self._waiting_instance_ports[instance_id] != (port_name, slot):
-                _logger.error(
-                    "Instance %s was waiting on port[slot] %s[%s], not on %s[%s]",
-                    instance_id,
-                    *self._waiting_instance_ports[instance_id],
-                    port_name, slot)
-            else:
-                del self._waiting_instances[instance_id]
-                del self._waiting_instance_ports[instance_id]
-
-                # Check if we were part of a deadlock
-                for i, instance_list in enumerate(self._detected_deadlocks):
-                    if instance_id in instance_list:
-                        del self._detected_deadlocks[i]
-                        break
-                if not self._detected_deadlocks:
-                    # There are no deadlocks anymore: cancel shutdown
+            assert instance_id in self._waiting_instances
+            assert self._waiting_instances[instance_id] == peer_instance_id
+            assert self._waiting_instance_ports[instance_id] == (port_name, slot)
+
+            # We're not waiting anymore
+            del self._waiting_instances[instance_id]
+            del self._waiting_instance_ports[instance_id]
+
+            # Check if we were part of a deadlock
+            for i, instance_list in enumerate(self._detected_deadlocks):
+                if instance_id in instance_list:
+                    del self._detected_deadlocks[i]
+                    break
+            if not self._detected_deadlocks:
+                # There are no deadlocks anymore: cancel shutdown
+                if self._shutdown_time is not None:
+                    _logger.info("Deadlock has resolved, abort is cancelled.")
                     self._shutdown_time = None
 
     def _check_for_deadlock(self, instance_id: str) -> None:
@@ -207,7 +205,7 @@ def _format_deadlock(self, deadlock_instances: List[str]) -> str:
             deadlock_instances: list of instances waiting on eachother
         """
         num_instances = str(len(deadlock_instances))
-        lines = [f"The following {num_instances} instances are dead-locked:"]
+        lines = [f"The following {num_instances} instances are deadlocked:"]
         for i, instance in enumerate(deadlock_instances):
             num = str(i+1).rjust(len(num_instances))
             peer_instance = self._waiting_instances[instance]
diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py
index df4cd0d8..4a52787d 100644
--- a/libmuscle/python/libmuscle/manager/manager.py
+++ b/libmuscle/python/libmuscle/manager/manager.py
@@ -80,7 +80,10 @@ def __init__(
         self._snapshot_registry = SnapshotRegistry(
                 configuration, snapshot_dir, self._topology_store)
         self._snapshot_registry.start()
-        # FIXME configure timeout:
+
+        # Hard-code grace period to 5 seconds. We may want to do something smarter in
+        # the future (e.g. set a timeout dependent on the number of instances), but this
+        # should suffice for now:
         self._deadlock_detector = DeadlockDetector(self.stop, 5.0)
         self._deadlock_detector.start()
 
@@ -127,7 +130,8 @@ def stop(self) -> None:
             self._instance_manager.shutdown()
         self._deadlock_detector.shutdown()
         # Note: don't join() deadlock detector, as this method may be called from the
-        # DeadlockDetector thread. join() would (ironically) deadlock the shutdown :)
+        # DeadlockDetector thread and calling join() on your own thread raises a
+        # RuntimeError.
         self._server.stop()
         self._snapshot_registry.shutdown()
         self._snapshot_registry.join()
diff --git a/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py b/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
index c0055f73..24a44435 100644
--- a/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
+++ b/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
@@ -1,4 +1,3 @@
-import logging
 import time
 from typing import Iterator
 from unittest.mock import Mock
@@ -8,6 +7,11 @@
 from libmuscle.manager.deadlock_detector import DeadlockDetector
 
 
+# Decorator for tests that expect an exception in the DeadlockDetector thread
+ignore_unhandled_thread_exception = pytest.mark.filterwarnings(
+    "ignore::pytest.PytestUnhandledThreadExceptionWarning")
+
+
 @pytest.fixture
 def shutdown_callback() -> Mock:
     return Mock()
@@ -33,47 +37,43 @@ def test_no_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) -> Non
     shutdown_callback.assert_not_called()
 
 
-def test_double_waiting_log_error(
-        caplog: pytest.LogCaptureFixture, detector: DeadlockDetector) -> None:
+@ignore_unhandled_thread_exception
+def test_double_waiting_log_error(detector: DeadlockDetector) -> None:
     detector.put_waiting("macro", "micro", "s", 0)
     detector.put_waiting("macro", "micro", "s", 1)
-    detector.shutdown()
-    detector.join()
-    assert len(caplog.record_tuples) == 1
-    assert caplog.record_tuples[0][:2] == (
-            "libmuscle.manager.deadlock_detector", logging.ERROR)
+    # This should trigger an AssertionError in the thread and shut it down.
+    # We cannot test for the exception, so check that the thread is not running anymore:
+    detector.join(0.1)
+    assert not detector.is_alive()
 
 
-def test_not_waiting_log_error(
-        caplog: pytest.LogCaptureFixture, detector: DeadlockDetector) -> None:
+@ignore_unhandled_thread_exception
+def test_not_waiting_log_error(detector: DeadlockDetector) -> None:
     detector.put_waiting_done("macro", "micro", "s", 0)
-    detector.shutdown()
-    detector.join()
-    assert len(caplog.record_tuples) == 1
-    assert caplog.record_tuples[0][:2] == (
-            "libmuscle.manager.deadlock_detector", logging.ERROR)
+    # This should trigger an AssertionError in the thread and shut it down.
+    # We cannot test for the exception, so check that the thread is not running anymore:
+    detector.join(0.1)
+    assert not detector.is_alive()
 
 
-def test_waiting_for_different_instance_log_error(
-        caplog: pytest.LogCaptureFixture, detector: DeadlockDetector) -> None:
+@ignore_unhandled_thread_exception
+def test_waiting_for_different_instance_log_error(detector: DeadlockDetector) -> None:
     detector.put_waiting("macro", "micro", "s", 0)
     detector.put_waiting_done("macro", "meso", "s", 0)
-    detector.shutdown()
-    detector.join()
-    assert len(caplog.record_tuples) == 1
-    assert caplog.record_tuples[0][:2] == (
-            "libmuscle.manager.deadlock_detector", logging.ERROR)
+    # This should trigger an AssertionError in the thread and shut it down.
+    # We cannot test for the exception, so check that the thread is not running anymore:
+    detector.join(0.1)
+    assert not detector.is_alive()
 
 
-def test_waiting_for_different_port_log_error(
-        caplog: pytest.LogCaptureFixture, detector: DeadlockDetector) -> None:
+@ignore_unhandled_thread_exception
+def test_waiting_for_different_port_log_error(detector: DeadlockDetector) -> None:
     detector.put_waiting("macro", "micro", "s", 0)
     detector.put_waiting_done("macro", "micro", "f_init", 0)
-    detector.shutdown()
-    detector.join()
-    assert len(caplog.record_tuples) == 1
-    assert caplog.record_tuples[0][:2] == (
-            "libmuscle.manager.deadlock_detector", logging.ERROR)
+    # This should trigger an AssertionError in the thread and shut it down.
+    # We cannot test for the exception, so check that the thread is not running anymore:
+    detector.join(0.1)
+    assert not detector.is_alive()
 
 
 def test_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) -> None:
diff --git a/setup.cfg b/setup.cfg
index 7f90b47c..f56bea66 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,6 +2,10 @@
 testpaths = libmuscle/python integration_test
 addopts = --cov --cov-report xml --cov-report term-missing -s
 # -vv --log-cli-level=DEBUG
+filterwarnings =
+    # raise an error when there are unhandled exceptions in a worker thread
+    error::pytest.PytestUnhandledThreadExceptionWarning
+
 
 [mypy]
 files = libmuscle/python/**/*.py, scripts/*.py, muscle3/*.py

From a727b1c855f788602864185b01e091ab2eb46922 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Aug 2024 11:24:31 +0200
Subject: [PATCH 08/19] Refactor receive timeout logic

Reduces the complexity of the Communicator
---
 libmuscle/python/libmuscle/communicator.py    | 75 +++---------------
 libmuscle/python/libmuscle/instance.py        | 13 +--
 .../libmuscle/receive_timeout_handler.py      | 79 +++++++++++++++++++
 .../libmuscle/test/test_communicator.py       |  7 +-
 4 files changed, 103 insertions(+), 71 deletions(-)
 create mode 100644 libmuscle/python/libmuscle/receive_timeout_handler.py

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index a1c886d3..b8f952ce 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -3,17 +3,16 @@
 from ymmsl import Identifier, Reference, Settings
 
 from libmuscle.endpoint import Endpoint
-from libmuscle.mmp_client import MMPClient
 from libmuscle.mpp_message import ClosePort, MPPMessage
 from libmuscle.mpp_client import MPPClient
 from libmuscle.mpp_server import MPPServer
 from libmuscle.mcp.tcp_util import SocketClosed
-from libmuscle.mcp.transport_client import TimeoutHandler
 from libmuscle.peer_info import PeerInfo
 from libmuscle.port_manager import PortManager
 from libmuscle.profiler import Profiler
 from libmuscle.profiling import (
         ProfileEvent, ProfileEventType, ProfileTimestamp)
+from libmuscle.receive_timeout_handler import ReceiveTimeoutHandlerFactory
 
 
 _logger = logging.getLogger(__name__)
@@ -62,48 +61,6 @@ def __init__(self, timestamp: float, next_timestamp: Optional[float] = None,
         self.settings = settings
 
 
-class RecvTimeoutHandler(TimeoutHandler):
-    """Timeout handler when receiving messages from peers.
-
-    This handler sends a message to the Muscle Manager when the receive times out (and
-    another message when the message does arrive).
-
-    This is used by the manager to detect if the simulation is in a deadlock, where a
-    cycle of instances is waiting on each other.
-    """
-
-    def __init__(
-            self, manager: MMPClient,
-            peer_instance: str, port_name: str, slot: Optional[int],
-            timeout: float
-            ) -> None:
-        """Initialize a new timeout handler.
-
-        Args:
-            manager: Connection to the muscle manager.
-            peer_instance: the peer instance we try to receive from.
-            port_name: the name of the port we try to receive on.
-            slot: the slot we try to receive on.
-        """
-        self._manager = manager
-        self._peer_instance = peer_instance
-        self._port_name = port_name
-        self._slot = slot
-        self._timeout = timeout
-
-    @property
-    def timeout(self) -> float:
-        return self._timeout
-
-    def on_timeout(self) -> None:
-        self._manager.waiting_for_receive(
-                self._peer_instance, self._port_name, self._slot)
-
-    def on_receive(self) -> None:
-        self._manager.waiting_for_receive_done(
-                self._peer_instance, self._port_name, self._slot)
-
-
 class Communicator:
     """Communication engine for MUSCLE3.
 
@@ -114,8 +71,7 @@ class Communicator:
     """
     def __init__(
             self, kernel: Reference, index: List[int],
-            port_manager: PortManager, profiler: Profiler,
-            manager: MMPClient) -> None:
+            port_manager: PortManager, profiler: Profiler) -> None:
         """Create a Communicator.
 
         The instance reference must start with one or more Identifiers,
@@ -133,10 +89,10 @@ def __init__(
         self._index = index
         self._port_manager = port_manager
         self._profiler = profiler
-        self._manager = manager
-        # Notify manager, by default, after 10 seconds waiting in receive_message()
-        self._receive_timeout = 10.0
 
+        # Note: Instance will set this attribute with a call to
+        # set_receive_timeout_factory() before it is used
+        self._receive_timeout_handler_factory: ReceiveTimeoutHandlerFactory
         self._server = MPPServer()
 
         # indexed by remote instance id
@@ -165,15 +121,12 @@ def set_peer_info(self, peer_info: PeerInfo) -> None:
         """
         self._peer_info = peer_info
 
-    def set_receive_timeout(self, receive_timeout: float) -> None:
-        """Update the timeout after which the manager is notified that we are waiting
-        for a message.
-
-        Args:
-            receive_timeout: Timeout (seconds). A negative number disables the deadlock
-                notification mechanism.
+    def set_receive_timeout_factory(
+            self, factory: ReceiveTimeoutHandlerFactory) -> None:
+        """Set the factory function for the receive timeout handler (used for deadlock
+        detection).
         """
-        self._receive_timeout = receive_timeout
+        self._receive_timeout_handler_factory = factory
 
     def send_message(
             self, port_name: str, message: Message,
@@ -286,11 +239,9 @@ def receive_message(
         snd_endpoint = self._peer_info.get_peer_endpoints(
                 recv_endpoint.port, slot_list)[0]
         client = self.__get_client(snd_endpoint.instance())
-        timeout_handler = None
-        if self._receive_timeout >= 0:
-            timeout_handler = RecvTimeoutHandler(
-                    self._manager, str(snd_endpoint.instance()),
-                    port_name, slot, self._receive_timeout)
+        # Set timeout handler for message receives (deadlock detection)
+        timeout_handler = self._receive_timeout_handler_factory(
+                str(snd_endpoint), port_name, slot)
         try:
             mpp_message_bytes, profile = client.receive(
                     recv_endpoint.ref(), timeout_handler)
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index d98777f5..e1aa2aea 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -23,6 +23,7 @@
 from libmuscle.profiler import Profiler
 from libmuscle.profiling import (
         ProfileEvent, ProfileEventType, ProfileTimestamp)
+from libmuscle.receive_timeout_handler import ReceiveTimeoutHandlerFactory
 from libmuscle.snapshot_manager import SnapshotManager
 from libmuscle.util import extract_log_file_location
 
@@ -142,8 +143,7 @@ def __init__(
         """PortManager for this instance."""
 
         self._communicator = Communicator(
-                self._name, self._index, self._port_manager, self._profiler,
-                self.__manager)
+                self._name, self._index, self._port_manager, self._profiler)
         """Communicator for this instance."""
 
         self._declared_ports = ports
@@ -183,7 +183,7 @@ def __init__(
         # Note: self._setup_checkpointing() needs to have the ports initialized
         # so it comes after self._connect()
         self._setup_checkpointing()
-        # profiling and logging need settings, so come after register_()
+        # profiling, logging and receive timeout need settings: come after register_()
         self._set_local_log_level()
         self._set_remote_log_level()
         self._setup_profiling()
@@ -816,12 +816,13 @@ def _setup_receive_timeout(self) -> None:
         """
         try:
             timeout = self.get_setting('muscle_deadlock_receive_timeout', 'float')
-            self._communicator.set_receive_timeout(timeout)
         except KeyError:
-            pass  # do nothing and keep the default
+            timeout = 10.0  # Use 10 seconds as default timeout
         _logger.debug(
                 "Timeout on receiving messages set to %f",
-                self._communicator._receive_timeout)
+                timeout)
+        factory = ReceiveTimeoutHandlerFactory(self.__manager, timeout)
+        self._communicator.set_receive_timeout_factory(factory)
 
     def _decide_reuse_instance(self) -> bool:
         """Decide whether and how to reuse the instance.
diff --git a/libmuscle/python/libmuscle/receive_timeout_handler.py b/libmuscle/python/libmuscle/receive_timeout_handler.py
new file mode 100644
index 00000000..fb62a21e
--- /dev/null
+++ b/libmuscle/python/libmuscle/receive_timeout_handler.py
@@ -0,0 +1,79 @@
+from typing import Optional
+
+from libmuscle.mcp.transport_client import TimeoutHandler
+from libmuscle.mmp_client import MMPClient
+
+
+class ReceiveTimeoutHandler(TimeoutHandler):
+    """Timeout handler when receiving messages from peers.
+
+    This handler sends a message to the Muscle Manager when the receive times out (and
+    another message when the message does arrive).
+
+    This is used by the manager to detect if the simulation is in a deadlock, where a
+    cycle of instances is waiting on each other.
+    """
+
+    def __init__(
+        self,
+        manager: MMPClient,
+        peer_instance: str,
+        port_name: str,
+        slot: Optional[int],
+        timeout: float,
+    ) -> None:
+        """Initialize a new timeout handler.
+
+        Args:
+            manager: Connection to the muscle manager.
+            peer_instance: the peer instance we try to receive from.
+            port_name: the name of the port we try to receive on.
+            slot: the slot we try to receive on.
+        """
+        self._manager = manager
+        self._peer_instance = peer_instance
+        self._port_name = port_name
+        self._slot = slot
+        self._timeout = timeout
+
+    @property
+    def timeout(self) -> float:
+        return self._timeout
+
+    def on_timeout(self) -> None:
+        self._manager.waiting_for_receive(
+                self._peer_instance, self._port_name, self._slot)
+
+    def on_receive(self) -> None:
+        self._manager.waiting_for_receive_done(
+                self._peer_instance, self._port_name, self._slot)
+
+
+class ReceiveTimeoutHandlerFactory:
+    """Factory class which can be called to create ReceiveTimeoutHandler instances.
+    """
+
+    def __init__(self, manager: MMPClient, timeout: float) -> None:
+        """Initialize factory class.
+
+        Args:
+            manager: Connection to the muscle manager.
+            timeout: Timeout in seconds before the manager is notified that we are
+                waiting for a message.
+        """
+        self.manager = manager
+        self.timeout = timeout
+
+    def __call__(
+        self, peer_instance_id: str, port_name: str, slot: Optional[int]
+    ) -> Optional[TimeoutHandler]:
+        """Create a new TimeoutHandler.
+
+        Returns:
+            A new ReceiveTimeoutHandler with the provided parameters, or None when
+            self.timeout is negative.
+        """
+        if self.timeout < 0:
+            return None
+        return ReceiveTimeoutHandler(
+                self.manager, peer_instance_id, port_name, slot, self.timeout)
diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index 739777fa..ee5a0791 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -6,6 +6,7 @@
 from libmuscle.communicator import Communicator, Message
 from libmuscle.mpp_message import ClosePort, MPPMessage
 from libmuscle.peer_info import PeerInfo
+from libmuscle.receive_timeout_handler import ReceiveTimeoutHandlerFactory
 from ymmsl import Conduit, Reference as Ref, Settings
 
 
@@ -46,7 +47,9 @@ def mpp_client(MPPClient):
 
 @pytest.fixture
 def communicator(connected_port_manager, profiler):
-    return Communicator(Ref('component'), [], connected_port_manager, profiler, Mock())
+    communicator = Communicator(Ref('component'), [], connected_port_manager, profiler)
+    communicator.set_receive_timeout_factory(ReceiveTimeoutHandlerFactory(Mock(), -1))
+    return communicator
 
 
 @pytest.fixture
@@ -115,7 +118,6 @@ def test_receive_message(connected_communicator, mpp_client):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    connected_communicator.set_receive_timeout(-1)
     recv_msg, saved_until = connected_communicator.receive_message('in')
 
     mpp_client.receive.assert_called_with(Ref('component.in'), None)
@@ -136,7 +138,6 @@ def test_receive_message_vector(connected_communicator, mpp_client):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
-    connected_communicator.set_receive_timeout(-1)
     recv_msg, saved_until = connected_communicator.receive_message('in_v', 5)
 
     mpp_client.receive.assert_called_with(Ref('component.in_v[5]'), None)

From cd79b7d7e0d3a22a1694e1d9734233b09fc8a43d Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Aug 2024 12:18:43 +0200
Subject: [PATCH 09/19] Revert "Refactor receive timeout logic"

This reverts commit a727b1c855f788602864185b01e091ab2eb46922.
---
 libmuscle/python/libmuscle/communicator.py    | 75 +++++++++++++++---
 libmuscle/python/libmuscle/instance.py        | 13 ++-
 .../libmuscle/receive_timeout_handler.py      | 79 -------------------
 .../libmuscle/test/test_communicator.py       |  7 +-
 4 files changed, 71 insertions(+), 103 deletions(-)
 delete mode 100644 libmuscle/python/libmuscle/receive_timeout_handler.py

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index b8f952ce..a1c886d3 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -3,16 +3,17 @@
 from ymmsl import Identifier, Reference, Settings
 
 from libmuscle.endpoint import Endpoint
+from libmuscle.mmp_client import MMPClient
 from libmuscle.mpp_message import ClosePort, MPPMessage
 from libmuscle.mpp_client import MPPClient
 from libmuscle.mpp_server import MPPServer
 from libmuscle.mcp.tcp_util import SocketClosed
+from libmuscle.mcp.transport_client import TimeoutHandler
 from libmuscle.peer_info import PeerInfo
 from libmuscle.port_manager import PortManager
 from libmuscle.profiler import Profiler
 from libmuscle.profiling import (
         ProfileEvent, ProfileEventType, ProfileTimestamp)
-from libmuscle.receive_timeout_handler import ReceiveTimeoutHandlerFactory
 
 
 _logger = logging.getLogger(__name__)
@@ -61,6 +62,48 @@ def __init__(self, timestamp: float, next_timestamp: Optional[float] = None,
         self.settings = settings
 
 
+class RecvTimeoutHandler(TimeoutHandler):
+    """Timeout handler when receiving messages from peers.
+
+    This handler sends a message to the Muscle Manager when the receive times out (and
+    another message when the message does arrive).
+
+    This is used by the manager to detect if the simulation is in a deadlock, where a
+    cycle of instances is waiting on each other.
+    """
+
+    def __init__(
+            self, manager: MMPClient,
+            peer_instance: str, port_name: str, slot: Optional[int],
+            timeout: float
+            ) -> None:
+        """Initialize a new timeout handler.
+
+        Args:
+            manager: Connection to the muscle manager.
+            peer_instance: the peer instance we try to receive from.
+            port_name: the name of the port we try to receive on.
+            slot: the slot we try to receive on.
+        """
+        self._manager = manager
+        self._peer_instance = peer_instance
+        self._port_name = port_name
+        self._slot = slot
+        self._timeout = timeout
+
+    @property
+    def timeout(self) -> float:
+        return self._timeout
+
+    def on_timeout(self) -> None:
+        self._manager.waiting_for_receive(
+                self._peer_instance, self._port_name, self._slot)
+
+    def on_receive(self) -> None:
+        self._manager.waiting_for_receive_done(
+                self._peer_instance, self._port_name, self._slot)
+
+
 class Communicator:
     """Communication engine for MUSCLE3.
 
@@ -71,7 +114,8 @@ class Communicator:
     """
     def __init__(
             self, kernel: Reference, index: List[int],
-            port_manager: PortManager, profiler: Profiler) -> None:
+            port_manager: PortManager, profiler: Profiler,
+            manager: MMPClient) -> None:
         """Create a Communicator.
 
         The instance reference must start with one or more Identifiers,
@@ -89,10 +133,10 @@ def __init__(
         self._index = index
         self._port_manager = port_manager
         self._profiler = profiler
+        self._manager = manager
+        # Notify manager, by default, after 10 seconds waiting in receive_message()
+        self._receive_timeout = 10.0
 
-        # Note: Instance will set this attribute with a call to
-        # set_receive_timeout_factory() before it is used
-        self._receive_timeout_handler_factory: ReceiveTimeoutHandlerFactory
         self._server = MPPServer()
 
         # indexed by remote instance id
@@ -121,12 +165,15 @@ def set_peer_info(self, peer_info: PeerInfo) -> None:
         """
         self._peer_info = peer_info
 
-    def set_receive_timeout_factory(
-            self, factory: ReceiveTimeoutHandlerFactory) -> None:
-        """Set the factory function for the receive timeout handler (used for deadlock
-        detection).
+    def set_receive_timeout(self, receive_timeout: float) -> None:
+        """Update the timeout after which the manager is notified that we are waiting
+        for a message.
+
+        Args:
+            receive_timeout: Timeout (seconds). A negative number disables the deadlock
+                notification mechanism.
         """
-        self._receive_timeout_handler_factory = factory
+        self._receive_timeout = receive_timeout
 
     def send_message(
             self, port_name: str, message: Message,
@@ -239,9 +286,11 @@ def receive_message(
         snd_endpoint = self._peer_info.get_peer_endpoints(
                 recv_endpoint.port, slot_list)[0]
         client = self.__get_client(snd_endpoint.instance())
-        # Set timeout handler for message receives (deadlock detection)
-        timeout_handler = self._receive_timeout_handler_factory(
-                str(snd_endpoint), port_name, slot)
+        timeout_handler = None
+        if self._receive_timeout >= 0:
+            timeout_handler = RecvTimeoutHandler(
+                    self._manager, str(snd_endpoint.instance()),
+                    port_name, slot, self._receive_timeout)
         try:
             mpp_message_bytes, profile = client.receive(
                     recv_endpoint.ref(), timeout_handler)
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index e1aa2aea..d98777f5 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -23,7 +23,6 @@
 from libmuscle.profiler import Profiler
 from libmuscle.profiling import (
         ProfileEvent, ProfileEventType, ProfileTimestamp)
-from libmuscle.receive_timeout_handler import ReceiveTimeoutHandlerFactory
 from libmuscle.snapshot_manager import SnapshotManager
 from libmuscle.util import extract_log_file_location
 
@@ -143,7 +142,8 @@ def __init__(
         """PortManager for this instance."""
 
         self._communicator = Communicator(
-                self._name, self._index, self._port_manager, self._profiler)
+                self._name, self._index, self._port_manager, self._profiler,
+                self.__manager)
         """Communicator for this instance."""
 
         self._declared_ports = ports
@@ -183,7 +183,7 @@ def __init__(
         # Note: self._setup_checkpointing() needs to have the ports initialized
         # so it comes after self._connect()
         self._setup_checkpointing()
-        # profiling, logging and receive timeout need settings: come after register_()
+        # profiling and logging need settings, so come after register_()
         self._set_local_log_level()
         self._set_remote_log_level()
         self._setup_profiling()
@@ -816,13 +816,12 @@ def _setup_receive_timeout(self) -> None:
         """
         try:
             timeout = self.get_setting('muscle_deadlock_receive_timeout', 'float')
+            self._communicator.set_receive_timeout(timeout)
         except KeyError:
-            timeout = 10.0  # Use 10 seconds as default timeout
+            pass  # do nothing and keep the default
         _logger.debug(
                 "Timeout on receiving messages set to %f",
-                timeout)
-        factory = ReceiveTimeoutHandlerFactory(self.__manager, timeout)
-        self._communicator.set_receive_timeout_factory(factory)
+                self._communicator._receive_timeout)
 
     def _decide_reuse_instance(self) -> bool:
         """Decide whether and how to reuse the instance.
diff --git a/libmuscle/python/libmuscle/receive_timeout_handler.py b/libmuscle/python/libmuscle/receive_timeout_handler.py
deleted file mode 100644
index fb62a21e..00000000
--- a/libmuscle/python/libmuscle/receive_timeout_handler.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from typing import Optional
-
-from libmuscle.mcp.transport_client import TimeoutHandler
-from libmuscle.mmp_client import MMPClient
-
-
-class ReceiveTimeoutHandler(TimeoutHandler):
-    """Timeout handler when receiving messages from peers.
-
-    This handler sends a message to the Muscle Manager when the receive times out (and
-    another message when the message does arrive).
-
-    This is used by the manager to detect if the simulation is in a deadlock, where a
-    cycle of instances is waiting on each other.
-    """
-
-    def __init__(
-        self,
-        manager: MMPClient,
-        peer_instance: str,
-        port_name: str,
-        slot: Optional[int],
-        timeout: float,
-    ) -> None:
-        """Initialize a new timeout handler.
-
-        Args:
-            manager: Connection to the muscle manager.
-            peer_instance: the peer instance we try to receive from.
-            port_name: the name of the port we try to receive on.
-            slot: the slot we try to receive on.
-        """
-        self._manager = manager
-        self._peer_instance = peer_instance
-        self._port_name = port_name
-        self._slot = slot
-        self._timeout = timeout
-
-    @property
-    def timeout(self) -> float:
-        return self._timeout
-
-    def on_timeout(self) -> None:
-        self._manager.waiting_for_receive(
-                self._peer_instance, self._port_name, self._slot)
-
-    def on_receive(self) -> None:
-        self._manager.waiting_for_receive_done(
-                self._peer_instance, self._port_name, self._slot)
-
-
-class ReceiveTimeoutHandlerFactory:
-    """Factory class which can be called to create ReceiveTimeoutHandler instances.
-    """
-
-    def __init__(self, manager: MMPClient, timeout: float) -> None:
-        """Initialize factory class.
-
-        Args:
-            manager: Connection to the muscle manager.
-            timeout: Timeout in seconds before the manager is notified that we are
-                waiting for a message.
-        """
-        self.manager = manager
-        self.timeout = timeout
-
-    def __call__(
-        self, peer_instance_id: str, port_name: str, slot: Optional[int]
-    ) -> Optional[TimeoutHandler]:
-        """Create a new TimeoutHandler.
-
-        Returns:
-            A new ReceiveTimeoutHandler with the provided parameters, or None when
-            self.timeout is negative.
-        """
-        if self.timeout < 0:
-            return None
-        return ReceiveTimeoutHandler(
-                self.manager, peer_instance_id, port_name, slot, self.timeout)
diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index ee5a0791..739777fa 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -6,7 +6,6 @@
 from libmuscle.communicator import Communicator, Message
 from libmuscle.mpp_message import ClosePort, MPPMessage
 from libmuscle.peer_info import PeerInfo
-from libmuscle.receive_timeout_handler import ReceiveTimeoutHandlerFactory
 from ymmsl import Conduit, Reference as Ref, Settings
 
 
@@ -47,9 +46,7 @@ def mpp_client(MPPClient):
 
 @pytest.fixture
 def communicator(connected_port_manager, profiler):
-    communicator = Communicator(Ref('component'), [], connected_port_manager, profiler)
-    communicator.set_receive_timeout_factory(ReceiveTimeoutHandlerFactory(Mock(), -1))
-    return communicator
+    return Communicator(Ref('component'), [], connected_port_manager, profiler, Mock())
 
 
 @pytest.fixture
@@ -118,6 +115,7 @@ def test_receive_message(connected_communicator, mpp_client):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
+    connected_communicator.set_receive_timeout(-1)
     recv_msg, saved_until = connected_communicator.receive_message('in')
 
     mpp_client.receive.assert_called_with(Ref('component.in'), None)
@@ -138,6 +136,7 @@ def test_receive_message_vector(connected_communicator, mpp_client):
 
     mpp_client.receive.return_value = msg.encoded(), MagicMock()
 
+    connected_communicator.set_receive_timeout(-1)
     recv_msg, saved_until = connected_communicator.receive_message('in_v', 5)
 
     mpp_client.receive.assert_called_with(Ref('component.in_v[5]'), None)

From f98e5a328dc6b52a24da9ab005e515018e19a1e0 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Aug 2024 12:22:50 +0200
Subject: [PATCH 10/19] Refactor ReceiveTimeoutHandler to a separate file.

---
 libmuscle/python/libmuscle/communicator.py    | 46 +-----------------
 .../libmuscle/receive_timeout_handler.py      | 47 +++++++++++++++++++
 2 files changed, 49 insertions(+), 44 deletions(-)
 create mode 100644 libmuscle/python/libmuscle/receive_timeout_handler.py

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index a1c886d3..812397be 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -8,12 +8,12 @@
 from libmuscle.mpp_client import MPPClient
 from libmuscle.mpp_server import MPPServer
 from libmuscle.mcp.tcp_util import SocketClosed
-from libmuscle.mcp.transport_client import TimeoutHandler
 from libmuscle.peer_info import PeerInfo
 from libmuscle.port_manager import PortManager
 from libmuscle.profiler import Profiler
 from libmuscle.profiling import (
         ProfileEvent, ProfileEventType, ProfileTimestamp)
+from libmuscle.receive_timeout_handler import ReceiveTimeoutHandler
 
 
 _logger = logging.getLogger(__name__)
@@ -62,48 +62,6 @@ def __init__(self, timestamp: float, next_timestamp: Optional[float] = None,
         self.settings = settings
 
 
-class RecvTimeoutHandler(TimeoutHandler):
-    """Timeout handler when receiving messages from peers.
-
-    This handler sends a message to the Muscle Manager when the receive times out (and
-    another message when the message does arrive).
-
-    This is used by the manager to detect if the simulation is in a deadlock, where a
-    cycle of instances is waiting on each other.
-    """
-
-    def __init__(
-            self, manager: MMPClient,
-            peer_instance: str, port_name: str, slot: Optional[int],
-            timeout: float
-            ) -> None:
-        """Initialize a new timeout handler.
-
-        Args:
-            manager: Connection to the muscle manager.
-            peer_instance: the peer instance we try to receive from.
-            port_name: the name of the port we try to receive on.
-            slot: the slot we try to receive on.
-        """
-        self._manager = manager
-        self._peer_instance = peer_instance
-        self._port_name = port_name
-        self._slot = slot
-        self._timeout = timeout
-
-    @property
-    def timeout(self) -> float:
-        return self._timeout
-
-    def on_timeout(self) -> None:
-        self._manager.waiting_for_receive(
-                self._peer_instance, self._port_name, self._slot)
-
-    def on_receive(self) -> None:
-        self._manager.waiting_for_receive_done(
-                self._peer_instance, self._port_name, self._slot)
-
-
 class Communicator:
     """Communication engine for MUSCLE3.
 
@@ -288,7 +246,7 @@ def receive_message(
         client = self.__get_client(snd_endpoint.instance())
         timeout_handler = None
         if self._receive_timeout >= 0:
-            timeout_handler = RecvTimeoutHandler(
+            timeout_handler = ReceiveTimeoutHandler(
                     self._manager, str(snd_endpoint.instance()),
                     port_name, slot, self._receive_timeout)
         try:
diff --git a/libmuscle/python/libmuscle/receive_timeout_handler.py b/libmuscle/python/libmuscle/receive_timeout_handler.py
new file mode 100644
index 00000000..20196f0f
--- /dev/null
+++ b/libmuscle/python/libmuscle/receive_timeout_handler.py
@@ -0,0 +1,47 @@
+from typing import Optional
+
+from libmuscle.mcp.transport_client import TimeoutHandler
+from libmuscle.mmp_client import MMPClient
+
+
+class ReceiveTimeoutHandler(TimeoutHandler):
+    """Timeout handler when receiving messages from peers.
+
+    This handler sends a message to the Muscle Manager when the receive times out (and
+    another message when the message does arrive).
+
+    This is used by the manager to detect if the simulation is in a deadlock, where a
+    cycle of instances is waiting on each other.
+    """
+
+    def __init__(
+            self, manager: MMPClient,
+            peer_instance: str, port_name: str, slot: Optional[int],
+            timeout: float
+            ) -> None:
+        """Initialize a new timeout handler.
+
+        Args:
+            manager: Connection to the muscle manager.
+            peer_instance: the peer instance we try to receive from.
+            port_name: the name of the port we try to receive on.
+            slot: the slot we try to receive on.
+            timeout: Timeout in seconds.
+        """
+        self._manager = manager
+        self._peer_instance = peer_instance
+        self._port_name = port_name
+        self._slot = slot
+        self._timeout = timeout
+
+    @property
+    def timeout(self) -> float:
+        return self._timeout
+
+    def on_timeout(self) -> None:
+        self._manager.waiting_for_receive(
+                self._peer_instance, self._port_name, self._slot)
+
+    def on_receive(self) -> None:
+        self._manager.waiting_for_receive_done(
+                self._peer_instance, self._port_name, self._slot)

From 94869da3eee3fa2432021b33083c20bd389aedf1 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Aug 2024 16:04:20 +0200
Subject: [PATCH 11/19] Implement C++ components of deadlock detection

---
 libmuscle/cpp/src/libmuscle/communicator.cpp  | 16 +++++--
 libmuscle/cpp/src/libmuscle/communicator.hpp  | 22 ++++++++-
 libmuscle/cpp/src/libmuscle/instance.cpp      | 22 ++++++++-
 libmuscle/cpp/src/libmuscle/mcp/protocol.hpp  |  3 ++
 .../libmuscle/mcp/tcp_transport_client.cpp    | 45 ++++++++++++++++++-
 .../libmuscle/mcp/tcp_transport_client.hpp    |  3 +-
 .../src/libmuscle/mcp/transport_client.hpp    | 18 +++++++-
 libmuscle/cpp/src/libmuscle/mmp_client.cpp    | 24 ++++++++++
 libmuscle/cpp/src/libmuscle/mmp_client.hpp    | 10 +++++
 libmuscle/cpp/src/libmuscle/mpp_client.cpp    |  4 +-
 libmuscle/cpp/src/libmuscle/mpp_client.hpp    |  3 +-
 .../src/libmuscle/receive_timeout_handler.cpp | 29 ++++++++++++
 .../src/libmuscle/receive_timeout_handler.hpp | 40 +++++++++++++++++
 .../tests/mocks/mock_communicator.hpp         | 13 ++++--
 .../libmuscle/tests/mocks/mock_mmp_client.hpp | 10 +++++
 .../libmuscle/tests/mocks/mock_mpp_client.hpp | 15 +++++--
 .../src/libmuscle/tests/test_communicator.cpp | 16 ++++---
 .../cpp/src/libmuscle/tests/test_instance.cpp |  1 +
 .../libmuscle/tests/test_snapshot_manager.cpp |  1 +
 19 files changed, 270 insertions(+), 25 deletions(-)
 create mode 100644 libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
 create mode 100644 libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp

diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp
index 29cd4293..016eecb2 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.cpp
@@ -34,14 +34,17 @@ Communicator::Communicator(
         ymmsl::Reference const & kernel,
         std::vector<int> const & index,
         PortManager & port_manager,
-        Logger & logger, Profiler & profiler)
+        Logger & logger, Profiler & profiler,
+        MMPClient & manager)
     : kernel_(kernel)
     , index_(index)
     , port_manager_(port_manager)
     , logger_(logger)
     , profiler_(profiler)
+    , manager_(manager)
     , server_()
     , clients_()
+    , receive_timeout_(10.0)  // Notify manager, by default, after 10 seconds waiting in receive_message()
 {}
 
 std::vector<std::string> Communicator::get_locations() const {
@@ -138,8 +141,12 @@ std::tuple<Message, double> Communicator::receive_message(
     Endpoint snd_endpoint = peer_info_.get().get_peer_endpoints(
             recv_endpoint.port, slot_list).at(0);
     MPPClient & client = get_client_(snd_endpoint.instance());
+    std::string peer_instance = static_cast<std::string>(snd_endpoint.instance());
+    ReceiveTimeoutHandler handler(
+            manager_, peer_instance, port_name, slot, receive_timeout_);
+    ReceiveTimeoutHandler *timeout_handler = receive_timeout_ < 0 ? nullptr : &handler;
     auto msg_and_profile = try_receive_(
-            client, recv_endpoint.ref(), snd_endpoint.kernel);
+            client, recv_endpoint.ref(), snd_endpoint.kernel, timeout_handler);
     auto & msg = std::get<0>(msg_and_profile);
 
     ProfileEvent recv_decode_event(
@@ -289,9 +296,10 @@ Endpoint Communicator::get_endpoint_(
 }
 
 std::tuple<std::vector<char>, mcp::ProfileData> Communicator::try_receive_(
-        MPPClient & client, Reference const & receiver, Reference const & peer) {
+        MPPClient & client, Reference const & receiver, Reference const & peer,
+        ReceiveTimeoutHandler *timeout_handler) {
     try {
-        return client.receive(receiver);
+        return client.receive(receiver, timeout_handler);
     } catch(std::runtime_error const & err) {
         throw std::runtime_error(
             "Error while receiving a message: connection with peer '" +
diff --git a/libmuscle/cpp/src/libmuscle/communicator.hpp b/libmuscle/cpp/src/libmuscle/communicator.hpp
index 535c988f..90257c0a 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.hpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.hpp
@@ -6,6 +6,7 @@
 
 #include <libmuscle/logger.hpp>
 #include <libmuscle/message.hpp>
+#include <libmuscle/mmp_client.hpp>
 #include <libmuscle/mpp_client.hpp>
 #include <libmuscle/mpp_server.hpp>
 #include <libmuscle/namespace.hpp>
@@ -14,6 +15,7 @@
 #include <libmuscle/port_manager.hpp>
 #include <libmuscle/ports_description.hpp>
 #include <libmuscle/profiler.hpp>
+#include <libmuscle/receive_timeout_handler.hpp>
 #include <libmuscle/test_support.hpp>
 #include <libmuscle/util.hpp>
 
@@ -53,7 +55,8 @@ class Communicator {
                 ymmsl::Reference const & kernel,
                 std::vector<int> const & index,
                 PortManager & port_manager,
-                Logger & logger, Profiler & profiler);
+                Logger & logger, Profiler & profiler,
+                MMPClient & manager);
 
         /** Returns a list of locations that we can be reached at.
          *
@@ -127,6 +130,19 @@ class Communicator {
          */
         void shutdown();
 
+        /** Update the timeout after which the manager is notified that we are
+         * waiting for a message.
+         * 
+         * @param receive_timeout Timeout (seconds). A negative number disables
+         *      the deadlock notification mechanism.
+         */
+        void set_receive_timeout(double receive_timeout) { receive_timeout_ = receive_timeout; }
+
+        /** Get the timeout after which the manager is notified that we are
+         * waiting for a message.
+         */
+        double get_receive_timeout() const { return receive_timeout_; }
+
     PRIVATE:
         using Ports_ = std::unordered_map<std::string, Port>;
 
@@ -140,7 +156,7 @@ class Communicator {
 
         std::tuple<std::vector<char>, mcp::ProfileData> try_receive_(
                 MPPClient & client, ymmsl::Reference const & receiver,
-                ymmsl::Reference const & peer);
+                ymmsl::Reference const & peer, ReceiveTimeoutHandler *handler);
 
         void close_port_(std::string const & port_name, Optional<int> slot = {});
 
@@ -186,9 +202,11 @@ class Communicator {
         PortManager & port_manager_;
         Logger & logger_;
         Profiler & profiler_;
+        MMPClient & manager_;
         MPPServer server_;
         std::unordered_map<ymmsl::Reference, std::unique_ptr<MPPClient>> clients_;
         Optional<PeerInfo> peer_info_;
+        double receive_timeout_;
 };
 
 } }
diff --git a/libmuscle/cpp/src/libmuscle/instance.cpp b/libmuscle/cpp/src/libmuscle/instance.cpp
index 8edc11c1..53bbfb28 100644
--- a/libmuscle/cpp/src/libmuscle/instance.cpp
+++ b/libmuscle/cpp/src/libmuscle/instance.cpp
@@ -143,6 +143,7 @@ class Instance::Impl {
         void deregister_();
         void setup_checkpointing_();
         void setup_profiling_();
+        void setup_receive_timeout_();
 
         ::ymmsl::Reference make_full_name_(int argc, char const * const argv[]) const;
         std::string extract_manager_location_(int argc, char const * const argv[]) const;
@@ -226,7 +227,7 @@ Instance::Impl::Impl(
         port_manager_.reset(new PortManager(index_(), ports));
         communicator_.reset(
                 new Communicator(
-                    name_(), index_(), *port_manager_, *logger_, *profiler_));
+                    name_(), index_(), *port_manager_, *logger_, *profiler_, *manager_));
         snapshot_manager_.reset(new SnapshotManager(
                 instance_name_, *manager_, *port_manager_, *logger_));
         trigger_manager_.reset(new TriggerManager());
@@ -239,6 +240,7 @@ Instance::Impl::Impl(
         set_local_log_level_();
         set_remote_log_level_();
         setup_profiling_();
+        setup_receive_timeout_();
 #ifdef MUSCLE_ENABLE_MPI
         auto sbase_data = Data(settings_manager_.base);
         msgpack::sbuffer sbuf;
@@ -556,6 +558,24 @@ void Instance::Impl::setup_profiling_() {
     profiler_->set_level(profile_level_str);
 }
 
+void Instance::Impl::setup_receive_timeout_() {
+    double timeout;
+    try {
+        timeout = settings_manager_.get_setting(
+               instance_name_, "muscle_deadlock_receive_timeout").as<double>();
+        communicator_->set_receive_timeout(timeout);
+    }
+    catch (std::runtime_error const & e) {
+        logger_->error(e.what() + std::string(" in muscle_deadlock_receive_timeout"));
+    }
+    catch (std::out_of_range const &) {
+        // muscle_deadlock_receive_timeout not set, do nothing and keep the default
+    }
+    logger_->debug(
+            "Timeout on receiving messages set to ",
+            communicator_->get_receive_timeout());
+}
+
 Message Instance::Impl::receive_message(
                 std::string const & port_name,
                 Optional<int> slot,
diff --git a/libmuscle/cpp/src/libmuscle/mcp/protocol.hpp b/libmuscle/cpp/src/libmuscle/mcp/protocol.hpp
index c45df271..25dd142b 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/protocol.hpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/protocol.hpp
@@ -19,6 +19,9 @@ enum class RequestType {
     submit_profile_events = 6,
     submit_snapshot = 7,
     get_checkpoint_info = 8,
+    // Connection deadlock detection
+    waiting_for_receive = 9,
+    waiting_for_receive_done = 10,
 
     // MUSCLE Peer Protocol
     get_next_message = 21
diff --git a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
index c2551a3b..dd5b0f14 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <chrono>
 #include <memory>
 #include <string>
 #include <unistd.h>
@@ -13,6 +14,7 @@
 #include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/tcp.h>
+#include <poll.h>
 
 
 namespace {
@@ -128,12 +130,51 @@ TcpTransportClient::~TcpTransportClient() {
 }
 
 std::tuple<std::vector<char>, ProfileData> TcpTransportClient::call(
-        char const * req_buf, std::size_t req_len
+        char const * req_buf, std::size_t req_len,
+        TimeoutHandler* timeout_handler
 ) const {
     ProfileTimestamp start_wait;
     send_frame(socket_fd_, req_buf, req_len);
 
-    int64_t length = recv_int64(socket_fd_);
+    int64_t length;
+    if (timeout_handler == nullptr) {
+        length = recv_int64(socket_fd_);
+    } else {
+        using std::chrono::duration;
+        using std::chrono::steady_clock;
+        using std::chrono::milliseconds;
+        using std::chrono::duration_cast;
+
+        const auto timeout_duration = duration<double>(timeout_handler->get_timeout());
+        const auto deadline = steady_clock::now() + timeout_duration;
+        int poll_result;
+        pollfd socket_poll_fd;
+        socket_poll_fd.fd = socket_fd_;
+        socket_poll_fd.events = POLLIN;
+        do {
+            int timeout_ms = duration_cast<milliseconds>(deadline - steady_clock::now()).count();
+            poll_result = poll(&socket_poll_fd, 1, timeout_ms);
+
+            if (poll_result >= 0)
+                break;
+
+            if (errno != EINTR)
+                throw std::runtime_error("Unexpected error during poll(): "+std::to_string(errno));
+            
+            // poll() was interrupted by a signal: retry with re-calculated timeout
+        } while (1);
+
+        if (poll_result == 0) {
+            // time limit expired
+            timeout_handler->on_timeout();
+            length = recv_int64(socket_fd_);
+            timeout_handler->on_receive();
+        } else {
+            // socket is ready for a receive, this call shouldn't block:
+            length = recv_int64(socket_fd_);
+        }
+    }
+
     ProfileTimestamp start_transfer;
     std::vector<char> result(length);
     recv_all(socket_fd_, result.data(), result.size());
diff --git a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.hpp b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.hpp
index 9da32e4b..4f33f4f7 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.hpp
@@ -46,7 +46,8 @@ class TcpTransportClient : public TransportClient {
          * @return A byte array with the received data.
          */
         virtual std::tuple<std::vector<char>, ProfileData> call(
-                char const * req_buf, std::size_t req_len) const override;
+                char const * req_buf, std::size_t req_len,
+                TimeoutHandler* timeout_handler=nullptr) const override;
 
         /** Closes this client.
          *
diff --git a/libmuscle/cpp/src/libmuscle/mcp/transport_client.hpp b/libmuscle/cpp/src/libmuscle/mcp/transport_client.hpp
index 556f74f2..bfe03ee3 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/transport_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/transport_client.hpp
@@ -18,6 +18,21 @@ namespace libmuscle { namespace _MUSCLE_IMPL_NS { namespace mcp {
 using ProfileData = std::tuple<
         ProfileTimestamp, ProfileTimestamp, ProfileTimestamp>;
 
+class TimeoutHandler {
+    public:
+        virtual ~TimeoutHandler() = default;
+
+        /** Timeout (in seconds) after which on_timeout is called. */
+        virtual double get_timeout() = 0;
+        /** Callback when getTimeout seconds have passed without a response from * the peer.
+         */
+        virtual void on_timeout() = 0;
+        /** Callback when receiving a response from the peer.
+         * 
+         * Note: this method is only called when the request has timed out.
+         */
+        virtual void on_receive() = 0;
+};
 
 /** A client that connects to an MCP transport server.
  *
@@ -73,7 +88,8 @@ class TransportClient {
          *         received data, and the timestamps.
          */
         virtual std::tuple<std::vector<char>, ProfileData> call(
-                char const * req_buf, std::size_t req_len) const = 0;
+                char const * req_buf, std::size_t req_len,
+                TimeoutHandler* timeout_handler=nullptr) const = 0;
 
         /** Closes this client.
          *
diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.cpp b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
index ecacc2c2..e5d528e8 100644
--- a/libmuscle/cpp/src/libmuscle/mmp_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
@@ -323,6 +323,30 @@ void MMPClient::deregister_instance() {
     }
 }
 
+void MMPClient::waiting_for_receive(
+        std::string const & peer_instance_id, std::string const & port_name,
+        Optional<int> slot)
+{
+    auto request = Data::list(
+            static_cast<int>(RequestType::waiting_for_receive),
+            static_cast<std::string>(instance_id_),
+            peer_instance_id, port_name, encode_optional(slot));
+
+    auto response = call_manager_(request);
+}
+
+void MMPClient::waiting_for_receive_done(
+        std::string const & peer_instance_id, std::string const & port_name,
+        Optional<int> slot)
+{
+    auto request = Data::list(
+            static_cast<int>(RequestType::waiting_for_receive_done),
+            static_cast<std::string>(instance_id_),
+            peer_instance_id, port_name, encode_optional(slot));
+
+    auto response = call_manager_(request);
+}
+
 DataConstRef MMPClient::call_manager_(DataConstRef const & request) {
     std::lock_guard<std::mutex> lock(mutex_);
 
diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.hpp b/libmuscle/cpp/src/libmuscle/mmp_client.hpp
index c082809b..1de227b4 100644
--- a/libmuscle/cpp/src/libmuscle/mmp_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/mmp_client.hpp
@@ -123,6 +123,16 @@ class MMPClient {
 
         void deregister_instance();
 
+        /** Notify the manager that we're waiting to receive a message. */
+        void waiting_for_receive(
+                std::string const & peer_instance_id, std::string const & port_name,
+                Optional<int> slot);
+
+        /** Notify the manager that we're done waiting to receive a message. */
+        void waiting_for_receive_done(
+                std::string const & peer_instance_id, std::string const & port_name,
+                Optional<int> slot);
+
     private:
         ymmsl::Reference instance_id_;
         mcp::TcpTransportClient transport_client_;
diff --git a/libmuscle/cpp/src/libmuscle/mpp_client.cpp b/libmuscle/cpp/src/libmuscle/mpp_client.cpp
index feba677f..973fcf7c 100644
--- a/libmuscle/cpp/src/libmuscle/mpp_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mpp_client.cpp
@@ -26,7 +26,7 @@ MPPClient::MPPClient(std::vector<std::string> const & locations) {
 }
 
 std::tuple<std::vector<char>, ProfileData> MPPClient::receive(
-        Reference const & receiver)
+        Reference const & receiver, mcp::TimeoutHandler *timeout_handler)
 {
     auto request = Data::list(
             static_cast<int>(RequestType::get_next_message),
@@ -37,7 +37,7 @@ std::tuple<std::vector<char>, ProfileData> MPPClient::receive(
     // can then overwrite after encoding with the length?
     msgpack::pack(sbuf, request);
 
-    return transport_client_->call(sbuf.data(), sbuf.size());
+    return transport_client_->call(sbuf.data(), sbuf.size(), timeout_handler);
 }
 
 void MPPClient::close() {
diff --git a/libmuscle/cpp/src/libmuscle/mpp_client.hpp b/libmuscle/cpp/src/libmuscle/mpp_client.hpp
index 8cc07a5c..3b5c58cb 100644
--- a/libmuscle/cpp/src/libmuscle/mpp_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/mpp_client.hpp
@@ -59,7 +59,8 @@ class MPPClient {
          * @return The received message.
          */
         std::tuple<std::vector<char>, mcp::ProfileData> receive(
-                ::ymmsl::Reference const & receiver);
+                ::ymmsl::Reference const & receiver,
+                mcp::TimeoutHandler *timeout_handler=nullptr);
 
         /** Closes this client.
          *
diff --git a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
new file mode 100644
index 00000000..87ba2ea5
--- /dev/null
+++ b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
@@ -0,0 +1,29 @@
+#include "receive_timeout_handler.hpp"
+
+namespace libmuscle { namespace _MUSCLE_IMPL_NS {
+
+ReceiveTimeoutHandler::ReceiveTimeoutHandler(
+        MMPClient& manager, std::string const& peer_instance,
+        std::string const& port_name, Optional<int> slot, double timeout)
+    : manager_(manager)
+    , peer_instance_(peer_instance)
+    , port_name_(port_name)
+    , slot_(slot)
+    , timeout_(timeout) {}
+
+double ReceiveTimeoutHandler::get_timeout()
+{
+    return timeout_;
+}
+
+void ReceiveTimeoutHandler::on_timeout()
+{
+    manager_.waiting_for_receive(peer_instance_, port_name_, slot_);
+}
+
+void ReceiveTimeoutHandler::on_receive()
+{
+    manager_.waiting_for_receive_done(peer_instance_, port_name_, slot_);
+}
+
+} }
diff --git a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp
new file mode 100644
index 00000000..294d6127
--- /dev/null
+++ b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <libmuscle/mcp/transport_client.hpp>
+#include <libmuscle/mmp_client.hpp>
+
+namespace libmuscle { namespace _MUSCLE_IMPL_NS {
+
+/** Timeout handler when receiving messages from peers.
+ * 
+ * This handler sends a message to the Muscle Manager when the receive times out (and
+ * another message when the message does arrive).
+ * 
+ * This is used by the manager to detect if the simulation is in a deadlock, where a
+ * cycle of instances is waiting on each other.
+ */
+class ReceiveTimeoutHandler : public mcp::TimeoutHandler {
+    public:
+        ReceiveTimeoutHandler(
+                MMPClient & manager,
+                std::string const & peer_instance,
+                std::string const & port_name,
+                Optional<int> slot,
+                double timeout);
+
+        virtual ~ReceiveTimeoutHandler() = default;
+
+        double get_timeout() override;
+        void on_timeout() override;
+        void on_receive() override;
+
+    private:
+        MMPClient & manager_;
+        std::string const & peer_instance_;
+        std::string const & port_name_;
+        Optional<int> slot_;
+        double timeout_;
+
+};
+
+} }
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.hpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.hpp
index acfc24c9..55eb654e 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.hpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.hpp
@@ -102,6 +102,8 @@ class MockCommunicator : public MockClass<MockCommunicator> {
             NAME_MOCK_MEM_FUN(MockCommunicator, send_message);
             NAME_MOCK_MEM_FUN(MockCommunicator, receive_message);
             NAME_MOCK_MEM_FUN(MockCommunicator, shutdown);
+            NAME_MOCK_MEM_FUN(MockCommunicator, set_receive_timeout);
+            NAME_MOCK_MEM_FUN(MockCommunicator, get_receive_timeout);
         }
 
         MockCommunicator() {
@@ -112,15 +114,16 @@ class MockCommunicator : public MockClass<MockCommunicator> {
                 ymmsl::Reference const & kernel,
                 std::vector<int> const & index,
                 PortManager & port_manager,
-                Logger & logger, Profiler & profiler)
+                Logger & logger, Profiler & profiler,
+                MMPClient & manager)
         {
             init_from_return_value();
-            constructor(kernel, index, port_manager, logger, profiler);
+            constructor(kernel, index, port_manager, logger, profiler, manager);
         }
 
         MockFun<
             Void, Val<ymmsl::Reference const &>, Val<std::vector<int> const &>,
-            Obj<PortManager &>, Obj<Logger &>, Obj<Profiler &>>
+            Obj<PortManager &>, Obj<Logger &>, Obj<Profiler &>, Obj<MMPClient &>>
                 constructor;
 
         MockFun<Val<std::vector<std::string>>> get_locations;
@@ -136,6 +139,10 @@ class MockCommunicator : public MockClass<MockCommunicator> {
         ::mock_communicator::CommunicatorReceiveMessageMock receive_message;
 
         MockFun<Void> shutdown;
+
+        MockFun<Void, Val<double>> set_receive_timeout;
+
+        MockFun<Val<double>> get_receive_timeout;
 };
 
 using Communicator = MockCommunicator;
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp
index c520b696..26418040 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp
@@ -32,6 +32,8 @@ class MockMMPClient : public MockClass<MockMMPClient> {
             NAME_MOCK_MEM_FUN(MockMMPClient, register_instance);
             NAME_MOCK_MEM_FUN(MockMMPClient, request_peers);
             NAME_MOCK_MEM_FUN(MockMMPClient, deregister_instance);
+            NAME_MOCK_MEM_FUN(MockMMPClient, waiting_for_receive);
+            NAME_MOCK_MEM_FUN(MockMMPClient, waiting_for_receive_done);
 
             // Create some empty return objects for return values with a complex
             // structure, to make it easier to set them in the tests or fixtures.
@@ -95,6 +97,14 @@ class MockMMPClient : public MockClass<MockMMPClient> {
             >>> request_peers;
 
         MockFun<Void> deregister_instance;
+
+        MockFun<Void,
+            Val<std::string const &>, Val<std::string const &>, Val<Optional<int>>
+            > waiting_for_receive;
+
+        MockFun<Void,
+            Val<std::string const &>, Val<std::string const &>, Val<Optional<int>>
+            > waiting_for_receive_done;
 };
 
 using MMPClient = MockMMPClient;
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.hpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.hpp
index 12a3daa1..6b6ec7f3 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.hpp
@@ -31,10 +31,19 @@ class MockMPPClient : public MockClass<MockMPPClient> {
 
         MockFun<Void, Val<std::vector<std::string> const &>> constructor;
 
-        MockFun<
+        // Use option 1 of the "Functions with default arguments" section
+        // (from mock_support.hpp:192)
+        using BaseMockFun = MockFun<
             Val<std::tuple<std::vector<char>, mcp::ProfileData>>,
-            Val<::ymmsl::Reference const &>
-            > receive;
+            Val<::ymmsl::Reference const &>, Obj<mcp::TimeoutHandler *>>;
+        struct MockOverloadedFun : BaseMockFun {
+            std::tuple<std::vector<char>, mcp::ProfileData> operator()(
+                    ::ymmsl::Reference const & receiver,
+                    mcp::TimeoutHandler *timeout_handler=nullptr) {
+                return BaseMockFun::operator()(receiver, timeout_handler);
+            }
+        } receive;
+        
 
         MockFun<Void> close;
 };
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp b/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp
index ccd0db88..012916a7 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp
@@ -21,6 +21,7 @@
 #include <libmuscle/peer_info.cpp>
 #include <libmuscle/port.cpp>
 #include <libmuscle/profiling.cpp>
+#include <libmuscle/receive_timeout_handler.cpp>
 #include <libmuscle/timestamp.cpp>
 
 // Test code dependencies
@@ -62,6 +63,7 @@ using libmuscle::_MUSCLE_IMPL_NS::PeerInfo;
 using libmuscle::_MUSCLE_IMPL_NS::PeerLocations;
 using libmuscle::_MUSCLE_IMPL_NS::PortsDescription;
 using libmuscle::_MUSCLE_IMPL_NS::mcp::ProfileData;
+using libmuscle::_MUSCLE_IMPL_NS::mcp::TimeoutHandler;
 using libmuscle::_MUSCLE_IMPL_NS::ProfileTimestamp;
 
 using ymmsl::Conduit;
@@ -79,11 +81,12 @@ struct libmuscle_communicator
     MockLogger logger_;
     MockProfiler profiler_;
     MockPortManager port_manager_;
+    MockMMPClient manager_;
 
     Communicator communicator_;
 
     libmuscle_communicator()
-        : communicator_("component", {}, connected_port_manager_, logger_, profiler_)
+        : communicator_("component", {}, connected_port_manager_, logger_, profiler_, manager_)
     {
         port_manager_.settings_in_connected.return_value = false;
     }
@@ -120,6 +123,9 @@ struct libmuscle_communicator2 : libmuscle_communicator {
 
         PeerInfo peer_info("component", {}, conduits, peer_dims, peer_locations);
         connected_communicator_.set_peer_info(peer_info);
+        // disable receive timeouts for these tests, so we can check call
+        // signatures: mpp_client->receive.called_with(..., nullptr)
+        connected_communicator_.set_receive_timeout(-1.0);
     }
 };
 
@@ -169,7 +175,7 @@ TEST_F(libmuscle_communicator2, receive_message) {
     double saved_until = std::get<1>(recv_msg_saved_until);
 
     auto & mpp_client = connected_communicator_.clients_.at("peer");
-    ASSERT_TRUE(mpp_client->receive.called_with("component.in"));
+    ASSERT_TRUE(mpp_client->receive.called_with("component.in", nullptr));
 
     ASSERT_EQ(recv_msg.timestamp(), 2.0);
     ASSERT_EQ(recv_msg.next_timestamp(), 3.0);
@@ -192,7 +198,7 @@ TEST_F(libmuscle_communicator2, receive_message_vector) {
     double saved_until = std::get<1>(recv_msg_saved_until);
 
     auto mpp_client = connected_communicator_.clients_.at("peer2[5]").get();
-    ASSERT_TRUE(mpp_client->receive.called_with("component.in_v[5]"));
+    ASSERT_TRUE(mpp_client->receive.called_with("component.in_v[5]", nullptr));
 
     ASSERT_EQ(recv_msg.timestamp(), 4.0);
     ASSERT_EQ(recv_msg.next_timestamp(), 6.0);
@@ -292,7 +298,7 @@ TEST_F(libmuscle_communicator2, port_discard_success_on_resume) {
 
     int count = 0;
 
-    MockMPPClient::return_value.receive.side_effect = [&](Reference const &) {
+    MockMPPClient::return_value.receive.side_effect = [&](Reference const &, TimeoutHandler *) {
         return std::make_tuple(
                 side_effect.at(count++).encoded(), ProfileData());
     };
@@ -355,7 +361,7 @@ TEST_F(libmuscle_communicator2, test_shutdown) {
         }
     }
 
-    MockMPPClient::return_value.receive.side_effect = [&](Reference const & receiver) {
+    MockMPPClient::return_value.receive.side_effect = [&](Reference const & receiver, TimeoutHandler*) {
         return std::make_tuple(messages.at(receiver)->encoded(), ProfileData());
     };
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_instance.cpp b/libmuscle/cpp/src/libmuscle/tests/test_instance.cpp
index 7ff08ef0..6f8e7aab 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_instance.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_instance.cpp
@@ -179,6 +179,7 @@ struct libmuscle_instance_base : ::testing::Test, ConnectedPortManagerFixture {
         auto & mock_comm = MockCommunicator::return_value;
         mock_comm.get_locations.return_value = std::vector<std::string>(
                 {"tcp:test1,test2", "tcp:test3"});
+        mock_comm.get_receive_timeout.return_value = 10.0;
 
         auto & mock_port_manager = MockPortManager::return_value;
         mock_port_manager.settings_in_connected.return_value = false;
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_snapshot_manager.cpp b/libmuscle/cpp/src/libmuscle/tests/test_snapshot_manager.cpp
index 7ad4218a..3c17736e 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_snapshot_manager.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_snapshot_manager.cpp
@@ -16,6 +16,7 @@
 #include <libmuscle/mcp/data_pack.cpp>
 #include <libmuscle/message.cpp>
 #include <libmuscle/port.cpp>
+#include <libmuscle/receive_timeout_handler.cpp>
 #include <libmuscle/settings_manager.cpp>
 #include <libmuscle/snapshot_manager.cpp>
 #include <libmuscle/timestamp.cpp>

From 89349164296babbc3166a7719e4a3e2ca29fcc29 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 19 Aug 2024 15:18:32 +0200
Subject: [PATCH 12/19] Update deadlock detection

Keep on `poll()`ing until message to receive is available and check with manager if deadlocked after each timeout.

This approach also enables deadlock detection for runs where the instances are not started by `muscle_manager`.
---
 libmuscle/cpp/src/libmuscle/communicator.cpp  | 38 +++++-------
 libmuscle/cpp/src/libmuscle/communicator.hpp  |  3 +-
 .../libmuscle/mcp/tcp_transport_client.cpp    | 59 ++++++++++---------
 libmuscle/cpp/src/libmuscle/mmp_client.cpp    |  9 +++
 libmuscle/cpp/src/libmuscle/mmp_client.hpp    |  3 +
 .../src/libmuscle/receive_timeout_handler.cpp | 14 ++++-
 .../src/libmuscle/receive_timeout_handler.hpp | 10 +++-
 .../libmuscle/tests/mocks/mock_mmp_client.hpp |  3 +
 libmuscle/python/libmuscle/communicator.py    |  9 ++-
 .../libmuscle/manager/deadlock_detector.py    | 42 +++++++++----
 .../python/libmuscle/manager/mmp_server.py    |  8 +++
 libmuscle/python/libmuscle/mcp/protocol.py    |  1 +
 .../libmuscle/mcp/tcp_transport_client.py     |  2 +-
 libmuscle/python/libmuscle/mmp_client.py      |  7 +++
 .../libmuscle/receive_timeout_handler.py      | 21 ++++++-
 15 files changed, 156 insertions(+), 73 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp
index 016eecb2..41f3fdcc 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.cpp
@@ -122,10 +122,10 @@ std::tuple<Message, double> Communicator::receive_message(
     Port & port = (port_name == "muscle_settings_in") ?
         port_manager_.muscle_settings_in() : port_manager_.get_port(port_name);
 
+    std::string port_and_slot = port_name;
     if (slot.is_set())
-        logger_.debug("Waiting for message on ", port_name, "[", slot.get(), "]");
-    else
-        logger_.debug("Waiting for message on ", port_name);
+        port_and_slot = port_name + "[" + std::to_string(slot.get()) + "]";
+    logger_.debug("Waiting for message on ", port_and_slot);
     std::vector<int> slot_list;
     if (slot.is_set())
         slot_list.emplace_back(slot.get());
@@ -146,7 +146,7 @@ std::tuple<Message, double> Communicator::receive_message(
             manager_, peer_instance, port_name, slot, receive_timeout_);
     ReceiveTimeoutHandler *timeout_handler = receive_timeout_ < 0 ? nullptr : &handler;
     auto msg_and_profile = try_receive_(
-            client, recv_endpoint.ref(), snd_endpoint.kernel, timeout_handler);
+            client, recv_endpoint.ref(), snd_endpoint.kernel, port_and_slot, timeout_handler);
     auto & msg = std::get<0>(msg_and_profile);
 
     ProfileEvent recv_decode_event(
@@ -210,20 +210,13 @@ std::tuple<Message, double> Communicator::receive_message(
     if (expected_message_number != mpp_message.message_number) {
         if (expected_message_number - 1 == mpp_message.message_number and
                 port.is_resuming(slot)) {
-            if (slot.is_set())
-                logger_.debug("Discarding received message on ", port_name,
-                              "[", slot.get(), "]: resuming from weakly",
-                              " consistent snapshot");
-            else
-                logger_.debug("Discarding received message on ", port_name,
-                              ": resuming from weakly consistent snapshot");
+            logger_.debug("Discarding received message on ", port_and_slot,
+                          ": resuming from weakly consistent snapshot");
             port.set_resumed(slot);
             return receive_message(port_name, slot, default_msg);
         }
         std::ostringstream oss;
-        oss << "Received message on " << port_name;
-        if (slot.is_set())
-            oss << "[" << slot.get() << "]";
+        oss << "Received message on " << port_and_slot;
         oss << " with unexpected message number " << mpp_message.message_number;
         oss << ". Was expecting " << expected_message_number;
         oss << ". Are you resuming from an inconsistent snapshot?";
@@ -231,16 +224,10 @@ std::tuple<Message, double> Communicator::receive_message(
     }
     port.increment_num_messages(slot);
 
-    if (slot.is_set())
-        logger_.debug("Received message on ", port_name, "[", slot.get(), "]");
-    else
-        logger_.debug("Received message on ", port_name);
+    logger_.debug("Received message on ", port_and_slot);
 
     if (is_close_port(message.data())) {
-        if (slot.is_set())
-            logger_.debug("Port ", port_name, "[", slot.get(), "] is now closed");
-        else
-            logger_.debug("Port ", port_name, " is now closed");
+        logger_.debug("Port ", port_and_slot, " is now closed");
     }
     return std::make_tuple(message, mpp_message.saved_until);
 }
@@ -297,9 +284,14 @@ Endpoint Communicator::get_endpoint_(
 
 std::tuple<std::vector<char>, mcp::ProfileData> Communicator::try_receive_(
         MPPClient & client, Reference const & receiver, Reference const & peer,
-        ReceiveTimeoutHandler *timeout_handler) {
+        std::string const & port_and_slot, ReceiveTimeoutHandler *timeout_handler) {
     try {
         return client.receive(receiver, timeout_handler);
+    } catch(Deadlock const & err) {
+        throw std::runtime_error(
+            "Deadlock detected when when receiving a message on '" +
+            port_and_slot +
+            "'. See manager logs for more detail.");
     } catch(std::runtime_error const & err) {
         throw std::runtime_error(
             "Error while receiving a message: connection with peer '" +
diff --git a/libmuscle/cpp/src/libmuscle/communicator.hpp b/libmuscle/cpp/src/libmuscle/communicator.hpp
index 90257c0a..7f472d79 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.hpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.hpp
@@ -156,7 +156,8 @@ class Communicator {
 
         std::tuple<std::vector<char>, mcp::ProfileData> try_receive_(
                 MPPClient & client, ymmsl::Reference const & receiver,
-                ymmsl::Reference const & peer, ReceiveTimeoutHandler *handler);
+                ymmsl::Reference const & peer, std::string const & port_and_slot,
+                ReceiveTimeoutHandler *handler);
 
         void close_port_(std::string const & port_name, Optional<int> slot = {});
 
diff --git a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
index dd5b0f14..2868f1b4 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
@@ -83,6 +83,30 @@ int connect(std::string const & address) {
             + port);
 }
 
+/** Poll until timeout (in seconds) is reached. Retry when interrupted with EINTR. */
+inline int poll_retry_eintr(pollfd *fds, nfds_t nfds, double timeout) {
+    using std::chrono::duration;
+    using std::chrono::steady_clock;
+    using std::chrono::milliseconds;
+    using std::chrono::duration_cast;
+
+    const auto timeout_duration = duration<double>(timeout);
+    const auto deadline = steady_clock::now() + timeout_duration;
+    while (1) {
+        int timeout_ms = duration_cast<milliseconds>(deadline - steady_clock::now()).count();
+        int poll_result = poll(fds, nfds, timeout_ms);
+
+        if (poll_result >= 0)
+            return poll_result;
+
+        if (errno != EINTR)
+            throw std::runtime_error(
+                    "Unexpected error during poll(): " +
+                    std::string(std::strerror(errno)));
+        // poll() was interrupted by a signal: retry with re-calculated timeout
+    }
+}
+
 }
 
 
@@ -140,38 +164,19 @@ std::tuple<std::vector<char>, ProfileData> TcpTransportClient::call(
     if (timeout_handler == nullptr) {
         length = recv_int64(socket_fd_);
     } else {
-        using std::chrono::duration;
-        using std::chrono::steady_clock;
-        using std::chrono::milliseconds;
-        using std::chrono::duration_cast;
-
-        const auto timeout_duration = duration<double>(timeout_handler->get_timeout());
-        const auto deadline = steady_clock::now() + timeout_duration;
-        int poll_result;
+        bool did_timeout = false;
         pollfd socket_poll_fd;
         socket_poll_fd.fd = socket_fd_;
         socket_poll_fd.events = POLLIN;
-        do {
-            int timeout_ms = duration_cast<milliseconds>(deadline - steady_clock::now()).count();
-            poll_result = poll(&socket_poll_fd, 1, timeout_ms);
-
-            if (poll_result >= 0)
-                break;
-
-            if (errno != EINTR)
-                throw std::runtime_error("Unexpected error during poll(): "+std::to_string(errno));
-            
-            // poll() was interrupted by a signal: retry with re-calculated timeout
-        } while (1);
-
-        if (poll_result == 0) {
-            // time limit expired
+        while (poll_retry_eintr(&socket_poll_fd, 1, timeout_handler->get_timeout()) == 0) {
             timeout_handler->on_timeout();
-            length = recv_int64(socket_fd_);
+            did_timeout = true;
+        }
+        // socket is ready for a receive, this call shouldn't block:
+        length = recv_int64(socket_fd_);
+
+        if (did_timeout) {
             timeout_handler->on_receive();
-        } else {
-            // socket is ready for a receive, this call shouldn't block:
-            length = recv_int64(socket_fd_);
         }
     }
 
diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.cpp b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
index e5d528e8..2b57640f 100644
--- a/libmuscle/cpp/src/libmuscle/mmp_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
@@ -347,6 +347,15 @@ void MMPClient::waiting_for_receive_done(
     auto response = call_manager_(request);
 }
 
+bool MMPClient::is_deadlocked() {
+    auto request = Data::list(
+            static_cast<int>(RequestType::waiting_for_receive_done),
+            static_cast<std::string>(instance_id_));
+    
+    auto response = call_manager_(request);
+    return response[1].as<bool>();
+}
+
 DataConstRef MMPClient::call_manager_(DataConstRef const & request) {
     std::lock_guard<std::mutex> lock(mutex_);
 
diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.hpp b/libmuscle/cpp/src/libmuscle/mmp_client.hpp
index 1de227b4..2bbcd32c 100644
--- a/libmuscle/cpp/src/libmuscle/mmp_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/mmp_client.hpp
@@ -133,6 +133,9 @@ class MMPClient {
                 std::string const & peer_instance_id, std::string const & port_name,
                 Optional<int> slot);
 
+        /** Ask the manager if we're part of a deadlock. */
+        bool is_deadlocked();
+
     private:
         ymmsl::Reference instance_id_;
         mcp::TcpTransportClient transport_client_;
diff --git a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
index 87ba2ea5..fa0a58df 100644
--- a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
+++ b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
@@ -1,5 +1,7 @@
 #include "receive_timeout_handler.hpp"
 
+#include <cmath>
+
 namespace libmuscle { namespace _MUSCLE_IMPL_NS {
 
 ReceiveTimeoutHandler::ReceiveTimeoutHandler(
@@ -9,16 +11,22 @@ ReceiveTimeoutHandler::ReceiveTimeoutHandler(
     , peer_instance_(peer_instance)
     , port_name_(port_name)
     , slot_(slot)
-    , timeout_(timeout) {}
+    , timeout_(timeout)
+    , num_timeout_(0) {}
 
 double ReceiveTimeoutHandler::get_timeout()
 {
-    return timeout_;
+    // Increase timeout by a factor 1.5 with every timeout we hit:
+    return timeout_ * std::pow(1.5, (double)num_timeout_);
 }
 
 void ReceiveTimeoutHandler::on_timeout()
 {
-    manager_.waiting_for_receive(peer_instance_, port_name_, slot_);
+    if (num_timeout_ == 0)
+        manager_.waiting_for_receive(peer_instance_, port_name_, slot_);
+    else
+        if (manager_.is_deadlocked())
+            throw Deadlock();
 }
 
 void ReceiveTimeoutHandler::on_receive()
diff --git a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp
index 294d6127..9581d1f4 100644
--- a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp
+++ b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp
@@ -5,6 +5,14 @@
 
 namespace libmuscle { namespace _MUSCLE_IMPL_NS {
 
+/** Error thrown when a deadlock is detected. */
+class Deadlock : public std::runtime_error {
+    public:
+        Deadlock() : std::runtime_error("Deadlock detected") {};
+        virtual ~Deadlock() = default;
+};
+
+
 /** Timeout handler when receiving messages from peers.
  * 
  * This handler sends a message to the Muscle Manager when the receive times out (and
@@ -34,7 +42,7 @@ class ReceiveTimeoutHandler : public mcp::TimeoutHandler {
         std::string const & port_name_;
         Optional<int> slot_;
         double timeout_;
-
+        int num_timeout_;
 };
 
 } }
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp
index 26418040..a02e02fd 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp
@@ -34,6 +34,7 @@ class MockMMPClient : public MockClass<MockMMPClient> {
             NAME_MOCK_MEM_FUN(MockMMPClient, deregister_instance);
             NAME_MOCK_MEM_FUN(MockMMPClient, waiting_for_receive);
             NAME_MOCK_MEM_FUN(MockMMPClient, waiting_for_receive_done);
+            NAME_MOCK_MEM_FUN(MockMMPClient, is_deadlocked);
 
             // Create some empty return objects for return values with a complex
             // structure, to make it easier to set them in the tests or fixtures.
@@ -105,6 +106,8 @@ class MockMMPClient : public MockClass<MockMMPClient> {
         MockFun<Void,
             Val<std::string const &>, Val<std::string const &>, Val<Optional<int>>
             > waiting_for_receive_done;
+
+        MockFun<Val<bool>> is_deadlocked;
 };
 
 using MMPClient = MockMMPClient;
diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index 812397be..ee42c88d 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -13,7 +13,7 @@
 from libmuscle.profiler import Profiler
 from libmuscle.profiling import (
         ProfileEvent, ProfileEventType, ProfileTimestamp)
-from libmuscle.receive_timeout_handler import ReceiveTimeoutHandler
+from libmuscle.receive_timeout_handler import Deadlock, ReceiveTimeoutHandler
 
 
 _logger = logging.getLogger(__name__)
@@ -257,6 +257,13 @@ def receive_message(
                 "Error while receiving a message: connection with peer"
                 f" '{snd_endpoint.kernel}' was lost. Did the peer crash?"
             ) from exc
+        except Deadlock:
+            # Profiler messages may be used for debugging the deadlock
+            self._profiler.shutdown()
+            raise RuntimeError(
+                "Deadlock detected when when receiving a message on "
+                f"port '{port_and_slot}'. See manager logs for more detail."
+            ) from None
 
         recv_decode_event = ProfileEvent(
                 ProfileEventType.RECEIVE_DECODE, ProfileTimestamp(), None,
diff --git a/libmuscle/python/libmuscle/manager/deadlock_detector.py b/libmuscle/python/libmuscle/manager/deadlock_detector.py
index 775e80ff..558dbeb5 100644
--- a/libmuscle/python/libmuscle/manager/deadlock_detector.py
+++ b/libmuscle/python/libmuscle/manager/deadlock_detector.py
@@ -1,5 +1,5 @@
 import logging
-from threading import Thread
+from threading import Thread, Lock
 import time
 from typing import Callable, Dict, List, Optional, Tuple
 from queue import Empty, Queue
@@ -13,11 +13,11 @@ class DeadlockDetector(Thread):
     """The DeadlockDetector attempts to detect when multiple instances are stuck waiting
     for each other.
 
-    This class is responsible for handling WAITING_FOR_RECEIVE and
+    This class is responsible for handling WAITING_FOR_RECEIVE, IS_DEADLOCKED and
     WAITING_FOR_RECEIVE_DONE MMP messages, which are submitted by the MMPServer.
 
     When a deadlock is detected, the cycle of instances that is waiting on each other is
-    logged with FATAL severity. If this deadlock does not get resoled in
+    logged with FATAL severity. If this deadlock does not get resolved in
     ``wait_before_shutdown`` seconds, the simulation is shut down.
     """
 
@@ -49,6 +49,9 @@ def __init__(
         self._waiting_instance_ports: Dict[str, Tuple[str, Optional[int]]] = {}
         """Maps instance IDs to the port/slot they are waiting on.."""
 
+        # detected deadlocks may be accessed from any thread by is_deadlocked, so it
+        # needs a mutex:
+        self._detected_deadlocks_mutex = Lock()
         self._detected_deadlocks: List[List[str]] = []
         """List of deadlocked instance cycles. Set by _handle_potential_deadlock.
         """
@@ -69,7 +72,8 @@ def run(self) -> None:
                 item = self._queue.get(timeout=seconds_until_abort)
                 if item is None:  # On shutdown, None is pushed to the queue
                     return  # exit thread
-                self._process_queue_item(item)
+                with self._detected_deadlocks_mutex:
+                    self._process_queue_item(item)
 
             except Empty:
                 # Timeout was set and has expired without any new messages:
@@ -82,15 +86,16 @@ def run(self) -> None:
                 #   (otherwise this would have triggered a _process_queue_item, clearing
                 #   self._shutdown_time and we had not set another timeout). And so a
                 #   deadlock is still present. Assert it to be absolutely certain:
-                assert self._detected_deadlocks
-                formatted_deadlocks = "\n\n".join(
-                        self._format_deadlock(instances)
-                        for instances in self._detected_deadlocks)
-                _logger.fatal(
-                        "Aborting simulation: deadlock detected.\n%s",
-                        formatted_deadlocks)
-                self._shutdown_callback()
-                return
+                with self._detected_deadlocks_mutex:
+                    assert self._detected_deadlocks
+                    formatted_deadlocks = "\n\n".join(
+                            self._format_deadlock(instances)
+                            for instances in self._detected_deadlocks)
+                    _logger.fatal(
+                            "Aborting simulation: deadlock detected.\n%s",
+                            formatted_deadlocks)
+                    self._shutdown_callback()
+                    return
 
     def shutdown(self) -> None:
         """Stop the deadlock detector thread."""
@@ -128,6 +133,17 @@ def put_waiting_done(
         """
         self._queue.put((False, instance_id, peer_instance_id, port_name, slot))
 
+    def is_deadlocked(self, instance_id: str) -> bool:
+        """Check if the provided instance is part of a detected deadlock.
+
+        This method can be called from any thread.
+        """
+        with self._detected_deadlocks_mutex:
+            for deadlock_instances in self._detected_deadlocks:
+                if instance_id in deadlock_instances:
+                    return True
+        return False
+
     def _process_queue_item(self, item: _QueueItem) -> None:
         """Actually process a WAITING_FOR_RECEIVE[_DONE] request.
 
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index 6eca3131..ef1fca51 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -131,6 +131,8 @@ def handle_request(self, request: bytes) -> bytes:
             response = self._waiting_for_receive(*req_args)
         elif req_type == RequestType.WAITING_FOR_RECEIVE_DONE.value:
             response = self._waiting_for_receive_done(*req_args)
+        elif req_type == RequestType.IS_DEADLOCKED.value:
+            response = self._is_deadlocked(*req_args)
 
         return cast(bytes, msgpack.packb(response, use_bin_type=True))
 
@@ -392,6 +394,12 @@ def _waiting_for_receive_done(
                 instance_id, peer_instance_id, port_name, slot)
         return [ResponseType.SUCCESS.value]
 
+    def _is_deadlocked(self, instance_id: str) -> Any:
+        """Check if the provided instance is part of a detected deadlock.
+        """
+        result = self._deadlock_detector.is_deadlocked(instance_id)
+        return [ResponseType.SUCCESS.value, result]
+
 
 class MMPServer:
     """The MUSCLE Manager Protocol server.
diff --git a/libmuscle/python/libmuscle/mcp/protocol.py b/libmuscle/python/libmuscle/mcp/protocol.py
index da23b221..d02f95cc 100644
--- a/libmuscle/python/libmuscle/mcp/protocol.py
+++ b/libmuscle/python/libmuscle/mcp/protocol.py
@@ -25,6 +25,7 @@ class RequestType(Enum):
     # Connection deadlock detection
     WAITING_FOR_RECEIVE = 9
     WAITING_FOR_RECEIVE_DONE = 10
+    IS_DEADLOCKED = 11
 
     # MUSCLE Peer Protocol
     GET_NEXT_MESSAGE = 21
diff --git a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
index 2c3483b3..78c399b3 100644
--- a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
+++ b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
@@ -78,7 +78,7 @@ def call(self, request: bytes, timeout_handler: Optional[TimeoutHandler] = None
 
         did_timeout = False
         if timeout_handler is not None:
-            if not self._poll(timeout_handler.timeout):
+            while not self._poll(timeout_handler.timeout):
                 did_timeout = True
                 timeout_handler.on_timeout()
 
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 00a2a20c..76a2d8e6 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -295,6 +295,13 @@ def waiting_for_receive_done(
                 peer_instance_id, port_name, slot]
         self._call_manager(request)
 
+    def is_deadlocked(self) -> bool:
+        """Ask the manager if we're part of a deadlock."""
+        request = [
+                RequestType.IS_DEADLOCKED.value, str(self._instance_id)]
+        response = self._call_manager(request)
+        return bool(response[1])
+
     def _call_manager(self, request: Any) -> Any:
         """Call the manager and do en/decoding.
 
diff --git a/libmuscle/python/libmuscle/receive_timeout_handler.py b/libmuscle/python/libmuscle/receive_timeout_handler.py
index 20196f0f..951b6586 100644
--- a/libmuscle/python/libmuscle/receive_timeout_handler.py
+++ b/libmuscle/python/libmuscle/receive_timeout_handler.py
@@ -4,6 +4,10 @@
 from libmuscle.mmp_client import MMPClient
 
 
+class Deadlock(Exception):
+    """Exception that is raised when the simulation has deadlocked."""
+
+
 class ReceiveTimeoutHandler(TimeoutHandler):
     """Timeout handler when receiving messages from peers.
 
@@ -33,14 +37,25 @@ def __init__(
         self._port_name = port_name
         self._slot = slot
         self._timeout = timeout
+        # Counter to keep track of the number of timeouts
+        self._num_timeouts = 0
 
     @property
     def timeout(self) -> float:
-        return self._timeout
+        # Increase timeout by a factor 1.5 with every timeout we hit:
+        factor = 1.5 ** self._num_timeouts
+        return self._timeout * factor
 
     def on_timeout(self) -> None:
-        self._manager.waiting_for_receive(
-                self._peer_instance, self._port_name, self._slot)
+        if self._num_timeouts == 0:
+            # Notify the manager that we're waiting for a receive
+            self._manager.waiting_for_receive(
+                    self._peer_instance, self._port_name, self._slot)
+        else:
+            # Ask the manager if we're part of a detected deadlock
+            if self._manager.is_deadlocked():
+                raise Deadlock()
+        self._num_timeouts += 1
 
     def on_receive(self) -> None:
         self._manager.waiting_for_receive_done(

From 45a6cd7becb373ff3b7f1471b4dc17932ebec9a6 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 19 Aug 2024 15:50:45 +0200
Subject: [PATCH 13/19] Simplify manager DeadlockDetector

- Remove ability to shutdown simulation (components will now crash when deadlocked)
- Process messages immediately (in the MMP server thread) instead of creating a custom DeadlockDetector Thread.
---
 .../libmuscle/manager/deadlock_detector.py    | 169 ++++--------------
 libmuscle/python/libmuscle/manager/manager.py |  11 +-
 .../python/libmuscle/manager/mmp_server.py    |   4 +-
 .../python/libmuscle/manager/test/conftest.py |   2 +-
 .../manager/test/test_deadlock_detector.py    | 137 ++++++--------
 5 files changed, 92 insertions(+), 231 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/deadlock_detector.py b/libmuscle/python/libmuscle/manager/deadlock_detector.py
index 558dbeb5..6776d00b 100644
--- a/libmuscle/python/libmuscle/manager/deadlock_detector.py
+++ b/libmuscle/python/libmuscle/manager/deadlock_detector.py
@@ -1,15 +1,12 @@
 import logging
-from threading import Thread, Lock
-import time
-from typing import Callable, Dict, List, Optional, Tuple
-from queue import Empty, Queue
+from threading import Lock
+from typing import Dict, List, Optional, Tuple
 
 
 _logger = logging.getLogger(__name__)
-_QueueItem = Tuple[bool, str, str, str, Optional[int]]
 
 
-class DeadlockDetector(Thread):
+class DeadlockDetector:
     """The DeadlockDetector attempts to detect when multiple instances are stuck waiting
     for each other.
 
@@ -17,95 +14,25 @@ class DeadlockDetector(Thread):
     WAITING_FOR_RECEIVE_DONE MMP messages, which are submitted by the MMPServer.
 
     When a deadlock is detected, the cycle of instances that is waiting on each other is
-    logged with FATAL severity. If this deadlock does not get resolved in
-    ``wait_before_shutdown`` seconds, the simulation is shut down.
+    logged with FATAL severity.
     """
 
-    def __init__(
-            self, shutdown_callback: Callable[[], None], wait_before_shutdown: float
-            ) -> None:
-        """Construct a new DeadlockDetector thread.
-
-        Args:
-            shutdown_callback: function to execute when a deadlock is detected. This
-                callback (which is executed in this thread!) is responsible for stopping
-                the simulation when a deadlock is detected.
-            wait_before_shutdown: Number of seconds to wait before executing
-                :param:`shutdown_callback` after a deadlock is detected. If the deadlock
-                is resolved (although this is unlikely), the simulation will not shut
-                down.
-        """
-        super().__init__(name="DeadlockDetector")
-
-        self._shutdown_callback = shutdown_callback
-        self._wait_before_shutdown = wait_before_shutdown
-
-        self._queue: Queue[Optional[_QueueItem]] = Queue()
-        """Queue of incoming messages. Incoming messages can come in any communication
-        thread and will be consumed and processed in this worker thread.
-        """
+    def __init__(self) -> None:
+        """Construct a new DeadlockDetector."""
+        self._mutex = Lock()
+        """Mutex that should be locked before accessing instance variables."""
         self._waiting_instances: Dict[str, str] = {}
         """Maps instance IDs to the peer instance IDs they are waiting for."""
         self._waiting_instance_ports: Dict[str, Tuple[str, Optional[int]]] = {}
         """Maps instance IDs to the port/slot they are waiting on.."""
-
-        # detected deadlocks may be accessed from any thread by is_deadlocked, so it
-        # needs a mutex:
-        self._detected_deadlocks_mutex = Lock()
         self._detected_deadlocks: List[List[str]] = []
-        """List of deadlocked instance cycles. Set by _handle_potential_deadlock.
-        """
-        self._shutdown_time: Optional[float] = None
-        """Future time when we confirm the potential deadlock and abort the simulation.
-        """
+        """List of deadlocked instance cycles. Set by _handle_potential_deadlock."""
 
-    def run(self) -> None:
-        """Logic that is executed in the thread."""
-        while True:
-            # Set a timeout when a deadlock was detected
-            seconds_until_abort = None
-            if self._shutdown_time is not None:
-                seconds_until_abort = max(0, self._shutdown_time - time.monotonic())
-
-            # Grab a new item from the queue, this raises Empty when timeout expires:
-            try:
-                item = self._queue.get(timeout=seconds_until_abort)
-                if item is None:  # On shutdown, None is pushed to the queue
-                    return  # exit thread
-                with self._detected_deadlocks_mutex:
-                    self._process_queue_item(item)
-
-            except Empty:
-                # Timeout was set and has expired without any new messages:
-                # - We only set a timeout when there is a deadlock
-                # - An item is pushed to the queue when any instance sends a
-                #   WAITING_FOR_RECEIVE_DONE message (which may cancel a potential
-                #   deadlock)
-                # Therefore:
-                # - We have not received a message cancelling a deadlock cycle
-                #   (otherwise this would have triggered a _process_queue_item, clearing
-                #   self._shutdown_time and we had not set another timeout). And so a
-                #   deadlock is still present. Assert it to be absolutely certain:
-                with self._detected_deadlocks_mutex:
-                    assert self._detected_deadlocks
-                    formatted_deadlocks = "\n\n".join(
-                            self._format_deadlock(instances)
-                            for instances in self._detected_deadlocks)
-                    _logger.fatal(
-                            "Aborting simulation: deadlock detected.\n%s",
-                            formatted_deadlocks)
-                    self._shutdown_callback()
-                    return
-
-    def shutdown(self) -> None:
-        """Stop the deadlock detector thread."""
-        self._queue.put(None)
-
-    def put_waiting(
+    def waiting_for_receive(
             self, instance_id: str, peer_instance_id: str,
             port_name: str, slot: Optional[int]
             ) -> None:
-        """Queue a WAITING_FOR_RECEIVE message from an instance for processing.
+        """Process a WAITING_FOR_RECEIVE message from an instance.
 
         This method can be called from any thread.
 
@@ -115,13 +42,20 @@ def put_waiting(
             port_name: Name of the input port.
             slot: Optional slot number of the input port.
         """
-        self._queue.put((True, instance_id, peer_instance_id, port_name, slot))
+        with self._mutex:
+            # Sanity checks, triggering this is a bug in the instance or the manager
+            assert instance_id not in self._waiting_instances
+
+            # Register that the instance is waiting
+            self._waiting_instances[instance_id] = peer_instance_id
+            self._waiting_instance_ports[instance_id] = (port_name, slot)
+            self._check_for_deadlock(instance_id)
 
-    def put_waiting_done(
+    def waiting_for_receive_done(
             self, instance_id: str, peer_instance_id: str,
             port_name: str, slot: Optional[int]
             ) -> None:
-        """Queue a WAITING_FOR_RECEIVE_DONE message from an instance for processing.
+        """Process a WAITING_FOR_RECEIVE_DONE message from an instance.
 
         This method can be called from any thread.
 
@@ -131,36 +65,7 @@ def put_waiting_done(
             port_name: Name of the input port.
             slot: Optional slot number of the input port.
         """
-        self._queue.put((False, instance_id, peer_instance_id, port_name, slot))
-
-    def is_deadlocked(self, instance_id: str) -> bool:
-        """Check if the provided instance is part of a detected deadlock.
-
-        This method can be called from any thread.
-        """
-        with self._detected_deadlocks_mutex:
-            for deadlock_instances in self._detected_deadlocks:
-                if instance_id in deadlock_instances:
-                    return True
-        return False
-
-    def _process_queue_item(self, item: _QueueItem) -> None:
-        """Actually process a WAITING_FOR_RECEIVE[_DONE] request.
-
-        This method should be called inside the worker thread.
-        """
-        _logger.debug("Processing queue item: %s", item)
-        is_waiting, instance_id, peer_instance_id, port_name, slot = item
-        if is_waiting:
-            # Sanity checks, triggering this is a bug in the instance or the manager
-            assert instance_id not in self._waiting_instances
-
-            # Register that the instance is waiting
-            self._waiting_instances[instance_id] = peer_instance_id
-            self._waiting_instance_ports[instance_id] = (port_name, slot)
-            self._check_for_deadlock(instance_id)
-
-        else:
+        with self._mutex:
             # Sanity checks, triggering these is a bug in the instance or the manager
             assert instance_id in self._waiting_instances
             assert self._waiting_instances[instance_id] == peer_instance_id
@@ -175,14 +80,22 @@ def _process_queue_item(self, item: _QueueItem) -> None:
                 if instance_id in instance_list:
                     del self._detected_deadlocks[i]
                     break
-            if not self._detected_deadlocks:
-                # There are no deadlocks anymore: cancel shutdown
-                if self._shutdown_time is not None:
-                    _logger.info("Deadlock has resolved, abort is cancelled.")
-                    self._shutdown_time = None
+
+    def is_deadlocked(self, instance_id: str) -> bool:
+        """Check if the provided instance is part of a detected deadlock.
+
+        This method can be called from any thread.
+        """
+        with self._mutex:
+            for deadlock_instances in self._detected_deadlocks:
+                if instance_id in deadlock_instances:
+                    return True
+        return False
 
     def _check_for_deadlock(self, instance_id: str) -> None:
         """Check if there is a cycle of waiting instances that involves this instance.
+
+        Make sure to lock self._mutex before calling this.
         """
         deadlock_instances = [instance_id]
         cur_instance = instance_id
@@ -197,22 +110,16 @@ def _check_for_deadlock(self, instance_id: str) -> None:
     def _handle_potential_deadlock(self, deadlock_instances: List[str]) -> None:
         """Handle a potential deadlock.
 
+        Make sure to lock self._mutex before calling this.
+
         Args:
             deadlock_instances: list of instances waiting on eachother
         """
-        shutdown_delay = self._wait_before_shutdown
-        if self._shutdown_time is not None:
-            # Get time until shutdown
-            shutdown_delay = self._shutdown_time - time.monotonic()
         _logger.fatal(
-                "Potential deadlock detected, aborting run in %d seconds.\n%s",
-                shutdown_delay,
+                "Potential deadlock detected:\n%s",
                 self._format_deadlock(deadlock_instances),
                 )
-
         self._detected_deadlocks.append(deadlock_instances)
-        if self._shutdown_time is None:
-            self._shutdown_time = time.monotonic() + self._wait_before_shutdown
 
     def _format_deadlock(self, deadlock_instances: List[str]) -> str:
         """Create and return formatted deadlock debug info.
diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py
index 4a52787d..f54bf4d5 100644
--- a/libmuscle/python/libmuscle/manager/manager.py
+++ b/libmuscle/python/libmuscle/manager/manager.py
@@ -46,6 +46,7 @@ def __init__(
         self._profile_store = ProfileStore(log_dir)
         self._topology_store = TopologyStore(configuration)
         self._instance_registry = InstanceRegistry()
+        self._deadlock_detector = DeadlockDetector()
         if run_dir is not None:
             snapshot_dir = run_dir.snapshot_dir()
         else:
@@ -81,12 +82,6 @@ def __init__(
                 configuration, snapshot_dir, self._topology_store)
         self._snapshot_registry.start()
 
-        # Hard-code grace period to 5 seconds. We may want to do something smarter in
-        # the future (e.g. set a timeout dependent on the number of instances), but this
-        # should suffice for now:
-        self._deadlock_detector = DeadlockDetector(self.stop, 5.0)
-        self._deadlock_detector.start()
-
         self._server = MMPServer(
                 self._logger, self._profile_store, self._configuration,
                 self._instance_registry, self._topology_store,
@@ -128,10 +123,6 @@ def stop(self) -> None:
         """Shuts down the manager."""
         if self._instance_manager:
             self._instance_manager.shutdown()
-        self._deadlock_detector.shutdown()
-        # Note: don't join() deadlock detector, as this method may be called from the
-        # DeadlockDetector thread and calling join() on your own thread raises a
-        # RuntimeError.
         self._server.stop()
         self._snapshot_registry.shutdown()
         self._snapshot_registry.join()
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index ef1fca51..1d4266e3 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -376,7 +376,7 @@ def _waiting_for_receive(
             port_name: Port name that the instance is waiting on
             slot: Slot that the instance is waiting on
         """
-        self._deadlock_detector.put_waiting(
+        self._deadlock_detector.waiting_for_receive(
                 instance_id, peer_instance_id, port_name, slot)
         return [ResponseType.SUCCESS.value]
 
@@ -390,7 +390,7 @@ def _waiting_for_receive_done(
             port_name: Port name that the instance is waiting on
             slot: Slot that the instance is waiting on
         """
-        self._deadlock_detector.put_waiting_done(
+        self._deadlock_detector.waiting_for_receive_done(
                 instance_id, peer_instance_id, port_name, slot)
         return [ResponseType.SUCCESS.value]
 
diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py
index 36f7dc52..53528b6c 100644
--- a/libmuscle/python/libmuscle/manager/test/conftest.py
+++ b/libmuscle/python/libmuscle/manager/test/conftest.py
@@ -57,7 +57,7 @@ def snapshot_registry(mmp_configuration, topology_store) -> SnapshotRegistry:
 
 @pytest.fixture
 def deadlock_detector() -> DeadlockDetector:
-    return DeadlockDetector(lambda: None, 1.0)
+    return DeadlockDetector()
 
 
 @pytest.fixture
diff --git a/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py b/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
index 24a44435..671ba22b 100644
--- a/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
+++ b/libmuscle/python/libmuscle/manager/test/test_deadlock_detector.py
@@ -1,106 +1,69 @@
-import time
-from typing import Iterator
-from unittest.mock import Mock
-
 import pytest
 
 from libmuscle.manager.deadlock_detector import DeadlockDetector
 
 
-# Decorator for tests that expect an exception in the DeadlockDetector thread
-ignore_unhandled_thread_exception = pytest.mark.filterwarnings(
-    "ignore::pytest.PytestUnhandledThreadExceptionWarning")
+@pytest.fixture
+def detector() -> DeadlockDetector:
+    return DeadlockDetector()
 
 
-@pytest.fixture
-def shutdown_callback() -> Mock:
-    return Mock()
+def test_no_deadlock(detector: DeadlockDetector) -> None:
+    detector.waiting_for_receive("macro", "micro", "s", None)
+    assert not detector.is_deadlocked("macro")
+    assert not detector.is_deadlocked("micro")
+    detector.waiting_for_receive_done("macro", "micro", "s", None)
+    assert not detector.is_deadlocked("macro")
+    assert not detector.is_deadlocked("micro")
 
 
-@pytest.fixture
-def detector(shutdown_callback) -> Iterator[DeadlockDetector]:
-    # Using a very short delay (10ms) to speed up unit testing
-    detector = DeadlockDetector(shutdown_callback, 0.01)
-    detector.start()
-    yield detector
-    if detector.is_alive():
-        detector.shutdown()
-        detector.join()
-
-
-def test_no_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) -> None:
-    detector.put_waiting("macro", "micro", "s", None)
-    detector.put_waiting_done("macro", "micro", "s", None)
-    time.sleep(0.05)
-    detector.shutdown()
-    detector.join()
-    shutdown_callback.assert_not_called()
-
-
-@ignore_unhandled_thread_exception
 def test_double_waiting_log_error(detector: DeadlockDetector) -> None:
-    detector.put_waiting("macro", "micro", "s", 0)
-    detector.put_waiting("macro", "micro", "s", 1)
-    # This should trigger an AssertionError in the thread and shut it down.
-    # We cannot test for the exception, so check that the thread is not running anymore:
-    detector.join(0.1)
-    assert not detector.is_alive()
+    detector.waiting_for_receive("macro", "micro", "s", 0)
+    with pytest.raises(AssertionError):
+        detector.waiting_for_receive("macro", "micro", "s", 1)
 
 
-@ignore_unhandled_thread_exception
 def test_not_waiting_log_error(detector: DeadlockDetector) -> None:
-    detector.put_waiting_done("macro", "micro", "s", 0)
-    # This should trigger an AssertionError in the thread and shut it down.
-    # We cannot test for the exception, so check that the thread is not running anymore:
-    detector.join(0.1)
-    assert not detector.is_alive()
+    with pytest.raises(AssertionError):
+        detector.waiting_for_receive_done("macro", "micro", "s", 0)
 
 
-@ignore_unhandled_thread_exception
 def test_waiting_for_different_instance_log_error(detector: DeadlockDetector) -> None:
-    detector.put_waiting("macro", "micro", "s", 0)
-    detector.put_waiting_done("macro", "meso", "s", 0)
-    # This should trigger an AssertionError in the thread and shut it down.
-    # We cannot test for the exception, so check that the thread is not running anymore:
-    detector.join(0.1)
-    assert not detector.is_alive()
+    detector.waiting_for_receive("macro", "micro", "s", 0)
+    with pytest.raises(AssertionError):
+        detector.waiting_for_receive_done("macro", "meso", "s", 0)
 
 
-@ignore_unhandled_thread_exception
 def test_waiting_for_different_port_log_error(detector: DeadlockDetector) -> None:
-    detector.put_waiting("macro", "micro", "s", 0)
-    detector.put_waiting_done("macro", "micro", "f_init", 0)
-    # This should trigger an AssertionError in the thread and shut it down.
-    # We cannot test for the exception, so check that the thread is not running anymore:
-    detector.join(0.1)
-    assert not detector.is_alive()
-
-
-def test_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) -> None:
-    detector.put_waiting("macro", "micro", "s", None)
-    detector.put_waiting("micro", "macro", "f_init", None)
-    detector.join(1)  # wait max 1 second then check that the deadlock was detected
-    assert not detector.is_alive()
-    shutdown_callback.assert_called_once()
-
-
-def test_deadlock_cancelled(
-        shutdown_callback: Mock, detector: DeadlockDetector) -> None:
-    detector.put_waiting("macro", "micro", "s", None)
-    detector.put_waiting("micro", "macro", "f_init", None)
-    detector.put_waiting_done("macro", "micro", "s", None)
-    time.sleep(0.05)
-    detector.shutdown()
-    detector.join()
-    shutdown_callback.assert_not_called()
-
-
-def test_double_deadlock(shutdown_callback: Mock, detector: DeadlockDetector) -> None:
-    detector.put_waiting("macro", "micro", "s", None)
-    detector.put_waiting("micro", "macro", "f_init", None)
-    detector.put_waiting("cycle2", "peer2", "s", None)
-    detector.put_waiting("peer2", "cycle2", "f_init", None)
-    detector.put_waiting_done("macro", "micro", "s", None)
-    detector.join(1)  # wait max 1 second then check that the deadlock was detected
-    assert not detector.is_alive()
-    shutdown_callback.assert_called()
+    detector.waiting_for_receive("macro", "micro", "s", 0)
+    with pytest.raises(AssertionError):
+        detector.waiting_for_receive_done("macro", "micro", "f_init", 0)
+
+
+def test_deadlock(detector: DeadlockDetector) -> None:
+    detector.waiting_for_receive("macro", "micro", "s", None)
+    assert not detector.is_deadlocked("macro")
+    assert not detector.is_deadlocked("micro")
+    detector.waiting_for_receive("micro", "macro", "f_init", None)
+    assert detector.is_deadlocked("macro")
+    assert detector.is_deadlocked("micro")
+
+
+def test_deadlock_cancelled(detector: DeadlockDetector) -> None:
+    detector.waiting_for_receive("macro", "micro", "s", None)
+    detector.waiting_for_receive("micro", "macro", "f_init", None)
+    detector.waiting_for_receive_done("macro", "micro", "s", None)
+    assert not detector.is_deadlocked("macro")
+    assert not detector.is_deadlocked("micro")
+
+
+def test_double_deadlock(detector: DeadlockDetector) -> None:
+    detector.waiting_for_receive("macro", "micro", "s", None)
+    detector.waiting_for_receive("micro", "macro", "f_init", None)
+    detector.waiting_for_receive("cycle2", "peer2", "s", None)
+    detector.waiting_for_receive("peer2", "cycle2", "f_init", None)
+    detector.waiting_for_receive_done("macro", "micro", "s", None)
+    assert not detector.is_deadlocked("macro")
+    assert not detector.is_deadlocked("micro")
+    assert detector.is_deadlocked("cycle2")
+    assert detector.is_deadlocked("peer2")

From f3f8e317acdb169a57ba398369bc6ef9ad52ec50 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 19 Aug 2024 16:59:20 +0200
Subject: [PATCH 14/19] Add integration tests for deadlock detection

And fix bugs with the C++ implementation.
---
 integration_test/conftest.py                  |  15 +-
 integration_test/test_deadlock_detection.py   | 184 ++++++++++++++++++
 libmuscle/cpp/src/libmuscle/mcp/protocol.hpp  |   1 +
 libmuscle/cpp/src/libmuscle/mmp_client.cpp    |   2 +-
 .../src/libmuscle/receive_timeout_handler.cpp |   1 +
 5 files changed, 199 insertions(+), 4 deletions(-)
 create mode 100644 integration_test/test_deadlock_detection.py

diff --git a/integration_test/conftest.py b/integration_test/conftest.py
index 18ab5ce4..4cf8701c 100644
--- a/integration_test/conftest.py
+++ b/integration_test/conftest.py
@@ -72,10 +72,12 @@ def _python_wrapper(instance_name, muscle_manager, callable):
     callable()
 
 
-def run_manager_with_actors(ymmsl_text, tmpdir, actors):
+def run_manager_with_actors(ymmsl_text, tmpdir, actors, expect_success=True):
     """Start muscle_manager along with C++ and python actors.
 
     Args:
+        ymmsl_text: YMMSL configuration for the simulation
+        tmpdir: Temporary folder to use as runpath a
         actors: a dictionary of lists containing details for each actor:
             ``{"instance_name": ("language", "details", ...)}``.
 
@@ -155,10 +157,17 @@ def run_manager_with_actors(ymmsl_text, tmpdir, actors):
         # check results
         for proc in native_processes:
             proc.wait()
-            assert proc.returncode == 0
+            if expect_success:
+                assert proc.returncode == 0
         for proc in python_processes:
             proc.join()
-            assert proc.exitcode == 0
+            if expect_success:
+                assert proc.exitcode == 0
+        if not expect_success:
+            # Check that at least one process has failed
+            assert (
+                    any(proc.returncode != 0 for proc in native_processes) or
+                    any(proc.exitcode != 0 for proc in python_processes))
 
 
 @pytest.fixture
diff --git a/integration_test/test_deadlock_detection.py b/integration_test/test_deadlock_detection.py
new file mode 100644
index 00000000..c6d5d55e
--- /dev/null
+++ b/integration_test/test_deadlock_detection.py
@@ -0,0 +1,184 @@
+import functools
+import sys
+
+from ymmsl import Operator
+
+from libmuscle import Instance, Message
+
+from .conftest import skip_if_python_only, run_manager_with_actors
+
+
+def suppress_deadlock_exception_output(func):
+    @functools.wraps(func)
+    def wrapper():
+        try:
+            func()
+        except RuntimeError as exc:
+            exc_str = str(exc).lower()
+            if "deadlock" not in exc_str and "did the peer crash?" not in exc_str:
+                raise
+            sys.exit(1)
+    return wrapper
+
+
+@suppress_deadlock_exception_output
+def deadlocking_micro():
+    instance = Instance({Operator.F_INIT: ["in"], Operator.O_F: ["out"]})
+
+    counter = 5  # Deadlock after 5 iterations
+    while instance.reuse_instance():
+        message = instance.receive("in")
+        counter -= 1
+        if counter > 0:
+            instance.send("out", message)
+
+
+@suppress_deadlock_exception_output
+def micro():
+    instance = Instance({Operator.F_INIT: ["in"], Operator.O_F: ["out"]})
+
+    while instance.reuse_instance():
+        message = instance.receive("in")
+        instance.send("out", message)
+
+
+@suppress_deadlock_exception_output
+def deadlocking_macro():
+    instance = Instance({Operator.O_I: ["out"], Operator.S: ["in"]})
+
+    while instance.reuse_instance():
+        for i in range(10):
+            message = Message(float(i), data="testing")
+            instance.send("out", message)
+            instance.receive("in")
+        # Deadlock:
+        instance.receive("in")
+
+
+@suppress_deadlock_exception_output
+def macro():
+    instance = Instance({Operator.O_I: ["out"], Operator.S: ["in"]})
+
+    while instance.reuse_instance():
+        for i in range(10):
+            message = Message(float(i), data="testing")
+            instance.send("out", message)
+            instance.receive("in")
+
+
+MACRO_MICRO_CONFIG = """
+ymmsl_version: v0.1
+model:
+  name: test_model
+  components:
+    macro:
+      implementation: macro
+    micro:
+      implementation: micro
+  conduits:
+    macro.out: micro.in
+    micro.out: macro.in
+settings:
+  muscle_deadlock_receive_timeout: 0.1
+"""
+MACRO_MICRO_WITH_DISPATCH_CONFIG = """
+ymmsl_version: v0.1
+model:
+  name: test_model
+  components:
+    macro:
+      implementation: macro
+    micro1:
+      implementation: micro
+    micro2:
+      implementation: micro
+    micro3:
+      implementation: micro
+  conduits:
+    macro.out: micro1.in
+    micro1.out: micro2.in
+    micro2.out: micro3.in
+    micro3.out: macro.in
+settings:
+  muscle_deadlock_receive_timeout: 0.1
+"""
+
+
+def test_no_deadlock(tmp_path):
+    run_manager_with_actors(
+        MACRO_MICRO_CONFIG, tmp_path,
+        {"macro": ("python", macro),
+         "micro": ("python", micro)})
+
+
+def test_deadlock1(tmp_path):
+    run_manager_with_actors(
+        MACRO_MICRO_CONFIG, tmp_path,
+        {"macro": ("python", macro),
+         "micro": ("python", deadlocking_micro)},
+        expect_success=False)
+
+
+def test_deadlock2(tmp_path):
+    run_manager_with_actors(
+        MACRO_MICRO_CONFIG, tmp_path,
+        {"macro": ("python", deadlocking_macro),
+         "micro": ("python", micro)},
+        expect_success=False)
+
+
+def test_no_deadlock_with_dispatch(tmp_path):
+    run_manager_with_actors(
+        MACRO_MICRO_WITH_DISPATCH_CONFIG, tmp_path,
+        {"macro": ("python", macro),
+         "micro1": ("python", micro),
+         "micro2": ("python", micro),
+         "micro3": ("python", micro)})
+
+
+def test_deadlock1_with_dispatch(tmp_path):
+    run_manager_with_actors(
+        MACRO_MICRO_WITH_DISPATCH_CONFIG, tmp_path,
+        {"macro": ("python", macro),
+         "micro1": ("python", micro),
+         "micro2": ("python", deadlocking_micro),
+         "micro3": ("python", micro)},
+        expect_success=False)
+
+
+def test_deadlock2_with_dispatch(tmp_path):
+    run_manager_with_actors(
+        MACRO_MICRO_WITH_DISPATCH_CONFIG, tmp_path,
+        {"macro": ("python", deadlocking_macro),
+         "micro1": ("python", micro),
+         "micro2": ("python", micro),
+         "micro3": ("python", micro)},
+        expect_success=False)
+
+
+@skip_if_python_only
+def test_no_deadlock_cpp(tmp_path):
+    run_manager_with_actors(
+        MACRO_MICRO_CONFIG, tmp_path,
+        {"macro": ("cpp", "component_test"),
+         "micro": ("python", micro)})
+
+
+@skip_if_python_only
+def test_deadlock1_cpp(tmp_path):
+    run_manager_with_actors(
+        MACRO_MICRO_CONFIG, tmp_path,
+        {"macro": ("cpp", "component_test"),
+         "micro": ("python", deadlocking_micro)},
+        expect_success=False)
+
+
+@skip_if_python_only
+def test_deadlock2_cpp(tmp_path):
+    run_manager_with_actors(
+        MACRO_MICRO_WITH_DISPATCH_CONFIG, tmp_path,
+        {"macro": ("cpp", "component_test"),
+         "micro1": ("python", micro),
+         "micro2": ("python", deadlocking_micro),
+         "micro3": ("cpp", "component_test")},
+        expect_success=False)
diff --git a/libmuscle/cpp/src/libmuscle/mcp/protocol.hpp b/libmuscle/cpp/src/libmuscle/mcp/protocol.hpp
index 25dd142b..3f7b5aa9 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/protocol.hpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/protocol.hpp
@@ -22,6 +22,7 @@ enum class RequestType {
     // Connection deadlock detection
     waiting_for_receive = 9,
     waiting_for_receive_done = 10,
+    is_deadlocked = 11,
 
     // MUSCLE Peer Protocol
     get_next_message = 21
diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.cpp b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
index 2b57640f..e2afae55 100644
--- a/libmuscle/cpp/src/libmuscle/mmp_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
@@ -349,7 +349,7 @@ void MMPClient::waiting_for_receive_done(
 
 bool MMPClient::is_deadlocked() {
     auto request = Data::list(
-            static_cast<int>(RequestType::waiting_for_receive_done),
+            static_cast<int>(RequestType::is_deadlocked),
             static_cast<std::string>(instance_id_));
     
     auto response = call_manager_(request);
diff --git a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
index fa0a58df..db3a8c07 100644
--- a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
+++ b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
@@ -27,6 +27,7 @@ void ReceiveTimeoutHandler::on_timeout()
     else
         if (manager_.is_deadlocked())
             throw Deadlock();
+    num_timeout_ ++;
 }
 
 void ReceiveTimeoutHandler::on_receive()

From f227051b0f35809079d167c2d1fe39f087ac9484 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 20 Aug 2024 14:23:02 +0200
Subject: [PATCH 15/19] Add documentation for deadlock detection

---
 docs/source/tips.rst | 54 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/docs/source/tips.rst b/docs/source/tips.rst
index a68b4983..658c963f 100644
--- a/docs/source/tips.rst
+++ b/docs/source/tips.rst
@@ -2,6 +2,60 @@
 Tips & tricks
 =============
 
+Deadlock detection
+==================
+
+.. versionadded:: 0.8
+
+MUSCLE3 has a deadlock detection mechanism, which can detect when the simulation
+is deadlocked because (part of) the components of the simulation are all waiting
+for a message of each other. This could happen, for example, due to a bug in
+one of the components, or because the components are not correctly wired
+together.
+
+The simplest deadlock consists of two components, where the first component is
+waiting to receive a message from the second component and vice versa. Because
+both components are waiting for eachother, the simulation is stuck and will no
+longer progress. MUSCLE3 will abort the simulation run and provide an error
+message that indicates that the simulation was deadlocked:
+
+.. code-block:: output
+    :caption: Example output of a deadlocked simulation
+
+    muscle_manager 2024-08-20 13:57:58,544 CRITICAL libmuscle.manager.deadlock_detector: Potential deadlock detected:
+    The following 2 instances are deadlocked:
+    1. Instance 'micro' is waiting on instance 'macro' in a receive on port 'initial_state'.
+    2. Instance 'macro' is waiting on instance 'micro' in a receive on port 'state_in'.
+
+
+.. note::
+    MUSCLE3 can only detect deadlocks that are the result of components waiting
+    for messages to receive. "Internal" deadlocks in simulation components (for
+    example due to bugs in MPI logic) cannot be detected by MUSCLE3.
+
+
+Configuring the deadlock detection
+----------------------------------
+
+With the default settings, MUSCLE3 will detect a deadlock 10 seconds after it
+occurs. The simulation is halted after another 15 seconds have passed.
+These default settings are chosen to limit the runtime impact of the deadlock
+detection. It may be useful to detect deadlocks faster during development of the
+simulation. This can be achieved with the special setting
+``muscle_deadlock_receive_timeout``:
+
+.. code-block:: yaml
+    :caption: Example configuration setting ``muscle_deadlock_receive_timeout``
+
+    ymmsl_version: v0.1
+    settings:
+      muscle_deadlock_receive_timeout: 1.0
+
+The value provided to this setting is the initial timeout (in seconds) before
+MUSCLE3 detects a deadlock. The simulation is halted after 1.5 times that
+duration. Deadlock detection is disabled when a negative value is used.
+
+
 Running simulation components interactively
 ===========================================
 

From 637043153a2110a55db26823f8449669b7c1976a Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 26 Aug 2024 10:43:16 +0200
Subject: [PATCH 16/19] Pass peer_instance as `ymmsl.Reference` instead of
 string in ReceiveTimeoutHandler

---
 libmuscle/cpp/src/libmuscle/communicator.cpp              | 3 +--
 libmuscle/cpp/src/libmuscle/mmp_client.cpp                | 8 ++++----
 libmuscle/cpp/src/libmuscle/mmp_client.hpp                | 4 ++--
 libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp   | 2 +-
 libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp   | 5 +++--
 .../cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp     | 4 ++--
 libmuscle/python/libmuscle/communicator.py                | 2 +-
 libmuscle/python/libmuscle/mmp_client.py                  | 8 ++++----
 libmuscle/python/libmuscle/receive_timeout_handler.py     | 4 +++-
 9 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp
index 41f3fdcc..5b01de3a 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.cpp
@@ -141,9 +141,8 @@ std::tuple<Message, double> Communicator::receive_message(
     Endpoint snd_endpoint = peer_info_.get().get_peer_endpoints(
             recv_endpoint.port, slot_list).at(0);
     MPPClient & client = get_client_(snd_endpoint.instance());
-    std::string peer_instance = static_cast<std::string>(snd_endpoint.instance());
     ReceiveTimeoutHandler handler(
-            manager_, peer_instance, port_name, slot, receive_timeout_);
+            manager_, snd_endpoint.instance(), port_name, slot, receive_timeout_);
     ReceiveTimeoutHandler *timeout_handler = receive_timeout_ < 0 ? nullptr : &handler;
     auto msg_and_profile = try_receive_(
             client, recv_endpoint.ref(), snd_endpoint.kernel, port_and_slot, timeout_handler);
diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.cpp b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
index e2afae55..eb56bcf2 100644
--- a/libmuscle/cpp/src/libmuscle/mmp_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
@@ -324,25 +324,25 @@ void MMPClient::deregister_instance() {
 }
 
 void MMPClient::waiting_for_receive(
-        std::string const & peer_instance_id, std::string const & port_name,
+        ::ymmsl::Reference const & peer_instance_id, std::string const & port_name,
         Optional<int> slot)
 {
     auto request = Data::list(
             static_cast<int>(RequestType::waiting_for_receive),
             static_cast<std::string>(instance_id_),
-            peer_instance_id, port_name, encode_optional(slot));
+            static_cast<std::string>(peer_instance_id), port_name, encode_optional(slot));
 
     auto response = call_manager_(request);
 }
 
 void MMPClient::waiting_for_receive_done(
-        std::string const & peer_instance_id, std::string const & port_name,
+        ::ymmsl::Reference const & peer_instance_id, std::string const & port_name,
         Optional<int> slot)
 {
     auto request = Data::list(
             static_cast<int>(RequestType::waiting_for_receive_done),
             static_cast<std::string>(instance_id_),
-            peer_instance_id, port_name, encode_optional(slot));
+            static_cast<std::string>(peer_instance_id), port_name, encode_optional(slot));
 
     auto response = call_manager_(request);
 }
diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.hpp b/libmuscle/cpp/src/libmuscle/mmp_client.hpp
index 2bbcd32c..86f653fc 100644
--- a/libmuscle/cpp/src/libmuscle/mmp_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/mmp_client.hpp
@@ -125,12 +125,12 @@ class MMPClient {
 
         /** Notify the manager that we're waiting to receive a message. */
         void waiting_for_receive(
-                std::string const & peer_instance_id, std::string const & port_name,
+                ::ymmsl::Reference const & peer_instance_id, std::string const & port_name,
                 Optional<int> slot);
 
         /** Notify the manager that we're done waiting to receive a message. */
         void waiting_for_receive_done(
-                std::string const & peer_instance_id, std::string const & port_name,
+                ::ymmsl::Reference const & peer_instance_id, std::string const & port_name,
                 Optional<int> slot);
 
         /** Ask the manager if we're part of a deadlock. */
diff --git a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
index db3a8c07..9b651f56 100644
--- a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
+++ b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
@@ -5,7 +5,7 @@
 namespace libmuscle { namespace _MUSCLE_IMPL_NS {
 
 ReceiveTimeoutHandler::ReceiveTimeoutHandler(
-        MMPClient& manager, std::string const& peer_instance,
+        MMPClient& manager, ::ymmsl::Reference const peer_instance,
         std::string const& port_name, Optional<int> slot, double timeout)
     : manager_(manager)
     , peer_instance_(peer_instance)
diff --git a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp
index 9581d1f4..1b70456d 100644
--- a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp
+++ b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.hpp
@@ -2,6 +2,7 @@
 
 #include <libmuscle/mcp/transport_client.hpp>
 #include <libmuscle/mmp_client.hpp>
+#include <ymmsl/ymmsl.hpp>
 
 namespace libmuscle { namespace _MUSCLE_IMPL_NS {
 
@@ -25,7 +26,7 @@ class ReceiveTimeoutHandler : public mcp::TimeoutHandler {
     public:
         ReceiveTimeoutHandler(
                 MMPClient & manager,
-                std::string const & peer_instance,
+                ::ymmsl::Reference const peer_instance,
                 std::string const & port_name,
                 Optional<int> slot,
                 double timeout);
@@ -38,7 +39,7 @@ class ReceiveTimeoutHandler : public mcp::TimeoutHandler {
 
     private:
         MMPClient & manager_;
-        std::string const & peer_instance_;
+        ::ymmsl::Reference const peer_instance_;
         std::string const & port_name_;
         Optional<int> slot_;
         double timeout_;
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp
index a02e02fd..d00fde23 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mmp_client.hpp
@@ -100,11 +100,11 @@ class MockMMPClient : public MockClass<MockMMPClient> {
         MockFun<Void> deregister_instance;
 
         MockFun<Void,
-            Val<std::string const &>, Val<std::string const &>, Val<Optional<int>>
+            Val<::ymmsl::Reference const &>, Val<std::string const &>, Val<Optional<int>>
             > waiting_for_receive;
 
         MockFun<Void,
-            Val<std::string const &>, Val<std::string const &>, Val<Optional<int>>
+            Val<::ymmsl::Reference const &>, Val<std::string const &>, Val<Optional<int>>
             > waiting_for_receive_done;
 
         MockFun<Val<bool>> is_deadlocked;
diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index ee42c88d..b4125aaf 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -247,7 +247,7 @@ def receive_message(
         timeout_handler = None
         if self._receive_timeout >= 0:
             timeout_handler = ReceiveTimeoutHandler(
-                    self._manager, str(snd_endpoint.instance()),
+                    self._manager, snd_endpoint.instance(),
                     port_name, slot, self._receive_timeout)
         try:
             mpp_message_bytes, profile = client.receive(
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 76a2d8e6..d8d1a8e1 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -276,23 +276,23 @@ def deregister_instance(self) -> None:
                     response[1]))
 
     def waiting_for_receive(
-            self, peer_instance_id: str, port_name: str, slot: Optional[int]
+            self, peer_instance_id: Reference, port_name: str, slot: Optional[int]
             ) -> None:
         """Notify the manager that we're waiting to receive a message."""
         request = [
                 RequestType.WAITING_FOR_RECEIVE.value,
                 str(self._instance_id),
-                peer_instance_id, port_name, slot]
+                str(peer_instance_id), port_name, slot]
         self._call_manager(request)
 
     def waiting_for_receive_done(
-            self, peer_instance_id: str, port_name: str, slot: Optional[int]
+            self, peer_instance_id: Reference, port_name: str, slot: Optional[int]
             ) -> None:
         """Notify the manager that we're done waiting to receive a message."""
         request = [
                 RequestType.WAITING_FOR_RECEIVE_DONE.value,
                 str(self._instance_id),
-                peer_instance_id, port_name, slot]
+                str(peer_instance_id), port_name, slot]
         self._call_manager(request)
 
     def is_deadlocked(self) -> bool:
diff --git a/libmuscle/python/libmuscle/receive_timeout_handler.py b/libmuscle/python/libmuscle/receive_timeout_handler.py
index 951b6586..a8ca0cbd 100644
--- a/libmuscle/python/libmuscle/receive_timeout_handler.py
+++ b/libmuscle/python/libmuscle/receive_timeout_handler.py
@@ -1,5 +1,7 @@
 from typing import Optional
 
+from ymmsl import Reference
+
 from libmuscle.mcp.transport_client import TimeoutHandler
 from libmuscle.mmp_client import MMPClient
 
@@ -20,7 +22,7 @@ class ReceiveTimeoutHandler(TimeoutHandler):
 
     def __init__(
             self, manager: MMPClient,
-            peer_instance: str, port_name: str, slot: Optional[int],
+            peer_instance: Reference, port_name: str, slot: Optional[int],
             timeout: float
             ) -> None:
         """Initialize a new timeout handler.

From a6bf6f3d4a4543add9aff9c10254f7813091eaf2 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 26 Aug 2024 12:51:33 +0200
Subject: [PATCH 17/19] Add logic for minimum muscle_deadlock_receive_timeout
 (0.1 seconds)

---
 libmuscle/cpp/src/libmuscle/instance.cpp         | 6 ++++++
 libmuscle/python/libmuscle/instance.py           | 5 +++++
 libmuscle/python/libmuscle/test/test_instance.py | 6 +++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/libmuscle/cpp/src/libmuscle/instance.cpp b/libmuscle/cpp/src/libmuscle/instance.cpp
index 53bbfb28..027f2ca8 100644
--- a/libmuscle/cpp/src/libmuscle/instance.cpp
+++ b/libmuscle/cpp/src/libmuscle/instance.cpp
@@ -563,6 +563,12 @@ void Instance::Impl::setup_receive_timeout_() {
     try {
         timeout = settings_manager_.get_setting(
                instance_name_, "muscle_deadlock_receive_timeout").as<double>();
+        if (timeout >= 0 && timeout < 0.1) {
+            logger_->info(
+                    "Provided muscle_deadlock_receive_timeout (", timeout,
+                    ") was less than the minimum of 0.1 seconds, setting it to 0.1.");
+            timeout = 0.1;
+        }
         communicator_->set_receive_timeout(timeout);
     }
     catch (std::runtime_error const & e) {
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index d98777f5..c9498dca 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -816,6 +816,11 @@ def _setup_receive_timeout(self) -> None:
         """
         try:
             timeout = self.get_setting('muscle_deadlock_receive_timeout', 'float')
+            if 0 <= timeout < 0.1:
+                _logger.info(
+                        "Provided muscle_deadlock_receive_timeout (%f) was less than "
+                        "the minimum of 0.1 seconds, setting it to 0.1.", timeout)
+                timeout = 0.1
             self._communicator.set_receive_timeout(timeout)
         except KeyError:
             pass  # do nothing and keep the default
diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py
index 9dc52c52..26e80f17 100644
--- a/libmuscle/python/libmuscle/test/test_instance.py
+++ b/libmuscle/python/libmuscle/test/test_instance.py
@@ -62,7 +62,10 @@ def communicator():
 @pytest.fixture
 def settings_manager():
     with patch('libmuscle.instance.SettingsManager') as SettingsManager:
-        yield SettingsManager.return_value
+        settings_manager = SettingsManager.return_value
+        # Emulate no settings available
+        settings_manager.get_setting.side_effect = KeyError()
+        yield settings_manager
 
 
 @pytest.fixture(autouse=True)
@@ -298,6 +301,7 @@ def test_list_settings(instance, settings_manager):
 
 
 def test_get_setting(instance, settings_manager):
+    settings_manager.get_setting.side_effect = None  # don't raise KeyError
     instance.get_setting('test', 'int')
     settings_manager.get_setting.assert_called_with(
             Ref('component'), Ref('test'), 'int')

From 3cf5d897106bec785a4e82e78b6dfdee382580fe Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 26 Aug 2024 12:52:15 +0200
Subject: [PATCH 18/19] Process review feedback: fix typo and prettier C++

---
 libmuscle/cpp/src/libmuscle/communicator.cpp             | 2 +-
 libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp | 2 +-
 libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp
index 5b01de3a..33dce3c4 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.cpp
@@ -288,7 +288,7 @@ std::tuple<std::vector<char>, mcp::ProfileData> Communicator::try_receive_(
         return client.receive(receiver, timeout_handler);
     } catch(Deadlock const & err) {
         throw std::runtime_error(
-            "Deadlock detected when when receiving a message on '" +
+            "Deadlock detected when receiving a message on '" +
             port_and_slot +
             "'. See manager logs for more detail.");
     } catch(std::runtime_error const & err) {
diff --git a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
index 2868f1b4..821f6d8b 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
@@ -92,7 +92,7 @@ inline int poll_retry_eintr(pollfd *fds, nfds_t nfds, double timeout) {
 
     const auto timeout_duration = duration<double>(timeout);
     const auto deadline = steady_clock::now() + timeout_duration;
-    while (1) {
+    while (true) {
         int timeout_ms = duration_cast<milliseconds>(deadline - steady_clock::now()).count();
         int poll_result = poll(fds, nfds, timeout_ms);
 
diff --git a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
index 9b651f56..1bb2bd10 100644
--- a/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
+++ b/libmuscle/cpp/src/libmuscle/receive_timeout_handler.cpp
@@ -17,7 +17,7 @@ ReceiveTimeoutHandler::ReceiveTimeoutHandler(
 double ReceiveTimeoutHandler::get_timeout()
 {
     // Increase timeout by a factor 1.5 with every timeout we hit:
-    return timeout_ * std::pow(1.5, (double)num_timeout_);
+    return timeout_ * std::pow(1.5, num_timeout_);
 }
 
 void ReceiveTimeoutHandler::on_timeout()

From b63f047c2403566b58d816c85afca36a727d0207 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 26 Aug 2024 12:57:15 +0200
Subject: [PATCH 19/19] Change moment of logging deadlocks

---
 libmuscle/python/libmuscle/manager/deadlock_detector.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/deadlock_detector.py b/libmuscle/python/libmuscle/manager/deadlock_detector.py
index 6776d00b..677bfaad 100644
--- a/libmuscle/python/libmuscle/manager/deadlock_detector.py
+++ b/libmuscle/python/libmuscle/manager/deadlock_detector.py
@@ -89,6 +89,9 @@ def is_deadlocked(self, instance_id: str) -> bool:
         with self._mutex:
             for deadlock_instances in self._detected_deadlocks:
                 if instance_id in deadlock_instances:
+                    _logger.fatal(
+                            "Deadlock detected, simulation is aborting!\n%s",
+                            self._format_deadlock(deadlock_instances))
                     return True
         return False
 
@@ -115,10 +118,6 @@ def _handle_potential_deadlock(self, deadlock_instances: List[str]) -> None:
         Args:
             deadlock_instances: list of instances waiting on eachother
         """
-        _logger.fatal(
-                "Potential deadlock detected:\n%s",
-                self._format_deadlock(deadlock_instances),
-                )
         self._detected_deadlocks.append(deadlock_instances)
 
     def _format_deadlock(self, deadlock_instances: List[str]) -> str: