From 994a4882d0c1a037aa8ae8eff913d9bb3922ea4a Mon Sep 17 00:00:00 2001
From: Sean Lynch <42618346+swlynch99@users.noreply.github.com>
Date: Tue, 24 Sep 2024 17:20:22 -0700
Subject: [PATCH] Explicitly handle some recoverable sqlx errors instead of
 failing

The whole goal of durable is for its workflows to be reliable. While we
can't handle some errors, there are a subset of sqlx errors that are not
the fault of the workflow itself and should instead be handled by
performing a retry.

This commit handles these and translates them into either retries or
runtime-level errors.
---
 ...0a48574aa64161b0ccc37be8bb58deaff5783.json |  15 +++
 ...38178092c2ba065b7c7f367ce1cd2a90d2dce.json |  15 +++
 crates/durable-runtime/src/worker.rs          | 119 +++++++++++++++---
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 .sqlx/query-8646787e97a2e9c59fa9a1f2f510a48574aa64161b0ccc37be8bb58deaff5783.json
 create mode 100644 .sqlx/query-cf31973a833c40d4d43aa01102e38178092c2ba065b7c7f367ce1cd2a90d2dce.json

diff --git a/.sqlx/query-8646787e97a2e9c59fa9a1f2f510a48574aa64161b0ccc37be8bb58deaff5783.json b/.sqlx/query-8646787e97a2e9c59fa9a1f2f510a48574aa64161b0ccc37be8bb58deaff5783.json
new file mode 100644
index 0000000..1ad93a8
--- /dev/null
+++ b/.sqlx/query-8646787e97a2e9c59fa9a1f2f510a48574aa64161b0ccc37be8bb58deaff5783.json
@@ -0,0 +1,15 @@
+{
+  "db_name": "PostgreSQL",
+  "query": "\n                                UPDATE durable.task\n                                SET state = 'ready',\n                                    running_on = NULL\n                                WHERE id = $1\n                                  AND running_on = $2\n                                ",
+  "describe": {
+    "columns": [],
+    "parameters": {
+      "Left": [
+        "Int8",
+        "Int8"
+      ]
+    },
+    "nullable": []
+  },
+  "hash": "8646787e97a2e9c59fa9a1f2f510a48574aa64161b0ccc37be8bb58deaff5783"
+}
diff --git a/.sqlx/query-cf31973a833c40d4d43aa01102e38178092c2ba065b7c7f367ce1cd2a90d2dce.json b/.sqlx/query-cf31973a833c40d4d43aa01102e38178092c2ba065b7c7f367ce1cd2a90d2dce.json
new file mode 100644
index 0000000..a2bfd3a
--- /dev/null
+++ b/.sqlx/query-cf31973a833c40d4d43aa01102e38178092c2ba065b7c7f367ce1cd2a90d2dce.json
@@ -0,0 +1,15 @@
+{
+  "db_name": "PostgreSQL",
+  "query": "\n                        UPDATE durable.task\n                          SET state = 'ready',\n                              running_on = NULL\n                        WHERE id = ANY($1::bigint[])\n                          AND running_on = $2\n                        ",
+  "describe": {
+    "columns": [],
+    "parameters": {
+      "Left": [
+        "Int8Array",
+        "Int8"
+      ]
+    },
+    "nullable": []
+  },
+  "hash": "cf31973a833c40d4d43aa01102e38178092c2ba065b7c7f367ce1cd2a90d2dce"
+}
diff --git a/crates/durable-runtime/src/worker.rs b/crates/durable-runtime/src/worker.rs
index 56e2915..1006f9f 100644
--- a/crates/durable-runtime/src/worker.rs
+++ b/crates/durable-runtime/src/worker.rs
@@ -13,7 +13,7 @@ use rand::Rng;
 use serde_json::value::RawValue;
 use sqlx::postgres::PgNotification;
 use sqlx::types::Json;
-use tokio::sync::{broadcast, Mutex, Notify, Semaphore};
+use tokio::sync::{broadcast, mpsc, Mutex, Notify, Semaphore};
 use tokio::task::JoinSet;
 use tokio::time::Instant;
 use tracing::Instrument;
@@ -344,7 +344,8 @@ impl Worker {
         //
         // How we do things here is that we order workers by id, each worker looks at
         // the one just in front of it and schedules a liveness check for just after
-        // that worker would expire. The exception to this is the
+        // that worker would expire. The worker with the oldest ID is then responsible
+        // for checking the one with the newest ID.
 
         let _guard = ShutdownGuard::new(&shared.shutdown);
         let mut shutdown = std::pin::pin!(shared.shutdown.wait());
@@ -559,8 +560,9 @@ impl Worker {
         let shutdown = self.shared.shutdown.clone();
         let _guard = ShutdownGuard::new(&shutdown);
         let mut shutdown = std::pin::pin!(shutdown.wait());
+        let (tx, mut rx) = tokio::sync::mpsc::channel::<i64>(1024);
 
-        self.spawn_new_tasks().await?;
+        self.spawn_new_tasks(&tx).await?;
         self.load_leader_id().await?;
 
         'outer: loop {
@@ -568,20 +570,50 @@ impl Worker {
                 biased;
 
                 _ = shutdown.as_mut() => break 'outer,
-                _ = self.tasks.join_next(), if !self.tasks.is_empty() => None,
-                event = self.event_source.next() => Some(event?),
+                _ = self.tasks.join_next(), if !self.tasks.is_empty() => LoopEvent::TaskComplete,
+                id = rx.recv() => LoopEvent::TaskFailed(id.expect("failed task channel closed unexpectedly")),
+                event = self.event_source.next() => LoopEvent::Event(event?),
             };
 
             // Clean up any tasks that have completed already.
             while self.tasks.try_join_next().is_some() {}
 
             let event = match event {
-                Some(event) => event,
-                None => {
+                LoopEvent::Event(event) => event,
+                LoopEvent::TaskComplete => {
                     if self.blocked {
-                        self.spawn_new_tasks().await?;
+                        self.spawn_new_tasks(&tx).await?;
+                    }
+
+                    continue;
+                }
+                LoopEvent::TaskFailed(id) => {
+                    let mut failed = vec![id];
+
+                    let mut count = 0;
+                    while let Ok(id) = rx.try_recv() {
+                        failed.push(id);
+
+                        count += 1;
+                        if count > 1024 {
+                            break;
+                        }
                     }
 
+                    sqlx::query!(
+                        "
+                        UPDATE durable.task
+                          SET state = 'ready',
+                              running_on = NULL
+                        WHERE id = ANY($1::bigint[])
+                          AND running_on = $2
+                        ",
+                        &failed,
+                        self.worker_id
+                    )
+                    .execute(&self.shared.pool)
+                    .await?;
+
                     continue;
                 }
             };
@@ -595,7 +627,7 @@ impl Worker {
                     running_on: Some(id),
                     ..
                 }) if id != self.worker_id => (),
-                Event::Task(_) => self.spawn_new_tasks().await?,
+                Event::Task(_) => self.spawn_new_tasks(&tx).await?,
                 Event::TaskSuspend(_) => {
                     self.shared.suspend.notify_waiters();
                 }
@@ -613,7 +645,7 @@ impl Worker {
 
                 // We don't know what we missed so do everything.
                 Event::Lagged => {
-                    self.spawn_new_tasks().await?;
+                    self.spawn_new_tasks(&tx).await?;
                     self.load_leader_id().await?;
                     self.shared.suspend.notify_waiters();
                 }
@@ -647,7 +679,7 @@ impl Worker {
 
     /// Spawn all new tasks that are scheduled on this server and also those
     /// that aren't scheduled on any server.
-    async fn spawn_new_tasks(&mut self) -> anyhow::Result<()> {
+    async fn spawn_new_tasks(&mut self, failure: &mpsc::Sender<i64>) -> anyhow::Result<()> {
         let max_tasks = self.shared.config.max_tasks;
         let allowed = max_tasks.saturating_sub(self.tasks.len());
         if allowed == 0 {
@@ -709,6 +741,7 @@ impl Worker {
             let shared = self.shared.clone();
             let engine = self.engine.clone();
             let worker_id = self.worker_id;
+            let failures = failure.clone();
 
             tracing::trace!(
                 target: "durable_runtime::worker::spawn_new_tasks",
@@ -723,6 +756,10 @@ impl Worker {
                     .await
                 {
                     tracing::error!(task_id, "worker task exited with an error: {e}");
+
+                    // An error here means we are already shutting down and normal shutdown recovery
+                    // should take care of any remaining tasks.
+                    let _ = failures.send(task_id).await;
                 }
             };
 
@@ -750,13 +787,52 @@ impl Worker {
     ) -> anyhow::Result<()> {
         let task_id = task.id;
 
-        let status =
-            match AssertUnwindSafe(Self::run_task_impl(shared.clone(), engine, task, worker_id))
-                .catch_unwind()
-                .await
-            {
+        // We are using the loop here to do some early breaks.
+        #[allow(clippy::never_loop)]
+        let status = loop {
+            let future = Self::run_task_impl(shared.clone(), engine, task, worker_id);
+            break match AssertUnwindSafe(future).catch_unwind().await {
                 Ok(Ok(status)) => status,
                 Ok(Err(error)) => {
+                    match find_sqlx_error(&error) {
+                        // These errors are external to the runtime and should usually be resolvable
+                        // if the workflow is retried somewhere else at a later point in time.
+                        //
+                        // This should also help to reduce the number of workflow aborts that are
+                        // "not the fault" of the workflow itself. These will (eventually) instead
+                        // get turned into worker errors, which can be handled at a higher level.
+                        Some(
+                            sqlx::Error::PoolTimedOut
+                            | sqlx::Error::WorkerCrashed
+                            | sqlx::Error::Io(_),
+                        ) => {
+                            // Attempt to reset the task state so it can be picked up again.
+                            //
+                            // If this fails then the task failure gets reported to the main event
+                            // loop which can ensure it gets retried.
+                            sqlx::query!(
+                                "
+                                UPDATE durable.task
+                                SET state = 'ready',
+                                    running_on = NULL
+                                WHERE id = $1
+                                  AND running_on = $2
+                                ",
+                                task_id,
+                                worker_id
+                            )
+                            .execute(&shared.pool)
+                            .await?;
+
+                            break TaskStatus::Suspend;
+                        }
+                        Some(sqlx::Error::PoolClosed) => {
+                            // Nothing we can do, since we can't make database queries.
+                            break TaskStatus::Suspend;
+                        }
+                        _ => (),
+                    }
+
                     let message = format!("{error:?}\n");
 
                     let result = sqlx::query!(
@@ -803,6 +879,7 @@ impl Worker {
                     TaskStatus::ExitFailure
                 }
             };
+        };
 
         match status {
             TaskStatus::NotScheduledOnWorker => {
@@ -1087,3 +1164,13 @@ impl EventSource for PgEventSource {
         }
     }
 }
+
+enum LoopEvent {
+    Event(Event),
+    TaskComplete,
+    TaskFailed(i64),
+}
+
+fn find_sqlx_error(error: &anyhow::Error) -> Option<&sqlx::Error> {
+    error.chain().filter_map(|e| e.downcast_ref()).next()
+}