fix: pool timeout bug

graphops · Mar 12, 2024 · 9816d7f · 9816d7f
1 parent 4abffbd
commit 9816d7f
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 47 deletions.
diff --git a/.sqlx/query-072603f4f0b17b26cec784a0e210b192cc9b7ddc23323def08c60118df914a47.json b/.sqlx/query-072603f4f0b17b26cec784a0e210b192cc9b7ddc23323def08c60118df914a47.json
diff --git a/.sqlx/query-3ed0545a37d071d142c970f7b0f863faf0fcf652cbded292e4eddca6fb4f1ccb.json b/.sqlx/query-3ed0545a37d071d142c970f7b0f863faf0fcf652cbded292e4eddca6fb4f1ccb.json
diff --git a/src/db/resolver.rs b/src/db/resolver.rs
@@ -1,7 +1,7 @@
 use async_graphql::{OutputType, SimpleObject};
 use chrono::Utc;
 use serde::{de::DeserializeOwned, Serialize};
-use sqlx::{types::Json, FromRow, PgPool, Row as SqliteRow};
+use sqlx::{postgres::PgQueryResult, types::Json, FromRow, PgPool, Row as SqliteRow};
 use std::ops::Deref;
 use tracing::trace;
 
@@ -83,6 +83,23 @@ ORDER BY id
     Ok(rows)
 }
 
+pub async fn count_messages(pool: &PgPool) -> anyhow::Result<i64> {
+    let result = sqlx::query!(
+        r#"
+        SELECT COUNT(*) as "count!: i64"
+        FROM messages
+        "#
+    )
+    .fetch_one(pool)
+    .await
+    .map_err(|e| {
+        trace!("Database query error: {:#?}", e);
+        anyhow::Error::new(e)
+    })?;
+
+    Ok(result.count)
+}
+
 pub async fn list_rows<T>(pool: &PgPool) -> Result<Vec<GraphQLRow<T>>, anyhow::Error>
 where
     T: Clone + Serialize + DeserializeOwned + OutputType + std::marker::Unpin,
@@ -193,32 +210,59 @@ WHERE id NOT IN (SELECT unnest($1::int8[]))
 RETURNING id
         "#,
         &top_ids
-    )
+        )
     .fetch_all(pool)
     .await?
     .len();
 
     Ok(deleted_ids.try_into().unwrap())
 }
 
-/// Function to delete messages older than `retention` minutes
-/// Returns the number of messages deleted
-pub async fn prune_old_messages(pool: &PgPool, retention: i32) -> Result<i64, anyhow::Error> {
+/// Function to delete messages older than `retention` minutes in batches
+/// Returns the total number of messages deleted
+/// Arguments:
+/// - `pool`: &PgPool - A reference to the PostgreSQL connection pool
+/// - `retention`: i32 - The retention time in minutes
+/// - `batch_size`: i64 - The number of messages to delete in each batch
+pub async fn prune_old_messages(
+    pool: &PgPool,
+    retention: i32,
+    batch_size: i64,
+) -> Result<i64, anyhow::Error> {
     let cutoff_nonce = Utc::now().timestamp() - (retention as i64 * 60);
+    let mut total_deleted = 0i64;
+
+    loop {
+        let delete_query = sqlx::query(
+            r#"
+            WITH deleted AS (
+                SELECT id
+                FROM messages
+                WHERE (message->>'nonce')::bigint < $1
+                ORDER BY id ASC
+                LIMIT $2
+                FOR UPDATE SKIP LOCKED
+            )
+            DELETE FROM messages
+            WHERE id IN (SELECT id FROM deleted)
+            RETURNING id
+            "#
+        )
+        .bind(cutoff_nonce)
+        .bind(batch_size);
 
-    let deleted_count = sqlx::query!(
-        r#"
-        DELETE FROM messages
-        WHERE (message->>'nonce')::bigint < $1
-        RETURNING id
-        "#,
-        cutoff_nonce
-    )
-    .fetch_all(pool)
-    .await?
-    .len() as i64;
+        let result: PgQueryResult = delete_query.execute(pool).await?;
+        let deleted_count = result.rows_affected() as i64;
+
+        total_deleted += deleted_count;
+
+        // Break the loop if we deleted fewer rows than the batch size, indicating we've processed all eligible messages.
+        if deleted_count < batch_size {
+            break;
+        }
+    }
 
-    Ok(deleted_count)
+    Ok(total_deleted)
 }
 
 pub async fn list_active_indexers(

diff --git a/src/operator/mod.rs b/src/operator/mod.rs
@@ -185,11 +185,12 @@ impl RadioOperator {
                         };
                     }
 
+                    let batch_size = 1000;
+
                     // Always prune old messages based on RETENTION
                     match timeout(
                         update_timeout,
-                        prune_old_messages(&self.db, self.config.retention)
-                    ).await {
+                        prune_old_messages(&self.db, self.config.retention, batch_size)                    ).await {
                         Err(e) => debug!(err = tracing::field::debug(e), "Pruning by retention timed out"),
                         Ok(Ok(num_pruned)) => {
                             total_num_pruned += num_pruned;

diff --git a/src/server/model/mod.rs b/src/server/model/mod.rs
@@ -1,9 +1,9 @@
 use async_graphql::{Context, EmptySubscription, Object, OutputType, Schema, SimpleObject};
 
-use chrono::{Duration, Utc};
+use chrono::Utc;
 use serde::{de::DeserializeOwned, Serialize};
 use sqlx::{Pool, Postgres};
-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};
 use thiserror::Error;
 
 use crate::{
@@ -61,13 +61,13 @@ impl QueryRoot {
         &self,
         ctx: &Context<'_>,
         indexers: Option<Vec<String>>,
-        minutes_ago: Option<i64>,
+        minutes_ago: Option<u64>,
     ) -> Result<Vec<String>, HttpServiceError> {
         let pool = ctx.data_unchecked::<Pool<Postgres>>();
         // Use a default time window if not specified
         // Default to 1440 minutes (24 hours) if not provided
         let minutes_ago = minutes_ago.unwrap_or(1440);
-        let from_timestamp = (Utc::now() - Duration::minutes(minutes_ago)).timestamp();
+        let from_timestamp = (Utc::now() - Duration::from_secs(minutes_ago * 60)).timestamp();
 
         let active_indexers = list_active_indexers(pool, indexers, from_timestamp).await?;
         Ok(active_indexers)
@@ -77,11 +77,11 @@ impl QueryRoot {
         &self,
         ctx: &Context<'_>,
         indexers: Option<Vec<String>>,
-        minutes_ago: Option<i64>,
+        minutes_ago: Option<u64>,
     ) -> Result<Vec<IndexerStats>, HttpServiceError> {
         let pool = ctx.data_unchecked::<Pool<Postgres>>();
         let minutes_ago = minutes_ago.unwrap_or(1440);
-        let from_timestamp = (Utc::now() - Duration::minutes(minutes_ago)).timestamp();
+        let from_timestamp = (Utc::now() - Duration::from_secs(minutes_ago * 60)).timestamp();
 
         let stats = get_indexer_stats(pool, indexers, from_timestamp).await?;
         Ok(stats)