From c66c0f3fe6a7aecb4f31bd930daba5e7e6510fc9 Mon Sep 17 00:00:00 2001 From: Mohit Dhattarwal <48082542+Mohiiit@users.noreply.github.com> Date: Sat, 9 Nov 2024 12:16:36 +0530 Subject: [PATCH] Add panic recovery handling for process_job with test coverage (#178) * refactor: makefile updated as per latest bootstraper commit * feat: handling panic in process job gracefully * changelog: updated * refactor: removed extra files * lint: fixed * refactor: panicking in the process job, test updated accordingly * refactor: process job updated to update panic message in the db * refactor: removed job id from panic message, was redundant --------- Co-authored-by: mohiiit Co-authored-by: Heemank Verma --- CHANGELOG.md | 1 + crates/orchestrator/src/jobs/mod.rs | 23 +++++++++-- crates/orchestrator/src/tests/jobs/mod.rs | 47 +++++++++++++++++++++++ madara | 2 +- pathfinder | 2 +- 5 files changed, 70 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ed57bb6..51a28afb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## Added +- panic handling in process job - upgrade ETH L1 bridge for withdrawals to work - added makefile and submodules - Endpoints for triggering processing and verification jobs diff --git a/crates/orchestrator/src/jobs/mod.rs b/crates/orchestrator/src/jobs/mod.rs index 476a81bd..e86aa984 100644 --- a/crates/orchestrator/src/jobs/mod.rs +++ b/crates/orchestrator/src/jobs/mod.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::fmt; +use std::panic::AssertUnwindSafe; use std::sync::Arc; use std::time::Duration; @@ -8,6 +9,7 @@ use color_eyre::eyre::{eyre, Context}; use constants::JOB_METADATA_FAILURE_REASON; use conversion::parse_string; use da_job::DaError; +use futures::FutureExt; use mockall::automock; use mockall_double::double; use opentelemetry::KeyValue; @@ -227,18 +229,33 @@ pub async fn process_job(id: Uuid, config: Arc) -> Result<(), JobError> tracing::debug!(job_id = ?id, job_type = ?job.job_type, "Getting job handler"); let job_handler = factory::get_job_handler(&job.job_type).await; - let external_id = match job_handler.process_job(config.clone(), &mut job).await { - Ok(external_id) => { + let external_id = match AssertUnwindSafe(job_handler.process_job(config.clone(), &mut job)).catch_unwind().await { + Ok(Ok(external_id)) => { tracing::debug!(job_id = ?id, "Successfully processed job"); external_id } - Err(e) => { + Ok(Err(e)) => { // TODO: I think most of the times the errors will not be fixed automatically // if we just retry. But for some failures like DB issues, it might be possible // that retrying will work. So we can add a retry logic here to improve robustness. tracing::error!(job_id = ?id, error = ?e, "Failed to process job"); return move_job_to_failed(&job, config.clone(), format!("Processing failed: {}", e)).await; } + Err(panic) => { + let panic_msg = panic + .downcast_ref::() + .map(|s| s.as_str()) + .or_else(|| panic.downcast_ref::<&str>().copied()) + .unwrap_or("Unknown panic message"); + + tracing::error!(job_id = ?id, panic_msg = %panic_msg, "Job handler panicked during processing"); + return move_job_to_failed( + &job, + config.clone(), + format!("Job handler panicked with message: {}", panic_msg), + ) + .await; + } }; tracing::debug!(job_id = ?id, "Incrementing process attempt count in metadata"); let metadata = increment_key_in_metadata(&job.metadata, JOB_PROCESS_ATTEMPT_METADATA_KEY)?; diff --git a/crates/orchestrator/src/tests/jobs/mod.rs b/crates/orchestrator/src/tests/jobs/mod.rs index ecb204b6..b10c66e8 100644 --- a/crates/orchestrator/src/tests/jobs/mod.rs +++ b/crates/orchestrator/src/tests/jobs/mod.rs @@ -192,6 +192,53 @@ async fn process_job_with_job_exists_in_db_and_valid_job_processing_status_works assert_eq!(consumed_message_payload.id, job_item.id); } +/// Tests `process_job` function when job handler panics during execution. +/// This test verifies that: +/// 1. The panic is properly caught and handled +/// 2. The job is moved to failed state +/// 3. Appropriate error message is set in the job metadata +#[rstest] +#[tokio::test] +async fn process_job_handles_panic() { + let job_item = build_job_item_by_type_and_status(JobType::SnosRun, JobStatus::Created, "1".to_string()); + + // Building config + let services = TestConfigBuilder::new() + .configure_database(ConfigType::Actual) + .configure_queue_client(ConfigType::Actual) + .build() + .await; + + let database_client = services.config.database(); + // Creating job in database + database_client.create_job(job_item.clone()).await.unwrap(); + + let mut job_handler = MockJob::new(); + // Setting up mock to panic when process_job is called + job_handler + .expect_process_job() + .times(1) + .returning(|_, _| -> Result { panic!("Simulated panic in process_job") }); + + // Mocking the `get_job_handler` call in process_job function + let job_handler: Arc> = Arc::new(Box::new(job_handler)); + let ctx = mock_factory::get_job_handler_context(); + ctx.expect().times(1).with(eq(JobType::SnosRun)).return_once(move |_| Arc::clone(&job_handler)); + + assert!(process_job(job_item.id, services.config.clone()).await.is_ok()); + + // DB checks - verify the job was moved to failed state + let job_in_db = database_client.get_job_by_id(job_item.id).await.unwrap().unwrap(); + assert_eq!(job_in_db.status, JobStatus::Failed); + assert!( + job_in_db + .metadata + .get(JOB_METADATA_FAILURE_REASON) + .unwrap() + .contains("Job handler panicked with message: Simulated panic in process_job") + ); +} + /// Tests `process_job` function when job is already existing in the db and job status is not /// `Created` or `VerificationFailed`. #[rstest] diff --git a/madara b/madara index d188aa91..93d57632 160000 --- a/madara +++ b/madara @@ -1 +1 @@ -Subproject commit d188aa91efa78bcc54f92aa1035295fd50e068d2 +Subproject commit 93d57632e7a56707d74ab63eb0ca8558df9f0d8e diff --git a/pathfinder b/pathfinder index 29f93d0d..138140e5 160000 --- a/pathfinder +++ b/pathfinder @@ -1 +1 @@ -Subproject commit 29f93d0d6ad8758fdcf5ae3a8bd2faad2a3bc92b +Subproject commit 138140e5fd967ede92806fd62de47c2e6b65712a