Skip to content

Commit

Permalink
Merge pull request #2587 from albinsuresh/fix/2501/restart-agent-afte…
Browse files Browse the repository at this point in the history
…r-update

Restart tedge-agent after self update
  • Loading branch information
albinsuresh authored Jan 18, 2024
2 parents 0463eb5 + 2aac2f8 commit fd91232
Show file tree
Hide file tree
Showing 28 changed files with 246 additions and 36 deletions.
23 changes: 23 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-apt-plugin_0.0.1_amd64.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-mapper_0.0.1_amd64.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-watchdog_0.0.1_amd64.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge_0.0.1_amd64.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-agent_0.0.1_amd64.deb filter=lfs diff=lfs merge=lfs -text

tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-agent_0.0.1_arm64.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-apt-plugin_0.0.1_arm64.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-mapper_0.0.1_arm64.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-watchdog_0.0.1_arm64.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge_0.0.1_arm64.deb filter=lfs diff=lfs merge=lfs -text

tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-apt-plugin_0.0.1_armv6.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-agent_0.0.1_armv6.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-mapper_0.0.1_armv6.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-watchdog_0.0.1_armv6.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge_0.0.1_armv6.deb filter=lfs diff=lfs merge=lfs -text

tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-agent_0.0.1_armhf.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-apt-plugin_0.0.1_armhf.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-mapper_0.0.1_armhf.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge-watchdog_0.0.1_armhf.deb filter=lfs diff=lfs merge=lfs -text
tests/RobotFramework/tests/cumulocity/self-update/base-version/tedge_0.0.1_armhf.deb filter=lfs diff=lfs merge=lfs -text
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ zeroize = "1.5"
codegen-units = 1
lto = true
opt-level = "z"
panic = "abort"
panic = "unwind"
strip = "symbols"
overflow-checks = true

Expand Down
97 changes: 70 additions & 27 deletions crates/core/tedge_actors/src/runtime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ pub enum RuntimeEvent {
Error(RuntimeError),
Started { task: String },
Stopped { task: String },
Aborted { task: String, error: RuntimeError },
Aborted { task: String, error: String },
}

/// The actor runtime
pub struct Runtime {
handle: RuntimeHandle,
bg_task: JoinHandle<()>,
bg_task: JoinHandle<Result<(), RuntimeError>>,
}

impl Runtime {
Expand Down Expand Up @@ -88,17 +88,22 @@ impl Runtime {
/// - Or, all the runtime handler clones have been dropped
/// and all the running tasks have reach completion (successfully or not).
pub async fn run_to_completion(self) -> Result<(), RuntimeError> {
Runtime::wait_for_completion(self.bg_task).await
if let Err(err) = Runtime::wait_for_completion(self.bg_task).await {
error!("Aborted due to {err}");
std::process::exit(1)
}

Ok(())
}

async fn wait_for_completion(bg_task: JoinHandle<()>) -> Result<(), RuntimeError> {
bg_task.await.map_err(|err| {
if err.is_panic() {
RuntimeError::RuntimePanic
} else {
RuntimeError::RuntimeCancellation
}
})
async fn wait_for_completion(
bg_task: JoinHandle<Result<(), RuntimeError>>,
) -> Result<(), RuntimeError> {
match bg_task.await {
Ok(result) => result,
Err(err) if err.is_panic() => Err(RuntimeError::RuntimePanic),
Err(_) => Err(RuntimeError::RuntimeCancellation),
}
}
}

Expand Down Expand Up @@ -167,8 +172,9 @@ impl RuntimeActor {
}
}

async fn run(mut self) {
async fn run(mut self) -> Result<(), RuntimeError> {
info!(target: "Runtime", "Started");
let mut aborting_error = None;
let mut actors_count: usize = 0;
loop {
tokio::select! {
Expand Down Expand Up @@ -202,7 +208,12 @@ impl RuntimeActor {
}
},
Some(finished_actor) = self.futures.next() => {
self.handle_actor_finishing(finished_actor).await;
if let Err(error) = self.handle_actor_finishing(finished_actor).await {
info!(target: "Runtime", "Shutting down on error: {error}");
aborting_error = Some(error);
shutdown_actors(&mut self.running_actors).await;
break
}
}
}
}
Expand All @@ -216,30 +227,43 @@ impl RuntimeActor {
}
_ = self.wait_for_actors_to_finish() => info!(target: "Runtime", "All actors have finished")
}

match aborting_error {
None => Ok(()),
Some(error) => Err(error),
}
}

async fn wait_for_actors_to_finish(&mut self) {
while let Some(finished_actor) = self.futures.next().await {
self.handle_actor_finishing(finished_actor).await;
let _ = self.handle_actor_finishing(finished_actor).await;
}
}

async fn handle_actor_finishing(
&mut self,
finished_actor: Result<Result<String, (String, RuntimeError)>, JoinError>,
) {
) -> Result<(), RuntimeError> {
match finished_actor {
Err(e) => error!(target: "Runtime", "Failed to execute actor: {e}"),
Err(e) => {
error!(target: "Runtime", "Failed to execute actor: {e}");
Err(RuntimeError::JoinError(e))
}
Ok(Ok(actor)) => {
self.running_actors.remove(&actor);
info!(target: "Runtime", "Actor has finished: {actor}");
self.send_event(RuntimeEvent::Stopped { task: actor }).await;
Ok(())
}
Ok(Err((actor, error))) => {
self.running_actors.remove(&actor);
error!(target: "Runtime", "Actor {actor} has finished unsuccessfully: {error:?}");
self.send_event(RuntimeEvent::Aborted { task: actor, error })
.await;
self.send_event(RuntimeEvent::Aborted {
task: actor.clone(),
error: format!("{error}"),
})
.await;
Err(error)
}
}
}
Expand Down Expand Up @@ -316,7 +340,15 @@ mod tests {
crate::Sender::send(&mut self.messages, EchoMessage::String(message))
.await?
}
EchoMessage::RuntimeRequest(RuntimeRequest::Shutdown) => break,
EchoMessage::RuntimeRequest(RuntimeRequest::Shutdown) => {
dbg!("shutdown requested");
crate::Sender::send(
&mut self.messages,
EchoMessage::String("Echo stopped".to_string()),
)
.await?;
break;
}
}
}

Expand Down Expand Up @@ -485,8 +517,8 @@ mod tests {
#[tokio::test]
async fn shutdown() {
let (mut actions_sender, mut events_receiver, ra) = init();
let (_, _, actor1) = create_actor(Echo::new);
let (_, _, actor2) = create_actor(Echo::new);
let (_, _sender1, actor1) = create_actor(Echo::new);
let (_, _sender2, actor2) = create_actor(Echo::new);

actions_sender
.send(RuntimeAction::Spawn(actor1))
Expand Down Expand Up @@ -537,26 +569,37 @@ mod tests {

let wait_for_actor_to_panic = async {
while let Some(event) = events_receiver.next().await {
if matches!(event, RuntimeEvent::Aborted { task, .. } if task == "Panic-0") {
break;
match event {
RuntimeEvent::Aborted { task, error } if task == "Panic-0" => {
return Some(error);
}
_ => {}
}
}
None
};

tokio::spawn(ra.run());

tokio::time::timeout(Duration::from_secs(1), wait_for_actor_to_panic)
// The panic is caught by the runtime and an event is sent
let error = tokio::time::timeout(Duration::from_secs(1), wait_for_actor_to_panic)
.await
.expect("Actor to panic in time");
assert_eq!(
error.map(|s| s.replace(char::is_numeric, "")), // ignore the task id
Some("task panicked".to_string())
);

sender
// No more message can be sent to the actors: they have been shutdown
assert!(sender
.send(EchoMessage::String("hello".into()))
.await
.expect("Expected the echo actor to be running and to receive a message");
.is_err());

// The actors have been properly shutdown
assert_eq!(
receiver.next().await.unwrap(),
EchoMessage::String("hello".into())
EchoMessage::String("Echo stopped".into())
);
}
}
5 changes: 3 additions & 2 deletions crates/core/tedge_agent/src/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ use tracing::info;
use tracing::instrument;
use tracing::warn;

const TEDGE_AGENT: &str = "tedge-agent";
pub const TEDGE_AGENT: &str = "tedge-agent";

#[derive(Debug, Clone)]
pub(crate) struct AgentConfig {
Expand Down Expand Up @@ -219,7 +219,8 @@ impl Agent {

#[instrument(skip(self), name = "sm-agent")]
pub async fn start(self) -> Result<(), anyhow::Error> {
info!("Starting tedge agent");
let version = env!("CARGO_PKG_VERSION");
info!("Starting tedge-agent v{}", version);
self.init()?;

// Runtime
Expand Down
51 changes: 48 additions & 3 deletions crates/core/tedge_agent/src/software_manager/actor.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use crate::agent::TEDGE_AGENT;
use crate::software_manager::config::SoftwareManagerConfig;
use crate::software_manager::error::SoftwareManagerError;
use crate::software_manager::error::SoftwareManagerError::NoPlugins;
use crate::state_repository::error::StateError;
use crate::state_repository::state::AgentStateRepository;
use anyhow::anyhow;
use async_trait::async_trait;
use plugin_sm::operation_logs::LogKind;
use plugin_sm::operation_logs::OperationLogs;
Expand All @@ -11,6 +13,7 @@ use plugin_sm::plugin_manager::Plugins;
use serde::Deserialize;
use serde::Serialize;
use std::path::PathBuf;
use std::process::Command;
use tedge_actors::fan_in_message_type;
use tedge_actors::Actor;
use tedge_actors::LoggingReceiver;
Expand Down Expand Up @@ -101,7 +104,12 @@ impl Actor for SoftwareManagerActor {

while let Some(request) = input_receiver.recv().await {
tokio::select! {
_ = self.handle_request(request, &mut plugins, &operation_logs) => {}
_ = self.handle_request(request, &mut plugins, &operation_logs) => {
if let Err(SoftwareManagerError::NotRunningLatestVersion) = Self::detect_self_update() {
error!("Tedge-agent is no more running the latest-version => a restart is required");
return Err(RuntimeError::ActorError(Box::new(SoftwareManagerError::NotRunningLatestVersion)));
}
}

Some(RuntimeRequest::Shutdown) = input_receiver.recv_signal() => {
info!("Received shutdown request from the runtime, exiting...");
Expand Down Expand Up @@ -147,11 +155,12 @@ impl SoftwareManagerActor {
) -> Result<(), SoftwareManagerError> {
match request {
SoftwareCommand::SoftwareUpdateCommand(request) => {
if let Err(err) = self
match self
.handle_software_update_operation(request, plugins, operation_logs)
.await
{
error!("{:?}", err);
Ok(()) => {}
Err(err) => error!("{:?}", err),
}
}
SoftwareCommand::SoftwareListCommand(request) => {
Expand Down Expand Up @@ -232,6 +241,42 @@ impl SoftwareManagerActor {
Ok(())
}

fn detect_self_update() -> Result<(), SoftwareManagerError> {
info!("Checking if tedge got self updated");
let current_running_version = env!("CARGO_PKG_VERSION");
info!("Current running version: {}", current_running_version);

let executable_path = std::env::current_exe()
.map_err(|e| anyhow!("Failed to retrieve running executable path due to {}", e))?;
let agent_binary_path = executable_path.parent().unwrap().join(TEDGE_AGENT);

let output = Command::new(agent_binary_path)
.args(["--version"])
.output()
.map_err(|e| anyhow!("Failed to fetch version of installed binary due to {}", e))?;
if !output.status.success() {
return Err(anyhow!(
"Fetching version from installed binary failed with {}",
String::from_utf8_lossy(&output.stderr)
)
.into());
}

let version_output = String::from_utf8_lossy(&output.stdout);
let version_output_split: Vec<&str> = version_output.split_whitespace().collect();
if let ["tedge-agent", installed_binary_version] = version_output_split.as_slice() {
info!("Installed binary version: {}", installed_binary_version);
if current_running_version != *installed_binary_version {
info!("Self update detected. Requesting shutdown...");
return Err(SoftwareManagerError::NotRunningLatestVersion);
}
} else {
return Err(anyhow!("Unexpected version output: {:?}", version_output).into());
}

Ok(())
}

async fn handle_software_list_operation(
&mut self,
request: SoftwareListCommand,
Expand Down
3 changes: 3 additions & 0 deletions crates/core/tedge_agent/src/software_manager/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ pub enum SoftwareManagerError {
#[error(transparent)]
FromTedgeConfig(#[from] tedge_config::TEdgeConfigError),

#[error("Tedge-agent is not running the latest version")]
NotRunningLatestVersion,

#[error(transparent)]
Other(#[from] anyhow::Error),
}
Expand Down
6 changes: 6 additions & 0 deletions tests/RobotFramework/bin/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ pushd "$SCRIPT_DIR/.." >/dev/null || exit 1
# Required to prevent dbus errors on raspberry pi
export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring

# Use git lfs for test artefacts
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
sudo apt install -y git-lfs
git lfs install
git lfs pull

#
# Setup python virtual environment and install dependencies
#
Expand Down
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Loading

1 comment on commit fd91232

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Robot Results

✅ Passed ❌ Failed ⏭️ Skipped Total Pass % ⏱️ Duration
381 0 3 381 100 1h1m48.287s

Please sign in to comment.