From 27b3c152b86acc020a0362cf79f868ab6a312481 Mon Sep 17 00:00:00 2001 From: Adam Grare Date: Wed, 5 Aug 2020 15:25:33 -0400 Subject: [PATCH] Cleanup failed systemd services during sync_workers It is possible for a systemd or container deployment to fail and no longer be run (e.g. CrashLoopBackOff) but the deployment/service will still exist in the runtime environment. Add the ability for these failed services to be cleaned up during MiqServer's sync_workers loop. --- .../miq_server/worker_management/monitor.rb | 18 ++++ .../worker_management/monitor/quiesce.rb | 2 - .../worker_management/monitor/systemd.rb | 73 +++++++++++++++ .../worker_management/monitor/systemd_spec.rb | 92 +++++++++++++++++++ 4 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 app/models/miq_server/worker_management/monitor/systemd.rb create mode 100644 spec/models/miq_server/worker_management/monitor/systemd_spec.rb diff --git a/app/models/miq_server/worker_management/monitor.rb b/app/models/miq_server/worker_management/monitor.rb index e8ec7879ae1d..825a28914a38 100644 --- a/app/models/miq_server/worker_management/monitor.rb +++ b/app/models/miq_server/worker_management/monitor.rb @@ -8,6 +8,7 @@ module MiqServer::WorkerManagement::Monitor include_concern 'Start' include_concern 'Status' include_concern 'Stop' + include_concern 'Systemd' include_concern 'SystemLimits' include_concern 'Validation' @@ -45,6 +46,8 @@ def worker_not_responding(w) end def sync_workers + cleanup_failed_workers + result = {} MiqWorkerType.worker_class_names.each do |class_name| begin @@ -63,6 +66,21 @@ def sync_workers result end + def cleanup_failed_workers + if podified? + elsif systemd? + cleaup_failed_systemd_services + end + end + + def podified? + MiqEnvironment::Command.is_podified? + end + + def systemd? + MiqEnvironment::Command.supports_systemd? + end + def clean_worker_records worker_deleted = false miq_workers.each do |w| diff --git a/app/models/miq_server/worker_management/monitor/quiesce.rb b/app/models/miq_server/worker_management/monitor/quiesce.rb index a5b201a2dcff..46434d662471 100644 --- a/app/models/miq_server/worker_management/monitor/quiesce.rb +++ b/app/models/miq_server/worker_management/monitor/quiesce.rb @@ -36,8 +36,6 @@ def quiesce_workers_loop miq_workers.each do |w| if w.containerized_worker? w.delete_container_objects - elsif w.systemd_worker? - w.stop_systemd_worker else stop_worker(w) end diff --git a/app/models/miq_server/worker_management/monitor/systemd.rb b/app/models/miq_server/worker_management/monitor/systemd.rb new file mode 100644 index 000000000000..d7e66e445266 --- /dev/null +++ b/app/models/miq_server/worker_management/monitor/systemd.rb @@ -0,0 +1,73 @@ +module MiqServer::WorkerManagement::Monitor::Systemd + extend ActiveSupport::Concern + + def cleaup_failed_systemd_services + failed_service_names = systemd_failed_miq_services.map { |service| service[:name] } + return if failed_service_names.empty? + + _log.info("Disabling failed unit files: [#{failed_service_names.join(", ")}]") + systemd_stop_services(failed_service_names) + end + + def systemd_failed_miq_services + miq_services(systemd_failed_services) + end + + def systemd_all_miq_services + miq_services(systemd_services) + end + + private + + def systemd_manager + @systemd_manager ||= begin + require "dbus/systemd" + DBus::Systemd::Manager.new + end + end + + def systemd_stop_services(service_names) + service_names.each do |service_name| + systemd_manager.StopUnit(service_name, "replace") + + service_settings_dir = systemd_unit_dir.join("#{service_name}.d") + FileUtils.rm_r(service_settings_dir) if service_settings_dir.exist? + end + + systemd_manager.DisableUnitFiles(service_names, false) + end + + def systemd_unit_dir + Pathname.new("/etc/systemd/system") + end + + def miq_services(services) + services.select { |unit| systemd_miq_service_base_names.include?(systemd_service_base_name(unit)) } + end + + def systemd_miq_service_base_names + @systemd_miq_service_base_names ||= begin + MiqWorkerType.worker_class_names.map(&:constantize).map(&:service_base_name) + end + end + + def systemd_service_name(unit) + File.basename(unit[:name], ".*") + end + + def systemd_service_base_name(unit) + systemd_service_name(unit).split("@").first + end + + def systemd_failed_services + systemd_services.select { |service| service[:active_state] == "failed" } + end + + def systemd_services + systemd_units.select { |unit| File.extname(unit[:name]) == ".service" } + end + + def systemd_units + systemd_manager.units + end +end diff --git a/spec/models/miq_server/worker_management/monitor/systemd_spec.rb b/spec/models/miq_server/worker_management/monitor/systemd_spec.rb new file mode 100644 index 000000000000..1e2f5c5b204e --- /dev/null +++ b/spec/models/miq_server/worker_management/monitor/systemd_spec.rb @@ -0,0 +1,92 @@ +RSpec.describe MiqServer::WorkerManagement::Monitor::Systemd do + let(:units) { [] } + let(:server) { EvmSpecHelper.create_guid_miq_server_zone.second } + let(:systemd_manager) { double("DBus::Systemd::Manager") } + + before do + MiqWorkerType.seed + allow(server).to receive(:systemd_manager).and_return(systemd_manager) + allow(systemd_manager).to receive(:units).and_return(units) + end + + context "#cleanup_failed_systemd_services" do + context "with no failed services" do + let(:units) { [{:name => "generic@68400a7e-1747-4f10-be2a-d0fc91b705ca.service", :description => "ManageIQ Generic Worker", :load_state => "loaded", :active_state => "active", :sub_state => "plugged", :job_id => 0, :job_type => "", :job_object_path => "/"}] } + + it "doesn't call DisableUnitFiles" do + expect(systemd_manager).not_to receive(:DisableUnitFiles) + server.cleaup_failed_systemd_services + end + end + + context "with failed services" do + let(:units) { [{:name => "generic@68400a7e-1747-4f10-be2a-d0fc91b705ca.service", :description => "ManageIQ Generic Worker", :load_state => "loaded", :active_state => "failed", :sub_state => "plugged", :job_id => 0, :job_type => "", :job_object_path => "/"}] } + + it "calls DisableUnitFiles with the service name" do + expect(systemd_manager).to receive(:StopUnit).with("generic@68400a7e-1747-4f10-be2a-d0fc91b705ca.service", "replace") + expect(systemd_manager).to receive(:DisableUnitFiles).with(["generic@68400a7e-1747-4f10-be2a-d0fc91b705ca.service"], false) + server.cleaup_failed_systemd_services + end + end + end + + context "#systemd_all_miq_services" do + let(:units) do + [ + {:name => "generic@68400a7e-1747-4f10-be2a-d0fc91b705ca.service", :active_state => "failed"}, + {:name => "ui@cfe2c489-5c93-4b77-8620-cf6b1d3ec595.service", :active_state => "active"}, + {:name => "ssh.service", :active_state => "active"} + ] + end + + it "filters out non-miq services" do + expect(server.systemd_all_miq_services.count).to eq(2) + end + end + + context "#systemd_failed_miq_services" do + let(:units) do + [ + {:name => "generic@68400a7e-1747-4f10-be2a-d0fc91b705ca.service", :active_state => "failed"}, + {:name => "ui@cfe2c489-5c93-4b77-8620-cf6b1d3ec595.service", :active_state => "active"} + ] + end + + it "filters out only failed services" do + expect(server.systemd_failed_miq_services.count).to eq(1) + end + end + + context "#systemd_miq_service_base_names (private)" do + it "returns the minimal_class_name" do + expect(server.send(:systemd_miq_service_base_names)).to include("generic", "ui") + end + end + + context "#systemd_services (private)" do + let(:units) do + [ + {:name => "generic@68400a7e-1747-4f10-be2a-d0fc91b705ca.service"}, + {:name => "miq.slice"} + ] + end + + it "filters out non-service files" do + expect(server.send(:systemd_services).count).to eq(1) + end + end + + context "#systemd_service_base_name (private)" do + it "with a non-templated service" do + expect(server.send(:systemd_service_base_name, :name => "miq.slice")).to eq("miq") + end + + it "with a template service" do + expect(server.send(:systemd_service_base_name, :name => "generic@.service")).to eq("generic") + end + + it "with a templated service instance" do + expect(server.send(:systemd_service_base_name, :name => "generic@68400a7e-1747-4f10-be2a-d0fc91b705ca.service")).to eq("generic") + end + end +end