Skip to content

Commit

Permalink
Cleanup failed systemd services during sync_workers
Browse files Browse the repository at this point in the history
It is possible for a systemd or container deployment to fail and no
longer be run (e.g. CrashLoopBackOff) but the deployment/service will
still exist in the runtime environment.

Add the ability for these failed services to be cleaned up during
MiqServer's sync_workers loop.
  • Loading branch information
agrare committed Aug 12, 2020
1 parent 484cf2c commit d79ac6c
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 5 deletions.
24 changes: 21 additions & 3 deletions app/models/miq_server/worker_management/monitor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ module MiqServer::WorkerManagement::Monitor
include_concern 'Start'
include_concern 'Status'
include_concern 'Stop'
include_concern 'Systemd'
include_concern 'SystemLimits'
include_concern 'Validation'

Expand All @@ -22,9 +23,7 @@ def monitor_workers

MiqWorker.status_update_all

check_not_responding
check_pending_stop
clean_worker_records
cleanup_failed_workers

# Monitor all remaining current worker records
miq_workers.where(:status => MiqWorker::STATUSES_CURRENT_OR_STARTING).each do |worker|
Expand Down Expand Up @@ -63,6 +62,25 @@ def sync_workers
result
end

def cleanup_failed_workers
check_not_responding
check_pending_stop
clean_worker_records

if podified?
elsif systemd?
cleanup_failed_systemd_services
end
end

def podified?
MiqEnvironment::Command.is_podified?
end

def systemd?
MiqEnvironment::Command.supports_systemd?
end

def clean_worker_records
worker_deleted = false
miq_workers.each do |w|
Expand Down
2 changes: 0 additions & 2 deletions app/models/miq_server/worker_management/monitor/quiesce.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ def quiesce_workers_loop
miq_workers.each do |w|
if w.containerized_worker?
w.delete_container_objects
elsif w.systemd_worker?
w.stop_systemd_worker
else
stop_worker(w)
end
Expand Down
73 changes: 73 additions & 0 deletions app/models/miq_server/worker_management/monitor/systemd.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
module MiqServer::WorkerManagement::Monitor::Systemd
extend ActiveSupport::Concern

def cleanup_failed_systemd_services
failed_service_names = systemd_failed_miq_services.map { |service| service[:name] }
return if failed_service_names.empty?

_log.info("Disabling failed unit files: [#{failed_service_names.join(", ")}]")
systemd_stop_services(failed_service_names)
end

def systemd_failed_miq_services
miq_services(systemd_failed_services)
end

def systemd_all_miq_services
miq_services(systemd_services)
end

private

def systemd_manager
@systemd_manager ||= begin
require "dbus/systemd"
DBus::Systemd::Manager.new
end
end

def systemd_stop_services(service_names)
service_names.each do |service_name|
systemd_manager.StopUnit(service_name, "replace")

service_settings_dir = systemd_unit_dir.join("#{service_name}.d")
FileUtils.rm_r(service_settings_dir) if service_settings_dir.exist?
end

systemd_manager.DisableUnitFiles(service_names, false)
end

def systemd_unit_dir
Pathname.new("/etc/systemd/system")
end

def miq_services(services)
services.select { |unit| systemd_miq_service_base_names.include?(systemd_service_base_name(unit)) }
end

def systemd_miq_service_base_names
@systemd_miq_service_base_names ||= begin
MiqWorkerType.worker_class_names.map(&:constantize).map(&:service_base_name)
end
end

def systemd_service_name(unit)
File.basename(unit[:name], ".*")
end

def systemd_service_base_name(unit)
systemd_service_name(unit).split("@").first
end

def systemd_failed_services
systemd_services.select { |service| service[:active_state] == "failed" }
end

def systemd_services
systemd_units.select { |unit| File.extname(unit[:name]) == ".service" }
end

def systemd_units
systemd_manager.units
end
end
92 changes: 92 additions & 0 deletions spec/models/miq_server/worker_management/monitor/systemd_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
RSpec.describe MiqServer::WorkerManagement::Monitor::Systemd do
let(:units) { [] }
let(:server) { EvmSpecHelper.create_guid_miq_server_zone.second }
let(:systemd_manager) { double("DBus::Systemd::Manager") }

before do
MiqWorkerType.seed
allow(server).to receive(:systemd_manager).and_return(systemd_manager)
allow(systemd_manager).to receive(:units).and_return(units)
end

context "#cleanup_failed_systemd_services" do
context "with no failed services" do
let(:units) { [{:name => "[email protected]", :description => "ManageIQ Generic Worker", :load_state => "loaded", :active_state => "active", :sub_state => "plugged", :job_id => 0, :job_type => "", :job_object_path => "/"}] }

it "doesn't call DisableUnitFiles" do
expect(systemd_manager).not_to receive(:DisableUnitFiles)
server.cleanup_failed_systemd_services
end
end

context "with failed services" do
let(:units) { [{:name => "[email protected]", :description => "ManageIQ Generic Worker", :load_state => "loaded", :active_state => "failed", :sub_state => "plugged", :job_id => 0, :job_type => "", :job_object_path => "/"}] }

it "calls DisableUnitFiles with the service name" do
expect(systemd_manager).to receive(:StopUnit).with("[email protected]", "replace")
expect(systemd_manager).to receive(:DisableUnitFiles).with(["[email protected]"], false)
server.cleanup_failed_systemd_services
end
end
end

context "#systemd_all_miq_services" do
let(:units) do
[
{:name => "[email protected]", :active_state => "failed"},
{:name => "[email protected]", :active_state => "active"},
{:name => "ssh.service", :active_state => "active"}
]
end

it "filters out non-miq services" do
expect(server.systemd_all_miq_services.count).to eq(2)
end
end

context "#systemd_failed_miq_services" do
let(:units) do
[
{:name => "[email protected]", :active_state => "failed"},
{:name => "[email protected]", :active_state => "active"}
]
end

it "filters out only failed services" do
expect(server.systemd_failed_miq_services.count).to eq(1)
end
end

context "#systemd_miq_service_base_names (private)" do
it "returns the minimal_class_name" do
expect(server.send(:systemd_miq_service_base_names)).to include("generic", "ui")
end
end

context "#systemd_services (private)" do
let(:units) do
[
{:name => "[email protected]"},
{:name => "miq.slice"}
]
end

it "filters out non-service files" do
expect(server.send(:systemd_services).count).to eq(1)
end
end

context "#systemd_service_base_name (private)" do
it "with a non-templated service" do
expect(server.send(:systemd_service_base_name, :name => "miq.slice")).to eq("miq")
end

it "with a template service" do
expect(server.send(:systemd_service_base_name, :name => "[email protected]")).to eq("generic")
end

it "with a templated service instance" do
expect(server.send(:systemd_service_base_name, :name => "[email protected]")).to eq("generic")
end
end
end
7 changes: 7 additions & 0 deletions spec/models/miq_server/worker_management/monitor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@
expect(MiqPriorityWorker).to receive(:sync_workers).and_return(:adds => [123])
expect(server.sync_workers).to eq("MiqPriorityWorker"=>{:adds=>[123]})
end

it "calls cleanup_failed_services" do
allow(MiqWorkerType).to receive(:worker_class_names).and_return([])
allow(MiqEnvironment::Command).to receive(:supports_systemd?).and_return(true)
expect(server).to receive(:cleanup_failed_systemd_services)
server.sync_workers
end
end
end
end

0 comments on commit d79ac6c

Please sign in to comment.