Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid blocking openQA jobs on Git updates #6055

Merged
merged 5 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions etc/openqa/openqa.ini
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
## web UI. Default is 'repo'
#hide_asset_types = repo iso

## Recognized referers contains list of hostnames separated by space. When
## openQA detects (via 'Referer' header) that test job was accessed from
## this domain, this job is labeled as linked and considered as important
## space-separated list of domains recognized by job labeling
## If openQA detects (via 'Referer' header) that a test job was accessed from a
## domain on that list, this job is labeled as linked and considered as important.
#recognized_referers = bugzilla.suse.com bugzilla.opensuse.org progress.opensuse.org github.com

## A regex in a string of test settings to ignore for "job investigation"
Expand Down Expand Up @@ -117,6 +117,10 @@
#git_auto_clone = yes
## enable automatic updates of all test code and needles managed via Git (still experimental, currently still breaks scheduling parallel clusters)
#git_auto_update = no
## specifies how to handle errors on automatic updates via git_auto_update
## - when set to "best-effort" openQA jobs are started even if the update failed
## - when set to "strict" openQA jobs will be blocked until the update succeeded or set to incomplete when the retries for updating are exhausted
#git_auto_update_method = best-effort

## Authentication method to use for user management
[auth]
Expand Down Expand Up @@ -215,7 +219,7 @@ concurrent = 0
[minion_task_triggers]
## Specify one or more task names (space-separated), by default these are not enabled.
## Good candidates would be limit_assets or limit_results_and_logs.
## This is analoguous to triggering tasks via systemd timers using
## This is analogous to triggering tasks via systemd timers using
## openqa-enqueue-asset-cleanup or openqa-enqueue-result-cleanup except
## it's triggered whenever a job is done rather than periodically.
#on_job_done = limit_results_and_logs limit_assets
Expand Down
5 changes: 2 additions & 3 deletions lib/OpenQA/Schema/Result/GruTasks.pm
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
package OpenQA::Schema::Result::GruTasks;


use Mojo::Base 'DBIx::Class::Core';
use Mojo::Base 'DBIx::Class::Core', -signatures;

use Mojo::JSON qw(decode_json encode_json);
use OpenQA::Parser::Result::OpenQA;
Expand Down Expand Up @@ -63,8 +63,7 @@ sub encode_json_to_db {
encode_json($args);
}

sub fail {
my ($self, $reason) = @_;
sub fail ($self, $reason = undef) {
$reason //= 'Unknown';
my $deps = $self->jobs->search;
my $detail_text = 'Minion-GRU.txt';
Expand Down
1 change: 1 addition & 0 deletions lib/OpenQA/Setup.pm
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ sub read_config ($app) {
do_cleanup => 'no',
git_auto_clone => 'yes',
git_auto_update => 'no',
git_auto_update_method => 'best-effort',
},
scheduler => {
max_job_scheduled_time => 7,
Expand Down
15 changes: 5 additions & 10 deletions lib/OpenQA/Shared/GruJob.pm
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@ use Mojo::Base 'Minion::Job', -signatures;

use Mojo::Util qw(dumper);

sub execute {
my $self = shift;

sub execute ($self) {
my $gru_id = $self->info->{notes}{gru_id};
if ($gru_id and not $self->info->{finished}) {
# We have a gru_id and this is the first run of the job
Expand All @@ -32,7 +30,7 @@ sub execute {
$self->app->log->error("Gru job error: $err");
$self->fail($err);
}
$self->_fail_gru($gru_id => $err);
$self->_fail_gru($gru_id, $err);
}

# Avoid a possible race condition where the task retries the job and it gets
Expand All @@ -45,14 +43,11 @@ sub execute {
return undef;
}

sub _delete_gru {
my ($self, $id) = @_;
my $gru = $self->minion->app->schema->resultset('GruTasks')->find($id);
$gru->delete() if $gru;
sub _delete_gru ($self, $id) {
$self->minion->app->schema->resultset('GruTasks')->search({id => $id})->delete;
}

sub _fail_gru {
my ($self, $id, $reason) = @_;
sub _fail_gru ($self, $id, $reason) {
my $gru = $self->minion->app->schema->resultset('GruTasks')->find($id);
$gru->fail($reason) if $gru;
}
Expand Down
20 changes: 19 additions & 1 deletion lib/OpenQA/Task/Git/Clone.pm
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use Mojo::Base 'Mojolicious::Plugin', -signatures;
use Mojo::Util 'trim';

use Mojo::File;
use List::Util qw(min);
use OpenQA::Log qw(log_debug);
use Time::Seconds 'ONE_HOUR';

Expand All @@ -27,8 +28,10 @@ sub _git_clone_all ($job, $clones) {
my $retry_delay = {delay => 30 + int(rand(10))};
# Prevent multiple git_clone tasks for the same path to run in parallel
my @guards;
my $is_path_only = 1;
for my $path (sort keys %$clones) {
$path = Mojo::File->new($path)->realpath if -e $path; # resolve symlinks
$is_path_only &&= !(defined $clones->{$path});
my $guard_name = "git_clone_${path}_task";
my $guard = $app->minion->guard($guard_name, 2 * ONE_HOUR);
unless ($guard) {
Expand All @@ -47,8 +50,23 @@ sub _git_clone_all ($job, $clones) {
die "Don't even think about putting '..' into '$path'." if $path =~ /\.\./;
eval { _git_clone($app, $job, $ctx, $path, $url) };
next unless my $error = $@;

# unblock openQA jobs despite network errors under best-effort configuration
my $retries = $job->retries;
my $git_config = $app->config->{'scm git'};
my $max_retries = $ENV{OPENQA_GIT_CLONE_RETRIES} // 10;
return $job->retry($retry_delay) if $job->retries < $max_retries;
my $max_best_effort_retries = min($max_retries, $ENV{OPENQA_GIT_CLONE_RETRIES_BEST_EFFORT} // 2);
my $gru_task_id = $job->info->{notes}->{gru_id};
if ( $is_path_only
&& defined($gru_task_id)
&& ($error =~ m/disconnect|curl|stream.*closed|/i)
&& $git_config->{git_auto_update_method} eq 'best-effort'
&& $retries >= $max_best_effort_retries)
{
$app->schema->resultset('GruDependencies')->search({gru_task_id => $gru_task_id})->delete;
}

return $job->retry($retry_delay) if $retries < $max_retries;
return $job->fail($error);
}
}
Expand Down
48 changes: 46 additions & 2 deletions t/14-grutasks-git.t
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use OpenQA::Task::Git::Clone;
require OpenQA::Test::Database;
use OpenQA::Test::Utils qw(run_gru_job perform_minion_jobs);
use OpenQA::Test::TimeLimit '20';
use Test::Output qw(stderr_like);
use Test::Output qw(combined_like stderr_like);
use Test::MockModule;
use Test::Mojo;
use Test::Warnings qw(:report_warnings);
Expand All @@ -33,6 +33,9 @@ mkdir 't';
dircopy "$FindBin::Bin/$_", "$workdir/t/$_" or BAIL_OUT($!) for qw(data);

my $schema = OpenQA::Test::Database->new->create();
my $gru_tasks = $schema->resultset('GruTasks');
my $gru_dependencies = $schema->resultset('GruDependencies');
my $jobs = $schema->resultset('Jobs');
my $t = Test::Mojo->new('OpenQA::WebAPI');

# launch an additional app to serve some file for testing blocking downloads
Expand Down Expand Up @@ -257,7 +260,11 @@ subtest 'git clone' => sub {
};

subtest 'git_update_all' => sub {
$t->app->config->{'scm git'}->{git_auto_update} = 'yes';
my $clone_mock = Test::MockModule->new('OpenQA::Task::Git::Clone');
$clone_mock->redefine(_git_clone => sub (@args) { die 'fake disconnect' });

my $git_config = $t->app->config->{'scm git'};
$git_config->{git_auto_update} = 'yes';
my $testdir = $workdir->child('openqa/share/tests');
$testdir->make_path;
my @clones;
Expand All @@ -266,11 +273,48 @@ subtest 'git_update_all' => sub {
$testdir->child("$path/.git")->make_path;
}
local $ENV{OPENQA_BASEDIR} = $workdir;
local $ENV{OPENQA_GIT_CLONE_RETRIES} = 4;
local $ENV{OPENQA_GIT_CLONE_RETRIES_BEST_EFFORT} = 2;
my $minion = $t->app->minion;
my $result = $t->app->gru->enqueue_git_update_all;
my $job = $minion->job($result->{minion_id});
my $args = $job->info->{args}->[0];
my $gru_id = $result->{gru_id};
my $gru_task = $gru_tasks->find($gru_id);
is_deeply [sort keys %$args], \@clones, 'job args as expected';
isnt $gru_task, undef, 'gru task created';

subtest 'error handling and retry behavior' => sub {
# assume an openQA job is blocked on the Gru task
my $blocked_job = $jobs->create({TEST => 'blocked-job'});
$gru_task->jobs->create({job_id => $blocked_job->id});

# perform job, it'll fail via the mocked _git_clone function
$minion->foreground($job->id); # already counts as retry
is $job->info->{state}, 'inactive', 'job failed but set to inactive to be retried';
is $gru_task->jobs->count, 1, 'openQA job not unblocked after first try';

# retry the job now
$minion->foreground($job->id);
is $job->info->{state}, 'inactive', 'job failed but again set to inactive to be retried';
is $gru_task->jobs->count, 0, 'openQA job unblocked after second try';

combined_like { $minion->foreground($job->id) } qr/fake disconnect/, 'error logged';
is $job->info->{state}, 'failed', 'job failed for real after retries exhausted';

subtest 'method "strict"' => sub {
local $ENV{OPENQA_GIT_CLONE_RETRIES_BEST_EFFORT} = 1;
$git_config->{git_auto_update_method} = 'strict';
$result = $t->app->gru->enqueue_git_update_all;
$job = $minion->job($result->{minion_id});
$gru_task = $gru_tasks->find($result->{gru_id});
$gru_task->jobs->create({job_id => $blocked_job->id});
$minion->foreground($job->id);
is $job->info->{state}, 'inactive', 'job failed but again set to inactive to be retried';
is $gru_task->jobs->count, 1, 'openQA job not unblocked with method "strict"';
};
}
if $gru_task;
Martchus marked this conversation as resolved.
Show resolved Hide resolved
};

subtest 'enqueue_git_clones' => sub {
Expand Down
1 change: 1 addition & 0 deletions t/config.t
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ subtest 'Test configuration default modes' => sub {
do_cleanup => 'no',
git_auto_clone => 'yes',
git_auto_update => 'no',
git_auto_update_method => 'best-effort',
},
'scheduler' => {
max_job_scheduled_time => 7,
Expand Down
Loading