Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions etc/openqa/openqa.ini
Original file line number Diff line number Diff line change
Expand Up @@ -448,14 +448,22 @@ concurrent = 0
#dynamic_job_limit_enabled = 0
## Minimum effective job limit when dynamic scaling is active. Defaults to 50.
#dynamic_job_limit_min = 50
## Load average above which the effective limit is scaled down. 0 = auto-detect (nproc * 0.85).
## Load average above which the effective limit is scaled down. 0 = auto-detect (nproc * dynamic_job_limit_load_threshold_factor).
#dynamic_job_limit_load_threshold = 0
## Load average for emergency cutback (3x step decrease). 0 = auto-detect (nproc * 1.5).
## Factor for auto-detecting the load threshold. Defaults to 0.85.
#dynamic_job_limit_load_threshold_factor = 0.85
## Load average for emergency cutback (3x step decrease). 0 = auto-detect (nproc * dynamic_job_limit_load_critical_factor).
#dynamic_job_limit_load_critical = 0
## Factor for auto-detecting the critical load threshold. Defaults to 1.5.
#dynamic_job_limit_load_critical_factor = 1.5
## Number of jobs to add/remove per adjustment. Defaults to 10.
#dynamic_job_limit_step = 10
## Minimum seconds between dynamic limit adjustments. Defaults to 60.
#dynamic_job_limit_interval = 60
## Fraction of threshold below which all load averages must fall to scale up. Defaults to 0.7.
#dynamic_job_limit_scale_up_hysteresis = 0.7
## Fraction of threshold below which "fast ramp-up" (double step) is active. Defaults to 0.3.
#dynamic_job_limit_fast_ramp_up_load_factor = 0.3

## Configuration of the label/bugref carry-over
[carry_over]
Expand Down
72 changes: 57 additions & 15 deletions lib/OpenQA/Scheduler/DynamicLimit.pm
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,44 @@ use Mojo::File 'path';
use OpenQA::Log qw(log_debug);
use OpenQA::Utils qw(load_avg);
use Time::HiRes 'time';
use Exporter 'import';

# Fraction of threshold below which all load averages must fall to scale up.
use constant SCALE_UP_HYSTERESIS => 0.7;
# Multiplier applied to step on an emergency (critical) cutback.
use constant EMERGENCY_STEP_MULTIPLIER => 3;
use constant {
SCALE_UP_HYSTERESIS => 0.7,
FAST_RAMP_UP_LOAD_FACTOR => 0.3,
LOAD_THRESHOLD_FACTOR => 0.85,
LOAD_CRITICAL_FACTOR => 1.5,
STEP => 10,
MIN => 50,
INTERVAL => 60,
EMERGENCY_STEP_MULTIPLIER => 3,
};

our @EXPORT_OK = qw(
SCALE_UP_HYSTERESIS
FAST_RAMP_UP_LOAD_FACTOR
LOAD_THRESHOLD_FACTOR
LOAD_CRITICAL_FACTOR
STEP
MIN
INTERVAL
EMERGENCY_STEP_MULTIPLIER
);

sub DEFAULTS ($self = undef) {
return {
dynamic_job_limit_load_threshold => 0,
dynamic_job_limit_load_threshold_factor => LOAD_THRESHOLD_FACTOR,
dynamic_job_limit_load_critical => 0,
dynamic_job_limit_load_critical_factor => LOAD_CRITICAL_FACTOR,
dynamic_job_limit_step => STEP,
dynamic_job_limit_min => MIN,
max_running_jobs => -1,
dynamic_job_limit_interval => INTERVAL,
dynamic_job_limit_scale_up_hysteresis => SCALE_UP_HYSTERESIS,
dynamic_job_limit_fast_ramp_up_load_factor => FAST_RAMP_UP_LOAD_FACTOR,
};
}

has effective_limit => undef;

Expand All @@ -39,42 +72,51 @@ sub _resolve_threshold ($configured, $factor) {
# Returns a hash reference of typed values; this is the single point coupling
# DynamicLimit to the config key names.
sub _extract_config ($config) {
my $defaults = DEFAULTS();
my %c = map { $_ => $config->{$_} // $defaults->{$_} } keys %$defaults;
return {
threshold => _resolve_threshold($config->{dynamic_job_limit_load_threshold}, 0.85),
critical => _resolve_threshold($config->{dynamic_job_limit_load_critical}, 1.5),
step => $config->{dynamic_job_limit_step},
min => $config->{dynamic_job_limit_min},
max => $config->{max_running_jobs},
interval => $config->{dynamic_job_limit_interval},
threshold =>
_resolve_threshold($c{dynamic_job_limit_load_threshold}, $c{dynamic_job_limit_load_threshold_factor}),
critical => _resolve_threshold($c{dynamic_job_limit_load_critical}, $c{dynamic_job_limit_load_critical_factor}),
step => $c{dynamic_job_limit_step},
min => $c{dynamic_job_limit_min},
max => $c{max_running_jobs},
interval => $c{dynamic_job_limit_interval},
scale_up_hysteresis => $c{dynamic_job_limit_scale_up_hysteresis},
fast_ramp_up_load_factor => $c{dynamic_job_limit_fast_ramp_up_load_factor},
};
}


# Adjusts effective_limit based on current load and resolved params, returns the new value.
# Caller must ensure effective_limit is initialised before calling.
sub _adjust ($self, $load, $p) {
my $current = $self->effective_limit;
my ($l1, $l5, $l15) = @$load;
my $load_max = max($l1, $l5, $l15);

my $new;
if (max($l1, $l5, $l15) > $p->{critical}) {
if ($load_max > $p->{critical}) {
# Emergency: cut back aggressively
$new = max($p->{min}, $current - $p->{step} * EMERGENCY_STEP_MULTIPLIER);
}
elsif ($l1 > $p->{threshold} && $l1 > $l5) {
# Load rising above threshold: decrease conservatively
$new = max($p->{min}, $current - $p->{step});
}
elsif (max($l1, $l5, $l15) < $p->{threshold} * SCALE_UP_HYSTERESIS) {
elsif ($load_max < $p->{threshold} * $p->{scale_up_hysteresis}) {
# Load well below threshold on all horizons: increase conservatively
$new = $p->{max} >= 0 ? min($p->{max}, $current + $p->{step}) : $current + $p->{step};
# Double step for fast ramp-up if load is very low
my $step = $load_max < $p->{threshold} * $p->{fast_ramp_up_load_factor} ? $p->{step} * 2 : $p->{step};
$new = $p->{max} >= 0 ? min($p->{max}, $current + $step) : $current + $step;
}
else {
$new = $current;
}

$self->effective_limit($new);
log_debug(sprintf 'Dynamic job limit: %d (load: %.2f/%.2f/%.2f, threshold: %.2f, critical: %.2f)',
$new, $l1, $l5, $l15, $p->{threshold}, $p->{critical});
log_debug(sprintf 'Dynamic job limit: %d (min: %d, max: %d, load: %.2f/%.2f/%.2f, threshold: %.2f, critical: %.2f)',
$new, $p->{min}, $p->{max}, $l1, $l5, $l15, $p->{threshold}, $p->{critical});
return $new;
}

Expand Down
8 changes: 2 additions & 6 deletions lib/OpenQA/Setup.pm
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use List::Util qw(any max);
use OpenQA::Constants qw(DEFAULT_WORKER_TIMEOUT MAX_TIMER);
use OpenQA::JobGroupDefaults;
use OpenQA::Jobs::Constants qw(OK_RESULTS);
use OpenQA::Scheduler::DynamicLimit;
use OpenQA::Task::Job::Limit;
use Feature::Compat::Try;

Expand Down Expand Up @@ -152,13 +153,8 @@ sub default_config () {
},
scheduler => {
max_job_scheduled_time => 7,
max_running_jobs => -1,
dynamic_job_limit_enabled => 0,
dynamic_job_limit_min => 50,
dynamic_job_limit_load_threshold => 0,
dynamic_job_limit_load_critical => 0,
dynamic_job_limit_step => 10,
dynamic_job_limit_interval => 60,
%{OpenQA::Scheduler::DynamicLimit->DEFAULTS},
},
logging => {
level => undef,
Expand Down
15 changes: 10 additions & 5 deletions lib/OpenQA/WebAPI/Controller/Test.pm
Original file line number Diff line number Diff line change
Expand Up @@ -338,14 +338,19 @@ sub list_running_ajax ($self) {
$job_data;
} @jobs;
my %response = (data => \@running);
my $config = OpenQA::App->singleton->config->{scheduler};
my $app = OpenQA::App->singleton;
my $config = $app->config->{scheduler};
my $max_running = $config->{max_running_jobs};
my $global_running
= $self->schema->resultset('Jobs')->count({state => [OpenQA::Jobs::Constants::EXECUTION_STATES]});
if ($config->{dynamic_job_limit_enabled}) {
my $effective = OpenQA::App->singleton->dynamic_limit->current_limit($config);
$response{dynamic_job_limit} = $effective if $effective >= 0 && @running >= $effective;
$response{max_running_jobs} = $max_running if $max_running >= 0;
my $effective = $app->dynamic_limit->current_limit($config);
if ($effective >= 0 && $global_running >= $effective) {
$response{dynamic_job_limit} = $effective;
$response{max_running_jobs} = $max_running if $max_running >= 0;
}
}
elsif ($max_running >= 0 && @running >= $max_running) {
elsif ($max_running >= 0 && $global_running >= $max_running) {
$response{max_running_jobs} = $max_running;
}
$self->render(json => \%response);
Expand Down
Loading
Loading