Skip to content

Commit a7d3f72

Browse files
add quotaa
1 parent 10de748 commit a7d3f72

File tree

4 files changed

+154
-12
lines changed

4 files changed

+154
-12
lines changed

Database/JobState.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,6 @@ public enum JobState
77
InProgress = 2,
88
Completed = 3,
99
Vanished = 4,
10-
Cancelled = 5
10+
Cancelled = 5,
11+
Throttled = 6 // Job is queued but waiting due to runner quota limit
1112
}

GitHub/GithubTargetConfiguration.cs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,15 @@ public class GithubTargetConfiguration
77
{
88
[JsonConverter(typeof(EnvironmentAwareJsonConverter<string>))]
99
public string Name { get; set; }
10-
10+
1111
[JsonConverter(typeof(EnvironmentAwareJsonConverter<string>))]
1212
public string GitHubToken { get; set; }
13-
13+
1414
public List<Pool> Pools { get; set; }
1515
public TargetType Target { get; set; }
16+
17+
/// <summary>
18+
/// Maximum number of runners allowed for this org/repo. If null or not set, unlimited runners are allowed.
19+
/// </summary>
20+
public int? RunnerQuota { get; set; }
1621
}

PoolManager.cs

Lines changed: 96 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ public class PoolManager : BackgroundService
2727
.CreateGauge("github_autoscaler_csp_runners", "Number of runners currently on the CSP", labelNames: ["csp"]);
2828
private static readonly Gauge StuckJobsCount = Metrics
2929
.CreateGauge("github_autoscaler_job_stuck", "Number of jobs not picked up after 15min");
30+
private static readonly Gauge ThrottledJobsCount = Metrics
31+
.CreateGauge("github_autoscaler_job_throttled", "Number of jobs waiting due to runner quota limit");
3032
private static readonly Gauge QueuedJobsCount = Metrics
3133
.CreateGauge("github_autoscaler_job_queued", "Total Number of jobs queued");
3234
private static readonly Gauge CompletedJobsCount = Metrics
@@ -334,11 +336,17 @@ private async Task ProcessStats(List<GithubTargetConfiguration> targetConfig)
334336
// Grab job state counts
335337
var db = new ActionsRunnerContext();
336338
var stuckTime = DateTime.UtcNow - TimeSpan.FromMinutes(15);
339+
340+
// Count stuck jobs (queued for >15min, excluding throttled jobs)
337341
var stuckJobs = await db.Jobs.CountAsync(x => x.State == JobState.Queued && x.RunnerId == null && x.QueueTime < stuckTime);
338342
StuckJobsCount.Set(stuckJobs);
339343

344+
// Count throttled jobs
345+
var throttledJobs = await db.Jobs.CountAsync(x => x.State == JobState.Throttled);
346+
ThrottledJobsCount.Set(throttledJobs);
347+
340348
var jobsByState = await db.Jobs.GroupBy(x => x.State).Select(x => new { x.Key, Count = x.Count() }).ToListAsync();
341-
349+
342350
QueuedJobsCount.Set(jobsByState.FirstOrDefault(x => x.Key == JobState.Queued)?.Count ?? 0);
343351
CompletedJobsCount.Set(jobsByState.FirstOrDefault(x => x.Key == JobState.Completed)?.Count ?? 0);
344352
InProgressJobsCount.Set(jobsByState.FirstOrDefault(x => x.Key == JobState.InProgress)?.Count ?? 0);
@@ -388,6 +396,40 @@ private async Task ProcessStats(List<GithubTargetConfiguration> targetConfig)
388396
}
389397
}
390398

399+
/// <summary>
400+
/// Checks if the runner quota has been reached for the given owner
401+
/// </summary>
402+
/// <param name="owner">The GitHub target configuration</param>
403+
/// <param name="db">Database context</param>
404+
/// <returns>True if quota is reached and no more runners should be created, false otherwise</returns>
405+
private async Task<bool> IsQuotaReached(GithubTargetConfiguration owner, ActionsRunnerContext db)
406+
{
407+
// If no quota is set, unlimited runners are allowed
408+
if (!owner.RunnerQuota.HasValue)
409+
{
410+
return false;
411+
}
412+
413+
// Count all runners that are actively consuming resources (not deleted/failed/cancelled)
414+
// This includes: CreationQueued, Created, Provisioned, Processing (states 1-4)
415+
// Excludes: DeletionQueued, Deleted, Failure, VanishedOnCloud, Cleanup, Cancelled (states 5+)
416+
var runners = await db.Runners
417+
.Include(x => x.Lifecycle)
418+
.Where(x => x.Owner == owner.Name)
419+
.ToListAsync();
420+
421+
int currentRunnerCount = runners.Count(x => x.LastState < RunnerStatus.DeletionQueued);
422+
423+
bool quotaReached = currentRunnerCount >= owner.RunnerQuota.Value;
424+
425+
if (quotaReached)
426+
{
427+
_logger.LogWarning($"Runner quota reached for {owner.Name}: {currentRunnerCount}/{owner.RunnerQuota.Value} (includes queued/provisioning runners)");
428+
}
429+
430+
return quotaReached;
431+
}
432+
391433
private async Task StartPoolRunners(List<GithubTargetConfiguration> targetConfig)
392434
{
393435
// Start pool runners
@@ -396,19 +438,33 @@ private async Task StartPoolRunners(List<GithubTargetConfiguration> targetConfig
396438
{
397439
_logger.LogInformation($"Checking pool runners for {owner.Name}");
398440

441+
// Check if quota is reached for this owner
442+
if (await IsQuotaReached(owner, db))
443+
{
444+
_logger.LogWarning($"Skipping pool runner creation for {owner.Name} - quota reached");
445+
continue;
446+
}
447+
399448
List<Runner> existingRunners = await db.Runners.Where(x => x.Owner == owner.Name && x.IsOnline).ToListAsync();
400-
449+
401450
foreach (Pool pool in owner.Pools)
402451
{
403452
int existCt = existingRunners.Count(x => x.Size == pool.Size);
404453
int missingCt = pool.NumRunners - existCt;
405454

406455
string arch = Program.Config.Sizes.FirstOrDefault(x => x.Name == pool.Size)?.Arch;
407-
456+
408457
_logger.LogInformation($"Checking pool {pool.Size} [{arch}]: Existing={existCt} Requested={pool.NumRunners} Missing={missingCt}");
409-
458+
410459
for (int i = 0; i < missingCt; i++)
411460
{
461+
// Check quota again before each runner creation
462+
if (await IsQuotaReached(owner, db))
463+
{
464+
_logger.LogWarning($"Quota reached while creating pool runners for {owner.Name} - stopping at {i}/{missingCt}");
465+
break;
466+
}
467+
412468
// Queue VM creation
413469
var profile = pool.Profile ?? "default";
414470
Runner newRunner = new()
@@ -451,25 +507,56 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
451507
{
452508
var db = new ActionsRunnerContext();
453509
var stuckTime = DateTime.UtcNow - TimeSpan.FromMinutes(10);
454-
var stuckJobs = await db.Jobs.Where(x => x.State == JobState.Queued && x.RunnerId == null && x.QueueTime < stuckTime).ToListAsync();
510+
511+
// Check both Queued and Throttled jobs that have been waiting >10min without a runner
512+
var stuckJobs = await db.Jobs
513+
.Where(x => (x.State == JobState.Queued || x.State == JobState.Throttled) && x.RunnerId == null && x.QueueTime < stuckTime)
514+
.ToListAsync();
515+
455516
foreach (var stuckJob in stuckJobs)
456517
{
457-
_logger.LogWarning($"Found stuck Job: {stuckJob.JobId} in {stuckJob.Repository}. Starting new runner to compensate...");
458-
459518
var owner = targetConfig.FirstOrDefault(x => x.Name == stuckJob.Owner);
460519
if (owner == null)
461520
{
462521
_logger.LogError($"Unable to get owner for stuck job. {stuckJob.JobId}");
463522
continue;
464523
}
465-
524+
525+
// Check if quota is reached
526+
bool quotaReached = await IsQuotaReached(owner, db);
527+
528+
if (quotaReached)
529+
{
530+
// Mark job as Throttled if it isn't already
531+
if (stuckJob.State != JobState.Throttled)
532+
{
533+
_logger.LogInformation($"Job {stuckJob.JobId} in {stuckJob.Repository} is throttled due to runner quota limit.");
534+
stuckJob.State = JobState.Throttled;
535+
await db.SaveChangesAsync();
536+
}
537+
continue;
538+
}
539+
else
540+
{
541+
// Quota is available - if job was throttled, move it back to queued
542+
if (stuckJob.State == JobState.Throttled)
543+
{
544+
_logger.LogInformation($"Quota now available for throttled job {stuckJob.JobId}. Moving to queued state.");
545+
stuckJob.State = JobState.Queued;
546+
await db.SaveChangesAsync();
547+
}
548+
}
549+
550+
// Job is genuinely stuck (not due to quota) - create replacement runner
551+
_logger.LogWarning($"Found stuck Job: {stuckJob.JobId} in {stuckJob.Repository}. Starting new runner to compensate...");
552+
466553
// Check if there is already a runner in queue to unstuck
467554
if (_queues.CreateTasks.Any(x => x.IsStuckReplacement && x.StuckJobId == stuckJob.JobId))
468555
{
469556
_logger.LogWarning($"Creating queue already has a task for jobs {stuckJob.JobId}");
470557
continue;
471558
}
472-
559+
473560
int replacementsInQueue = _queues.CreateTasks.Count(x => x.IsStuckReplacement);
474561
if (replacementsInQueue > 25)
475562
{

Program.cs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,8 +702,57 @@ private static async Task JobQueued(ILogger<Program> logger, string repoName, Li
702702
TargetType.Repository => repoName,
703703
_ => throw new ArgumentOutOfRangeException(nameof(targetType), targetType, null)
704704
};
705+
706+
// Get the target configuration to check quota
707+
var targetConfig = targetType switch
708+
{
709+
TargetType.Organization => Config.TargetConfigs.FirstOrDefault(x => x.Name == orgName && x.Target == TargetType.Organization),
710+
TargetType.Repository => Config.TargetConfigs.FirstOrDefault(x => x.Name == repoName && x.Target == TargetType.Repository),
711+
_ => throw new ArgumentOutOfRangeException(nameof(targetType), targetType, null)
712+
};
713+
705714
// Record runner to database
706715
await using var db = new ActionsRunnerContext();
716+
717+
// Check if quota is reached before creating runner
718+
if (targetConfig?.RunnerQuota.HasValue == true)
719+
{
720+
// Count all runners that are actively consuming resources (not deleted/failed/cancelled)
721+
// This includes: CreationQueued, Created, Provisioned, Processing (states 1-4)
722+
var runners = await db.Runners
723+
.Include(x => x.Lifecycle)
724+
.Where(x => x.Owner == owner)
725+
.ToListAsync();
726+
727+
int currentRunnerCount = runners.Count(x => x.LastState < RunnerStatus.DeletionQueued);
728+
729+
if (currentRunnerCount >= targetConfig.RunnerQuota.Value)
730+
{
731+
logger.LogWarning($"Runner quota reached for {owner}: {currentRunnerCount}/{targetConfig.RunnerQuota.Value} (includes queued/provisioning runners). Job will be throttled.");
732+
733+
// Record the job in the database as Throttled
734+
if (jobId > 0)
735+
{
736+
Job throttledJob = new()
737+
{
738+
GithubJobId = jobId,
739+
Repository = repoName,
740+
Owner = owner,
741+
State = JobState.Throttled,
742+
QueueTime = DateTime.UtcNow,
743+
JobUrl = jobUrl,
744+
Orphan = false,
745+
RequestedProfile = profileName,
746+
RequestedSize = size
747+
};
748+
await db.Jobs.AddAsync(throttledJob);
749+
await db.SaveChangesAsync();
750+
}
751+
752+
return;
753+
}
754+
}
755+
707756
if (jobId > 0)
708757
{
709758
Job queuedJob = new()

0 commit comments

Comments
 (0)