@@ -27,6 +27,8 @@ public class PoolManager : BackgroundService
2727 . CreateGauge ( "github_autoscaler_csp_runners" , "Number of runners currently on the CSP" , labelNames : [ "csp" ] ) ;
2828 private static readonly Gauge StuckJobsCount = Metrics
2929 . CreateGauge ( "github_autoscaler_job_stuck" , "Number of jobs not picked up after 15min" ) ;
30+ private static readonly Gauge ThrottledJobsCount = Metrics
31+ . CreateGauge ( "github_autoscaler_job_throttled" , "Number of jobs waiting due to runner quota limit" ) ;
3032 private static readonly Gauge QueuedJobsCount = Metrics
3133 . CreateGauge ( "github_autoscaler_job_queued" , "Total Number of jobs queued" ) ;
3234 private static readonly Gauge CompletedJobsCount = Metrics
@@ -334,11 +336,17 @@ private async Task ProcessStats(List<GithubTargetConfiguration> targetConfig)
334336 // Grab job state counts
335337 var db = new ActionsRunnerContext ( ) ;
336338 var stuckTime = DateTime . UtcNow - TimeSpan . FromMinutes ( 15 ) ;
339+
340+ // Count stuck jobs (queued for >15min, excluding throttled jobs)
337341 var stuckJobs = await db . Jobs . CountAsync ( x => x . State == JobState . Queued && x . RunnerId == null && x . QueueTime < stuckTime ) ;
338342 StuckJobsCount . Set ( stuckJobs ) ;
339343
344+ // Count throttled jobs
345+ var throttledJobs = await db . Jobs . CountAsync ( x => x . State == JobState . Throttled ) ;
346+ ThrottledJobsCount . Set ( throttledJobs ) ;
347+
340348 var jobsByState = await db . Jobs . GroupBy ( x => x . State ) . Select ( x => new { x . Key , Count = x . Count ( ) } ) . ToListAsync ( ) ;
341-
349+
342350 QueuedJobsCount . Set ( jobsByState . FirstOrDefault ( x => x . Key == JobState . Queued ) ? . Count ?? 0 ) ;
343351 CompletedJobsCount . Set ( jobsByState . FirstOrDefault ( x => x . Key == JobState . Completed ) ? . Count ?? 0 ) ;
344352 InProgressJobsCount . Set ( jobsByState . FirstOrDefault ( x => x . Key == JobState . InProgress ) ? . Count ?? 0 ) ;
@@ -388,6 +396,40 @@ private async Task ProcessStats(List<GithubTargetConfiguration> targetConfig)
388396 }
389397 }
390398
399+ /// <summary>
400+ /// Checks if the runner quota has been reached for the given owner
401+ /// </summary>
402+ /// <param name="owner">The GitHub target configuration</param>
403+ /// <param name="db">Database context</param>
404+ /// <returns>True if quota is reached and no more runners should be created, false otherwise</returns>
405+ private async Task < bool > IsQuotaReached ( GithubTargetConfiguration owner , ActionsRunnerContext db )
406+ {
407+ // If no quota is set, unlimited runners are allowed
408+ if ( ! owner . RunnerQuota . HasValue )
409+ {
410+ return false ;
411+ }
412+
413+ // Count all runners that are actively consuming resources (not deleted/failed/cancelled)
414+ // This includes: CreationQueued, Created, Provisioned, Processing (states 1-4)
415+ // Excludes: DeletionQueued, Deleted, Failure, VanishedOnCloud, Cleanup, Cancelled (states 5+)
416+ var runners = await db . Runners
417+ . Include ( x => x . Lifecycle )
418+ . Where ( x => x . Owner == owner . Name )
419+ . ToListAsync ( ) ;
420+
421+ int currentRunnerCount = runners . Count ( x => x . LastState < RunnerStatus . DeletionQueued ) ;
422+
423+ bool quotaReached = currentRunnerCount >= owner . RunnerQuota . Value ;
424+
425+ if ( quotaReached )
426+ {
427+ _logger . LogWarning ( $ "Runner quota reached for { owner . Name } : { currentRunnerCount } /{ owner . RunnerQuota . Value } (includes queued/provisioning runners)") ;
428+ }
429+
430+ return quotaReached ;
431+ }
432+
391433 private async Task StartPoolRunners ( List < GithubTargetConfiguration > targetConfig )
392434 {
393435 // Start pool runners
@@ -396,19 +438,33 @@ private async Task StartPoolRunners(List<GithubTargetConfiguration> targetConfig
396438 {
397439 _logger . LogInformation ( $ "Checking pool runners for { owner . Name } ") ;
398440
441+ // Check if quota is reached for this owner
442+ if ( await IsQuotaReached ( owner , db ) )
443+ {
444+ _logger . LogWarning ( $ "Skipping pool runner creation for { owner . Name } - quota reached") ;
445+ continue ;
446+ }
447+
399448 List < Runner > existingRunners = await db . Runners . Where ( x => x . Owner == owner . Name && x . IsOnline ) . ToListAsync ( ) ;
400-
449+
401450 foreach ( Pool pool in owner . Pools )
402451 {
403452 int existCt = existingRunners . Count ( x => x . Size == pool . Size ) ;
404453 int missingCt = pool . NumRunners - existCt ;
405454
406455 string arch = Program . Config . Sizes . FirstOrDefault ( x => x . Name == pool . Size ) ? . Arch ;
407-
456+
408457 _logger . LogInformation ( $ "Checking pool { pool . Size } [{ arch } ]: Existing={ existCt } Requested={ pool . NumRunners } Missing={ missingCt } ") ;
409-
458+
410459 for ( int i = 0 ; i < missingCt ; i ++ )
411460 {
461+ // Check quota again before each runner creation
462+ if ( await IsQuotaReached ( owner , db ) )
463+ {
464+ _logger . LogWarning ( $ "Quota reached while creating pool runners for { owner . Name } - stopping at { i } /{ missingCt } ") ;
465+ break ;
466+ }
467+
412468 // Queue VM creation
413469 var profile = pool . Profile ?? "default" ;
414470 Runner newRunner = new ( )
@@ -451,25 +507,56 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
451507 {
452508 var db = new ActionsRunnerContext ( ) ;
453509 var stuckTime = DateTime . UtcNow - TimeSpan . FromMinutes ( 10 ) ;
454- var stuckJobs = await db . Jobs . Where ( x => x . State == JobState . Queued && x . RunnerId == null && x . QueueTime < stuckTime ) . ToListAsync ( ) ;
510+
511+ // Check both Queued and Throttled jobs that have been waiting >10min without a runner
512+ var stuckJobs = await db . Jobs
513+ . Where ( x => ( x . State == JobState . Queued || x . State == JobState . Throttled ) && x . RunnerId == null && x . QueueTime < stuckTime )
514+ . ToListAsync ( ) ;
515+
455516 foreach ( var stuckJob in stuckJobs )
456517 {
457- _logger . LogWarning ( $ "Found stuck Job: { stuckJob . JobId } in { stuckJob . Repository } . Starting new runner to compensate...") ;
458-
459518 var owner = targetConfig . FirstOrDefault ( x => x . Name == stuckJob . Owner ) ;
460519 if ( owner == null )
461520 {
462521 _logger . LogError ( $ "Unable to get owner for stuck job. { stuckJob . JobId } ") ;
463522 continue ;
464523 }
465-
524+
525+ // Check if quota is reached
526+ bool quotaReached = await IsQuotaReached ( owner , db ) ;
527+
528+ if ( quotaReached )
529+ {
530+ // Mark job as Throttled if it isn't already
531+ if ( stuckJob . State != JobState . Throttled )
532+ {
533+ _logger . LogInformation ( $ "Job { stuckJob . JobId } in { stuckJob . Repository } is throttled due to runner quota limit.") ;
534+ stuckJob . State = JobState . Throttled ;
535+ await db . SaveChangesAsync ( ) ;
536+ }
537+ continue ;
538+ }
539+ else
540+ {
541+ // Quota is available - if job was throttled, move it back to queued
542+ if ( stuckJob . State == JobState . Throttled )
543+ {
544+ _logger . LogInformation ( $ "Quota now available for throttled job { stuckJob . JobId } . Moving to queued state.") ;
545+ stuckJob . State = JobState . Queued ;
546+ await db . SaveChangesAsync ( ) ;
547+ }
548+ }
549+
550+ // Job is genuinely stuck (not due to quota) - create replacement runner
551+ _logger . LogWarning ( $ "Found stuck Job: { stuckJob . JobId } in { stuckJob . Repository } . Starting new runner to compensate...") ;
552+
466553 // Check if there is already a runner in queue to unstuck
467554 if ( _queues . CreateTasks . Any ( x => x . IsStuckReplacement && x . StuckJobId == stuckJob . JobId ) )
468555 {
469556 _logger . LogWarning ( $ "Creating queue already has a task for jobs { stuckJob . JobId } ") ;
470557 continue ;
471558 }
472-
559+
473560 int replacementsInQueue = _queues . CreateTasks . Count ( x => x . IsStuckReplacement ) ;
474561 if ( replacementsInQueue > 25 )
475562 {
0 commit comments