From 6415a61b449eefed783d08cf1aee8172934f14aa Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Fri, 1 Nov 2024 16:48:15 +0100 Subject: [PATCH 01/28] extracted resource watching logic into a separate class; implemented logic for observer in a RecourceWatcher class; added method to stop a thread gracefully --- scrapyd_k8s/k8s_resource_watcher.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scrapyd_k8s/k8s_resource_watcher.py b/scrapyd_k8s/k8s_resource_watcher.py index 7e9991e..1567806 100644 --- a/scrapyd_k8s/k8s_resource_watcher.py +++ b/scrapyd_k8s/k8s_resource_watcher.py @@ -4,7 +4,6 @@ from kubernetes import client, watch from typing import Callable, List import urllib3 - logger = logging.getLogger(__name__) class ResourceWatcher: @@ -51,7 +50,6 @@ def subscribe(self, callback: Callable): if callback not in self.subscribers: self.subscribers.append(callback) logger.debug(f"Subscriber {callback.__name__} added.") - def unsubscribe(self, callback: Callable): """ Removes a subscriber callback. @@ -65,7 +63,6 @@ def unsubscribe(self, callback: Callable): if callback in self.subscribers: self.subscribers.remove(callback) logger.debug(f"Subscriber {callback.__name__} removed.") - def notify_subscribers(self, event: dict): """ Notifies all subscribers about an event. @@ -141,7 +138,6 @@ def watch_pods(self): time.sleep(backoff_time) backoff_time = min(backoff_time*self.backoff_coefficient, 900) - def stop(self): """ Stops the watcher thread gracefully. From 3064c3a9dd9643a812449da7fad36761325952d7 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 6 Nov 2024 09:04:25 +0100 Subject: [PATCH 02/28] add kubernetes scheduler class that subscribes to the event observer that handles the logic to unsuspend jobs and get the next in order according to the creation timestamp; modify schedule endpoint to start jobs suspended if there is already enogh jobs running; modify corresponding function in k8s launcher; add to k8s launcher methods to unsuspend job, to get current number of running jobs, to list suspended jobs and a private method to get job name to be used for unsuspend function --- kubernetes.yaml | 2 +- scrapyd_k8s.sample-k8s.conf | 2 +- scrapyd_k8s/api.py | 15 ++++- scrapyd_k8s/k8s_resource_watcher.py | 3 +- scrapyd_k8s/k8s_scheduler.py | 97 +++++++++++++++++++++++++++++ scrapyd_k8s/launcher/k8s.py | 72 ++++++++++++++++++++- 6 files changed, 184 insertions(+), 7 deletions(-) create mode 100644 scrapyd_k8s/k8s_scheduler.py diff --git a/kubernetes.yaml b/kubernetes.yaml index e430b91..213ada9 100644 --- a/kubernetes.yaml +++ b/kubernetes.yaml @@ -176,7 +176,7 @@ rules: verbs: ["get"] - apiGroups: ["batch"] resources: ["jobs"] - verbs: ["get", "list", "create", "delete"] + verbs: ["get", "list", "create", "patch", "update", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding diff --git a/scrapyd_k8s.sample-k8s.conf b/scrapyd_k8s.sample-k8s.conf index 9ab3536..835b82a 100644 --- a/scrapyd_k8s.sample-k8s.conf +++ b/scrapyd_k8s.sample-k8s.conf @@ -20,7 +20,7 @@ namespace = default #pull_secret = ghcr-registry # Maximum number of jobs running in parallel -max_proc = 10 +max_proc = 1 # For each project, define a project section. # This contains a repository that points to the remote container repository. diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 98207ed..8ffbc90 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -5,11 +5,20 @@ from flask_basicauth import BasicAuth from natsort import natsort_keygen, ns +from .k8s_resource_watcher import ResourceWatcher +from .k8s_scheduler import KubernetesScheduler from .config import Config config = Config() app = Flask(__name__) +# Initialize ResourceWatcher and KubernetesScheduler +namespace = config.namespace() +resource_watcher = ResourceWatcher(namespace) +max_proc = int(scrapyd_config.get('max_proc', 4)) +logging.debug(f"MAX PROC IS SET TO: {max_proc}") +k8s_scheduler = KubernetesScheduler(config, launcher, resource_watcher, max_proc) + @app.get("/") def home(): return "

scrapyd-k8s

" @@ -46,8 +55,12 @@ def api_schedule(): _version = request.form.get('_version', 'latest') # TODO allow customizing latest tag # any other parameter is passed as spider argument args = { k: v for k, v in request.form.items() if k not in ('project', 'spider', 'setting', 'jobid', 'priority', '_version') } + running_jobs = launcher.get_running_jobs_count() + start_suspended = running_jobs >= k8s_scheduler.max_proc + logger.info( + f"Scheduling job {job_id} with start_suspended={start_suspended}. Running jobs: {running_jobs}, Max procs: {k8s_scheduler.max_proc}") env_config, env_secret = project.env_config(), project.env_secret() - jobid = config.launcher().schedule(project, _version, spider, job_id, settings, args) + jobid = config.launcher.schedule(project, _version, spider, job_id, settings, args, start_suspended=start_suspended) return { 'status': 'ok', 'jobid': job_id } @app.post("/cancel.json") diff --git a/scrapyd_k8s/k8s_resource_watcher.py b/scrapyd_k8s/k8s_resource_watcher.py index 1567806..60c51f1 100644 --- a/scrapyd_k8s/k8s_resource_watcher.py +++ b/scrapyd_k8s/k8s_resource_watcher.py @@ -18,8 +18,7 @@ class ResourceWatcher: List of subscriber callback functions to notify on events. """ - def __init__(self, namespace, config): - """ + def __init__(self, namespace, config): """ Initializes the ResourceWatcher. Parameters diff --git a/scrapyd_k8s/k8s_scheduler.py b/scrapyd_k8s/k8s_scheduler.py new file mode 100644 index 0000000..9256360 --- /dev/null +++ b/scrapyd_k8s/k8s_scheduler.py @@ -0,0 +1,97 @@ +import logging + +logger = logging.getLogger(__name__) + +class KubernetesScheduler: + """ + Manages scheduling of Kubernetes jobs to limit the number of concurrently running jobs. + """ + + def __init__(self, config, launcher, resource_watcher, max_proc): + """ + Initializes the KubernetesScheduler. + + Parameters + ---------- + config : Config + The configuration object. + launcher : K8s + The launcher instance with initialized Kubernetes clients. + resource_watcher : ResourceWatcher + The resource watcher to subscribe to pod events. + max_proc : int + Maximum number of concurrent jobs. + """ + self.config = config + self.launcher = launcher + self.max_proc = max_proc + self.namespace = config.scrapyd().get('namespace', 'default') + + # Subscribe to the ResourceWatcher + resource_watcher.subscribe(self.handle_pod_event) + logger.info(f"KubernetesScheduler initialized with max_proc={self.max_proc}.") + + def handle_pod_event(self, event: dict): + """ + Handles pod events from the ResourceWatcher. + """ + pod = event['object'] + pod_phase = pod.status.phase + pod_name = pod.metadata.name + event_type = event['type'] + + logger.debug(f"KubernetesScheduler received event: {event_type}, pod: {pod_name}, phase: {pod_phase}") + + # Check if this pod is related to our jobs + if not pod.metadata.labels.get(self.launcher.LABEL_JOB_ID): + logger.debug(f"Pod {pod_name} does not have our job label; ignoring.") + return + + # If a pod has terminated (Succeeded or Failed), we may have capacity to unsuspend jobs + if pod_phase in ('Succeeded', 'Failed') and event_type in ('MODIFIED', 'DELETED'): + logger.info(f"Pod {pod_name} has completed with phase {pod_phase}. Checking for suspended jobs.") + self.check_and_unsuspend_jobs() + else: + logger.debug(f"Pod {pod_name} event not relevant for unsuspension.") + + def check_and_unsuspend_jobs(self): + """ + Checks if there is capacity to unsuspend jobs and unsuspends them if possible. + """ + running_jobs_count = self.launcher.get_running_jobs_count() + logger.debug(f"Current running jobs: {running_jobs_count}, max_proc: {self.max_proc}") + + while running_jobs_count < self.max_proc: + job_id = self.get_next_suspended_job_id() + if not job_id: + logger.info("No suspended jobs to unsuspend.") + break + success = self.launcher.unsuspend_job(job_id) + if success: + running_jobs_count += 1 + logger.info(f"Unsuspended job {job_id}. Total running jobs now: {running_jobs_count}") + else: + logger.error(f"Failed to unsuspend job {job_id}") + break + + def get_next_suspended_job_id(self): + """ + Retrieves the ID of the next suspended job from Kubernetes, + sorting by creation timestamp to ensure FIFO order. + + Returns + ------- + str or None + The job ID of the next suspended job, or None if none are found. + """ + label_selector = f"{self.launcher.LABEL_JOB_ID}" + jobs = self.launcher.list_suspended_jobs(label_selector=label_selector) + if not jobs: + logger.debug("No suspended jobs found.") + return None + # Sort jobs by creation timestamp to ensure FIFO order + jobs.sort(key=lambda job: job.metadata.creation_timestamp) + job = jobs[0] + job_id = job.metadata.labels.get(self.launcher.LABEL_JOB_ID) + logger.debug(f"Next suspended job to unsuspend: {job_id}") + return job_id diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index 00ed30a..9ec9f8a 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -3,6 +3,7 @@ import kubernetes import kubernetes.stream +import logging from signal import Signals from ..k8s_resource_watcher import ResourceWatcher @@ -11,6 +12,8 @@ logger = logging.getLogger(__name__) +logger = logging.getLogger(__name__) + class K8s: LABEL_PROJECT = 'org.scrapy.project' @@ -50,7 +53,7 @@ def listjobs(self, project=None): jobs = [self._parse_job(j) for j in jobs.items] return jobs - def schedule(self, project, version, spider, job_id, settings, args): + def schedule(self, project, version, spider, job_id, settings, args, start_suspended=False): job_name = self._k8s_job_name(project.id(), job_id) _settings = [i for k, v in native_stringify_dict(settings, keys_only=False).items() for i in ['-s', f"{k}={v}"]] _args = [i for k, v in native_stringify_dict(args, keys_only=False).items() for i in ['-a', f"{k}={v}"]] @@ -98,7 +101,7 @@ def schedule(self, project, version, spider, job_id, settings, args): ) job_spec = kubernetes.client.V1JobSpec( template=pod_template, - # suspend=True, # TODO implement scheduler with suspend + suspend=start_suspended, completions=1, backoff_limit=0 # don't retry (TODO reconsider) ) @@ -145,6 +148,63 @@ def enable_joblogs(self, config): logger.warning("No storage provider configured; job logs will not be uploaded.") + def unsuspend_job(self, job_id: str): + job_name = self._get_job_name(job_id) + if not job_name: + logger.error(f"Cannot unsuspend job {job_id}: job name not found.") + return False + try: + self._k8s_batch.patch_namespaced_job( + name=job_name, + namespace=self._namespace, + body={'spec': {'suspend': False}} + ) + logger.info(f"Job {job_id} unsuspended.") + return True + except Exception as e: + logger.exception(f"Error unsuspending job {job_id}: {e}") + return False + + def get_running_jobs_count(self) -> int: + """ + Returns the number of currently active (unsuspended, not completed, not failed) jobs. + """ + label_selector = f"{self.LABEL_JOB_ID}" + jobs = self._k8s_batch.list_namespaced_job( + namespace=self._namespace, + label_selector=label_selector + ) + + active_jobs = [] + for job in jobs.items: + job_name = job.metadata.name + is_suspended = job.spec.suspend + is_completed = job.status.completion_time is not None + has_failed = job.status.failed is not None and job.status.failed > 0 + logger.debug(f"Job {job_name}: suspended={is_suspended}, completed={is_completed}, failed={has_failed}") + + if not is_suspended and not is_completed and not has_failed: + active_jobs.append(job) + + logger.debug(f"Active jobs: {[job.metadata.name for job in active_jobs]}") + logger.debug(f"Found {len(active_jobs)} active jobs.") + + return len(active_jobs) + + def list_suspended_jobs(self, label_selector: str = ""): + try: + jobs = self._k8s_batch.list_namespaced_job( + namespace=self._namespace, + label_selector=label_selector + ) + # Filter jobs where spec.suspend == True + suspended_jobs = [job for job in jobs.items if job.spec.suspend] + logger.debug(f"Found {len(suspended_jobs)} suspended jobs.") + return suspended_jobs + except Exception as e: + logger.exception(f"Error listing suspended jobs: {e}") + return [] + def _parse_job(self, job): state = self._k8s_job_to_scrapyd_status(job) return { @@ -169,6 +229,14 @@ def _get_job(self, project, job_id): return job + def _get_job_name(self, job_id: str): + label_selector = f"{self.LABEL_JOB_ID}={job_id}" + jobs = self._k8s_batch.list_namespaced_job(namespace=self._namespace, label_selector=label_selector) + if not jobs.items: + logger.error(f"No job found with job_id={job_id}") + return None + return jobs.items[0].metadata.name + def _get_pod(self, project, job_id): label = self.LABEL_JOB_ID + '=' + job_id r = self._k8s.list_namespaced_pod(namespace=self._namespace, label_selector=label) From 7bee3ac349a2fb29a934d29334b33de22296570b Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Thu, 7 Nov 2024 17:48:30 +0100 Subject: [PATCH 03/28] finish the observer logic extraction; merge changes from main; add resource watcher instance to enable_joblogs to subscribe to the event watcher if the log feature is configured; delete logic about event watcher from main; pass container for list objects function instead of container name; remove start methon from log handler class; modify joblogs init to subscribe to event watcher --- scrapyd_k8s/api.py | 1 - scrapyd_k8s/k8s_scheduler.py | 1 - scrapyd_k8s/launcher/k8s.py | 1 - 3 files changed, 3 deletions(-) diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 8ffbc90..93f734a 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -170,6 +170,5 @@ def run(): config_password = scrapyd_config.get('password') if config_username is not None and config_password is not None: enable_authentication(app, config_username, config_password) - # run server app.run(host=host, port=port) diff --git a/scrapyd_k8s/k8s_scheduler.py b/scrapyd_k8s/k8s_scheduler.py index 9256360..1cb13e4 100644 --- a/scrapyd_k8s/k8s_scheduler.py +++ b/scrapyd_k8s/k8s_scheduler.py @@ -27,7 +27,6 @@ def __init__(self, config, launcher, resource_watcher, max_proc): self.max_proc = max_proc self.namespace = config.scrapyd().get('namespace', 'default') - # Subscribe to the ResourceWatcher resource_watcher.subscribe(self.handle_pod_event) logger.info(f"KubernetesScheduler initialized with max_proc={self.max_proc}.") diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index 9ec9f8a..91328fd 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -147,7 +147,6 @@ def enable_joblogs(self, config): else: logger.warning("No storage provider configured; job logs will not be uploaded.") - def unsuspend_job(self, job_id: str): job_name = self._get_job_name(job_id) if not job_name: From 7c39fc84be3cad2cf26a99f2b02c3043b662b492 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Thu, 7 Nov 2024 18:29:23 +0100 Subject: [PATCH 04/28] change enable_joblogs method signature in docker --- scrapyd_k8s/launcher/docker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index e6f919e..40042f8 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -68,7 +68,7 @@ def cancel(self, project_id, job_id, signal): c.kill(signal='SIG' + signal) return prevstate - def enable_joblogs(self, config): + def enable_joblogs(self, config, resource_watcher): logger.warning("Job logs are not supported when using the Docker launcher.") def _parse_job(self, c): From ec7560009a2c49104abb16e6ff403348a5061e4f Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Mon, 11 Nov 2024 14:50:26 +0100 Subject: [PATCH 05/28] refactor api.py and launcher/k8s.py to keep api.py launcher agnostic --- scrapyd_k8s/api.py | 17 +++++------------ scrapyd_k8s/launcher/docker.py | 8 +++++++- scrapyd_k8s/launcher/k8s.py | 20 ++++++++++++++++---- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 93f734a..5afa45d 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -12,13 +12,6 @@ app = Flask(__name__) -# Initialize ResourceWatcher and KubernetesScheduler -namespace = config.namespace() -resource_watcher = ResourceWatcher(namespace) -max_proc = int(scrapyd_config.get('max_proc', 4)) -logging.debug(f"MAX PROC IS SET TO: {max_proc}") -k8s_scheduler = KubernetesScheduler(config, launcher, resource_watcher, max_proc) - @app.get("/") def home(): return "

scrapyd-k8s

" @@ -55,12 +48,12 @@ def api_schedule(): _version = request.form.get('_version', 'latest') # TODO allow customizing latest tag # any other parameter is passed as spider argument args = { k: v for k, v in request.form.items() if k not in ('project', 'spider', 'setting', 'jobid', 'priority', '_version') } - running_jobs = launcher.get_running_jobs_count() - start_suspended = running_jobs >= k8s_scheduler.max_proc - logger.info( - f"Scheduling job {job_id} with start_suspended={start_suspended}. Running jobs: {running_jobs}, Max procs: {k8s_scheduler.max_proc}") + # running_jobs = launcher.get_running_jobs_count() + # start_suspended = running_jobs >= k8s_scheduler.max_proc + # logger.info( + # f"Scheduling job {job_id} with start_suspended={start_suspended}. Running jobs: {running_jobs}, Max procs: {k8s_scheduler.max_proc}") env_config, env_secret = project.env_config(), project.env_secret() - jobid = config.launcher.schedule(project, _version, spider, job_id, settings, args, start_suspended=start_suspended) + jobid = config.launcher().schedule(project, _version, spider, job_id, settings, args, start_suspended=start_suspended) return { 'status': 'ok', 'jobid': job_id } @app.post("/cancel.json") diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index 40042f8..177482b 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -41,7 +41,7 @@ def schedule(self, project, version, spider, job_id, settings, args): 'SCRAPYD_JOB': job_id, } # TODO env_source handling resources = project.resources(spider) - c = self._docker.containers.run( + c = self._docker.containers.create( image=project.repository() + ':' + version, command=['scrapy', 'crawl', spider, *_args, *_settings], environment=env, @@ -71,6 +71,12 @@ def cancel(self, project_id, job_id, signal): def enable_joblogs(self, config, resource_watcher): logger.warning("Job logs are not supported when using the Docker launcher.") + def get_running_jobs_count(self): + # Return the number of running Docker containers matching the job labels + label = self.LABEL_PROJECT + running_jobs = self._docker.containers.list(filters={'label': label, 'status': 'running'}) + return len(running_jobs) + def _parse_job(self, c): state = self._docker_to_scrapyd_status(c.status) return { diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index 91328fd..b8fb886 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -6,12 +6,15 @@ import logging from signal import Signals -from ..k8s_resource_watcher import ResourceWatcher -from ..utils import format_datetime_object, native_stringify_dict +from ..utils import format_datetime_object from scrapyd_k8s.joblogs import KubernetesJobLogHandler logger = logging.getLogger(__name__) +from ..utils import native_stringify_dict +from ..k8s_resource_watcher import ResourceWatcher +from ..k8s_scheduler import KubernetesScheduler + logger = logging.getLogger(__name__) class K8s: @@ -31,7 +34,6 @@ def __init__(self, config): self._k8s = kubernetes.client.CoreV1Api() self._k8s_batch = kubernetes.client.BatchV1Api() - self._init_resource_watcher(config) def _init_resource_watcher(self, config): @@ -42,6 +44,12 @@ def _init_resource_watcher(self, config): else: logger.debug("Job logs handling not enabled; 'joblogs' configuration section is missing.") + # Initialize KubernetesScheduler + self.max_proc = int(config.scrapyd().get('max_proc', 4)) + self.k8s_scheduler = KubernetesScheduler(config, self, self.resource_watcher, self.max_proc) + logger.debug(f"KubernetesLauncher initialized with max_proc={self.max_proc}.") + + def get_node_name(self): deployment = os.getenv('MY_DEPLOYMENT_NAME', 'default') namespace = os.getenv('MY_NAMESPACE') @@ -53,7 +61,11 @@ def listjobs(self, project=None): jobs = [self._parse_job(j) for j in jobs.items] return jobs - def schedule(self, project, version, spider, job_id, settings, args, start_suspended=False): + def schedule(self, project, version, spider, job_id, settings, args): + running_jobs = self.get_running_jobs_count() + start_suspended = running_jobs >= self.max_proc + logger.info( + f"Scheduling job {job_id} with start_suspended={start_suspended}. Running jobs: {running_jobs}, Max procs: {self.max_proc}") job_name = self._k8s_job_name(project.id(), job_id) _settings = [i for k, v in native_stringify_dict(settings, keys_only=False).items() for i in ['-s', f"{k}={v}"]] _args = [i for k, v in native_stringify_dict(args, keys_only=False).items() for i in ['-a', f"{k}={v}"]] From 7641ddb15229ddb69bbd3ce72f051d60ac8e0d83 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 12 Nov 2024 10:59:30 +0100 Subject: [PATCH 06/28] add implementation for docker to check the number of running containers and run more from the queue of created jobs when capacity is available; add backgroung thread that sleeps for 5 sec and triggers the function that starts additional containers up to capacity; add a method to gracefully stop the background thread that might be used in the future to stop the thread when app stops; encapsulate k8s and docker related schedule functionality in corresponding launchers and keep api.py launcher agnostic; add max_proc to config for docker --- scrapyd_k8s.sample-docker.conf | 3 ++ scrapyd_k8s/launcher/docker.py | 67 ++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/scrapyd_k8s.sample-docker.conf b/scrapyd_k8s.sample-docker.conf index 542ab34..7c4c11d 100644 --- a/scrapyd_k8s.sample-docker.conf +++ b/scrapyd_k8s.sample-docker.conf @@ -16,6 +16,9 @@ repository = scrapyd_k8s.repository.Local # Since this is the Docker example, we choose Docker here. launcher = scrapyd_k8s.launcher.Docker +# Maximum number of jobs running in parallel +max_proc = 1 + # For each project, define a project section. # This contains a repository, with the container label to use. [project.example] diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index 177482b..28e6801 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -2,6 +2,9 @@ import re import socket +import threading +import time + import docker from ..utils import format_iso_date_string, native_stringify_dict @@ -22,6 +25,31 @@ class Docker: def __init__(self, config): self._docker = docker.from_env() + self.max_proc = int(config.scrapyd().get('max_proc', 4)) + self._stop_event = threading.Event() + self._lock = threading.Lock() + + self._thread = threading.Thread(target=self._background_task, daemon=True) + self._thread.start() + logger.info("Background thread for managing Docker containers started.") + + def _background_task(self): + """ + Background thread that periodically checks and starts pending containers. + """ + check_interval = 5 # seconds + while not self._stop_event.is_set(): + with self._lock: + self.start_pending_containers() + time.sleep(check_interval) + + def shutdown(self): + """ + Cleanly shutdown the background thread. + """ + self._stop_event.set() + self._thread.join() + logger.info("Background thread for managing Docker containers stopped.") def get_node_name(self): return socket.gethostname() @@ -55,6 +83,41 @@ def schedule(self, project, version, spider, job_id, settings, args): mem_limit=resources.get('limits', {}).get('memory'), cpu_quota=_str_to_micro(resources.get('limits', {}).get('cpu')) ) + running_jobs_count = self.get_running_jobs_count() + if running_jobs_count < self.max_proc: + self.start_pending_containers() + + def start_pending_containers(self): + """ + Checks if there is capacity to start pending containers and starts them if possible. + """ + running_jobs_count = self.get_running_jobs_count() + logger.debug(f"Current running jobs: {running_jobs_count}, max_proc: {self.max_proc}") + + while running_jobs_count < self.max_proc: + pending_container = self.get_next_pending_container() + if not pending_container: + logger.info("No pending containers to start.") + break + try: + pending_container.start() + running_jobs_count += 1 + logger.info( + f"Started pending container {pending_container.name}. Total running jobs now: {running_jobs_count}") + except Exception as e: + logger.error(f"Failed to start container {pending_container.name}: {e}") + break + + def get_next_pending_container(self): + pending_containers = self._docker.containers.list(all=True, filters={ + 'label': self.LABEL_PROJECT, + 'status': 'created', + }) + if not pending_containers: + return None + # Sort by creation time to ensure FIFO order + pending_containers.sort(key=lambda c: c.attrs['Created']) + return pending_containers[0] def cancel(self, project_id, job_id, signal): c = self._get_container(project_id, job_id) @@ -64,8 +127,12 @@ def cancel(self, project_id, job_id, signal): prevstate = self._docker_to_scrapyd_status(c.status) if c.status == 'created' or c.status == 'scheduled': c.remove() + logger.info(f"Removed pending container {c.name}.") elif c.status == 'running': c.kill(signal='SIG' + signal) + logger.info(f"Killed and removed running container {c.name}.") + # After cancelling, try to start pending containers since we might have capacity + self.start_pending_containers() return prevstate def enable_joblogs(self, config, resource_watcher): From fdce0c0992aa4d80ef9835f499f745ce6a668001 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 12 Nov 2024 11:16:06 +0100 Subject: [PATCH 07/28] remove update from RBAC because we perform patching of the resource and patch is sufficient --- kubernetes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes.yaml b/kubernetes.yaml index 213ada9..b4c2376 100644 --- a/kubernetes.yaml +++ b/kubernetes.yaml @@ -176,7 +176,7 @@ rules: verbs: ["get"] - apiGroups: ["batch"] resources: ["jobs"] - verbs: ["get", "list", "create", "patch", "update", "delete"] + verbs: ["get", "list", "create", "patch", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding From 0b3d19a0ce6a1784deb6eaa7b2007459c4e2dce2 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Thu, 14 Nov 2024 17:00:02 +0100 Subject: [PATCH 08/28] add number of retry attempts and exponential backoff time to the reconnect loop for event watcher; make number of reconnect attempts, backoff time and a coefficient for exponential growth configurable via config; add backoff_time, reconnection_attempts and backoff_coefficient as attributes to the resource watcher init; add resource_version as a param to w.stream so a failed stream can read from the last resource it was able to catch; add urllib3.exceptions.ProtocolError and handle reconnection after some exponential backoff time to avoid api flooding; add config as a param for init for resource watcher; modify config in kubernetes.yaml and k8s config to contain add backoff_time, reconnection_attempts and backoff_coefficient --- kubernetes.yaml | 3 +++ scrapyd_k8s.sample-k8s.conf | 10 ++++++++++ scrapyd_k8s/k8s_resource_watcher.py | 4 ++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/kubernetes.yaml b/kubernetes.yaml index b4c2376..a72d392 100644 --- a/kubernetes.yaml +++ b/kubernetes.yaml @@ -91,6 +91,9 @@ data: namespace = default max_proc = 2 + reconnection_attempts = 5 + backoff_time = 5 + backoff_coefficient = 2 # This is an example spider that should work out of the box. # Adapt the spider config to your use-case. diff --git a/scrapyd_k8s.sample-k8s.conf b/scrapyd_k8s.sample-k8s.conf index 835b82a..7fc6674 100644 --- a/scrapyd_k8s.sample-k8s.conf +++ b/scrapyd_k8s.sample-k8s.conf @@ -22,6 +22,16 @@ namespace = default # Maximum number of jobs running in parallel max_proc = 1 +# Number of attempts to reconnect with k8s API to watch events, default is 5 +reconnection_attempts = 5 + +# Minimum time in seconds to wait before reconnecting to k8s API to watch events, default is 5 +backoff_time = 5 + +# Coefficient that is multiplied by backoff_time to provide exponential backoff to prevent k8s API from being overwhelmed +# default is 2, every reconnection attempt will take backoff_time*backoff_coefficient +backoff_coefficient = 2 + # For each project, define a project section. # This contains a repository that points to the remote container repository. # An optional env_secret is the name of a secret with additional environment diff --git a/scrapyd_k8s/k8s_resource_watcher.py b/scrapyd_k8s/k8s_resource_watcher.py index 60c51f1..7342f81 100644 --- a/scrapyd_k8s/k8s_resource_watcher.py +++ b/scrapyd_k8s/k8s_resource_watcher.py @@ -17,8 +17,8 @@ class ResourceWatcher: subscribers : List[Callable] List of subscriber callback functions to notify on events. """ - - def __init__(self, namespace, config): """ + def __init__(self, namespace, config): + """ Initializes the ResourceWatcher. Parameters From 62cf9ed800d33b23fa6d18d95969391ae33578cd Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 20 Nov 2024 20:20:07 +0100 Subject: [PATCH 09/28] create a helper finction _filter_jobs that accepts a filter function and a label selector to make the code in listjobs, get_running_jobs and list_suspended_jobs DRY; refactor listjobs to use the helper function with the existing _parse_job as a filter_func parameter --- scrapyd_k8s/launcher/k8s.py | 111 +++++++++++++++++++++++++++--------- 1 file changed, 83 insertions(+), 28 deletions(-) diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index b8fb886..c4473a3 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -11,6 +11,8 @@ logger = logging.getLogger(__name__) +from kubernetes.client import ApiException + from ..utils import native_stringify_dict from ..k8s_resource_watcher import ResourceWatcher from ..k8s_scheduler import KubernetesScheduler @@ -56,10 +58,11 @@ def get_node_name(self): return ".".join([n for n in [namespace, deployment] if n]) def listjobs(self, project=None): - label = self.LABEL_PROJECT + ('=%s'%(project) if project else '') - jobs = self._k8s_batch.list_namespaced_job(namespace=self._namespace, label_selector=label) - jobs = [self._parse_job(j) for j in jobs.items] - return jobs + label_selector = self.LABEL_PROJECT + ('=%s' % project if project else '') + return self._filter_jobs( + label_selector=label_selector, + filter_func=self._parse_job + ) def schedule(self, project, version, spider, job_id, settings, args): running_jobs = self.get_running_jobs_count() @@ -181,41 +184,93 @@ def get_running_jobs_count(self) -> int: Returns the number of currently active (unsuspended, not completed, not failed) jobs. """ label_selector = f"{self.LABEL_JOB_ID}" - jobs = self._k8s_batch.list_namespaced_job( - namespace=self._namespace, - label_selector=label_selector + active_jobs = self._filter_jobs( + label_selector=label_selector, + filter_func=self._is_active_job ) - - active_jobs = [] - for job in jobs.items: - job_name = job.metadata.name - is_suspended = job.spec.suspend - is_completed = job.status.completion_time is not None - has_failed = job.status.failed is not None and job.status.failed > 0 - logger.debug(f"Job {job_name}: suspended={is_suspended}, completed={is_completed}, failed={has_failed}") - - if not is_suspended and not is_completed and not has_failed: - active_jobs.append(job) - - logger.debug(f"Active jobs: {[job.metadata.name for job in active_jobs]}") logger.debug(f"Found {len(active_jobs)} active jobs.") - return len(active_jobs) def list_suspended_jobs(self, label_selector: str = ""): + """ + Retrieves a list of suspended jobs. + """ + suspended_jobs = self._filter_jobs( + label_selector=label_selector, + filter_func=self._is_suspended_job + ) + logger.debug(f"Found {len(suspended_jobs)} suspended jobs.") + return suspended_jobs + + def _filter_jobs(self, label_selector: str, filter_func): + """ + Helper method to fetch jobs and filter them based on a provided function. + + Parameters + ---------- + label_selector : str + Kubernetes label selector to filter jobs. + filter_func : function + A function that takes a job and returns True if the job matches the condition. + + Returns + ------- + list + A list of Kubernetes Job objects that match the filter condition. + """ try: - jobs = self._k8s_batch.list_namespaced_job( + jobs_response = self._k8s_batch.list_namespaced_job( namespace=self._namespace, label_selector=label_selector ) - # Filter jobs where spec.suspend == True - suspended_jobs = [job for job in jobs.items if job.spec.suspend] - logger.debug(f"Found {len(suspended_jobs)} suspended jobs.") - return suspended_jobs - except Exception as e: - logger.exception(f"Error listing suspended jobs: {e}") + jobs = jobs_response.items + filtered_jobs = [] + for job in jobs: + if filter_func(job): + filtered_jobs.append(job) + return filtered_jobs + except ApiException as e: + logger.exception(f"API call failed while listing jobs: {e}") return [] + def _is_active_job(self, job): + """ + Determines if a job is active (running) based on your specific filtering criteria. + + Parameters + ---------- + job : V1Job + A Kubernetes Job object. + + Returns + ------- + bool + True if the job is active, False otherwise. + """ + job_name = job.metadata.name + is_suspended = job.spec.suspend + is_completed = job.status.completion_time is not None + has_failed = job.status.failed is not None and job.status.failed > 0 + logger.debug( + f"Job {job_name}: suspended={is_suspended}, completed={is_completed}, failed={has_failed}") + return not is_suspended and not is_completed and not has_failed + + def _is_suspended_job(self, job): + """ + Determines if a job is suspended. + + Parameters + ---------- + job : V1Job + A Kubernetes Job object. + + Returns + ------- + bool + True if the job is suspended, False otherwise. + """ + return job.spec.suspend + def _parse_job(self, job): state = self._k8s_job_to_scrapyd_status(job) return { From 8a499d8e3cc872424e3050582aadf71710374790 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 20 Nov 2024 21:00:59 +0100 Subject: [PATCH 10/28] change the signature of the filtering method to also accept a parse function because list jobs uses a different logic --- scrapyd_k8s/launcher/k8s.py | 67 ++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 20 deletions(-) diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index c4473a3..7e0fcd9 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -61,7 +61,8 @@ def listjobs(self, project=None): label_selector = self.LABEL_PROJECT + ('=%s' % project if project else '') return self._filter_jobs( label_selector=label_selector, - filter_func=self._parse_job + filter_func=None, # No additional filtering + parse_func=self._parse_job ) def schedule(self, project, version, spider, job_id, settings, args): @@ -202,21 +203,25 @@ def list_suspended_jobs(self, label_selector: str = ""): logger.debug(f"Found {len(suspended_jobs)} suspended jobs.") return suspended_jobs - def _filter_jobs(self, label_selector: str, filter_func): + def _filter_jobs(self, label_selector, filter_func=None, parse_func=None): """ - Helper method to fetch jobs and filter them based on a provided function. + Helper method to fetch jobs and optionally filter and parse them. Parameters ---------- label_selector : str Kubernetes label selector to filter jobs. - filter_func : function + filter_func : function, optional A function that takes a job and returns True if the job matches the condition. + If None, all jobs are included. + parse_func : function, optional + A function that takes a job and returns a transformed job. + If None, the raw job object is returned. Returns ------- list - A list of Kubernetes Job objects that match the filter condition. + A list of Kubernetes Job objects that match the filter condition and are parsed if parse_func is provided. """ try: jobs_response = self._k8s_batch.list_namespaced_job( @@ -224,11 +229,14 @@ def _filter_jobs(self, label_selector: str, filter_func): label_selector=label_selector ) jobs = jobs_response.items - filtered_jobs = [] - for job in jobs: - if filter_func(job): - filtered_jobs.append(job) - return filtered_jobs + + if filter_func is not None: + jobs = [job for job in jobs if filter_func(job)] + + if parse_func is not None: + jobs = [parse_func(job) for job in jobs] + + return jobs except ApiException as e: logger.exception(f"API call failed while listing jobs: {e}") return [] @@ -283,25 +291,44 @@ def _parse_job(self, job): } def _get_job(self, project, job_id): - label = self.LABEL_JOB_ID + '=' + job_id - r = self._k8s_batch.list_namespaced_job(namespace=self._namespace, label_selector=label) - if not r or not r.items: + label_selector = f"{self.LABEL_JOB_ID}={job_id}" + jobs = self._filter_jobs( + label_selector=label_selector, + filter_func=None + ) + if not jobs: + logger.error(f"No job found with job_id={job_id}") return None - job = r.items[0] - + job = jobs[0] if job.metadata.labels.get(self.LABEL_PROJECT) != project: - # TODO log error + logger.error(f"Job {job_id} does not belong to project {project}") return None - return job def _get_job_name(self, job_id: str): + """ + Retrieves the Kubernetes job name for the given job ID. + + Parameters + ---------- + job_id : str + The job ID to look up. + + Returns + ------- + str or None + The name of the Kubernetes job, or None if not found. + """ label_selector = f"{self.LABEL_JOB_ID}={job_id}" - jobs = self._k8s_batch.list_namespaced_job(namespace=self._namespace, label_selector=label_selector) - if not jobs.items: + jobs = self._filter_jobs( + label_selector=label_selector, + filter_func=None + ) + + if not jobs: logger.error(f"No job found with job_id={job_id}") return None - return jobs.items[0].metadata.name + return jobs[0].metadata.name def _get_pod(self, project, job_id): label = self.LABEL_JOB_ID + '=' + job_id From 7471b07c8000fff9761bc6ba394bd6343ab476fd Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Thu, 14 Nov 2024 17:00:02 +0100 Subject: [PATCH 11/28] add number of retry attempts and exponential backoff time to the reconnect loop for event watcher; make number of reconnect attempts, backoff time and a coefficient for exponential growth configurable via config; add backoff_time, reconnection_attempts and backoff_coefficient as attributes to the resource watcher init; add resource_version as a param to w.stream so a failed stream can read from the last resource it was able to catch; add urllib3.exceptions.ProtocolError and handle reconnection after some exponential backoff time to avoid api flooding; add config as a param for init for resource watcher; modify config in kubernetes.yaml and k8s config to contain add backoff_time, reconnection_attempts and backoff_coefficient --- scrapyd_k8s.sample-k8s.conf | 10 ++++++++++ scrapyd_k8s/tests/integration/test_api.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/scrapyd_k8s.sample-k8s.conf b/scrapyd_k8s.sample-k8s.conf index 7fc6674..9fb8142 100644 --- a/scrapyd_k8s.sample-k8s.conf +++ b/scrapyd_k8s.sample-k8s.conf @@ -32,6 +32,16 @@ backoff_time = 5 # default is 2, every reconnection attempt will take backoff_time*backoff_coefficient backoff_coefficient = 2 +# Number of attempts to reconnect with k8s API to watch events, default is 5 +reconnection_attempts = 5 + +# Minimum time in seconds to wait before reconnecting to k8s API to watch events, default is 5 +backoff_time = 5 + +# Coefficient that is multiplied by backoff_time to provide exponential backoff to prevent k8s API from being overwhelmed +# default is 2, every reconnection attempt will take backoff_time*backoff_coefficient +backoff_coefficient = 2 + # For each project, define a project section. # This contains a repository that points to the remote container repository. # An optional env_secret is the name of a secret with additional environment diff --git a/scrapyd_k8s/tests/integration/test_api.py b/scrapyd_k8s/tests/integration/test_api.py index 45f51b2..4d34b5a 100644 --- a/scrapyd_k8s/tests/integration/test_api.py +++ b/scrapyd_k8s/tests/integration/test_api.py @@ -44,7 +44,7 @@ def test_listversions_ok(): assert_response_ok(response) json = response.json() - assert json['versions'] == AVAIL_VERSIONS + assert set(AVAIL_VERSIONS).issubset(set(json['versions'])) assert 'node_name' in json def test_listversions_project_missing(): From aca31a5bdccad73e283c6bd2f0696c587a7eed4b Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Fri, 15 Nov 2024 14:37:12 +0100 Subject: [PATCH 12/28] add logic to reset number of reconnect attempts and backoff time when connection to the k8s was achieved so only sequential failures detected; add exception handling to watch_pods to handle failure in urllib3, when source version is old and not available anymore, and when stream is ended; remove k8s resource watcher initialization from run function in api.py and move it to k8s.py launcher as _init_resource_watcher; refactor existing logic from joblogs/__init__.py to keep it in _init_resource_watcher and enable_joblogs in k8s launcher --- scrapyd_k8s/launcher/k8s.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index 7e0fcd9..6374998 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -6,16 +6,16 @@ import logging from signal import Signals -from ..utils import format_datetime_object +from ..utils import format_datetime_object, native_stringify_dict from scrapyd_k8s.joblogs import KubernetesJobLogHandler logger = logging.getLogger(__name__) from kubernetes.client import ApiException - -from ..utils import native_stringify_dict -from ..k8s_resource_watcher import ResourceWatcher from ..k8s_scheduler import KubernetesScheduler +from ..k8s_resource_watcher import ResourceWatcher +from ..utils import native_stringify_dict +from scrapyd_k8s.joblogs import KubernetesJobLogHandler logger = logging.getLogger(__name__) @@ -37,6 +37,7 @@ def __init__(self, config): self._k8s = kubernetes.client.CoreV1Api() self._k8s_batch = kubernetes.client.BatchV1Api() self._init_resource_watcher(config) + self.max_proc = int(config.scrapyd().get('max_proc', 4)) def _init_resource_watcher(self, config): self.resource_watcher = ResourceWatcher(self._namespace, config) @@ -51,7 +52,6 @@ def _init_resource_watcher(self, config): self.k8s_scheduler = KubernetesScheduler(config, self, self.resource_watcher, self.max_proc) logger.debug(f"KubernetesLauncher initialized with max_proc={self.max_proc}.") - def get_node_name(self): deployment = os.getenv('MY_DEPLOYMENT_NAME', 'default') namespace = os.getenv('MY_NAMESPACE') From c4d424271abb79bec64fa6bf851b1cea5e77f9ce Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 19 Nov 2024 11:53:54 +0100 Subject: [PATCH 13/28] added a CONFIG.md file with detailed explanations about parameters used to re-establish connection to the Kubernetes wather --- CONFIG.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index c1edc02..a5a8351 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -59,12 +59,11 @@ choose to use them. The event watcher establishes a connection to the Kubernetes API and receives a stream of events from it. However, the nature of this long-lived connection is unstable; it can be interrupted by network issues, proxies configured to terminate long-lived connections, and other factors. For this reason, a mechanism was implemented to re-establish the long-lived -connection to the Kubernetes API. To achieve this, three parameters were introduced: `reconnection_attempts`, +connection to the Kubernetes API. To achieve this, three parameters were introduced: `backoff_time` and `backoff_coefficient`. #### What are these parameters about? -* `reconnection_attempts` - defines how many consecutive attempts will be made to reconnect if the connection fails; * `backoff_time`, `backoff_coefficient` - are used to gradually slow down each subsequent attempt to establish a connection with the Kubernetes API, preventing the API from becoming overloaded with requests. The `backoff_time` increases exponentially and is calculated as `backoff_time *= self.backoff_coefficient`. @@ -72,5 +71,4 @@ connection to the Kubernetes API. To achieve this, three parameters were introdu #### When do I need to change it in the config file? Default values for these parameters are provided in the code and are tuned to an "average" cluster setting. If your network -requirements or other conditions are unusual, you may need to adjust these values to better suit your specific setup. - +requirements or other conditions are unusual, you may need to adjust these values to better suit your specific setup. \ No newline at end of file From 27a310f48eaa3abe0f3128efb49039f466fe5ad8 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 19 Nov 2024 13:28:20 +0100 Subject: [PATCH 14/28] move section about config file from README.md to CONFIG.md; add a link to the CONFIG.md in the README.md; remove variables for reconnection_attempts, backoff_time and backoff_coefficient fron the sample config since default values are provided in the code. --- kubernetes.yaml | 3 --- scrapyd_k8s.sample-k8s.conf | 10 ---------- 2 files changed, 13 deletions(-) diff --git a/kubernetes.yaml b/kubernetes.yaml index a72d392..b4c2376 100644 --- a/kubernetes.yaml +++ b/kubernetes.yaml @@ -91,9 +91,6 @@ data: namespace = default max_proc = 2 - reconnection_attempts = 5 - backoff_time = 5 - backoff_coefficient = 2 # This is an example spider that should work out of the box. # Adapt the spider config to your use-case. diff --git a/scrapyd_k8s.sample-k8s.conf b/scrapyd_k8s.sample-k8s.conf index 9fb8142..7fc6674 100644 --- a/scrapyd_k8s.sample-k8s.conf +++ b/scrapyd_k8s.sample-k8s.conf @@ -32,16 +32,6 @@ backoff_time = 5 # default is 2, every reconnection attempt will take backoff_time*backoff_coefficient backoff_coefficient = 2 -# Number of attempts to reconnect with k8s API to watch events, default is 5 -reconnection_attempts = 5 - -# Minimum time in seconds to wait before reconnecting to k8s API to watch events, default is 5 -backoff_time = 5 - -# Coefficient that is multiplied by backoff_time to provide exponential backoff to prevent k8s API from being overwhelmed -# default is 2, every reconnection attempt will take backoff_time*backoff_coefficient -backoff_coefficient = 2 - # For each project, define a project section. # This contains a repository that points to the remote container repository. # An optional env_secret is the name of a secret with additional environment From 340f242a3fd0cb73fa56e341f82e699bf16a6e7d Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Fri, 22 Nov 2024 12:06:19 +0100 Subject: [PATCH 15/28] make number of job limitation optional by separating k8s_scheduler in a package that has an enable function in launcher/k8s.py which also part of resource watcher initialization; initialize the scheduler if max_proc was provided in the scrapyd section of the config file; refactor related methods in the launcher to use extra functionality for job number limiting only if max_proc is provided --- kubernetes.yaml | 2 -- scrapyd_k8s/k8s_scheduler/__init__.py | 1 + .../{ => k8s_scheduler}/k8s_scheduler.py | 3 -- scrapyd_k8s/launcher/k8s.py | 35 +++++++++++++++---- 4 files changed, 30 insertions(+), 11 deletions(-) create mode 100644 scrapyd_k8s/k8s_scheduler/__init__.py rename scrapyd_k8s/{ => k8s_scheduler}/k8s_scheduler.py (96%) diff --git a/kubernetes.yaml b/kubernetes.yaml index b4c2376..e4d59a6 100644 --- a/kubernetes.yaml +++ b/kubernetes.yaml @@ -89,8 +89,6 @@ data: launcher = scrapyd_k8s.launcher.K8s namespace = default - - max_proc = 2 # This is an example spider that should work out of the box. # Adapt the spider config to your use-case. diff --git a/scrapyd_k8s/k8s_scheduler/__init__.py b/scrapyd_k8s/k8s_scheduler/__init__.py new file mode 100644 index 0000000..73dfe4d --- /dev/null +++ b/scrapyd_k8s/k8s_scheduler/__init__.py @@ -0,0 +1 @@ +from scrapyd_k8s.k8s_scheduler.k8s_scheduler import KubernetesScheduler \ No newline at end of file diff --git a/scrapyd_k8s/k8s_scheduler.py b/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py similarity index 96% rename from scrapyd_k8s/k8s_scheduler.py rename to scrapyd_k8s/k8s_scheduler/k8s_scheduler.py index 1cb13e4..f2704e0 100644 --- a/scrapyd_k8s/k8s_scheduler.py +++ b/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py @@ -27,9 +27,6 @@ def __init__(self, config, launcher, resource_watcher, max_proc): self.max_proc = max_proc self.namespace = config.scrapyd().get('namespace', 'default') - resource_watcher.subscribe(self.handle_pod_event) - logger.info(f"KubernetesScheduler initialized with max_proc={self.max_proc}.") - def handle_pod_event(self, event: dict): """ Handles pod events from the ResourceWatcher. diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index 6374998..ef0216c 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -36,16 +36,21 @@ def __init__(self, config): self._k8s = kubernetes.client.CoreV1Api() self._k8s_batch = kubernetes.client.BatchV1Api() + + self.scheduler = None self._init_resource_watcher(config) self.max_proc = int(config.scrapyd().get('max_proc', 4)) def _init_resource_watcher(self, config): self.resource_watcher = ResourceWatcher(self._namespace, config) - if config.joblogs() is not None: self.enable_joblogs(config) else: logger.debug("Job logs handling not enabled; 'joblogs' configuration section is missing.") + if self.max_proc is not None: + self.enable_k8s_scheduler(config) + else: + logger.debug("k8s scheduler not enabled; 'max_proc' configuration is missing in the scrapyd section.") # Initialize KubernetesScheduler self.max_proc = int(config.scrapyd().get('max_proc', 4)) @@ -66,10 +71,14 @@ def listjobs(self, project=None): ) def schedule(self, project, version, spider, job_id, settings, args): - running_jobs = self.get_running_jobs_count() - start_suspended = running_jobs >= self.max_proc - logger.info( - f"Scheduling job {job_id} with start_suspended={start_suspended}. Running jobs: {running_jobs}, Max procs: {self.max_proc}") + if self.scheduler: + running_jobs = self.get_running_jobs_count() + start_suspended = running_jobs >= self.scheduler.max_proc + logger.debug( + f"Scheduling job {job_id} with start_suspended={start_suspended}. Running jobs: {running_jobs}, Max procs: {self.scheduler.max_proc}") + else: + start_suspended = False + logger.debug(f"Scheduling job {job_id} without suspension. Scheduler not enabled.") job_name = self._k8s_job_name(project.id(), job_id) _settings = [i for k, v in native_stringify_dict(settings, keys_only=False).items() for i in ['-s', f"{k}={v}"]] _args = [i for k, v in native_stringify_dict(args, keys_only=False).items() for i in ['-a', f"{k}={v}"]] @@ -163,7 +172,21 @@ def enable_joblogs(self, config): else: logger.warning("No storage provider configured; job logs will not be uploaded.") + def enable_k8s_scheduler(self, config): + try: + max_proc = int(self.max_proc) + self.scheduler = KubernetesScheduler(config, self, self.resource_watcher, max_proc) + logger.debug(f"KubernetesLauncher initialized with max_proc={max_proc}.") + self.resource_watcher.subscribe(self.scheduler.handle_pod_event) + logger.info("K8s scheduler started.") + except ValueError: + logger.error(f"Invalid max_proc value: {self.max_proc}. Scheduler not enabled.") + self.scheduler = None + def unsuspend_job(self, job_id: str): + if not self.scheduler: + logger.error("Scheduler is not enabled. Cannot unsuspend jobs.") + return False job_name = self._get_job_name(job_id) if not job_name: logger.error(f"Cannot unsuspend job {job_id}: job name not found.") @@ -176,7 +199,7 @@ def unsuspend_job(self, job_id: str): ) logger.info(f"Job {job_id} unsuspended.") return True - except Exception as e: + except ApiException as e: logger.exception(f"Error unsuspending job {job_id}: {e}") return False From bc9983ea36dfc32faa7400619880c60b23e357dc Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Fri, 22 Nov 2024 13:16:03 +0100 Subject: [PATCH 16/28] implement the feature to limit running jobs to be optional for Docker; remove max_proc from config file since by default we want to runn all scheduled jobs in parallel; add a section about max_proc to the CONFIG.md --- CONFIG.md | 12 ++++++++ scrapyd_k8s.sample-docker.conf | 3 -- scrapyd_k8s.sample-k8s.conf | 3 -- scrapyd_k8s/launcher/docker.py | 53 ++++++++++++++++++++++------------ 4 files changed, 47 insertions(+), 24 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index a5a8351..5b618d1 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -68,6 +68,18 @@ connection to the Kubernetes API. To achieve this, three parameters were introdu connection with the Kubernetes API, preventing the API from becoming overloaded with requests. The `backoff_time` increases exponentially and is calculated as `backoff_time *= self.backoff_coefficient`. +### max_proc + +By default `max_proc` is not present in the config file but you can add it under the scrapyd section to limit the number +of jobs that run in parallel, while other scheduled job wait for their turn to run whenever currently running job(s) +complete the run, or are cancelled, or are failed. This feature is available for both Kubernetes and Docker. + +For example, you have a cluster with 0 running jobs, you schedule 20 jobs and provide `max_proc = 5` in the scrapyd section. +Then 5 jobs start running immediately and 15 others are suspended. Whenever at least of the jobs finish running, the new +job is added to run. The order in which jobs were scheduled is preserved. + +`max_proc` - a parameter you can set to limit the number of running jobs at a given moment + #### When do I need to change it in the config file? Default values for these parameters are provided in the code and are tuned to an "average" cluster setting. If your network diff --git a/scrapyd_k8s.sample-docker.conf b/scrapyd_k8s.sample-docker.conf index 7c4c11d..542ab34 100644 --- a/scrapyd_k8s.sample-docker.conf +++ b/scrapyd_k8s.sample-docker.conf @@ -16,9 +16,6 @@ repository = scrapyd_k8s.repository.Local # Since this is the Docker example, we choose Docker here. launcher = scrapyd_k8s.launcher.Docker -# Maximum number of jobs running in parallel -max_proc = 1 - # For each project, define a project section. # This contains a repository, with the container label to use. [project.example] diff --git a/scrapyd_k8s.sample-k8s.conf b/scrapyd_k8s.sample-k8s.conf index 7fc6674..df32508 100644 --- a/scrapyd_k8s.sample-k8s.conf +++ b/scrapyd_k8s.sample-k8s.conf @@ -19,9 +19,6 @@ namespace = default # Optional pull secret, in case you have private spiders. #pull_secret = ghcr-registry -# Maximum number of jobs running in parallel -max_proc = 1 - # Number of attempts to reconnect with k8s API to watch events, default is 5 reconnection_attempts = 5 diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index 28e6801..39e4f8c 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -25,13 +25,18 @@ class Docker: def __init__(self, config): self._docker = docker.from_env() - self.max_proc = int(config.scrapyd().get('max_proc', 4)) - self._stop_event = threading.Event() - self._lock = threading.Lock() - - self._thread = threading.Thread(target=self._background_task, daemon=True) - self._thread.start() - logger.info("Background thread for managing Docker containers started.") + self.max_proc = config.scrapyd().get('max_proc') + if self.max_proc is not None: + self.max_proc = int(self.max_proc) + self._stop_event = threading.Event() + self._lock = threading.Lock() + + self._thread = threading.Thread(target=self._background_task, daemon=True) + self._thread.start() + logger.info("Background thread for managing Docker containers started.") + else: + self._thread = None + logger.info("Job limit not set; Docker launcher will not limit running jobs.") def _background_task(self): """ @@ -47,9 +52,10 @@ def shutdown(self): """ Cleanly shutdown the background thread. """ - self._stop_event.set() - self._thread.join() - logger.info("Background thread for managing Docker containers stopped.") + if self._thread is not None: + self._stop_event.set() + self._thread.join() + logger.info("Background thread for managing Docker containers stopped.") def get_node_name(self): return socket.gethostname() @@ -83,9 +89,15 @@ def schedule(self, project, version, spider, job_id, settings, args): mem_limit=resources.get('limits', {}).get('memory'), cpu_quota=_str_to_micro(resources.get('limits', {}).get('cpu')) ) - running_jobs_count = self.get_running_jobs_count() - if running_jobs_count < self.max_proc: - self.start_pending_containers() + if self.max_proc is not None: + running_jobs_count = self.get_running_jobs_count() + if running_jobs_count < self.max_proc: + self.start_pending_containers() + else: + logger.info(f"Job {job_id} is pending due to max_proc limit.") + else: + c.start() + logger.info(f"Job {job_id} started without suspension.") def start_pending_containers(self): """ @@ -132,17 +144,22 @@ def cancel(self, project_id, job_id, signal): c.kill(signal='SIG' + signal) logger.info(f"Killed and removed running container {c.name}.") # After cancelling, try to start pending containers since we might have capacity - self.start_pending_containers() + if self.max_proc is not None: + self.start_pending_containers() return prevstate def enable_joblogs(self, config, resource_watcher): logger.warning("Job logs are not supported when using the Docker launcher.") def get_running_jobs_count(self): - # Return the number of running Docker containers matching the job labels - label = self.LABEL_PROJECT - running_jobs = self._docker.containers.list(filters={'label': label, 'status': 'running'}) - return len(running_jobs) + if self.max_proc is not None: + # Return the number of running Docker containers matching the job labels + label = self.LABEL_PROJECT + running_jobs = self._docker.containers.list(filters={'label': label, 'status': 'running'}) + return len(running_jobs) + else: + # If job limiting is not enabled, return 0 to avoid unnecessary processing + return 0 def _parse_job(self, c): state = self._docker_to_scrapyd_status(c.status) From 303262813bfe11602f5a08a587a4c4f07f7c39b8 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Mon, 25 Nov 2024 15:41:30 +0100 Subject: [PATCH 17/28] add exceptions to the KubernetesScheduler class, for the init method, propagate the exception --- scrapyd_k8s/k8s_scheduler/k8s_scheduler.py | 192 +++++++++++++++------ scrapyd_k8s/launcher/k8s.py | 2 +- 2 files changed, 144 insertions(+), 50 deletions(-) diff --git a/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py b/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py index f2704e0..747bc87 100644 --- a/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py +++ b/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py @@ -1,5 +1,7 @@ import logging +from kubernetes.client import ApiException + logger = logging.getLogger(__name__) class KubernetesScheduler: @@ -7,7 +9,7 @@ class KubernetesScheduler: Manages scheduling of Kubernetes jobs to limit the number of concurrently running jobs. """ - def __init__(self, config, launcher, resource_watcher, max_proc): + def __init__(self, config, launcher, max_proc): """ Initializes the KubernetesScheduler. @@ -17,58 +19,126 @@ def __init__(self, config, launcher, resource_watcher, max_proc): The configuration object. launcher : K8s The launcher instance with initialized Kubernetes clients. - resource_watcher : ResourceWatcher - The resource watcher to subscribe to pod events. max_proc : int Maximum number of concurrent jobs. + + Raises + ------ + TypeError + If `max_proc` is not an integer. """ - self.config = config - self.launcher = launcher - self.max_proc = max_proc - self.namespace = config.scrapyd().get('namespace', 'default') + try: + self.config = config + self.launcher = launcher + if not isinstance(max_proc, int): + raise TypeError(f"max_proc must be an integer, got {type(max_proc).__name__}") + self.max_proc = max_proc + self.namespace = config.namespace() + except TypeError as e: + logger.exception(f"TypeError during KubernetesScheduler initialization: {e}") + raise - def handle_pod_event(self, event: dict): + def handle_pod_event(self, event): """ Handles pod events from the ResourceWatcher. + + Processes events related to pod lifecycle changes. If a pod associated with a job + has terminated (either succeeded or failed), it triggers the scheduler to check + and potentially unsuspend other suspended jobs to utilize available capacity. + + Parameters + ---------- + event : dict + A dictionary representing the pod event received from Kubernetes. + + Notes + ----- + This method handles and logs the following exceptions internally: + - KeyError: If the event dictionary lacks the 'object' or 'type' keys. + - AttributeError: If the pod object lacks attributes like `status.phase` or `metadata.name`. + - TypeError: If `event` is not a dictionary or does not have the expected structure. + + Exceptions are not propagated further. """ - pod = event['object'] - pod_phase = pod.status.phase - pod_name = pod.metadata.name - event_type = event['type'] + try: + if not isinstance(event, dict): + raise TypeError(f"Event must be a dictionary, got {type(event).__name__}") + pod = event['object'] + pod_phase = pod.status.phase + pod_name = pod.metadata.name + event_type = event['type'] - logger.debug(f"KubernetesScheduler received event: {event_type}, pod: {pod_name}, phase: {pod_phase}") + if not hasattr(pod, 'status') or not hasattr(pod.status, 'phase'): + raise AttributeError("Pod object missing 'status.phase' attribute") + if not hasattr(pod, 'metadata') or not hasattr(pod.metadata, 'name'): + raise AttributeError("Pod object missing 'metadata.name' attribute") - # Check if this pod is related to our jobs - if not pod.metadata.labels.get(self.launcher.LABEL_JOB_ID): - logger.debug(f"Pod {pod_name} does not have our job label; ignoring.") - return + logger.debug(f"KubernetesScheduler received event: {event_type}, pod: {pod_name}, phase: {pod_phase}") - # If a pod has terminated (Succeeded or Failed), we may have capacity to unsuspend jobs - if pod_phase in ('Succeeded', 'Failed') and event_type in ('MODIFIED', 'DELETED'): - logger.info(f"Pod {pod_name} has completed with phase {pod_phase}. Checking for suspended jobs.") - self.check_and_unsuspend_jobs() - else: - logger.debug(f"Pod {pod_name} event not relevant for unsuspension.") + # Check if this pod is related to our jobs + if not pod.metadata.labels.get(self.launcher.LABEL_JOB_ID): + logger.debug(f"Pod {pod_name} does not have our job label; ignoring.") + return + + # If a pod has terminated (Succeeded or Failed), we may have capacity to unsuspend jobs + if pod_phase in ('Succeeded', 'Failed') and event_type in ('MODIFIED', 'DELETED'): + logger.info(f"Pod {pod_name} has completed with phase {pod_phase}. Checking for suspended jobs.") + self.check_and_unsuspend_jobs() + else: + logger.debug(f"Pod {pod_name} event not relevant for unsuspension.") + except KeyError as e: + logger.error(f"KeyError in handle_pod_event: Missing key {e} in event: {event}") + except AttributeError as e: + logger.error(f"AttributeError in handle_pod_event: {e} in event: {event}") + except TypeError as e: + logger.error(f"TypeError in handle_pod_event: {e} in event: {event}") def check_and_unsuspend_jobs(self): """ Checks if there is capacity to unsuspend jobs and unsuspends them if possible. + + Continuously unsuspends suspended jobs until the number of running jobs reaches + the maximum allowed (`max_proc`) or there are no more suspended jobs available. + + Notes + ----- + This method handles and logs the following exceptions internally: + - ApiException: If there are issues interacting with the Kubernetes API during job count retrieval + or while unsuspending a job. + - AttributeError: If the launcher object lacks required methods. + - TypeError: If `max_proc` is not an integer. + + Exceptions are not propagated further. """ - running_jobs_count = self.launcher.get_running_jobs_count() - logger.debug(f"Current running jobs: {running_jobs_count}, max_proc: {self.max_proc}") - - while running_jobs_count < self.max_proc: - job_id = self.get_next_suspended_job_id() - if not job_id: - logger.info("No suspended jobs to unsuspend.") - break - success = self.launcher.unsuspend_job(job_id) - if success: - running_jobs_count += 1 - logger.info(f"Unsuspended job {job_id}. Total running jobs now: {running_jobs_count}") - else: - logger.error(f"Failed to unsuspend job {job_id}") - break + try: + running_jobs_count = self.launcher.get_running_jobs_count() + logger.debug(f"Current running jobs: {running_jobs_count}, max_proc: {self.max_proc}") + + while running_jobs_count < self.max_proc: + job_id = self.get_next_suspended_job_id() + if not job_id: + logger.info("No suspended jobs to unsuspend.") + break + try: + success = self.launcher.unsuspend_job(job_id) + if success: + running_jobs_count += 1 + logger.info(f"Unsuspended job {job_id}. Total running jobs now: {running_jobs_count}") + else: + logger.error(f"Failed to unsuspend job {job_id}") + break + except ApiException as e: + logger.error(f"Kubernetes API exception while unsuspending job {job_id}: {e}") + break + except AttributeError as e: + logger.error(f"AttributeError while unsuspending job {job_id}: {e}") + break + except ApiException as e: + logger.error(f"Kubernetes API exception in check_and_unsuspend_jobs: {e}") + except AttributeError as e: + logger.error(f"AttributeError in check_and_unsuspend_jobs: {e}") + except TypeError as e: + logger.error(f"TypeError in check_and_unsuspend_jobs: {e}") def get_next_suspended_job_id(self): """ @@ -79,15 +149,39 @@ def get_next_suspended_job_id(self): ------- str or None The job ID of the next suspended job, or None if none are found. + + Notes + ----- + This method handles and logs the following exceptions internally: + - ApiException: If there are issues interacting with the Kubernetes API during job retrieval. + - AttributeError: If job objects lack `metadata.creation_timestamp` or `metadata.labels`. + - TypeError: If the returned jobs list is not a list. + + Exceptions are not propagated further. """ - label_selector = f"{self.launcher.LABEL_JOB_ID}" - jobs = self.launcher.list_suspended_jobs(label_selector=label_selector) - if not jobs: - logger.debug("No suspended jobs found.") - return None - # Sort jobs by creation timestamp to ensure FIFO order - jobs.sort(key=lambda job: job.metadata.creation_timestamp) - job = jobs[0] - job_id = job.metadata.labels.get(self.launcher.LABEL_JOB_ID) - logger.debug(f"Next suspended job to unsuspend: {job_id}") - return job_id + try: + label_selector = f"{self.launcher.LABEL_JOB_ID}" + jobs = self.launcher.list_suspended_jobs(label_selector=label_selector) + + if not isinstance(jobs, list): + raise TypeError(f"list_suspended_jobs should return a list, got {type(jobs).__name__}") + + if not jobs: + logger.debug("No suspended jobs found.") + return None + # Sort jobs by creation timestamp to ensure FIFO order + jobs.sort(key=lambda job: job.metadata.creation_timestamp) + job = jobs[0] + if not hasattr(job, 'metadata') or not hasattr(job.metadata, 'creation_timestamp'): + raise AttributeError(f"Job object missing 'metadata.creation_timestamp': {job}") + if not hasattr(job.metadata, 'labels'): + raise AttributeError(f"Job object missing 'metadata.labels': {job}") + job_id = job.metadata.labels.get(self.launcher.LABEL_JOB_ID) + logger.debug(f"Next suspended job to unsuspend: {job_id}") + return job_id + except ApiException as api_e: + logger.error(f"Kubernetes API exception in get_next_suspended_job_id: {api_e}") + except AttributeError as attr_e: + logger.error(f"AttributeError in get_next_suspended_job_id: {attr_e}") + except TypeError as type_e: + logger.error(f"TypeError in get_next_suspended_job_id: {type_e}") diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index ef0216c..f3d0bc5 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -175,7 +175,7 @@ def enable_joblogs(self, config): def enable_k8s_scheduler(self, config): try: max_proc = int(self.max_proc) - self.scheduler = KubernetesScheduler(config, self, self.resource_watcher, max_proc) + self.scheduler = KubernetesScheduler(config, self, max_proc) logger.debug(f"KubernetesLauncher initialized with max_proc={max_proc}.") self.resource_watcher.subscribe(self.scheduler.handle_pod_event) logger.info("K8s scheduler started.") From 57630ef503746d8839318182b4d742475a8ba809 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Mon, 25 Nov 2024 16:24:06 +0100 Subject: [PATCH 18/28] add custom exceptions to the class for log handling; add exceptions to handle and propagate the critical ones --- scrapyd_k8s/joblogs/log_handler_k8s.py | 52 +++++++++++++++++++++----- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/scrapyd_k8s/joblogs/log_handler_k8s.py b/scrapyd_k8s/joblogs/log_handler_k8s.py index 61c4592..71643a0 100644 --- a/scrapyd_k8s/joblogs/log_handler_k8s.py +++ b/scrapyd_k8s/joblogs/log_handler_k8s.py @@ -8,10 +8,21 @@ logger = logging.getLogger(__name__) +# Custom Exceptions +class KubernetesJobLogHandlerError(Exception): + """Base exception class for KubernetesJobLogHandler.""" + +class LogUploadError(KubernetesJobLogHandlerError): + """Raised when uploading logs to object storage fails.""" + +class PodStreamingError(KubernetesJobLogHandlerError): + """Raised when streaming logs from a pod fails.""" + class KubernetesJobLogHandler: """ A class to handle Kubernetes job logs by watching pods, streaming logs, and uploading them to object storage. + This class: - Observes Kubernetes pods for job-related events. - Streams logs from running pods, storing them locally. @@ -48,10 +59,9 @@ class KubernetesJobLogHandler: Ensures a log file exists for a given job and returns its path. stream_logs(job_name): - Streams logs from a Kubernetes pod corresponding to the given job name and writes them to a file. - + Streams logs from a Kubernetes pod and writes them to a file. handle_events(event): - Processes Kubernetes pod events to start log streaming or upload logs when pods complete. + Watches Kubernetes pod events and handles actions such as starting log streaming or uploading logs. """ # The value was chosen to provide a balance between memory usage and the number of I/O operations DEFAULT_BLOCK_SIZE = 6144 @@ -168,6 +178,7 @@ def concatenate_and_delete_files(self, main_file_path, temp_file_path, block_siz logger.debug(f"Concatenated '{temp_file_path}' into '{main_file_path}' and deleted temporary file.") except (IOError, OSError) as e: logger.error(f"Failed to concatenate and delete files for job: {e}") + raise KubernetesJobLogHandlerError(f"Failed to concatenate and delete files: {e}") from e def make_log_filename_for_job(self, job_id): """ @@ -213,6 +224,11 @@ def stream_logs(self, job_id, pod_name): Returns ------- None + + Raises + ------ + PodStreamingError + If an I/O error occurs while streaming logs or processing them. """ log_lines_counter = 0 v1 = client.CoreV1Api() @@ -250,17 +266,32 @@ def stream_logs(self, job_id, pod_name): self.concatenate_and_delete_files(log_file_path, temp_file_path) else: os.remove(temp_file_path) - logger.info(f"Removed temporary file '{temp_file_path}' after streaming logs for job '{job_id}'.") - except Exception as e: - logger.exception(f"Error streaming logs for job '{job_id}': {e}") - + logger.info(f"Removed temporary file '{temp_file_path}' after streaming logs for job '{job_name}'.") + except (IOError, OSError) as e: + logger.error(f"I/O error while streaming logs for job '{job_name}': {e}") + raise PodStreamingError(f"I/O error while streaming logs for job '{job_name}': {e}") from e + except KubernetesJobLogHandlerError as e: + logger.error(f"Error processing logs for job '{job_name}': {e}") + raise PodStreamingError(f"Error processing logs for job '{job_name}': {e}") from e + finally: + w.stop() def handle_events(self, event): """ - Watches Kubernetes pods and handles events such as starting log streaming or uploading logs. + Watches Kubernetes pod events and handles actions such as starting log streaming or uploading logs. + + Parameters + ---------- + event : dict + The event dictionary containing information about the pod event. Returns ------- None + + Raises + ------ + KubernetesJobLogHandlerError + If an error occurs while handling pod events. """ try: @@ -297,5 +328,6 @@ def handle_events(self, event): logger.info(f"Logfile not found for job '{job_id}'") else: logger.debug(f"Other pod event type '{event['type']}' for pod '{pod.metadata.name}' - Phase: '{pod.status.phase}'") - except Exception as e: - logger.exception(f"Error watching pods in namespace '{self.namespace}': {e}") + except KubernetesJobLogHandlerError as e: + logger.error(f"Handled error in handle_events: {e}") + raise e From d2210da38ea26b5a3ff7b03f64cc51cf4d37cf82 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 27 Nov 2024 11:32:07 +0100 Subject: [PATCH 19/28] modify method that fetches nez job id to be unsuspended to add creation_timestamps to the jobs that do not have them, so they are proccessed at the end of the queue; add unit tests for k8s_scheduler class --- scrapyd_k8s/k8s_scheduler/k8s_scheduler.py | 11 +- scrapyd_k8s/tests/unit/test_k8s_scheduler.py | 232 +++++++++++++++++++ 2 files changed, 241 insertions(+), 2 deletions(-) create mode 100644 scrapyd_k8s/tests/unit/test_k8s_scheduler.py diff --git a/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py b/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py index 747bc87..a9866be 100644 --- a/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py +++ b/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py @@ -1,4 +1,5 @@ import logging +import datetime from kubernetes.client import ApiException @@ -169,11 +170,17 @@ def get_next_suspended_job_id(self): if not jobs: logger.debug("No suspended jobs found.") return None + + # Assign default timestamp to jobs missing creation_timestamp + for job in jobs: + if not hasattr(job, 'metadata') or not hasattr(job.metadata, 'creation_timestamp') or not job.metadata.creation_timestamp: + job.metadata.creation_timestamp = datetime.datetime.max + logger.warning( + f"Job {job} missing 'metadata.creation_timestamp'; assigned max timestamp.") + # Sort jobs by creation timestamp to ensure FIFO order jobs.sort(key=lambda job: job.metadata.creation_timestamp) job = jobs[0] - if not hasattr(job, 'metadata') or not hasattr(job.metadata, 'creation_timestamp'): - raise AttributeError(f"Job object missing 'metadata.creation_timestamp': {job}") if not hasattr(job.metadata, 'labels'): raise AttributeError(f"Job object missing 'metadata.labels': {job}") job_id = job.metadata.labels.get(self.launcher.LABEL_JOB_ID) diff --git a/scrapyd_k8s/tests/unit/test_k8s_scheduler.py b/scrapyd_k8s/tests/unit/test_k8s_scheduler.py new file mode 100644 index 0000000..8589cd5 --- /dev/null +++ b/scrapyd_k8s/tests/unit/test_k8s_scheduler.py @@ -0,0 +1,232 @@ +import pytest +from unittest.mock import Mock, patch +from kubernetes.client.rest import ApiException +from scrapyd_k8s.k8s_scheduler.k8s_scheduler import KubernetesScheduler + +@pytest.fixture +def mock_config(): + mock_config = Mock() + mock_config.namespace.return_value = 'default' + return mock_config + +@pytest.fixture +def mock_launcher(): + mock_launcher = Mock() + mock_launcher.LABEL_JOB_ID = 'org.scrapy.job_id' + return mock_launcher + +def test_k8s_scheduler_init(mock_config, mock_launcher): + max_proc = 5 + scheduler = KubernetesScheduler(mock_config, mock_launcher, max_proc) + assert scheduler.config == mock_config + assert scheduler.launcher == mock_launcher + assert scheduler.max_proc == max_proc + assert scheduler.namespace == 'default' + +def test_k8s_scheduler_init_invalid_max_proc(mock_config, mock_launcher): + max_proc = 'five' # Not an integer + with pytest.raises(TypeError) as excinfo: + KubernetesScheduler(mock_config, mock_launcher, max_proc) + assert "max_proc must be an integer" in str(excinfo.value) + +def test_handle_pod_event_with_non_dict_event(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + event = ['not', 'a', 'dict'] + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.handle_pod_event(event) + mock_logger.error.assert_called_with( + f"TypeError in handle_pod_event: Event must be a dictionary, got {type(event).__name__} in event: {event}" + ) + +def test_handle_pod_event_missing_keys(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + event = {'wrong_key': 'value'} + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.handle_pod_event(event) + mock_logger.error.assert_called_with( + f"KeyError in handle_pod_event: Missing key 'object' in event: {event}" + ) + +def test_handle_pod_event_pod_missing_status(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + pod = Mock() + pod.status = None + pod.metadata = Mock() + pod.metadata.name = 'pod-name' + event = {'object': pod, 'type': 'MODIFIED'} + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.handle_pod_event(event) + mock_logger.error.assert_called_with( + f"AttributeError in handle_pod_event: 'NoneType' object has no attribute 'phase' in event: {event}" + ) + +def test_handle_pod_event_pod_missing_metadata(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + pod = Mock() + pod.status = Mock() + pod.status.phase = 'Running' + pod.metadata = None + event = {'object': pod, 'type': 'MODIFIED'} + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.handle_pod_event(event) + mock_logger.error.assert_called_with( + f"AttributeError in handle_pod_event: 'NoneType' object has no attribute 'name' in event: {event}" + ) + +def test_handle_pod_event_pod_not_related(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + pod = Mock() + pod.status = Mock() + pod.status.phase = 'Running' + pod.metadata = Mock() + pod.metadata.name = 'pod-name' + pod.metadata.labels = {} + event = {'object': pod, 'type': 'MODIFIED'} + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.handle_pod_event(event) + mock_logger.debug.assert_called_with("Pod pod-name does not have our job label; ignoring.") + +def test_handle_pod_event_pod_terminated(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + pod = Mock() + pod.status = Mock() + pod.status.phase = 'Succeeded' + pod.metadata = Mock() + pod.metadata.name = 'pod-name' + pod.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job-id'} + event = {'object': pod, 'type': 'MODIFIED'} + + with patch.object(scheduler, 'check_and_unsuspend_jobs') as mock_check_and_unsuspend_jobs, \ + patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.handle_pod_event(event) + mock_logger.info.assert_called_with( + "Pod pod-name has completed with phase Succeeded. Checking for suspended jobs." + ) + mock_check_and_unsuspend_jobs.assert_called_once() + +def test_handle_pod_event_pod_not_terminated(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + pod = Mock() + pod.status = Mock() + pod.status.phase = 'Running' + pod.metadata = Mock() + pod.metadata.name = 'pod-name' + pod.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job-id'} + event = {'object': pod, 'type': 'MODIFIED'} + + with patch.object(scheduler, 'check_and_unsuspend_jobs') as mock_check_and_unsuspend_jobs, \ + patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.handle_pod_event(event) + mock_logger.debug.assert_called_with("Pod pod-name event not relevant for unsuspension.") + mock_check_and_unsuspend_jobs.assert_not_called() + +def test_check_and_unsuspend_jobs_with_capacity_and_suspended_jobs(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + mock_launcher.get_running_jobs_count.return_value = 3 + scheduler.get_next_suspended_job_id = Mock(side_effect=['job1', 'job2', None]) + mock_launcher.unsuspend_job.side_effect = [True, True] + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.check_and_unsuspend_jobs() + assert mock_launcher.unsuspend_job.call_count == 2 + mock_launcher.unsuspend_job.assert_any_call('job1') + mock_launcher.unsuspend_job.assert_any_call('job2') + mock_logger.info.assert_any_call("Unsuspended job job1. Total running jobs now: 4") + mock_logger.info.assert_any_call("Unsuspended job job2. Total running jobs now: 5") + +def test_check_and_unsuspend_jobs_no_suspended_jobs(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + mock_launcher.get_running_jobs_count.return_value = 3 + scheduler.get_next_suspended_job_id = Mock(return_value=None) + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.check_and_unsuspend_jobs() + mock_launcher.unsuspend_job.assert_not_called() + mock_logger.info.assert_called_with("No suspended jobs to unsuspend.") + +def test_check_and_unsuspend_jobs_unsuspend_fails(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + mock_launcher.get_running_jobs_count.return_value = 3 + scheduler.get_next_suspended_job_id = Mock(return_value='job1') + mock_launcher.unsuspend_job.return_value = False + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.check_and_unsuspend_jobs() + mock_launcher.unsuspend_job.assert_called_once_with('job1') + mock_logger.error.assert_called_with("Failed to unsuspend job job1") + +def test_check_and_unsuspend_jobs_unsuspend_api_exception(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + mock_launcher.get_running_jobs_count.return_value = 3 + scheduler.get_next_suspended_job_id = Mock(return_value='job1') + mock_launcher.unsuspend_job.side_effect = ApiException("API Error") + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.check_and_unsuspend_jobs() + mock_launcher.unsuspend_job.assert_called_once_with('job1') + mock_logger.error.assert_called_with( + f"Kubernetes API exception while unsuspending job job1: (API Error)\nReason: None\n" + ) + +def test_get_next_suspended_job_id_with_suspended_jobs(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + job1 = Mock() + job1.metadata = Mock() + job1.metadata.creation_timestamp = '2021-01-01T00:00:00Z' + job1.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job1'} + + job2 = Mock() + job2.metadata = Mock() + job2.metadata.creation_timestamp = '2021-01-02T00:00:00Z' + job2.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job2'} + + mock_launcher.list_suspended_jobs.return_value = [job2, job1] + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + job_id = scheduler.get_next_suspended_job_id() + assert job_id == 'job1' + mock_logger.debug.assert_called_with("Next suspended job to unsuspend: job1") + +def test_get_next_suspended_job_id_no_suspended_jobs(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + mock_launcher.list_suspended_jobs.return_value = [] + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + job_id = scheduler.get_next_suspended_job_id() + assert job_id is None + mock_logger.debug.assert_called_with("No suspended jobs found.") + +def test_get_next_suspended_job_id_list_suspended_jobs_returns_non_list(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + mock_launcher.list_suspended_jobs.return_value = 'not a list' + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + job_id = scheduler.get_next_suspended_job_id() + assert job_id is None + mock_logger.error.assert_called_with( + "TypeError in get_next_suspended_job_id: list_suspended_jobs should return a list, got str" + ) + +def test_get_next_suspended_job_id_job_missing_creation_timestamp(mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + job = Mock() + job.metadata = Mock() + job.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job1'} + job.metadata.creation_timestamp = None # Simulate missing creation_timestamp + + mock_launcher.list_suspended_jobs.return_value = [job] + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + job_id = scheduler.get_next_suspended_job_id() + assert job_id == 'job1' + mock_logger.warning.assert_called_with( + f"Job {job} missing 'metadata.creation_timestamp'; assigned max timestamp." + ) From 4da7e29c863d627cc72c27750010b93629e87f2e Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 27 Nov 2024 12:16:29 +0100 Subject: [PATCH 20/28] make the proper test structure according to the standards; add tests for docker implementation of the scheduler --- .../tests/unit/k8s_scheduler/__init__.py | 0 .../{ => k8s_scheduler}/test_k8s_scheduler.py | 0 scrapyd_k8s/tests/unit/launcher/__init__.py | 0 .../tests/unit/launcher/test_docker.py | 182 ++++++++++++++++++ 4 files changed, 182 insertions(+) create mode 100644 scrapyd_k8s/tests/unit/k8s_scheduler/__init__.py rename scrapyd_k8s/tests/unit/{ => k8s_scheduler}/test_k8s_scheduler.py (100%) create mode 100644 scrapyd_k8s/tests/unit/launcher/__init__.py create mode 100644 scrapyd_k8s/tests/unit/launcher/test_docker.py diff --git a/scrapyd_k8s/tests/unit/k8s_scheduler/__init__.py b/scrapyd_k8s/tests/unit/k8s_scheduler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapyd_k8s/tests/unit/test_k8s_scheduler.py b/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py similarity index 100% rename from scrapyd_k8s/tests/unit/test_k8s_scheduler.py rename to scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py diff --git a/scrapyd_k8s/tests/unit/launcher/__init__.py b/scrapyd_k8s/tests/unit/launcher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapyd_k8s/tests/unit/launcher/test_docker.py b/scrapyd_k8s/tests/unit/launcher/test_docker.py new file mode 100644 index 0000000..eeec2ba --- /dev/null +++ b/scrapyd_k8s/tests/unit/launcher/test_docker.py @@ -0,0 +1,182 @@ +import pytest +from unittest.mock import MagicMock, patch +import time +from scrapyd_k8s.launcher.docker import Docker + +@pytest.fixture +def config_with_max_proc(): + config = MagicMock() + config.scrapyd.return_value.get.return_value = '2' # max_proc set to 2 + return config + +@pytest.fixture +def config_without_max_proc(): + config = MagicMock() + config.scrapyd.return_value.get.return_value = None # max_proc not set + return config + +@pytest.fixture +def mock_docker_client(): + with patch('scrapyd_k8s.launcher.docker.docker') as mock_docker_module: + mock_client = MagicMock() + mock_docker_module.from_env.return_value = mock_client + yield mock_client + +@pytest.fixture +def docker_launcher_with_max_proc(config_with_max_proc, mock_docker_client): + return Docker(config_with_max_proc) + +@pytest.fixture +def docker_launcher_without_max_proc(config_without_max_proc, mock_docker_client): + return Docker(config_without_max_proc) + +def test_docker_init_with_max_proc(config_with_max_proc, mock_docker_client): + docker_launcher = Docker(config_with_max_proc) + assert docker_launcher.max_proc == 2 + assert docker_launcher._thread is not None + assert docker_launcher._thread.is_alive() + +def test_docker_init_without_max_proc(config_without_max_proc, mock_docker_client): + docker_launcher = Docker(config_without_max_proc) + assert docker_launcher.max_proc is None + assert docker_launcher._thread is None + +def test_schedule_with_capacity(docker_launcher_with_max_proc, mock_docker_client): + # Mock methods + docker_launcher_with_max_proc.get_running_jobs_count = MagicMock(return_value=1) + docker_launcher_with_max_proc.start_pending_containers = MagicMock() + + # Mock container creation + mock_container = MagicMock() + mock_docker_client.containers.create.return_value = mock_container + + # Prepare parameters for schedule + project = MagicMock() + project.id.return_value = 'test_project' + project.repository.return_value = 'test_repo' + project.resources.return_value = {} + version = 'v1' + spider = 'test_spider' + job_id = 'job_123' + settings = {} + args = {} + + # Call schedule + docker_launcher_with_max_proc.schedule(project, version, spider, job_id, settings, args) + + # Verify that container is created + mock_docker_client.containers.create.assert_called_once() + + # Since running jobs count is less than max_proc, start_pending_containers should be called + docker_launcher_with_max_proc.start_pending_containers.assert_called_once() + +def test_schedule_no_capacity(docker_launcher_with_max_proc, mock_docker_client): + # Mock methods + docker_launcher_with_max_proc.get_running_jobs_count = MagicMock(return_value=2) # At max_proc + docker_launcher_with_max_proc.start_pending_containers = MagicMock() + + # Mock container creation + mock_container = MagicMock() + mock_docker_client.containers.create.return_value = mock_container + + # Prepare parameters for schedule + project = MagicMock() + project.id.return_value = 'test_project' + project.repository.return_value = 'test_repo' + project.resources.return_value = {} + version = 'v1' + spider = 'test_spider' + job_id = 'job_456' + settings = {} + args = {} + + # Patch the logger to check log outputs + with patch('scrapyd_k8s.launcher.docker.logger') as mock_logger: + # Call schedule + docker_launcher_with_max_proc.schedule(project, version, spider, job_id, settings, args) + + # Verify that container is created + mock_docker_client.containers.create.assert_called_once() + + # start_pending_containers should not be called since we're at capacity + docker_launcher_with_max_proc.start_pending_containers.assert_not_called() + + # Verify that container.start() is not called immediately + mock_container.start.assert_not_called() + + # Check that the correct log message was output + mock_logger.info.assert_called_with(f"Job {job_id} is pending due to max_proc limit.") + +def test_schedule_no_max_proc(docker_launcher_without_max_proc, mock_docker_client): + # Mock container creation + mock_container = MagicMock() + mock_docker_client.containers.create.return_value = mock_container + + # Prepare parameters for schedule + project = MagicMock() + project.id.return_value = 'test_project' + project.repository.return_value = 'test_repo' + project.resources.return_value = {} + version = 'v1' + spider = 'test_spider' + job_id = 'job_789' + settings = {} + args = {} + + # Call schedule + docker_launcher_without_max_proc.schedule(project, version, spider, job_id, settings, args) + + # Verify that container is created + mock_docker_client.containers.create.assert_called_once() + + # Since max_proc is not set, container.start() should be called immediately + mock_container.start.assert_called_once() + +def test_get_running_jobs_count(docker_launcher_with_max_proc, mock_docker_client): + # Mock the list of running containers + mock_container_list = [MagicMock(), MagicMock()] + mock_docker_client.containers.list.return_value = mock_container_list + + count = docker_launcher_with_max_proc.get_running_jobs_count() + + # Verify that the count matches the number of mock containers + assert count == 2 + mock_docker_client.containers.list.assert_called_with( + filters={'label': docker_launcher_with_max_proc.LABEL_PROJECT, 'status': 'running'}) + +def test_start_pending_containers(docker_launcher_with_max_proc, mock_docker_client): + # Mock the get_running_jobs_count method + docker_launcher_with_max_proc.get_running_jobs_count = MagicMock(return_value=1) + + # Mock pending containers + mock_pending_container = MagicMock() + mock_pending_container.name = 'pending_container' + mock_docker_client.containers.list.return_value = [mock_pending_container] + + # Patch logger to check log outputs + with patch('scrapyd_k8s.launcher.docker.logger') as mock_logger: + # Call start_pending_containers + docker_launcher_with_max_proc.start_pending_containers() + + # Verify that the pending container's start method was called + mock_pending_container.start.assert_called_once() + + # Verify that the correct log message was output + mock_logger.info.assert_called_with( + f"Started pending container {mock_pending_container.name}. Total running jobs now: 2" + ) + +def test_background_task_starts_pending_containers(config_with_max_proc, mock_docker_client): + # Mock start_pending_containers before initializing the Docker class + with patch.object(Docker, 'start_pending_containers', autospec=True) as mock_start_pending: + # Initialize Docker instance + docker_launcher = Docker(config_with_max_proc) + + # Wait for slightly more than check_interval to ensure the background task runs + time.sleep(5.1) # Wait for the background thread to execute + + # Verify that start_pending_containers was called by the background thread + assert mock_start_pending.call_count > 0 + + # Clean up by shutting down the background thread + docker_launcher.shutdown() From 3fa4922384d02e069485a06a554df21255716dc9 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 28 Jan 2025 13:33:04 +0100 Subject: [PATCH 21/28] rebased to main --- scrapyd_k8s/api.py | 4 ---- scrapyd_k8s/joblogs/log_handler_k8s.py | 13 +++++------ scrapyd_k8s/k8s_scheduler/k8s_scheduler.py | 4 +++- scrapyd_k8s/launcher/k8s.py | 26 +++++++++------------- 4 files changed, 20 insertions(+), 27 deletions(-) diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 5afa45d..8c5b9b3 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -48,10 +48,6 @@ def api_schedule(): _version = request.form.get('_version', 'latest') # TODO allow customizing latest tag # any other parameter is passed as spider argument args = { k: v for k, v in request.form.items() if k not in ('project', 'spider', 'setting', 'jobid', 'priority', '_version') } - # running_jobs = launcher.get_running_jobs_count() - # start_suspended = running_jobs >= k8s_scheduler.max_proc - # logger.info( - # f"Scheduling job {job_id} with start_suspended={start_suspended}. Running jobs: {running_jobs}, Max procs: {k8s_scheduler.max_proc}") env_config, env_secret = project.env_config(), project.env_secret() jobid = config.launcher().schedule(project, _version, spider, job_id, settings, args, start_suspended=start_suspended) return { 'status': 'ok', 'jobid': job_id } diff --git a/scrapyd_k8s/joblogs/log_handler_k8s.py b/scrapyd_k8s/joblogs/log_handler_k8s.py index 71643a0..b9d8dca 100644 --- a/scrapyd_k8s/joblogs/log_handler_k8s.py +++ b/scrapyd_k8s/joblogs/log_handler_k8s.py @@ -266,15 +266,14 @@ def stream_logs(self, job_id, pod_name): self.concatenate_and_delete_files(log_file_path, temp_file_path) else: os.remove(temp_file_path) - logger.info(f"Removed temporary file '{temp_file_path}' after streaming logs for job '{job_name}'.") + logger.info(f"Removed temporary file '{temp_file_path}' after streaming logs for job '{job_id}'.") except (IOError, OSError) as e: - logger.error(f"I/O error while streaming logs for job '{job_name}': {e}") - raise PodStreamingError(f"I/O error while streaming logs for job '{job_name}': {e}") from e + logger.error(f"I/O error while streaming logs for job '{job_id}': {e}") + raise PodStreamingError(f"I/O error while streaming logs for job '{job_id}': {e}") from e except KubernetesJobLogHandlerError as e: - logger.error(f"Error processing logs for job '{job_name}': {e}") - raise PodStreamingError(f"Error processing logs for job '{job_name}': {e}") from e - finally: - w.stop() + logger.error(f"Error processing logs for job '{job_id}': {e}") + raise PodStreamingError(f"Error processing logs for job '{job_id}': {e}") from e + def handle_events(self, event): """ Watches Kubernetes pod events and handles actions such as starting log streaming or uploading logs. diff --git a/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py b/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py index a9866be..34e9f2e 100644 --- a/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py +++ b/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py @@ -4,6 +4,7 @@ from kubernetes.client import ApiException logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) class KubernetesScheduler: """ @@ -35,6 +36,7 @@ def __init__(self, config, launcher, max_proc): raise TypeError(f"max_proc must be an integer, got {type(max_proc).__name__}") self.max_proc = max_proc self.namespace = config.namespace() + logger.info("Scheduler feature is initialized") except TypeError as e: logger.exception(f"TypeError during KubernetesScheduler initialization: {e}") raise @@ -82,7 +84,7 @@ def handle_pod_event(self, event): return # If a pod has terminated (Succeeded or Failed), we may have capacity to unsuspend jobs - if pod_phase in ('Succeeded', 'Failed') and event_type in ('MODIFIED', 'DELETED'): + if pod_phase in ('Succeeded', 'Failed'): logger.info(f"Pod {pod_name} has completed with phase {pod_phase}. Checking for suspended jobs.") self.check_and_unsuspend_jobs() else: diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index f3d0bc5..46a7a19 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -18,6 +18,7 @@ from scrapyd_k8s.joblogs import KubernetesJobLogHandler logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) class K8s: @@ -27,6 +28,7 @@ class K8s: def __init__(self, config): self._namespace = config.scrapyd().get('namespace', 'default') + self.max_proc = config.scrapyd().get('max_proc') self._pull_secret = config.scrapyd().get('pull_secret') # TODO figure out where to put Kubernetes initialisation try: @@ -37,9 +39,8 @@ def __init__(self, config): self._k8s = kubernetes.client.CoreV1Api() self._k8s_batch = kubernetes.client.BatchV1Api() - self.scheduler = None + self.k8s_scheduler = None self._init_resource_watcher(config) - self.max_proc = int(config.scrapyd().get('max_proc', 4)) def _init_resource_watcher(self, config): self.resource_watcher = ResourceWatcher(self._namespace, config) @@ -50,12 +51,7 @@ def _init_resource_watcher(self, config): if self.max_proc is not None: self.enable_k8s_scheduler(config) else: - logger.debug("k8s scheduler not enabled; 'max_proc' configuration is missing in the scrapyd section.") - - # Initialize KubernetesScheduler - self.max_proc = int(config.scrapyd().get('max_proc', 4)) - self.k8s_scheduler = KubernetesScheduler(config, self, self.resource_watcher, self.max_proc) - logger.debug(f"KubernetesLauncher initialized with max_proc={self.max_proc}.") + logger.debug("k8s scheduler not enabled; jobs run directly after scheduling.") def get_node_name(self): deployment = os.getenv('MY_DEPLOYMENT_NAME', 'default') @@ -71,11 +67,11 @@ def listjobs(self, project=None): ) def schedule(self, project, version, spider, job_id, settings, args): - if self.scheduler: + if self.k8s_scheduler: running_jobs = self.get_running_jobs_count() - start_suspended = running_jobs >= self.scheduler.max_proc + start_suspended = running_jobs >= self.k8s_scheduler.max_proc logger.debug( - f"Scheduling job {job_id} with start_suspended={start_suspended}. Running jobs: {running_jobs}, Max procs: {self.scheduler.max_proc}") + f"Scheduling job {job_id} with start_suspended={start_suspended}. Running jobs: {running_jobs}, Max procs: {self.k8s_scheduler.max_proc}") else: start_suspended = False logger.debug(f"Scheduling job {job_id} without suspension. Scheduler not enabled.") @@ -175,16 +171,16 @@ def enable_joblogs(self, config): def enable_k8s_scheduler(self, config): try: max_proc = int(self.max_proc) - self.scheduler = KubernetesScheduler(config, self, max_proc) + self.k8s_scheduler = KubernetesScheduler(config, self, max_proc) logger.debug(f"KubernetesLauncher initialized with max_proc={max_proc}.") - self.resource_watcher.subscribe(self.scheduler.handle_pod_event) + self.resource_watcher.subscribe(self.k8s_scheduler.handle_pod_event) logger.info("K8s scheduler started.") except ValueError: logger.error(f"Invalid max_proc value: {self.max_proc}. Scheduler not enabled.") - self.scheduler = None + self.k8s_scheduler = None def unsuspend_job(self, job_id: str): - if not self.scheduler: + if not self.k8s_scheduler: logger.error("Scheduler is not enabled. Cannot unsuspend jobs.") return False job_name = self._get_job_name(job_id) From 7925d37af98b8bd5525d748f431d0fa3c4c37c07 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 29 Jan 2025 10:13:39 +0100 Subject: [PATCH 22/28] change the test test_listversions_ok to check the exact list --- scrapyd_k8s/tests/integration/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapyd_k8s/tests/integration/test_api.py b/scrapyd_k8s/tests/integration/test_api.py index 4d34b5a..45f51b2 100644 --- a/scrapyd_k8s/tests/integration/test_api.py +++ b/scrapyd_k8s/tests/integration/test_api.py @@ -44,7 +44,7 @@ def test_listversions_ok(): assert_response_ok(response) json = response.json() - assert set(AVAIL_VERSIONS).issubset(set(json['versions'])) + assert json['versions'] == AVAIL_VERSIONS assert 'node_name' in json def test_listversions_project_missing(): From 0f89754801f9e0604c8eda82e88d8fd4a7cee133 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 25 Feb 2025 10:25:22 +0100 Subject: [PATCH 23/28] add integration tests for scheduler with two different configs --- kubernetes-0.yaml | 191 +++++++++ kubernetes-1.yaml | 191 +++++++++ .../tests/integration/test_k8s_scheduler.py | 402 ++++++++++++++++++ 3 files changed, 784 insertions(+) create mode 100644 kubernetes-0.yaml create mode 100644 kubernetes-1.yaml create mode 100644 scrapyd_k8s/tests/integration/test_k8s_scheduler.py diff --git a/kubernetes-0.yaml b/kubernetes-0.yaml new file mode 100644 index 0000000..b63e32c --- /dev/null +++ b/kubernetes-0.yaml @@ -0,0 +1,191 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: scrapyd-k8s + name: scrapyd-k8s +spec: + selector: + matchLabels: + app.kubernetes.io/name: scrapyd-k8s + template: + metadata: + labels: + app.kubernetes.io/name: scrapyd-k8s + spec: + securityContext: + fsGroup: 1000 + serviceAccountName: scrapyd-k8s + containers: + - image: scrapyd-k8s-test:latest + imagePullPolicy: Never + name: scrapyd-k8s + ports: + - containerPort: 6800 + name: http + protocol: TCP + env: + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: MY_DEPLOYMENT_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['app.kubernetes.io/name'] + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: http + livenessProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + resources: + limits: + memory: 128Mi # TODO check + requests: + memory: 64Mi # TODO check + volumeMounts: + - name: scrapyd-k8s-config + mountPath: /opt/app/scrapyd_k8s.conf + readOnly: true + subPath: scrapyd_k8s.conf + #- name: joblogs + # mountPath: /data + # Enable if your spider repository needs a pull secret + # - name: scrapyd-k8s-pull-secret + # mountPath: /opt/app/.docker + # readOnly: true + volumes: + - configMap: + name: scrapyd-k8s-config + name: scrapyd-k8s-config + #- name: joblogs + # persistentVolumeClaim: + # claimName: pv-claim + # Enable if your spider repository needs a pull secret + # - secret: + # secretName: pull-secret + # items: + # - key: .dockerconfigjson + # path: config.json +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: scrapyd-k8s-config + labels: + app.kubernetes.io/name: scrapyd-k8s +data: + scrapyd_k8s.conf: |- + [scrapyd] + bind_address = 0.0.0.0 + http_port = 6800 + + repository = scrapyd_k8s.repository.Remote + launcher = scrapyd_k8s.launcher.K8s + + namespace = default + + max_proc = 0 + + # This is an example spider that should work out of the box. + # Adapt the spider config to your use-case. + [project.example] + env_secret = spider-example-env + env_config = spider-example-env + repository = ghcr.io/q-m/scrapyd-k8s-spider-example + + # It is strongly recomended to set resource requests and limits on production. + # They can be overridden on the project and spider level. + [default.resources] + requests_cpu = 0.2 + requests_memory = 0.2G + limits_cpu = 0.8 + limits_memory = 0.5G + + [joblogs] + logs_dir = /data/joblogs +--- +apiVersion: v1 +kind: Secret +metadata: + name: spider-example-env + labels: + app.kubernetes.io/name: spider-example +stringData: + FOO_API_KEY: "1234567890abcdef" +#--- +#apiVersion: v1 +#kind: PersistentVolumeClaim +#metadata: +# name: pv-claim +#spec: +# accessModes: +# - ReadWriteOnce +# resources: +# requests: +# storage: 5Gi +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: spider-example-env + labels: + app.kubernetes.io/name: spider-example +data: + BAR_VALUE: "baz" +--- +apiVersion: v1 +kind: Service +metadata: + name: scrapyd-k8s + labels: + app.kubernetes.io/name: scrapyd-k8s +spec: + type: ClusterIP + ports: + - name: http + port: 6800 + protocol: TCP + targetPort: http + selector: + app.kubernetes.io/name: scrapyd-k8s +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: scrapyd-k8s +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: scrapyd-k8s +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods/exec"] + verbs: ["get"] + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "create", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: scrapyd-k8s +subjects: + - kind: ServiceAccount + name: scrapyd-k8s +roleRef: + kind: Role + name: scrapyd-k8s + apiGroup: rbac.authorization.k8s.io diff --git a/kubernetes-1.yaml b/kubernetes-1.yaml new file mode 100644 index 0000000..9343e74 --- /dev/null +++ b/kubernetes-1.yaml @@ -0,0 +1,191 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: scrapyd-k8s + name: scrapyd-k8s +spec: + selector: + matchLabels: + app.kubernetes.io/name: scrapyd-k8s + template: + metadata: + labels: + app.kubernetes.io/name: scrapyd-k8s + spec: + securityContext: + fsGroup: 1000 + serviceAccountName: scrapyd-k8s + containers: + - image: scrapyd-k8s-test:latest + imagePullPolicy: Never + name: scrapyd-k8s + ports: + - containerPort: 6800 + name: http + protocol: TCP + env: + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: MY_DEPLOYMENT_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['app.kubernetes.io/name'] + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: http + livenessProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + resources: + limits: + memory: 128Mi # TODO check + requests: + memory: 64Mi # TODO check + volumeMounts: + - name: scrapyd-k8s-config + mountPath: /opt/app/scrapyd_k8s.conf + readOnly: true + subPath: scrapyd_k8s.conf + #- name: joblogs + # mountPath: /data + # Enable if your spider repository needs a pull secret + # - name: scrapyd-k8s-pull-secret + # mountPath: /opt/app/.docker + # readOnly: true + volumes: + - configMap: + name: scrapyd-k8s-config + name: scrapyd-k8s-config + #- name: joblogs + # persistentVolumeClaim: + # claimName: pv-claim + # Enable if your spider repository needs a pull secret + # - secret: + # secretName: pull-secret + # items: + # - key: .dockerconfigjson + # path: config.json +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: scrapyd-k8s-config + labels: + app.kubernetes.io/name: scrapyd-k8s +data: + scrapyd_k8s.conf: |- + [scrapyd] + bind_address = 0.0.0.0 + http_port = 6800 + + repository = scrapyd_k8s.repository.Remote + launcher = scrapyd_k8s.launcher.K8s + + namespace = default + + max_proc = 1 + + # This is an example spider that should work out of the box. + # Adapt the spider config to your use-case. + [project.example] + env_secret = spider-example-env + env_config = spider-example-env + repository = ghcr.io/q-m/scrapyd-k8s-spider-example + + # It is strongly recomended to set resource requests and limits on production. + # They can be overridden on the project and spider level. + [default.resources] + requests_cpu = 0.2 + requests_memory = 0.2G + limits_cpu = 0.8 + limits_memory = 0.5G + + [joblogs] + logs_dir = /data/joblogs +--- +apiVersion: v1 +kind: Secret +metadata: + name: spider-example-env + labels: + app.kubernetes.io/name: spider-example +stringData: + FOO_API_KEY: "1234567890abcdef" +#--- +#apiVersion: v1 +#kind: PersistentVolumeClaim +#metadata: +# name: pv-claim +#spec: +# accessModes: +# - ReadWriteOnce +# resources: +# requests: +# storage: 5Gi +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: spider-example-env + labels: + app.kubernetes.io/name: spider-example +data: + BAR_VALUE: "baz" +--- +apiVersion: v1 +kind: Service +metadata: + name: scrapyd-k8s + labels: + app.kubernetes.io/name: scrapyd-k8s +spec: + type: ClusterIP + ports: + - name: http + port: 6800 + protocol: TCP + targetPort: http + selector: + app.kubernetes.io/name: scrapyd-k8s +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: scrapyd-k8s +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: scrapyd-k8s +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods/exec"] + verbs: ["get"] + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "create", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: scrapyd-k8s +subjects: + - kind: ServiceAccount + name: scrapyd-k8s +roleRef: + kind: Role + name: scrapyd-k8s + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/scrapyd_k8s/tests/integration/test_k8s_scheduler.py b/scrapyd_k8s/tests/integration/test_k8s_scheduler.py new file mode 100644 index 0000000..0379186 --- /dev/null +++ b/scrapyd_k8s/tests/integration/test_k8s_scheduler.py @@ -0,0 +1,402 @@ +import os +import pytest +import subprocess +import time +import requests +import uuid +import shutil + +USE_REAL_MINIKUBE = os.environ.get("USE_REAL_MINIKUBE", "false").lower() == "true" + + +def is_minikube_running(): + try: + proc = subprocess.run( + ["minikube", "status", "--output=json"], + capture_output=True, text=True, check=True, timeout=10 + ) + return "Running" in proc.stdout + except Exception: + return False + + +@pytest.fixture(scope="session") +def ensure_minikube(): + """If USE_REAL_MINIKUBE is true, ensure minikube is up and context set.""" + if not USE_REAL_MINIKUBE: + pytest.skip("Skipping real k8s tests unless USE_REAL_MINIKUBE=true is set.") + + if not shutil.which("minikube"): + pytest.skip("Minikube not installed, skipping.") + + if not is_minikube_running(): + try: + proc = subprocess.run( + ["minikube", "start", "--driver=docker"], + check=True, capture_output=True, text=True, timeout=180 + ) + print(proc.stdout) + except subprocess.CalledProcessError as e: + pytest.skip(f"Failed to start minikube: {e.stderr}") + + # Force context to minikube + try: + subprocess.run(["kubectl", "config", "use-context", "minikube"], check=True) + except subprocess.CalledProcessError: + pytest.skip("Failed to use minikube context.") + + # Check cluster + try: + subprocess.run(["kubectl", "cluster-info"], check=True) + except subprocess.CalledProcessError: + pytest.skip("kubectl cluster-info failed.") + + yield + + # Clean up all Kubernetes resources after all tests + print("Final cleanup of all resources...") + subprocess.run(["kubectl", "delete", "all", "--all"], check=False) + # Don't delete minikube itself - leave that to the user + + +@pytest.fixture(scope="session") +def minikube_env(): + result = subprocess.run(["minikube", "docker-env"], check=True, capture_output=True, text=True) + + env = os.environ.copy() + for line in result.stdout.split('\n'): + if line.startswith("export "): + key, value = line[len("export "):].split("=", 1) + env[key] = value.strip('"') + + yield env + + +@pytest.fixture(scope="session", autouse=True) +def ensure_scrapyd_image(minikube_env): + print("Building test container...") + subprocess.run(["docker", "build", "-t", "scrapyd-k8s-test:latest", "."], check=True, env=minikube_env) + + yield + + print("Removing test container...") + subprocess.run(["docker", "rmi", "scrapyd-k8s-test:latest"], check=False, env=minikube_env) + + +def wait_for_scrapyd_ready(label_selector="app.kubernetes.io/name=scrapyd-k8s", timeout=40): + """ + Wait until a pod matching label_selector is Running, then sleep 5s. + """ + import time + start = time.time() + ready = False + + print(f"Waiting for pod with label {label_selector} to be ready (timeout: {timeout}s)...") + + while time.time() - start < timeout: + get_pods = subprocess.run( + ["kubectl", "get", "pods", "--selector", label_selector], + capture_output=True, text=True + ) + print(f"Current pods: {get_pods.stdout}") + + if "Running" in get_pods.stdout: + print(f"Pod is now Running. Waiting a moment for internal initialization...") + # give it a few seconds for internal init + time.sleep(3) + ready = True + break + + if not ready: + # Get deployment events for debugging + print("Pod not ready in time. Checking events:") + events = subprocess.run( + ["kubectl", "get", "events", "--sort-by=.metadata.creationTimestamp"], + capture_output=True, text=True + ) + print(events.stdout) + pytest.fail(f"No 'Running' pods found with label {label_selector} after {timeout}s") + + return True + + +@pytest.fixture +def scrapyd_service(ensure_minikube, minikube_env, request): + """ + Parameterized fixture that sets up scrapyd with specified max_proc. + """ + # Get max_proc from parameter or default to 0 + max_proc = getattr(request, "param", 0) + + # Retrieve a temporary kubernetes yaml with the specified max_proc + yaml_file = "kubernetes-%d.yaml" % max_proc + + # Now apply the new configuration + print(f"Applying {yaml_file} with max_proc={max_proc}...") + subprocess.run(["kubectl", "apply", "-f", yaml_file], check=True, env=minikube_env) + + # Wait for the deployment to be ready + wait_for_scrapyd_ready() + + # Even after pod is ready, give it a few more seconds to fully initialize + print("Pod is ready, waiting for internal initialization...") + time.sleep(3) + + # Port-forward + pf_proc = subprocess.Popen( + ["kubectl", "port-forward", "service/scrapyd-k8s", "6800:6800"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + time.sleep(3) + + # Quick health check + base_url = "http://127.0.0.1:6800" + try: + resp = requests.get(f"{base_url}/healthz", timeout=5) + if resp.status_code != 200: + pytest.fail("Scrapyd /healthz not OK.") + except Exception as e: + pf_proc.terminate() + pytest.fail(f"Couldn't connect to scrapyd at {base_url}: {e}") + + # Log diagnostics - check the actual configuration + print("Checking actual config in pod...") + config_check = subprocess.run( + ["kubectl", "exec", "deployment/scrapyd-k8s", "--", "cat", "/opt/app/scrapyd_k8s.conf"], + capture_output=True, text=True + ) + if config_check.returncode == 0: + print("Current config in pod:") + print(config_check.stdout) + + # Highlight the max_proc setting + if f"max_proc = {max_proc}" in config_check.stdout: + print(f"✅ max_proc={max_proc} found in config") + else: + print(f"❌ max_proc={max_proc} NOT found in config") + + # Check logs for scheduler initialization + print("Checking for scheduler initialization in logs...") + logs = subprocess.run( + ["kubectl", "logs", "deployment/scrapyd-k8s"], + capture_output=True, text=True + ) + if "k8s scheduler started" in logs.stdout.lower(): + print("✅ Scheduler initialization found in logs") + else: + print("❌ Scheduler initialization NOT found in logs") + + # Additional diagnostic information + print("Full pod logs:") + print(logs.stdout) + + # Print out the environment variables + print("Checking environment variables:") + env_vars = subprocess.run( + ["kubectl", "exec", "deployment/scrapyd-k8s", "--", "env"], + capture_output=True, text=True + ) + print(env_vars.stdout) + + yield base_url, max_proc + + # Cleanup + pf_proc.terminate() + print(f"Deleting {yaml_file} and cleaning up resources...") + subprocess.run(["kubectl", "delete", "-f", yaml_file], check=False, env=minikube_env) + + +def wait_for_job_state(service_url, job_id, target_state, timeout=25): + """ + Wait until job `job_id` is in `target_state` (one of 'pending', 'running', 'finished') according to listjobs.json. + Raise TimeoutError if not. + """ + start = time.time() + while time.time() - start < timeout: + r = requests.get(f"{service_url}/listjobs.json") + data = r.json() + # data has keys: "pending", "running", "finished" + for key in ["pending", "running", "finished"]: + if any(j["id"] == job_id for j in data[key]): + if key == target_state: + return + time.sleep(2) + raise TimeoutError(f"Job {job_id} not in {target_state} after {timeout}s") + + +@pytest.mark.parametrize("scrapyd_service", [0], indirect=True) +def test_zero_allowed_jobs(scrapyd_service): + """ + With max_proc=0, any scheduled job should remain in the 'pending' state in /listjobs.json + (i.e., it never goes to 'running' or 'finished'). + """ + service_url, max_proc = scrapyd_service + print(f"Testing with max_proc={max_proc}") + + # First check the max_proc setting in the pod + pods = subprocess.run( + ["kubectl", "get", "pods", "--selector=app.kubernetes.io/name=scrapyd-k8s", "-o", "name"], + capture_output=True, text=True + ) + pod_name = pods.stdout.strip() + + if pod_name: + print(f"Found pod: {pod_name}") + # Check for the max_proc in logs again + pod_logs = subprocess.run( + ["kubectl", "logs", pod_name], + capture_output=True, text=True + ) + print("Pod logs related to scheduler:") + for line in pod_logs.stdout.split('\n'): + if 'sched' in line.lower() or 'max_proc' in line.lower(): + print(f" {line.strip()}") + else: + print("No pods found for scrapyd-k8s") + + # Schedule a job + job_id = uuid.uuid4().hex + resp = requests.post(f"{service_url}/schedule.json", data={ + "project": "example", + "spider": "quotes", + "_version": "latest", + "jobid": job_id + }) + assert resp.status_code == 200 + + # Wait some time, ensure it never leaves "pending" + # (If your job finishes super fast, you'd never see it running anyway. + # So the best we can do is "still pending after X seconds.") + wait_time = 10 + print(f"Waiting {wait_time}s to confirm job stays pending...") + time.sleep(wait_time) + + # Check the job is still in 'pending' + r = requests.get(f"{service_url}/listjobs.json").json() + pending = [j for j in r["pending"] if j["id"] == job_id] + running = [j for j in r["running"] if j["id"] == job_id] + finished = [j for j in r["finished"] if j["id"] == job_id] + + # Check Kubernetes job state + check_k8s = subprocess.run( + ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job_id, "-o", "wide"], + capture_output=True, text=True + ) + print(f"Job {job_id} Kubernetes status: {check_k8s.stdout}") + + # Check if the job is suspended + check_suspended = subprocess.run( + ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job_id, "-o", + "jsonpath='{.items[0].spec.suspend}'"], + capture_output=True, text=True + ) + print(f"Job {job_id} suspended state: {check_suspended.stdout}") + + assert len(pending) == 1, ( + f"Expected job {job_id} to remain pending (max_proc=0), got state: {r}" + ) + assert not running, f"Job {job_id} unexpectedly running: {r}" + assert not finished, f"Job {job_id} unexpectedly finished: {r}" + + print("test_zero_allowed_jobs passed: job remained pending.") + + +@pytest.mark.parametrize("scrapyd_service", [1], indirect=True) +def test_one_allowed_job(scrapyd_service): + """ + With max_proc=1, scheduling two jobs back to back => + - The first should move into 'running' + - The second should remain 'pending' until the first finishes + """ + service_url, max_proc = scrapyd_service + print(f"Testing with max_proc={max_proc}") + + # Check for the max_proc in pod + pods = subprocess.run( + ["kubectl", "get", "pods", "--selector=app.kubernetes.io/name=scrapyd-k8s", "-o", "name"], + capture_output=True, text=True + ) + pod_name = pods.stdout.strip() + + if pod_name: + print(f"Found pod: {pod_name}") + # Check for the max_proc in logs again + pod_logs = subprocess.run( + ["kubectl", "logs", pod_name], + capture_output=True, text=True + ) + print("Pod logs related to scheduler:") + for line in pod_logs.stdout.split('\n'): + if 'sched' in line.lower() or 'max_proc' in line.lower(): + print(f" {line.strip()}") + else: + print("No pods found for scrapyd-k8s") + + job1_id = uuid.uuid4().hex + job2_id = uuid.uuid4().hex + + # Schedule job1 + r1 = requests.post(f"{service_url}/schedule.json", data={ + "project": "example", + "spider": "quotes", + "_version": "latest", + "jobid": job1_id + }) + assert r1.status_code == 200 + + # Schedule job2 + r2 = requests.post(f"{service_url}/schedule.json", data={ + "project": "example", + "spider": "quotes", + "_version": "latest", + "jobid": job2_id + }) + assert r2.status_code == 200 + + # job1 should eventually be 'running' + print("Waiting for job1 to appear running...") + wait_for_job_state(service_url, job1_id, 'running', timeout=20) + + # Once job1 is running, job2 should remain 'pending' + print("Checking job2 is pending while job1 runs...") + r = requests.get(f"{service_url}/listjobs.json").json() + pending2 = any(j["id"] == job2_id for j in r["pending"]) + running2 = any(j["id"] == job2_id for j in r["running"]) + + # Check Kubernetes job states + check_job1 = subprocess.run( + ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job1_id, "-o", "wide"], + capture_output=True, text=True + ) + print(f"Job1 {job1_id} Kubernetes status: {check_job1.stdout}") + + check_job2 = subprocess.run( + ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job2_id, "-o", "wide"], + capture_output=True, text=True + ) + print(f"Job2 {job2_id} Kubernetes status: {check_job2.stdout}") + + # Check if job2 is suspended + check_job2_suspended = subprocess.run( + ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job2_id, "-o", + "jsonpath='{.items[0].spec.suspend}'"], + capture_output=True, text=True + ) + print(f"Job2 {job2_id} suspended state: {check_job2_suspended.stdout}") + + assert pending2, f"job2 expected pending, got {r}" + assert not running2, f"job2 is unexpectedly running: {r}" + + # Wait for job1 to finish + try: + print("Waiting for job1 to finish...") + wait_for_job_state(service_url, job1_id, 'finished', timeout=20) + except TimeoutError as e: + # if job1 is extremely slow or never finishes, fail + pytest.fail(str(e)) + + # After job1 finishes, job2 should become running + print("Waiting up to 30s for job2 to become running now that job1 finished...") + wait_for_job_state(service_url, job2_id, 'running', timeout=25) + + print("test_one_allowed_job passed: concurrency=1 logic is correct.") From eb7173f4f11d5e2a86bbfd44e15744d624d4ae71 Mon Sep 17 00:00:00 2001 From: wvengen Date: Tue, 4 Mar 2025 14:57:58 +0100 Subject: [PATCH 24/28] Adapt integration testing setup --- .github/workflows/test.yml | 5 +- kubernetes-0.yaml | 191 --------- kubernetes-1.yaml | 191 --------- kubernetes.yaml | 4 +- scrapyd_k8s/api.py | 2 +- .../tests/integration/test_k8s_scheduler.py | 402 ------------------ .../tests/integration/test_maxproc_one.conf | 3 + .../tests/integration/test_maxproc_one.k8s.sh | 26 ++ .../tests/integration/test_maxproc_one.py | 75 ++++ .../tests/integration/test_maxproc_zero.conf | 3 + .../integration/test_maxproc_zero.k8s.sh | 26 ++ .../tests/integration/test_maxproc_zero.py | 50 +++ 12 files changed, 191 insertions(+), 787 deletions(-) delete mode 100644 kubernetes-0.yaml delete mode 100644 kubernetes-1.yaml delete mode 100644 scrapyd_k8s/tests/integration/test_k8s_scheduler.py create mode 100644 scrapyd_k8s/tests/integration/test_maxproc_one.conf create mode 100755 scrapyd_k8s/tests/integration/test_maxproc_one.k8s.sh create mode 100644 scrapyd_k8s/tests/integration/test_maxproc_one.py create mode 100644 scrapyd_k8s/tests/integration/test_maxproc_zero.conf create mode 100755 scrapyd_k8s/tests/integration/test_maxproc_zero.k8s.sh create mode 100644 scrapyd_k8s/tests/integration/test_maxproc_zero.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6bde8f6..2de8204 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -132,9 +132,11 @@ jobs: # for each integration test file for test in scrapyd_k8s/tests/integration/test_*.py; do echo; echo "# $test" - # run scrapyd-k8s with test-specific configuration file + # run scrapyd-k8s with test-specific configuration file, run k8s patch if available cfg=`echo "$test" | sed 's/\.py$/.conf/'` kubectl create cm scrapyd-k8s-testcfg --from-file=scrapyd_k8s.test.conf="$cfg" + k8sconfig=`echo "$test" | sed 's/\.py$/\.k8s.sh/'` + [ -x "$k8sconfig" ] && "$k8sconfig" up kubectl scale --replicas=1 deploy/scrapyd-k8s # wait for scrapyd-k8s to become ready kubectl wait --for=condition=Available deploy/scrapyd-k8s --timeout=60s @@ -151,6 +153,7 @@ jobs: kubectl scale --replicas=0 deploy/scrapyd-k8s kubectl wait --for=delete pod -l app.kubernetes.io/name=scrapyd-k8s --timeout=90s kubectl delete cm scrapyd-k8s-testcfg --wait + [ -x "$k8sconfig" ] && "$k8sconfig" down done test-k8s: diff --git a/kubernetes-0.yaml b/kubernetes-0.yaml deleted file mode 100644 index b63e32c..0000000 --- a/kubernetes-0.yaml +++ /dev/null @@ -1,191 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/name: scrapyd-k8s - name: scrapyd-k8s -spec: - selector: - matchLabels: - app.kubernetes.io/name: scrapyd-k8s - template: - metadata: - labels: - app.kubernetes.io/name: scrapyd-k8s - spec: - securityContext: - fsGroup: 1000 - serviceAccountName: scrapyd-k8s - containers: - - image: scrapyd-k8s-test:latest - imagePullPolicy: Never - name: scrapyd-k8s - ports: - - containerPort: 6800 - name: http - protocol: TCP - env: - - name: MY_POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: MY_DEPLOYMENT_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['app.kubernetes.io/name'] - readinessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: http - livenessProbe: - failureThreshold: 30 - httpGet: - path: /healthz - port: http - resources: - limits: - memory: 128Mi # TODO check - requests: - memory: 64Mi # TODO check - volumeMounts: - - name: scrapyd-k8s-config - mountPath: /opt/app/scrapyd_k8s.conf - readOnly: true - subPath: scrapyd_k8s.conf - #- name: joblogs - # mountPath: /data - # Enable if your spider repository needs a pull secret - # - name: scrapyd-k8s-pull-secret - # mountPath: /opt/app/.docker - # readOnly: true - volumes: - - configMap: - name: scrapyd-k8s-config - name: scrapyd-k8s-config - #- name: joblogs - # persistentVolumeClaim: - # claimName: pv-claim - # Enable if your spider repository needs a pull secret - # - secret: - # secretName: pull-secret - # items: - # - key: .dockerconfigjson - # path: config.json ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: scrapyd-k8s-config - labels: - app.kubernetes.io/name: scrapyd-k8s -data: - scrapyd_k8s.conf: |- - [scrapyd] - bind_address = 0.0.0.0 - http_port = 6800 - - repository = scrapyd_k8s.repository.Remote - launcher = scrapyd_k8s.launcher.K8s - - namespace = default - - max_proc = 0 - - # This is an example spider that should work out of the box. - # Adapt the spider config to your use-case. - [project.example] - env_secret = spider-example-env - env_config = spider-example-env - repository = ghcr.io/q-m/scrapyd-k8s-spider-example - - # It is strongly recomended to set resource requests and limits on production. - # They can be overridden on the project and spider level. - [default.resources] - requests_cpu = 0.2 - requests_memory = 0.2G - limits_cpu = 0.8 - limits_memory = 0.5G - - [joblogs] - logs_dir = /data/joblogs ---- -apiVersion: v1 -kind: Secret -metadata: - name: spider-example-env - labels: - app.kubernetes.io/name: spider-example -stringData: - FOO_API_KEY: "1234567890abcdef" -#--- -#apiVersion: v1 -#kind: PersistentVolumeClaim -#metadata: -# name: pv-claim -#spec: -# accessModes: -# - ReadWriteOnce -# resources: -# requests: -# storage: 5Gi ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: spider-example-env - labels: - app.kubernetes.io/name: spider-example -data: - BAR_VALUE: "baz" ---- -apiVersion: v1 -kind: Service -metadata: - name: scrapyd-k8s - labels: - app.kubernetes.io/name: scrapyd-k8s -spec: - type: ClusterIP - ports: - - name: http - port: 6800 - protocol: TCP - targetPort: http - selector: - app.kubernetes.io/name: scrapyd-k8s ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: scrapyd-k8s ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: scrapyd-k8s -rules: - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["pods/exec"] - verbs: ["get"] - - apiGroups: [""] - resources: ["pods/log"] - verbs: ["get"] - - apiGroups: ["batch"] - resources: ["jobs"] - verbs: ["get", "list", "create", "patch", "delete"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: scrapyd-k8s -subjects: - - kind: ServiceAccount - name: scrapyd-k8s -roleRef: - kind: Role - name: scrapyd-k8s - apiGroup: rbac.authorization.k8s.io diff --git a/kubernetes-1.yaml b/kubernetes-1.yaml deleted file mode 100644 index 9343e74..0000000 --- a/kubernetes-1.yaml +++ /dev/null @@ -1,191 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/name: scrapyd-k8s - name: scrapyd-k8s -spec: - selector: - matchLabels: - app.kubernetes.io/name: scrapyd-k8s - template: - metadata: - labels: - app.kubernetes.io/name: scrapyd-k8s - spec: - securityContext: - fsGroup: 1000 - serviceAccountName: scrapyd-k8s - containers: - - image: scrapyd-k8s-test:latest - imagePullPolicy: Never - name: scrapyd-k8s - ports: - - containerPort: 6800 - name: http - protocol: TCP - env: - - name: MY_POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: MY_DEPLOYMENT_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['app.kubernetes.io/name'] - readinessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: http - livenessProbe: - failureThreshold: 30 - httpGet: - path: /healthz - port: http - resources: - limits: - memory: 128Mi # TODO check - requests: - memory: 64Mi # TODO check - volumeMounts: - - name: scrapyd-k8s-config - mountPath: /opt/app/scrapyd_k8s.conf - readOnly: true - subPath: scrapyd_k8s.conf - #- name: joblogs - # mountPath: /data - # Enable if your spider repository needs a pull secret - # - name: scrapyd-k8s-pull-secret - # mountPath: /opt/app/.docker - # readOnly: true - volumes: - - configMap: - name: scrapyd-k8s-config - name: scrapyd-k8s-config - #- name: joblogs - # persistentVolumeClaim: - # claimName: pv-claim - # Enable if your spider repository needs a pull secret - # - secret: - # secretName: pull-secret - # items: - # - key: .dockerconfigjson - # path: config.json ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: scrapyd-k8s-config - labels: - app.kubernetes.io/name: scrapyd-k8s -data: - scrapyd_k8s.conf: |- - [scrapyd] - bind_address = 0.0.0.0 - http_port = 6800 - - repository = scrapyd_k8s.repository.Remote - launcher = scrapyd_k8s.launcher.K8s - - namespace = default - - max_proc = 1 - - # This is an example spider that should work out of the box. - # Adapt the spider config to your use-case. - [project.example] - env_secret = spider-example-env - env_config = spider-example-env - repository = ghcr.io/q-m/scrapyd-k8s-spider-example - - # It is strongly recomended to set resource requests and limits on production. - # They can be overridden on the project and spider level. - [default.resources] - requests_cpu = 0.2 - requests_memory = 0.2G - limits_cpu = 0.8 - limits_memory = 0.5G - - [joblogs] - logs_dir = /data/joblogs ---- -apiVersion: v1 -kind: Secret -metadata: - name: spider-example-env - labels: - app.kubernetes.io/name: spider-example -stringData: - FOO_API_KEY: "1234567890abcdef" -#--- -#apiVersion: v1 -#kind: PersistentVolumeClaim -#metadata: -# name: pv-claim -#spec: -# accessModes: -# - ReadWriteOnce -# resources: -# requests: -# storage: 5Gi ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: spider-example-env - labels: - app.kubernetes.io/name: spider-example -data: - BAR_VALUE: "baz" ---- -apiVersion: v1 -kind: Service -metadata: - name: scrapyd-k8s - labels: - app.kubernetes.io/name: scrapyd-k8s -spec: - type: ClusterIP - ports: - - name: http - port: 6800 - protocol: TCP - targetPort: http - selector: - app.kubernetes.io/name: scrapyd-k8s ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: scrapyd-k8s ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: scrapyd-k8s -rules: - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["pods/exec"] - verbs: ["get"] - - apiGroups: [""] - resources: ["pods/log"] - verbs: ["get"] - - apiGroups: ["batch"] - resources: ["jobs"] - verbs: ["get", "list", "create", "patch", "delete"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: scrapyd-k8s -subjects: - - kind: ServiceAccount - name: scrapyd-k8s -roleRef: - kind: Role - name: scrapyd-k8s - apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/kubernetes.yaml b/kubernetes.yaml index e4d59a6..6a58a97 100644 --- a/kubernetes.yaml +++ b/kubernetes.yaml @@ -162,6 +162,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: scrapyd-k8s +# review scrapyd_k8s/test/integration/*.k8s.sh when modifying this in the repository rules: - apiGroups: [""] resources: ["pods"] @@ -174,7 +175,8 @@ rules: verbs: ["get"] - apiGroups: ["batch"] resources: ["jobs"] - verbs: ["get", "list", "create", "patch", "delete"] + # add "patch" if you use scheduling, i.e. if you use max_proc + verbs: ["get", "list", "create", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 8c5b9b3..ecbd16f 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -49,7 +49,7 @@ def api_schedule(): # any other parameter is passed as spider argument args = { k: v for k, v in request.form.items() if k not in ('project', 'spider', 'setting', 'jobid', 'priority', '_version') } env_config, env_secret = project.env_config(), project.env_secret() - jobid = config.launcher().schedule(project, _version, spider, job_id, settings, args, start_suspended=start_suspended) + jobid = config.launcher().schedule(project, _version, spider, job_id, settings, args) return { 'status': 'ok', 'jobid': job_id } @app.post("/cancel.json") diff --git a/scrapyd_k8s/tests/integration/test_k8s_scheduler.py b/scrapyd_k8s/tests/integration/test_k8s_scheduler.py deleted file mode 100644 index 0379186..0000000 --- a/scrapyd_k8s/tests/integration/test_k8s_scheduler.py +++ /dev/null @@ -1,402 +0,0 @@ -import os -import pytest -import subprocess -import time -import requests -import uuid -import shutil - -USE_REAL_MINIKUBE = os.environ.get("USE_REAL_MINIKUBE", "false").lower() == "true" - - -def is_minikube_running(): - try: - proc = subprocess.run( - ["minikube", "status", "--output=json"], - capture_output=True, text=True, check=True, timeout=10 - ) - return "Running" in proc.stdout - except Exception: - return False - - -@pytest.fixture(scope="session") -def ensure_minikube(): - """If USE_REAL_MINIKUBE is true, ensure minikube is up and context set.""" - if not USE_REAL_MINIKUBE: - pytest.skip("Skipping real k8s tests unless USE_REAL_MINIKUBE=true is set.") - - if not shutil.which("minikube"): - pytest.skip("Minikube not installed, skipping.") - - if not is_minikube_running(): - try: - proc = subprocess.run( - ["minikube", "start", "--driver=docker"], - check=True, capture_output=True, text=True, timeout=180 - ) - print(proc.stdout) - except subprocess.CalledProcessError as e: - pytest.skip(f"Failed to start minikube: {e.stderr}") - - # Force context to minikube - try: - subprocess.run(["kubectl", "config", "use-context", "minikube"], check=True) - except subprocess.CalledProcessError: - pytest.skip("Failed to use minikube context.") - - # Check cluster - try: - subprocess.run(["kubectl", "cluster-info"], check=True) - except subprocess.CalledProcessError: - pytest.skip("kubectl cluster-info failed.") - - yield - - # Clean up all Kubernetes resources after all tests - print("Final cleanup of all resources...") - subprocess.run(["kubectl", "delete", "all", "--all"], check=False) - # Don't delete minikube itself - leave that to the user - - -@pytest.fixture(scope="session") -def minikube_env(): - result = subprocess.run(["minikube", "docker-env"], check=True, capture_output=True, text=True) - - env = os.environ.copy() - for line in result.stdout.split('\n'): - if line.startswith("export "): - key, value = line[len("export "):].split("=", 1) - env[key] = value.strip('"') - - yield env - - -@pytest.fixture(scope="session", autouse=True) -def ensure_scrapyd_image(minikube_env): - print("Building test container...") - subprocess.run(["docker", "build", "-t", "scrapyd-k8s-test:latest", "."], check=True, env=minikube_env) - - yield - - print("Removing test container...") - subprocess.run(["docker", "rmi", "scrapyd-k8s-test:latest"], check=False, env=minikube_env) - - -def wait_for_scrapyd_ready(label_selector="app.kubernetes.io/name=scrapyd-k8s", timeout=40): - """ - Wait until a pod matching label_selector is Running, then sleep 5s. - """ - import time - start = time.time() - ready = False - - print(f"Waiting for pod with label {label_selector} to be ready (timeout: {timeout}s)...") - - while time.time() - start < timeout: - get_pods = subprocess.run( - ["kubectl", "get", "pods", "--selector", label_selector], - capture_output=True, text=True - ) - print(f"Current pods: {get_pods.stdout}") - - if "Running" in get_pods.stdout: - print(f"Pod is now Running. Waiting a moment for internal initialization...") - # give it a few seconds for internal init - time.sleep(3) - ready = True - break - - if not ready: - # Get deployment events for debugging - print("Pod not ready in time. Checking events:") - events = subprocess.run( - ["kubectl", "get", "events", "--sort-by=.metadata.creationTimestamp"], - capture_output=True, text=True - ) - print(events.stdout) - pytest.fail(f"No 'Running' pods found with label {label_selector} after {timeout}s") - - return True - - -@pytest.fixture -def scrapyd_service(ensure_minikube, minikube_env, request): - """ - Parameterized fixture that sets up scrapyd with specified max_proc. - """ - # Get max_proc from parameter or default to 0 - max_proc = getattr(request, "param", 0) - - # Retrieve a temporary kubernetes yaml with the specified max_proc - yaml_file = "kubernetes-%d.yaml" % max_proc - - # Now apply the new configuration - print(f"Applying {yaml_file} with max_proc={max_proc}...") - subprocess.run(["kubectl", "apply", "-f", yaml_file], check=True, env=minikube_env) - - # Wait for the deployment to be ready - wait_for_scrapyd_ready() - - # Even after pod is ready, give it a few more seconds to fully initialize - print("Pod is ready, waiting for internal initialization...") - time.sleep(3) - - # Port-forward - pf_proc = subprocess.Popen( - ["kubectl", "port-forward", "service/scrapyd-k8s", "6800:6800"], - stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - time.sleep(3) - - # Quick health check - base_url = "http://127.0.0.1:6800" - try: - resp = requests.get(f"{base_url}/healthz", timeout=5) - if resp.status_code != 200: - pytest.fail("Scrapyd /healthz not OK.") - except Exception as e: - pf_proc.terminate() - pytest.fail(f"Couldn't connect to scrapyd at {base_url}: {e}") - - # Log diagnostics - check the actual configuration - print("Checking actual config in pod...") - config_check = subprocess.run( - ["kubectl", "exec", "deployment/scrapyd-k8s", "--", "cat", "/opt/app/scrapyd_k8s.conf"], - capture_output=True, text=True - ) - if config_check.returncode == 0: - print("Current config in pod:") - print(config_check.stdout) - - # Highlight the max_proc setting - if f"max_proc = {max_proc}" in config_check.stdout: - print(f"✅ max_proc={max_proc} found in config") - else: - print(f"❌ max_proc={max_proc} NOT found in config") - - # Check logs for scheduler initialization - print("Checking for scheduler initialization in logs...") - logs = subprocess.run( - ["kubectl", "logs", "deployment/scrapyd-k8s"], - capture_output=True, text=True - ) - if "k8s scheduler started" in logs.stdout.lower(): - print("✅ Scheduler initialization found in logs") - else: - print("❌ Scheduler initialization NOT found in logs") - - # Additional diagnostic information - print("Full pod logs:") - print(logs.stdout) - - # Print out the environment variables - print("Checking environment variables:") - env_vars = subprocess.run( - ["kubectl", "exec", "deployment/scrapyd-k8s", "--", "env"], - capture_output=True, text=True - ) - print(env_vars.stdout) - - yield base_url, max_proc - - # Cleanup - pf_proc.terminate() - print(f"Deleting {yaml_file} and cleaning up resources...") - subprocess.run(["kubectl", "delete", "-f", yaml_file], check=False, env=minikube_env) - - -def wait_for_job_state(service_url, job_id, target_state, timeout=25): - """ - Wait until job `job_id` is in `target_state` (one of 'pending', 'running', 'finished') according to listjobs.json. - Raise TimeoutError if not. - """ - start = time.time() - while time.time() - start < timeout: - r = requests.get(f"{service_url}/listjobs.json") - data = r.json() - # data has keys: "pending", "running", "finished" - for key in ["pending", "running", "finished"]: - if any(j["id"] == job_id for j in data[key]): - if key == target_state: - return - time.sleep(2) - raise TimeoutError(f"Job {job_id} not in {target_state} after {timeout}s") - - -@pytest.mark.parametrize("scrapyd_service", [0], indirect=True) -def test_zero_allowed_jobs(scrapyd_service): - """ - With max_proc=0, any scheduled job should remain in the 'pending' state in /listjobs.json - (i.e., it never goes to 'running' or 'finished'). - """ - service_url, max_proc = scrapyd_service - print(f"Testing with max_proc={max_proc}") - - # First check the max_proc setting in the pod - pods = subprocess.run( - ["kubectl", "get", "pods", "--selector=app.kubernetes.io/name=scrapyd-k8s", "-o", "name"], - capture_output=True, text=True - ) - pod_name = pods.stdout.strip() - - if pod_name: - print(f"Found pod: {pod_name}") - # Check for the max_proc in logs again - pod_logs = subprocess.run( - ["kubectl", "logs", pod_name], - capture_output=True, text=True - ) - print("Pod logs related to scheduler:") - for line in pod_logs.stdout.split('\n'): - if 'sched' in line.lower() or 'max_proc' in line.lower(): - print(f" {line.strip()}") - else: - print("No pods found for scrapyd-k8s") - - # Schedule a job - job_id = uuid.uuid4().hex - resp = requests.post(f"{service_url}/schedule.json", data={ - "project": "example", - "spider": "quotes", - "_version": "latest", - "jobid": job_id - }) - assert resp.status_code == 200 - - # Wait some time, ensure it never leaves "pending" - # (If your job finishes super fast, you'd never see it running anyway. - # So the best we can do is "still pending after X seconds.") - wait_time = 10 - print(f"Waiting {wait_time}s to confirm job stays pending...") - time.sleep(wait_time) - - # Check the job is still in 'pending' - r = requests.get(f"{service_url}/listjobs.json").json() - pending = [j for j in r["pending"] if j["id"] == job_id] - running = [j for j in r["running"] if j["id"] == job_id] - finished = [j for j in r["finished"] if j["id"] == job_id] - - # Check Kubernetes job state - check_k8s = subprocess.run( - ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job_id, "-o", "wide"], - capture_output=True, text=True - ) - print(f"Job {job_id} Kubernetes status: {check_k8s.stdout}") - - # Check if the job is suspended - check_suspended = subprocess.run( - ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job_id, "-o", - "jsonpath='{.items[0].spec.suspend}'"], - capture_output=True, text=True - ) - print(f"Job {job_id} suspended state: {check_suspended.stdout}") - - assert len(pending) == 1, ( - f"Expected job {job_id} to remain pending (max_proc=0), got state: {r}" - ) - assert not running, f"Job {job_id} unexpectedly running: {r}" - assert not finished, f"Job {job_id} unexpectedly finished: {r}" - - print("test_zero_allowed_jobs passed: job remained pending.") - - -@pytest.mark.parametrize("scrapyd_service", [1], indirect=True) -def test_one_allowed_job(scrapyd_service): - """ - With max_proc=1, scheduling two jobs back to back => - - The first should move into 'running' - - The second should remain 'pending' until the first finishes - """ - service_url, max_proc = scrapyd_service - print(f"Testing with max_proc={max_proc}") - - # Check for the max_proc in pod - pods = subprocess.run( - ["kubectl", "get", "pods", "--selector=app.kubernetes.io/name=scrapyd-k8s", "-o", "name"], - capture_output=True, text=True - ) - pod_name = pods.stdout.strip() - - if pod_name: - print(f"Found pod: {pod_name}") - # Check for the max_proc in logs again - pod_logs = subprocess.run( - ["kubectl", "logs", pod_name], - capture_output=True, text=True - ) - print("Pod logs related to scheduler:") - for line in pod_logs.stdout.split('\n'): - if 'sched' in line.lower() or 'max_proc' in line.lower(): - print(f" {line.strip()}") - else: - print("No pods found for scrapyd-k8s") - - job1_id = uuid.uuid4().hex - job2_id = uuid.uuid4().hex - - # Schedule job1 - r1 = requests.post(f"{service_url}/schedule.json", data={ - "project": "example", - "spider": "quotes", - "_version": "latest", - "jobid": job1_id - }) - assert r1.status_code == 200 - - # Schedule job2 - r2 = requests.post(f"{service_url}/schedule.json", data={ - "project": "example", - "spider": "quotes", - "_version": "latest", - "jobid": job2_id - }) - assert r2.status_code == 200 - - # job1 should eventually be 'running' - print("Waiting for job1 to appear running...") - wait_for_job_state(service_url, job1_id, 'running', timeout=20) - - # Once job1 is running, job2 should remain 'pending' - print("Checking job2 is pending while job1 runs...") - r = requests.get(f"{service_url}/listjobs.json").json() - pending2 = any(j["id"] == job2_id for j in r["pending"]) - running2 = any(j["id"] == job2_id for j in r["running"]) - - # Check Kubernetes job states - check_job1 = subprocess.run( - ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job1_id, "-o", "wide"], - capture_output=True, text=True - ) - print(f"Job1 {job1_id} Kubernetes status: {check_job1.stdout}") - - check_job2 = subprocess.run( - ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job2_id, "-o", "wide"], - capture_output=True, text=True - ) - print(f"Job2 {job2_id} Kubernetes status: {check_job2.stdout}") - - # Check if job2 is suspended - check_job2_suspended = subprocess.run( - ["kubectl", "get", "jobs", "--selector=org.scrapy.job_id=" + job2_id, "-o", - "jsonpath='{.items[0].spec.suspend}'"], - capture_output=True, text=True - ) - print(f"Job2 {job2_id} suspended state: {check_job2_suspended.stdout}") - - assert pending2, f"job2 expected pending, got {r}" - assert not running2, f"job2 is unexpectedly running: {r}" - - # Wait for job1 to finish - try: - print("Waiting for job1 to finish...") - wait_for_job_state(service_url, job1_id, 'finished', timeout=20) - except TimeoutError as e: - # if job1 is extremely slow or never finishes, fail - pytest.fail(str(e)) - - # After job1 finishes, job2 should become running - print("Waiting up to 30s for job2 to become running now that job1 finished...") - wait_for_job_state(service_url, job2_id, 'running', timeout=25) - - print("test_one_allowed_job passed: concurrency=1 logic is correct.") diff --git a/scrapyd_k8s/tests/integration/test_maxproc_one.conf b/scrapyd_k8s/tests/integration/test_maxproc_one.conf new file mode 100644 index 0000000..ea56c53 --- /dev/null +++ b/scrapyd_k8s/tests/integration/test_maxproc_one.conf @@ -0,0 +1,3 @@ +# additional scrapyd-k8s configuration for test_maxproc_one.py +[scrapyd] +max_proc = 1 diff --git a/scrapyd_k8s/tests/integration/test_maxproc_one.k8s.sh b/scrapyd_k8s/tests/integration/test_maxproc_one.k8s.sh new file mode 100755 index 0000000..16f66b7 --- /dev/null +++ b/scrapyd_k8s/tests/integration/test_maxproc_one.k8s.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# Kubernetes cluster preparation for test_maxproc_one.py +# Called with "up" argument on setup, with "down" argument afterwards. +# +# Adds "patch" to job role permissions. +# Tightly bound to kubernetes.yaml + +if [ "$1" = "up" ]; then + kubectl patch role scrapyd-k8s --type=json -p='[ + { + "op": "add", + "path": "/rules/3/verbs/0", + "value": "patch" + } + ]' +elif [ "$1" = "down" ]; then + kubectl patch role scrapyd-k8s --type=json -p='[ + { + "op": "remove", + "path": "/rules/3/verbs/0" + } + ]' +else + echo "Usage: $0 up|down" + exit 1 +fi diff --git a/scrapyd_k8s/tests/integration/test_maxproc_one.py b/scrapyd_k8s/tests/integration/test_maxproc_one.py new file mode 100644 index 0000000..7fa9e1c --- /dev/null +++ b/scrapyd_k8s/tests/integration/test_maxproc_one.py @@ -0,0 +1,75 @@ +import os +import requests +import time + +BASE_URL = os.getenv('TEST_BASE_URL', 'http://localhost:6800') +RUN_PROJECT = os.getenv('TEST_RUN_PROJECT', 'example') +RUN_VERSION = os.getenv('TEST_RUN_VERSION', 'latest') +RUN_SPIDER = os.getenv('TEST_RUN_SPIDER', 'static') +MAX_WAIT = int(os.getenv('TEST_MAX_WAIT', '6')) +STATIC_SLEEP = float(os.getenv('TEST_STATIC_SLEEP', '2')) + +def test_max_proc_one(): + """With max_proc=1, two jobs are scheduled, and are run after each other. """ + # Schedule a job + response = requests.post(BASE_URL + '/schedule.json', data={ + 'project': RUN_PROJECT, 'spider': RUN_SPIDER, '_version': RUN_VERSION, + 'setting': 'STATIC_SLEEP=%d' % STATIC_SLEEP + }) + assert_response_ok(response) + jobid1 = response.json()['jobid'] + assert jobid1 is not None + # Schedule another job right away, which remains queued + response = requests.post(BASE_URL + '/schedule.json', data={ + 'project': RUN_PROJECT, 'spider': RUN_SPIDER, '_version': RUN_VERSION, + 'setting': 'STATIC_SLEEP=%d' % STATIC_SLEEP + }) + assert_response_ok(response) + jobid2 = response.json()['jobid'] + assert jobid2 is not None + + # Wait and make sure the job remains in the pending state + listjobs_wait(jobid1, 'running') + assert_listjobs(pending=jobid2, running=jobid1) + # Wait until the first is finished and the second starts + listjobs_wait(jobid1, 'finished', max_wait=STATIC_SLEEP+MAX_WAIT) + listjobs_wait(jobid2, 'running') + # Wait until the second is finished too + listjobs_wait(jobid2, 'finished', max_wait=STATIC_SLEEP+MAX_WAIT) + +def assert_response_ok(response): + assert response.status_code == 200 + assert response.headers['Content-Type'] == 'application/json' + assert response.json()['status'] == 'ok' + +def assert_listjobs(pending=None, running=None, finished=None): + response = requests.get(BASE_URL + '/listjobs.json') + assert_response_ok(response) + if pending: + assert len(response.json()['pending']) == 1 + assert response.json()['pending'][0]['id'] == pending + return response.json()['pending'][0] + else: + assert response.json()['pending'] == [] + if running: + assert len(response.json()['running']) == 1 + assert response.json()['running'][0]['id'] == running + return response.json()['running'][0] + else: + assert response.json()['running'] == [] + # finished may contain other jobs + if finished: + matches = [j for j in response.json()['finished'] if j['id'] == finished] + assert len(matches) == 1 + return matches[0] + +def listjobs_wait(jobid, state, max_wait=MAX_WAIT): + started = time.monotonic() + while time.monotonic() - started < max_wait: + response = requests.get(BASE_URL + '/listjobs.json') + assert_response_ok(response) + for j in response.json()[state]: + if j['id'] == jobid: + return True + time.sleep(0.5) + assert False, 'Timeout waiting for job state change' diff --git a/scrapyd_k8s/tests/integration/test_maxproc_zero.conf b/scrapyd_k8s/tests/integration/test_maxproc_zero.conf new file mode 100644 index 0000000..4d0fd31 --- /dev/null +++ b/scrapyd_k8s/tests/integration/test_maxproc_zero.conf @@ -0,0 +1,3 @@ +# additional scrapyd-k8s configuration for test_maxproc_zero.py +[scrapyd] +max_proc = 0 diff --git a/scrapyd_k8s/tests/integration/test_maxproc_zero.k8s.sh b/scrapyd_k8s/tests/integration/test_maxproc_zero.k8s.sh new file mode 100755 index 0000000..4cd69ad --- /dev/null +++ b/scrapyd_k8s/tests/integration/test_maxproc_zero.k8s.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# Kubernetes cluster preparation for test_maxproc_zero.py +# Called with "up" argument on setup, with "down" argument afterwards. +# +# Adds "patch" to job role permissions. +# Tightly bound to kubernetes.yaml + +if [ "$1" = "up" ]; then + kubectl patch role scrapyd-k8s --type=json -p='[ + { + "op": "add", + "path": "/rules/3/verbs/0", + "value": "patch" + } + ]' +elif [ "$1" = "down" ]; then + kubectl patch role scrapyd-k8s --type=json -p='[ + { + "op": "remove", + "path": "/rules/3/verbs/0" + } + ]' +else + echo "Usage: $0 up|down" + exit 1 +fi diff --git a/scrapyd_k8s/tests/integration/test_maxproc_zero.py b/scrapyd_k8s/tests/integration/test_maxproc_zero.py new file mode 100644 index 0000000..9976a90 --- /dev/null +++ b/scrapyd_k8s/tests/integration/test_maxproc_zero.py @@ -0,0 +1,50 @@ +import os +import requests +import time + +BASE_URL = os.getenv('TEST_BASE_URL', 'http://localhost:6800') +RUN_PROJECT = os.getenv('TEST_RUN_PROJECT', 'example') +RUN_VERSION = os.getenv('TEST_RUN_VERSION', 'latest') +RUN_SPIDER = os.getenv('TEST_RUN_SPIDER', 'static') +MAX_WAIT = int(os.getenv('TEST_MAX_WAIT', '6')) + +def test_max_proc_zero(): + """With max_proc=0, any scheduled job remains in the 'pending' state""" + # Schedule a job + response = requests.post(BASE_URL + '/schedule.json', data={ + 'project': RUN_PROJECT, 'spider': RUN_SPIDER, '_version': RUN_VERSION + }) + assert_response_ok(response) + jobid = response.json()['jobid'] + assert jobid is not None + + # Wait and make sure the job remains in the pending state + assert_listjobs(pending=jobid) + time.sleep(MAX_WAIT) + assert_listjobs(pending=jobid) + +def assert_response_ok(response): + assert response.status_code == 200 + assert response.headers['Content-Type'] == 'application/json' + assert response.json()['status'] == 'ok' + +def assert_listjobs(pending=None, running=None, finished=None): + response = requests.get(BASE_URL + '/listjobs.json') + assert_response_ok(response) + if pending: + assert len(response.json()['pending']) == 1 + assert response.json()['pending'][0]['id'] == pending + return response.json()['pending'][0] + else: + assert response.json()['pending'] == [] + if running: + assert len(response.json()['running']) == 1 + assert response.json()['running'][0]['id'] == running + return response.json()['running'][0] + else: + assert response.json()['running'] == [] + # finished may contain other jobs + if finished: + matches = [j for j in response.json()['finished'] if j['id'] == finished] + assert len(matches) == 1 + return matches[0] From d0501d55c72937cef961715d55d0175afdf103e7 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 11 Mar 2025 14:27:22 +0100 Subject: [PATCH 25/28] grouped unit tests under classes; added parametrization to avoide douplicate code --- .../unit/k8s_scheduler/test_k8s_scheduler.py | 471 ++++++++++-------- 1 file changed, 257 insertions(+), 214 deletions(-) diff --git a/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py b/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py index 8589cd5..4a985de 100644 --- a/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py +++ b/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py @@ -15,218 +15,261 @@ def mock_launcher(): mock_launcher.LABEL_JOB_ID = 'org.scrapy.job_id' return mock_launcher -def test_k8s_scheduler_init(mock_config, mock_launcher): - max_proc = 5 - scheduler = KubernetesScheduler(mock_config, mock_launcher, max_proc) - assert scheduler.config == mock_config - assert scheduler.launcher == mock_launcher - assert scheduler.max_proc == max_proc - assert scheduler.namespace == 'default' - -def test_k8s_scheduler_init_invalid_max_proc(mock_config, mock_launcher): - max_proc = 'five' # Not an integer - with pytest.raises(TypeError) as excinfo: - KubernetesScheduler(mock_config, mock_launcher, max_proc) - assert "max_proc must be an integer" in str(excinfo.value) - -def test_handle_pod_event_with_non_dict_event(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - event = ['not', 'a', 'dict'] - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.handle_pod_event(event) - mock_logger.error.assert_called_with( - f"TypeError in handle_pod_event: Event must be a dictionary, got {type(event).__name__} in event: {event}" - ) - -def test_handle_pod_event_missing_keys(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - event = {'wrong_key': 'value'} - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.handle_pod_event(event) - mock_logger.error.assert_called_with( - f"KeyError in handle_pod_event: Missing key 'object' in event: {event}" - ) - -def test_handle_pod_event_pod_missing_status(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - pod = Mock() - pod.status = None - pod.metadata = Mock() - pod.metadata.name = 'pod-name' - event = {'object': pod, 'type': 'MODIFIED'} - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.handle_pod_event(event) - mock_logger.error.assert_called_with( - f"AttributeError in handle_pod_event: 'NoneType' object has no attribute 'phase' in event: {event}" - ) - -def test_handle_pod_event_pod_missing_metadata(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - pod = Mock() - pod.status = Mock() - pod.status.phase = 'Running' - pod.metadata = None - event = {'object': pod, 'type': 'MODIFIED'} - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.handle_pod_event(event) - mock_logger.error.assert_called_with( - f"AttributeError in handle_pod_event: 'NoneType' object has no attribute 'name' in event: {event}" - ) - -def test_handle_pod_event_pod_not_related(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - pod = Mock() - pod.status = Mock() - pod.status.phase = 'Running' - pod.metadata = Mock() - pod.metadata.name = 'pod-name' - pod.metadata.labels = {} - event = {'object': pod, 'type': 'MODIFIED'} - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.handle_pod_event(event) - mock_logger.debug.assert_called_with("Pod pod-name does not have our job label; ignoring.") - -def test_handle_pod_event_pod_terminated(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - pod = Mock() - pod.status = Mock() - pod.status.phase = 'Succeeded' - pod.metadata = Mock() - pod.metadata.name = 'pod-name' - pod.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job-id'} - event = {'object': pod, 'type': 'MODIFIED'} - - with patch.object(scheduler, 'check_and_unsuspend_jobs') as mock_check_and_unsuspend_jobs, \ - patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.handle_pod_event(event) - mock_logger.info.assert_called_with( - "Pod pod-name has completed with phase Succeeded. Checking for suspended jobs." - ) - mock_check_and_unsuspend_jobs.assert_called_once() - -def test_handle_pod_event_pod_not_terminated(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - pod = Mock() - pod.status = Mock() - pod.status.phase = 'Running' - pod.metadata = Mock() - pod.metadata.name = 'pod-name' - pod.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job-id'} - event = {'object': pod, 'type': 'MODIFIED'} - - with patch.object(scheduler, 'check_and_unsuspend_jobs') as mock_check_and_unsuspend_jobs, \ - patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.handle_pod_event(event) - mock_logger.debug.assert_called_with("Pod pod-name event not relevant for unsuspension.") - mock_check_and_unsuspend_jobs.assert_not_called() - -def test_check_and_unsuspend_jobs_with_capacity_and_suspended_jobs(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - - mock_launcher.get_running_jobs_count.return_value = 3 - scheduler.get_next_suspended_job_id = Mock(side_effect=['job1', 'job2', None]) - mock_launcher.unsuspend_job.side_effect = [True, True] - - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.check_and_unsuspend_jobs() - assert mock_launcher.unsuspend_job.call_count == 2 - mock_launcher.unsuspend_job.assert_any_call('job1') - mock_launcher.unsuspend_job.assert_any_call('job2') - mock_logger.info.assert_any_call("Unsuspended job job1. Total running jobs now: 4") - mock_logger.info.assert_any_call("Unsuspended job job2. Total running jobs now: 5") - -def test_check_and_unsuspend_jobs_no_suspended_jobs(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - - mock_launcher.get_running_jobs_count.return_value = 3 - scheduler.get_next_suspended_job_id = Mock(return_value=None) - - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.check_and_unsuspend_jobs() - mock_launcher.unsuspend_job.assert_not_called() - mock_logger.info.assert_called_with("No suspended jobs to unsuspend.") - -def test_check_and_unsuspend_jobs_unsuspend_fails(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - - mock_launcher.get_running_jobs_count.return_value = 3 - scheduler.get_next_suspended_job_id = Mock(return_value='job1') - mock_launcher.unsuspend_job.return_value = False - - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.check_and_unsuspend_jobs() - mock_launcher.unsuspend_job.assert_called_once_with('job1') - mock_logger.error.assert_called_with("Failed to unsuspend job job1") - -def test_check_and_unsuspend_jobs_unsuspend_api_exception(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - - mock_launcher.get_running_jobs_count.return_value = 3 - scheduler.get_next_suspended_job_id = Mock(return_value='job1') - mock_launcher.unsuspend_job.side_effect = ApiException("API Error") - - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - scheduler.check_and_unsuspend_jobs() - mock_launcher.unsuspend_job.assert_called_once_with('job1') - mock_logger.error.assert_called_with( - f"Kubernetes API exception while unsuspending job job1: (API Error)\nReason: None\n" - ) - -def test_get_next_suspended_job_id_with_suspended_jobs(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - - job1 = Mock() - job1.metadata = Mock() - job1.metadata.creation_timestamp = '2021-01-01T00:00:00Z' - job1.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job1'} - - job2 = Mock() - job2.metadata = Mock() - job2.metadata.creation_timestamp = '2021-01-02T00:00:00Z' - job2.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job2'} - - mock_launcher.list_suspended_jobs.return_value = [job2, job1] - - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - job_id = scheduler.get_next_suspended_job_id() - assert job_id == 'job1' - mock_logger.debug.assert_called_with("Next suspended job to unsuspend: job1") - -def test_get_next_suspended_job_id_no_suspended_jobs(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - - mock_launcher.list_suspended_jobs.return_value = [] - - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - job_id = scheduler.get_next_suspended_job_id() - assert job_id is None - mock_logger.debug.assert_called_with("No suspended jobs found.") - -def test_get_next_suspended_job_id_list_suspended_jobs_returns_non_list(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - - mock_launcher.list_suspended_jobs.return_value = 'not a list' - - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - job_id = scheduler.get_next_suspended_job_id() - assert job_id is None - mock_logger.error.assert_called_with( - "TypeError in get_next_suspended_job_id: list_suspended_jobs should return a list, got str" - ) - -def test_get_next_suspended_job_id_job_missing_creation_timestamp(mock_config, mock_launcher): - scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - - job = Mock() - job.metadata = Mock() - job.metadata.labels = {mock_launcher.LABEL_JOB_ID: 'job1'} - job.metadata.creation_timestamp = None # Simulate missing creation_timestamp - - mock_launcher.list_suspended_jobs.return_value = [job] - - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: - job_id = scheduler.get_next_suspended_job_id() - assert job_id == 'job1' - mock_logger.warning.assert_called_with( - f"Job {job} missing 'metadata.creation_timestamp'; assigned max timestamp." +class TestKubernetesSchedulerInitialization: + def test_k8s_scheduler_init(self, mock_config, mock_launcher): + max_proc = 5 + scheduler = KubernetesScheduler(mock_config, mock_launcher, max_proc) + assert scheduler.config == mock_config + assert scheduler.launcher == mock_launcher + assert scheduler.max_proc == max_proc + assert scheduler.namespace == 'default' + + def test_k8s_scheduler_init_invalid_max_proc(self, mock_config, mock_launcher): + max_proc = 'five' # Not an integer + with pytest.raises(TypeError) as excinfo: + KubernetesScheduler(mock_config, mock_launcher, max_proc) + assert "max_proc must be an integer" in str(excinfo.value) + + +class TestPodEventHandling: + @pytest.mark.parametrize("event, expected_log, log_type", [ + ( + ['not', 'a', 'dict'], + "TypeError in handle_pod_event: Event must be a dictionary, got list in event: ['not', 'a', 'dict']", + 'error' + ), + ( + {'wrong_key': 'value'}, + "KeyError in handle_pod_event: Missing key 'object' in event: {'wrong_key': 'value'}", + 'error' + ), + ]) + def test_handle_pod_event_input_validation(self, mock_config, mock_launcher, event, expected_log, log_type): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.handle_pod_event(event) + if log_type == 'error': + mock_logger.error.assert_called_with(expected_log) + elif log_type == 'debug': + mock_logger.debug.assert_called_with(expected_log) + elif log_type == 'info': + mock_logger.info.assert_called_with(expected_log) + + @pytest.mark.parametrize("pod_config, expected_log, log_type", [ + # Pod with missing status + ( + {"status": None, "metadata_name": "pod-name", "metadata_labels": {}}, + "AttributeError in handle_pod_event: 'NoneType' object has no attribute 'phase' in event: ", + 'error' + ), + # Pod with missing metadata + ( + {"status_phase": "Running", "metadata": None}, + "AttributeError in handle_pod_event: 'NoneType' object has no attribute 'name' in event: ", + 'error' + ), + # Pod not related to our jobs + ( + {"status_phase": "Running", "metadata_name": "pod-name", "metadata_labels": {}}, + "Pod pod-name does not have our job label; ignoring.", + 'debug' + ), + # Pod terminated successfully + ( + {"status_phase": "Succeeded", "metadata_name": "pod-name", + "metadata_labels": {"org.scrapy.job_id": "job-id"}}, + "Pod pod-name has completed with phase Succeeded. Checking for suspended jobs.", + 'info' + ), + # Pod not terminated + ( + {"status_phase": "Running", "metadata_name": "pod-name", + "metadata_labels": {"org.scrapy.job_id": "job-id"}}, + "Pod pod-name event not relevant for unsuspension.", + 'debug' + ), + ]) + def test_handle_pod_event_pod_scenarios(self, mock_config, mock_launcher, pod_config, expected_log, log_type): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + # Create a mock pod based on the configuration + pod = Mock() + + if "status" in pod_config: + pod.status = pod_config["status"] + else: + pod.status = Mock() + if "status_phase" in pod_config: + pod.status.phase = pod_config["status_phase"] + + if "metadata" in pod_config: + pod.metadata = pod_config["metadata"] + else: + pod.metadata = Mock() + if "metadata_name" in pod_config: + pod.metadata.name = pod_config["metadata_name"] + if "metadata_labels" in pod_config: + pod.metadata.labels = pod_config["metadata_labels"] + + event = {'object': pod, 'type': 'MODIFIED'} + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger, \ + patch.object(scheduler, 'check_and_unsuspend_jobs') as mock_check_and_unsuspend_jobs: + scheduler.handle_pod_event(event) + + if log_type == 'error': + # For error logs, we need to check that it contains the expected text since + # the full event will be appended + assert mock_logger.error.call_count == 1 + assert expected_log in mock_logger.error.call_args[0][0] + elif log_type == 'debug': + mock_logger.debug.assert_called_with(expected_log) + elif log_type == 'info': + mock_logger.info.assert_called_with(expected_log) + + # Check if check_and_unsuspend_jobs should be called + if pod_config.get("status_phase") in ["Succeeded", "Failed"] and \ + pod_config.get("metadata_labels", {}).get(mock_launcher.LABEL_JOB_ID): + mock_check_and_unsuspend_jobs.assert_called_once() + else: + mock_check_and_unsuspend_jobs.assert_not_called() + + +class TestJobSuspensionManagement: + @pytest.mark.parametrize("running_count, suspended_jobs, unsuspend_results, expected_logs", [ + # Case with capacity and suspended jobs + ( + 3, # running_count + ['job1', 'job2', None], # suspended_jobs + [True, True], # unsuspend_results + [ + "Unsuspended job job1. Total running jobs now: 4", + "Unsuspended job job2. Total running jobs now: 5" + ] # expected_logs + ), + # Case with no suspended jobs + ( + 3, # running_count + [None], # suspended_jobs + [], # unsuspend_results + ["No suspended jobs to unsuspend."] # expected_logs + ), + # Case where unsuspension fails + ( + 3, # running_count + ['job1'], # suspended_jobs + [False], # unsuspend_results + ["Failed to unsuspend job job1"] # expected_logs ) + ]) + def test_check_and_unsuspend_jobs_scenarios(self, mock_config, mock_launcher, + running_count, suspended_jobs, + unsuspend_results, expected_logs): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + mock_launcher.get_running_jobs_count.return_value = running_count + scheduler.get_next_suspended_job_id = Mock(side_effect=suspended_jobs) + mock_launcher.unsuspend_job.side_effect = unsuspend_results + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.check_and_unsuspend_jobs() + + assert mock_launcher.unsuspend_job.call_count == len(unsuspend_results) + + # Verify the logs + log_methods = { + "Failed to unsuspend job": mock_logger.error, + "No suspended jobs to unsuspend": mock_logger.info, + "Unsuspended job": mock_logger.info + } + + for expected_log in expected_logs: + for prefix, log_method in log_methods.items(): + if expected_log.startswith(prefix): + assert any( + expected_log in call_args[0][0] + for call_args in log_method.call_args_list + ) + + def test_check_and_unsuspend_jobs_unsuspend_api_exception(self, mock_config, mock_launcher): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + mock_launcher.get_running_jobs_count.return_value = 3 + scheduler.get_next_suspended_job_id = Mock(return_value='job1') + mock_launcher.unsuspend_job.side_effect = ApiException("API Error") + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + scheduler.check_and_unsuspend_jobs() + mock_launcher.unsuspend_job.assert_called_once_with('job1') + mock_logger.error.assert_called_with( + f"Kubernetes API exception while unsuspending job job1: (API Error)\nReason: None\n" + ) + + +class TestSuspendedJobSelection: + @pytest.mark.parametrize("suspended_jobs, expected_job_id, expected_log, log_type", [ + # Case with suspended jobs - should return the oldest job (job1) + ( + [ + # job2 is newer + {'id': 'job2', 'creation_timestamp': '2021-01-02T00:00:00Z'}, + # job1 is older + {'id': 'job1', 'creation_timestamp': '2021-01-01T00:00:00Z'} + ], + 'job1', + "Next suspended job to unsuspend: job1", + 'debug' + ), + # Case with no suspended jobs + ( + [], + None, + "No suspended jobs found.", + 'debug' + ), + # Case with non-list return value + ( + 'not a list', + None, + "TypeError in get_next_suspended_job_id: list_suspended_jobs should return a list, got str", + 'error' + ), + # Case with job missing creation timestamp + ( + [{'id': 'job1', 'creation_timestamp': None}], + 'job1', + "Job .* missing 'metadata.creation_timestamp'; assigned max timestamp.", + 'warning' + ), + ]) + def test_get_next_suspended_job_id_scenarios(self, mock_config, mock_launcher, + suspended_jobs, expected_job_id, + expected_log, log_type): + scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) + + # Handle special case for non-list + if suspended_jobs == 'not a list': + mock_launcher.list_suspended_jobs.return_value = suspended_jobs + else: + # Create proper mock jobs based on the test data + mock_jobs = [] + for job_data in suspended_jobs: + job = Mock() + job.metadata = Mock() + job.metadata.creation_timestamp = job_data['creation_timestamp'] + job.metadata.labels = {mock_launcher.LABEL_JOB_ID: job_data['id']} + mock_jobs.append(job) + + mock_launcher.list_suspended_jobs.return_value = mock_jobs + + with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + job_id = scheduler.get_next_suspended_job_id() + assert job_id == expected_job_id + + if log_type == 'debug': + mock_logger.debug.assert_called_with(expected_log) + elif log_type == 'error': + mock_logger.error.assert_called_with(expected_log) + elif log_type == 'warning': + import re + assert any(re.match(expected_log, args[0]) for args, _ in mock_logger.warning.call_args_list) From a482e496cef81b4a0a79bbe1644a42d2d557e230 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Fri, 28 Mar 2025 09:18:23 +0100 Subject: [PATCH 26/28] remove num of attempts from sample config; add empty line in the watcher file; move the dir with the k8s scheduler to the launcher --- scrapyd_k8s.sample-k8s.conf | 3 --- scrapyd_k8s/api.py | 3 --- scrapyd_k8s/k8s_resource_watcher.py | 1 + scrapyd_k8s/k8s_scheduler/__init__.py | 1 - scrapyd_k8s/launcher/k8s.py | 6 ++---- scrapyd_k8s/launcher/k8s_scheduler/__init__.py | 1 + scrapyd_k8s/{ => launcher}/k8s_scheduler/k8s_scheduler.py | 0 scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py | 2 +- 8 files changed, 5 insertions(+), 12 deletions(-) delete mode 100644 scrapyd_k8s/k8s_scheduler/__init__.py create mode 100644 scrapyd_k8s/launcher/k8s_scheduler/__init__.py rename scrapyd_k8s/{ => launcher}/k8s_scheduler/k8s_scheduler.py (100%) diff --git a/scrapyd_k8s.sample-k8s.conf b/scrapyd_k8s.sample-k8s.conf index df32508..b9e8457 100644 --- a/scrapyd_k8s.sample-k8s.conf +++ b/scrapyd_k8s.sample-k8s.conf @@ -19,9 +19,6 @@ namespace = default # Optional pull secret, in case you have private spiders. #pull_secret = ghcr-registry -# Number of attempts to reconnect with k8s API to watch events, default is 5 -reconnection_attempts = 5 - # Minimum time in seconds to wait before reconnecting to k8s API to watch events, default is 5 backoff_time = 5 diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index ecbd16f..0e8a0c0 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -1,12 +1,9 @@ #!/usr/bin/env python3 -import logging import uuid from flask import Flask, request, Response, jsonify from flask_basicauth import BasicAuth from natsort import natsort_keygen, ns -from .k8s_resource_watcher import ResourceWatcher -from .k8s_scheduler import KubernetesScheduler from .config import Config config = Config() diff --git a/scrapyd_k8s/k8s_resource_watcher.py b/scrapyd_k8s/k8s_resource_watcher.py index 7342f81..ed44fd1 100644 --- a/scrapyd_k8s/k8s_resource_watcher.py +++ b/scrapyd_k8s/k8s_resource_watcher.py @@ -4,6 +4,7 @@ from kubernetes import client, watch from typing import Callable, List import urllib3 + logger = logging.getLogger(__name__) class ResourceWatcher: diff --git a/scrapyd_k8s/k8s_scheduler/__init__.py b/scrapyd_k8s/k8s_scheduler/__init__.py deleted file mode 100644 index 73dfe4d..0000000 --- a/scrapyd_k8s/k8s_scheduler/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from scrapyd_k8s.k8s_scheduler.k8s_scheduler import KubernetesScheduler \ No newline at end of file diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index 46a7a19..b1cd9d9 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -1,18 +1,16 @@ import os -import logging import kubernetes import kubernetes.stream import logging from signal import Signals -from ..utils import format_datetime_object, native_stringify_dict -from scrapyd_k8s.joblogs import KubernetesJobLogHandler +from ..utils import format_datetime_object logger = logging.getLogger(__name__) from kubernetes.client import ApiException -from ..k8s_scheduler import KubernetesScheduler +from scrapyd_k8s.launcher.k8s_scheduler import KubernetesScheduler from ..k8s_resource_watcher import ResourceWatcher from ..utils import native_stringify_dict from scrapyd_k8s.joblogs import KubernetesJobLogHandler diff --git a/scrapyd_k8s/launcher/k8s_scheduler/__init__.py b/scrapyd_k8s/launcher/k8s_scheduler/__init__.py new file mode 100644 index 0000000..1444439 --- /dev/null +++ b/scrapyd_k8s/launcher/k8s_scheduler/__init__.py @@ -0,0 +1 @@ +from scrapyd_k8s.launcher.k8s_scheduler.k8s_scheduler import KubernetesScheduler \ No newline at end of file diff --git a/scrapyd_k8s/k8s_scheduler/k8s_scheduler.py b/scrapyd_k8s/launcher/k8s_scheduler/k8s_scheduler.py similarity index 100% rename from scrapyd_k8s/k8s_scheduler/k8s_scheduler.py rename to scrapyd_k8s/launcher/k8s_scheduler/k8s_scheduler.py diff --git a/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py b/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py index 4a985de..91cde93 100644 --- a/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py +++ b/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py @@ -1,7 +1,7 @@ import pytest from unittest.mock import Mock, patch from kubernetes.client.rest import ApiException -from scrapyd_k8s.k8s_scheduler.k8s_scheduler import KubernetesScheduler +from scrapyd_k8s.launcher.k8s_scheduler import KubernetesScheduler @pytest.fixture def mock_config(): From f9427ec93657386698131a3ca06f0ed5d9290628 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Fri, 28 Mar 2025 14:22:10 +0100 Subject: [PATCH 27/28] modify path to mock scheduler --- .../tests/unit/k8s_scheduler/test_k8s_scheduler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py b/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py index 91cde93..c29485c 100644 --- a/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py +++ b/scrapyd_k8s/tests/unit/k8s_scheduler/test_k8s_scheduler.py @@ -46,7 +46,7 @@ class TestPodEventHandling: ]) def test_handle_pod_event_input_validation(self, mock_config, mock_launcher, event, expected_log, log_type): scheduler = KubernetesScheduler(mock_config, mock_launcher, 5) - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + with patch('scrapyd_k8s.launcher.k8s_scheduler.k8s_scheduler.logger') as mock_logger: scheduler.handle_pod_event(event) if log_type == 'error': mock_logger.error.assert_called_with(expected_log) @@ -113,7 +113,7 @@ def test_handle_pod_event_pod_scenarios(self, mock_config, mock_launcher, pod_co event = {'object': pod, 'type': 'MODIFIED'} - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger, \ + with patch('scrapyd_k8s.launcher.k8s_scheduler.k8s_scheduler.logger') as mock_logger, \ patch.object(scheduler, 'check_and_unsuspend_jobs') as mock_check_and_unsuspend_jobs: scheduler.handle_pod_event(event) @@ -171,7 +171,7 @@ def test_check_and_unsuspend_jobs_scenarios(self, mock_config, mock_launcher, scheduler.get_next_suspended_job_id = Mock(side_effect=suspended_jobs) mock_launcher.unsuspend_job.side_effect = unsuspend_results - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + with patch('scrapyd_k8s.launcher.k8s_scheduler.k8s_scheduler.logger') as mock_logger: scheduler.check_and_unsuspend_jobs() assert mock_launcher.unsuspend_job.call_count == len(unsuspend_results) @@ -198,7 +198,7 @@ def test_check_and_unsuspend_jobs_unsuspend_api_exception(self, mock_config, moc scheduler.get_next_suspended_job_id = Mock(return_value='job1') mock_launcher.unsuspend_job.side_effect = ApiException("API Error") - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + with patch('scrapyd_k8s.launcher.k8s_scheduler.k8s_scheduler.logger') as mock_logger: scheduler.check_and_unsuspend_jobs() mock_launcher.unsuspend_job.assert_called_once_with('job1') mock_logger.error.assert_called_with( @@ -262,7 +262,7 @@ def test_get_next_suspended_job_id_scenarios(self, mock_config, mock_launcher, mock_launcher.list_suspended_jobs.return_value = mock_jobs - with patch('scrapyd_k8s.k8s_scheduler.k8s_scheduler.logger') as mock_logger: + with patch('scrapyd_k8s.launcher.k8s_scheduler.k8s_scheduler.logger') as mock_logger: job_id = scheduler.get_next_suspended_job_id() assert job_id == expected_job_id From 371b2b65f2e14b59be61a9d1b8ddda851a084475 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 1 Apr 2025 10:35:44 +0200 Subject: [PATCH 28/28] edited config to two params, removed those params from the sample config, added an empty line in api.py --- CONFIG.md | 2 +- scrapyd_k8s.sample-k8s.conf | 7 ------- scrapyd_k8s/api.py | 1 + 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 5b618d1..f285328 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -59,7 +59,7 @@ choose to use them. The event watcher establishes a connection to the Kubernetes API and receives a stream of events from it. However, the nature of this long-lived connection is unstable; it can be interrupted by network issues, proxies configured to terminate long-lived connections, and other factors. For this reason, a mechanism was implemented to re-establish the long-lived -connection to the Kubernetes API. To achieve this, three parameters were introduced: +connection to the Kubernetes API. To achieve this, two parameters were introduced: `backoff_time` and `backoff_coefficient`. #### What are these parameters about? diff --git a/scrapyd_k8s.sample-k8s.conf b/scrapyd_k8s.sample-k8s.conf index b9e8457..44aa99e 100644 --- a/scrapyd_k8s.sample-k8s.conf +++ b/scrapyd_k8s.sample-k8s.conf @@ -19,13 +19,6 @@ namespace = default # Optional pull secret, in case you have private spiders. #pull_secret = ghcr-registry -# Minimum time in seconds to wait before reconnecting to k8s API to watch events, default is 5 -backoff_time = 5 - -# Coefficient that is multiplied by backoff_time to provide exponential backoff to prevent k8s API from being overwhelmed -# default is 2, every reconnection attempt will take backoff_time*backoff_coefficient -backoff_coefficient = 2 - # For each project, define a project section. # This contains a repository that points to the remote container repository. # An optional env_secret is the name of a secret with additional environment diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 0e8a0c0..c763b6e 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -156,5 +156,6 @@ def run(): config_password = scrapyd_config.get('password') if config_username is not None and config_password is not None: enable_authentication(app, config_username, config_password) + # run server app.run(host=host, port=port)