Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions estela-api/config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@
GOOGLE_APPLICATION_LOCATION=(str, "dummy"),
MAX_CLI_DOWNLOAD_CHUNK_MB=(int, 2),
MAX_WEB_DOWNLOAD_SIZE_MB=(int, 1024),
MULTI_NODE_MODE=(str, "False"),
DEDICATED_SPIDER_NODES=(str, "True"),
SPIDER_NODE_ROLE=(str, "bitmaker-worker"),
NODE_CAPACITY_THRESHOLD=(float, 0.95),
WORKERS_CAPACITY_THRESHOLD=(float, 0.95),
DISPATCH_RETRY_DELAY=(int, 30),
RUN_JOBS_PER_LOT=(int, 100),
BUCKET_NAME_PROJECTS=(str, "dummy"),
Expand Down Expand Up @@ -251,9 +251,9 @@


# Cluster settings
MULTI_NODE_MODE = env("MULTI_NODE_MODE")
DEDICATED_SPIDER_NODES = env("DEDICATED_SPIDER_NODES")
SPIDER_NODE_ROLE = env("SPIDER_NODE_ROLE")
NODE_CAPACITY_THRESHOLD = env("NODE_CAPACITY_THRESHOLD")
WORKERS_CAPACITY_THRESHOLD = env("WORKERS_CAPACITY_THRESHOLD")
DISPATCH_RETRY_DELAY = env("DISPATCH_RETRY_DELAY")
RUN_JOBS_PER_LOT = env("RUN_JOBS_PER_LOT")
CHECK_JOB_ERRORS_BATCH_SIZE = 100
Expand Down
28 changes: 17 additions & 11 deletions estela-api/core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import redis
from kubernetes import client, config

NODE_CAPACITY_THRESHOLD = settings.NODE_CAPACITY_THRESHOLD
WORKERS_CAPACITY_THRESHOLD = settings.WORKERS_CAPACITY_THRESHOLD

def get_default_token(job):
user = job.spider.project.users.first()
Expand Down Expand Up @@ -73,8 +73,8 @@ def run_spider_jobs():
new_cpu = used_cpu + job_cpu
new_mem = used_mem + job_mem

if (alloc_cpu > 0 and (new_cpu / alloc_cpu) >= NODE_CAPACITY_THRESHOLD) or \
(alloc_mem > 0 and (new_mem / alloc_mem) >= NODE_CAPACITY_THRESHOLD):
if (alloc_cpu > 0 and (new_cpu / alloc_cpu) >= WORKERS_CAPACITY_THRESHOLD) or \
(alloc_mem > 0 and (new_mem / alloc_mem) >= WORKERS_CAPACITY_THRESHOLD):
skipped += 1
continue

Expand Down Expand Up @@ -119,9 +119,6 @@ def _dispatch_single_job(job):

token = get_default_token(job)

job.status = SpiderJob.WAITING_STATUS
job.save()

job_manager.create_job(
job.name,
job.key,
Expand All @@ -135,17 +132,25 @@ def _dispatch_single_job(job):
resource_tier=job.resource_tier,
)

job.status = SpiderJob.WAITING_STATUS
job.save()


def _get_cluster_resources():
try:
config.load_incluster_config()
v1 = client.CoreV1Api()

dedicated = settings.DEDICATED_SPIDER_NODES == "True"
spider_node_role = settings.SPIDER_NODE_ROLE
nodes = v1.list_node(label_selector=f"role={spider_node_role}")

if dedicated:
nodes = v1.list_node(label_selector=f"role={spider_node_role}")
else:
nodes = v1.list_node()

if not nodes.items:
logging.warning("No worker nodes found with label role=%s", spider_node_role)
logging.warning("No worker nodes found")
return None

total_allocatable_mem = 0
Expand Down Expand Up @@ -173,10 +178,11 @@ def _get_cluster_resources():
)
for pod in pending_pods.items:
if pod.spec.node_name:
continue
node_selector = pod.spec.node_selector or {}
if node_selector.get("role") != spider_node_role:
continue
if dedicated:
node_selector = pod.spec.node_selector or {}
if node_selector.get("role") != spider_node_role:
continue
for container in pod.spec.containers:
requests = (container.resources.requests or {}) if container.resources else {}
total_requested_mem += _parse_k8s_resource(requests.get("memory", "0"))
Expand Down
2 changes: 1 addition & 1 deletion estela-api/engines/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def create_job_object(
else ([volume] if volume else None)
),
node_selector={"role": settings.SPIDER_NODE_ROLE}
if settings.MULTI_NODE_MODE == "True"
if settings.DEDICATED_SPIDER_NODES == "True"
else None,
)
if not isbuild:
Expand Down
4 changes: 2 additions & 2 deletions installation/helm-chart/templates/API/api-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ data:
DJANGO_API_HOST: http://{{ .Values.DJANGO_API_HOST }}
DJANGO_ALLOWED_HOSTS: {{ .Values.DJANGO_API_HOST }},{{ .Values.DJANGO_ALLOWED_HOSTS }},127.0.0.1
STAGE: {{ .Values.STAGE }}
MULTI_NODE_MODE: {{ .Values.MULTINODE | quote}}
DEDICATED_SPIDER_NODES: {{ .Values.DEDICATED_SPIDER_NODES | quote}}
SPIDER_NODE_ROLE: {{ .Values.SPIDER_NODE_ROLE | quote }}
NODE_CAPACITY_THRESHOLD: {{ .Values.NODE_CAPACITY_THRESHOLD | quote }}
WORKERS_CAPACITY_THRESHOLD: {{ .Values.WORKERS_CAPACITY_THRESHOLD | quote }}
DISPATCH_RETRY_DELAY: {{ .Values.DISPATCH_RETRY_DELAY | quote }}
RUN_JOBS_PER_LOT: {{ .Values.RUN_JOBS_PER_LOT | quote }}
BUILD: {{ .Values.BUILD }}
Expand Down
8 changes: 7 additions & 1 deletion installation/helm-chart/values.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ DJANGO_ALLOWED_HOSTS: ""
DJANGO_EXTERNAL_APPS: "" # "app_1,app_2,..."
EXTERNAL_APP_KEYS: "" # "key_1,key_2,..."
EXTERNAL_MIDDLEWARES: "" # "app1.middlware,app2.middlware"
MULTINODE: "" # "False"
DEDICATED_SPIDER_NODES: "" # "True"
BUILD: "" # "default"

# Celery
Expand Down Expand Up @@ -107,6 +107,12 @@ SIZE_THRESHOLD: ""
INSERT_TIME_THRESHOLD: ""
ACTIVITY_TIME_THRESHOLD: ""

############ SPIDER JOB RESOURCES #########
SPIDER_NODE_ROLE: "bitmaker-worker"
WORKERS_CAPACITY_THRESHOLD: "0.95"
DISPATCH_RETRY_DELAY: "30"
RUN_JOBS_PER_LOT: "100"

############ RESOURCES LIMITS #############
API_CPU_LIMIT: "" # "250m"
API_MEM_LIMIT: "" # "1Gi"
Expand Down