diff --git a/README.md b/README.md index 627de1fae6f..fed00d814ef 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,11 @@ KubeRay provides several tools to improve running and managing Ray's experience - Operator Integration with Kubernetes node problem detector (future work) - Kubernetes based workspace to easily submit ray jobs (future work) +## Use helm chart + +A helm chart is a collection of files that describe a related set of Kubernetes resources. It can help users to deploy ray-operator and ray clusters conveniently. +Please read [kubray-operator](helm-chart/kubray-operator/README.md) to deploy an operator and [ray-cluster](helm-chart/ray-cluster/README.md) to deploy a custom cluster. + ## Development Please read our [CONTRIBUTING](CONTRIBUTING.md) guide before making a pull request. Refer to our [DEVELOPMENT](./ray-operator/DEVELOPMENT.md) to build and run tests locally. diff --git a/helm-chart/kubray-operator/.helmignore b/helm-chart/kubray-operator/.helmignore new file mode 100644 index 00000000000..50af0317254 --- /dev/null +++ b/helm-chart/kubray-operator/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-chart/kubray-operator/Chart.yaml b/helm-chart/kubray-operator/Chart.yaml new file mode 100644 index 00000000000..60a0af1232d --- /dev/null +++ b/helm-chart/kubray-operator/Chart.yaml @@ -0,0 +1,7 @@ +apiVersion: v2 +appVersion: "1.0" +description: A Helm chart for Kubernetes +name: kubray-operator +version: 0.1.0 +icon: https://github.com/ray-project/ray/raw/master/doc/source/images/ray_header_logo.png +type: application diff --git a/helm-chart/kubray-operator/README.md b/helm-chart/kubray-operator/README.md new file mode 100644 index 00000000000..f3ee2ad87a5 --- /dev/null +++ b/helm-chart/kubray-operator/README.md @@ -0,0 +1,40 @@ +# Ray Operator + +KubRay-Operator: A simple Helm chart + +Run a deployment of Ray Operator. + +Deploy ray operator first, then deploy ray cluster. + +## Helm + +Make sure helm version is v3+ +```console +$ helm version +version.BuildInfo{Version:"v3.6.2", GitCommit:"ee407bdf364942bcb8e8c665f82e15aa28009b71", GitTreeState:"dirty", GoVersion:"go1.16.5"} +``` + +## Installing the Chart + +Please use command below: +```console +$ helm install kubray-operator . --values values.yaml --namespace default --create-namespace +``` +## List the Chart + +To list the `my-release` deployment: + +```console +$ helm list -n default +``` + +## Uninstalling the Chart + +To uninstall/delete the `my-release` deployment: + +```console +$ helm delete kubray-operator +``` + +The command removes nearly all the Kubernetes components associated with the +chart and deletes the release. \ No newline at end of file diff --git a/helm-chart/kubray-operator/templates/_helpers.tpl b/helm-chart/kubray-operator/templates/_helpers.tpl new file mode 100644 index 00000000000..1f45d8237f7 --- /dev/null +++ b/helm-chart/kubray-operator/templates/_helpers.tpl @@ -0,0 +1,56 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "kubray-operator.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "kubray-operator.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "kubray-operator.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} +{{- define "kubray-operator.labels" -}} +app.kubernetes.io/name: {{ include "kubray-operator.name" . }} +helm.sh/chart: {{ include "kubray-operator.chart" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Create the name of the service account to use +*/}} +{{- define "kubray-operator.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} + {{ default (include "kubray-operator.fullname" .) .Values.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.serviceAccount.name }} +{{- end -}} +{{- end -}} diff --git a/helm-chart/kubray-operator/templates/deployment.yaml b/helm-chart/kubray-operator/templates/deployment.yaml new file mode 100644 index 00000000000..c5c5aed20f7 --- /dev/null +++ b/helm-chart/kubray-operator/templates/deployment.yaml @@ -0,0 +1,62 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "kubray-operator.fullname" . }} + labels: +{{ include "kubray-operator.labels" . | indent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "kubray-operator.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "kubray-operator.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ .Values.serviceAccount.name }} + volumes: [] + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + volumeMounts: [] + command: + - /manager + ports: + - name: http + containerPort: 8080 + protocol: TCP + env: [] + livenessProbe: + httpGet: + path: /metrics + port: http + readinessProbe: + httpGet: + path: /metrics + port: http + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm-chart/kubray-operator/templates/role.yaml b/helm-chart/kubray-operator/templates/role.yaml new file mode 100644 index 00000000000..9ffdbdc6821 --- /dev/null +++ b/helm-chart/kubray-operator/templates/role.yaml @@ -0,0 +1,51 @@ +{{- if .Values.rbacEnable }} +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + labels: +{{ include "kubray-operator.labels" . | indent 4 }} + name: {{ include "kubray-operator.fullname" . }} +rules: +- apiGroups: + - "" + resources: + - pods + - pods/log + - pods/status + - pods/exec + - services + - endpoints + - persistentvolumeclaims + - events + - configmaps + - secrets + verbs: + - "*" +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - watch +- apiGroups: + - apps + resources: + - deployments + - deployments/status + - daemonsets + - replicasets + - statefulsets + - statefulsets/status + verbs: + - "*" +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch +{{- end }} diff --git a/helm-chart/kubray-operator/templates/rolebinding.yaml b/helm-chart/kubray-operator/templates/rolebinding.yaml new file mode 100644 index 00000000000..5c0f17cdf56 --- /dev/null +++ b/helm-chart/kubray-operator/templates/rolebinding.yaml @@ -0,0 +1,16 @@ +{{- if .Values.rbacEnable }} +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + labels: +{{ include "kubray-operator.labels" . | indent 4 }} + name: {{ include "kubray-operator.fullname" . }} +subjects: +- kind: ServiceAccount + name: {{ .Values.serviceAccount.name }} + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: {{ include "kubray-operator.fullname" . }} + apiGroup: rbac.authorization.k8s.io +{{- end }} diff --git a/helm-chart/kubray-operator/templates/service.yaml b/helm-chart/kubray-operator/templates/service.yaml new file mode 100644 index 00000000000..59986fa2072 --- /dev/null +++ b/helm-chart/kubray-operator/templates/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "kubray-operator.fullname" . }} + labels: +{{ include "kubray-operator.labels" . | indent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: {{ include "kubray-operator.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} diff --git a/helm-chart/kubray-operator/templates/serviceaccount.yaml b/helm-chart/kubray-operator/templates/serviceaccount.yaml new file mode 100644 index 00000000000..f36ae842282 --- /dev/null +++ b/helm-chart/kubray-operator/templates/serviceaccount.yaml @@ -0,0 +1,8 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ template "kubray-operator.serviceAccountName" . }} + labels: +{{ include "kubray-operator.labels" . | indent 4 }} +{{- end -}} diff --git a/helm-chart/kubray-operator/values.yaml b/helm-chart/kubray-operator/values.yaml new file mode 100644 index 00000000000..e5bb900fc3e --- /dev/null +++ b/helm-chart/kubray-operator/values.yaml @@ -0,0 +1,47 @@ +# Default values for kubray-operator. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: docker.io/library/controller + tag: latest + pullPolicy: IfNotPresent + +nameOverride: "kubray-operator" +fullnameOverride: "kubray-operator" + +## Install Default RBAC roles and bindings +rbac: + create: true + apiVersion: v1 + +serviceAccount: + # Specifies whether a service account should be created + create: true + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "ray-operator" + +service: + type: ClusterIP + port: 8080 + +ingress: + enabled: false + +resources: + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do whelm to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + limits: + cpu: 100m + memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +createCustomResource: true +rbacEnable: true diff --git a/helm-chart/ray-cluster/.helmignore b/helm-chart/ray-cluster/.helmignore new file mode 100644 index 00000000000..50af0317254 --- /dev/null +++ b/helm-chart/ray-cluster/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-chart/ray-cluster/Chart.yaml b/helm-chart/ray-cluster/Chart.yaml new file mode 100644 index 00000000000..558afcdc7f9 --- /dev/null +++ b/helm-chart/ray-cluster/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +appVersion: "1.0" +description: A Helm chart for Kubernetes +name: ray-cluster +version: 0.1.0 +icon: https://github.com/ray-project/ray/raw/master/doc/source/images/ray_header_logo.png diff --git a/helm-chart/ray-cluster/README.md b/helm-chart/ray-cluster/README.md new file mode 100644 index 00000000000..e7aa75b7ecf --- /dev/null +++ b/helm-chart/ray-cluster/README.md @@ -0,0 +1,65 @@ +# Ray Cluster + +Make sure ray-operator has been deployed. + +[Ray](https://ray.io/) is an open source framework that provides a simple, universal API for building distributed applications. Ray is packaged with RLlib, a scalable reinforcement learning library, and Tune, a scalable hyperparameter tuning library. + +## Helm + +```console +$ helm version +version.BuildInfo{Version:"v3.6.2", GitCommit:"ee407bdf364942bcb8e8c665f82e15aa28009b71", GitTreeState:"dirty", GoVersion:"go1.16.5"} +``` + +## TL;DR; + +```console +helm install --name ray-cluster . --values values.yaml --namespace default +``` + +## Installing the Chart + +To install the chart with the release name `my-release`: + + +```console +helm install --name sample ray-cluster --values ray-cluster/values.yaml --namespace default +``` + +> note: The chart will submit a RayCluster. + + +## Uninstalling the Chart + +To uninstall/delete the `my-release` deployment: + +```console +helm delete ray-cluster +``` + +The command removes nearly all the Kubernetes components associated with the +chart and deletes the release. + +## Check Cluster status + +### Get Service + +```console +$ kubectl get svc -l ray.io/cluster=ray-cluster +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +ray-cluster-head-svc ClusterIP 10.103.36.68 10001/TCP,6379/TCP,8265/TCP 9m24s +``` + +## Forward to dashboard + +```console +$ kubectl get pod -o wide +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +ray-cluster-head-sd77l 1/1 Running 0 8h 10.1.61.208 docker-desktop +ray-cluster-worker-workergroup-czxd6 1/1 Running 0 8h 10.1.61.207 docker-desktop +kubray-operator-687785b964-jgfhv 1/1 Running 6 3d4h 10.1.61.196 docker-desktop + +$ kubectl port-forward ray-cluster-head-sd77l 8265 +Forwarding from 127.0.0.1:8265 -> 8265 +Forwarding from [::1]:8265 -> 8265 +``` diff --git a/helm-chart/ray-cluster/templates/_helpers.tpl b/helm-chart/ray-cluster/templates/_helpers.tpl new file mode 100644 index 00000000000..38d0a90d5fa --- /dev/null +++ b/helm-chart/ray-cluster/templates/_helpers.tpl @@ -0,0 +1,56 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "ray-cluster.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "ray-cluster.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "ray-cluster.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} +{{- define "ray-cluster.labels" -}} +app.kubernetes.io/name: {{ include "ray-cluster.name" . }} +helm.sh/chart: {{ include "ray-cluster.chart" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Create the name of the service account to use +*/}} +{{- define "ray-cluster.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} + {{ default (include "ray-cluster.fullname" .) .Values.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.serviceAccount.name }} +{{- end -}} +{{- end -}} diff --git a/helm-chart/ray-cluster/templates/raycluster-cluster.yaml b/helm-chart/ray-cluster/templates/raycluster-cluster.yaml new file mode 100644 index 00000000000..e0458a775bc --- /dev/null +++ b/helm-chart/ray-cluster/templates/raycluster-cluster.yaml @@ -0,0 +1,71 @@ +apiVersion: ray.io/v1alpha1 +kind: RayCluster +metadata: + labels: +{{ include "ray-cluster.labels" . | indent 4 }} + name: {{ include "ray-cluster.fullname" . }} +spec: + headGroupSpec: + serviceType: ClusterIP + rayStartParams: + {{- range $key, $val := .Values.head.initArgs }} + {{ $key }}: {{ $val | quote }} + {{- end }} + replicas: {{ .Values.head.replicas }} + template: + spec: + containers: + - volumeMounts: {{- toYaml .Values.head.volumeMounts | nindent 12 }} + name: ray-head + image: {{ .Values.image.repository }}:{{ .Values.image.tag }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + resources: {{- toYaml .Values.head.resources | nindent 14 }} + env: + - name: TYPE + value: head + {{- toYaml .Values.head.containerEnv | nindent 14}} + volumes: {{- toYaml .Values.head.volumes | nindent 10 }} + affinity: {{- toYaml .Values.head.affinity | nindent 10 }} + metadata: + annotation: {{- toYaml .Values.head.annotations | nindent 10 }} + labels: + groupName: {{ .Values.head.groupName }} + rayNodeType: {{ .Values.head.type }} + rayCluster: {{ include "ray-cluster.fullname" . }} +{{ include "ray-cluster.labels" . | indent 10 }} + + workerGroupSpecs: + - rayStartParams: + {{- range $key, $val := .Values.worker.initArgs }} + {{ $key }}: {{ $val | quote }} + {{- end }} + replicas: {{ .Values.worker.replicas }} + minReplicas: {{ .Values.worker.miniReplicas | default 1 }} + maxReplicas: {{ .Values.worker.maxiReplicas | default 2147483647 }} + groupName: {{ .Values.worker.groupName }} + template: + spec: + initContainers: + - name: init-myservice + image: busybox:1.28 + command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] + containers: + - volumeMounts: {{- toYaml .Values.worker.volumeMounts | nindent 12 }} + name: ray-worker + image: {{ .Values.image.repository }}:{{ .Values.image.tag }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + resources: {{- toYaml .Values.worker.resources | nindent 14 }} + env: + - name: TYPE + value: worker + {{- toYaml .Values.worker.containerEnv | nindent 14}} + ports: {{- toYaml .Values.worker.ports | nindent 14}} + volumes: {{- toYaml .Values.worker.volumes | nindent 10 }} + affinity: {{- toYaml .Values.worker.affinity | nindent 10 }} + metadata: + annotation: {{- toYaml .Values.worker.annotations | nindent 10 }} + labels: + rayNodeType: {{ .Values.worker.type }} + groupName: {{ .Values.worker.groupName }} + rayCluster: {{ include "ray-cluster.fullname" . }} +{{ include "ray-cluster.labels" . | indent 10 }} \ No newline at end of file diff --git a/helm-chart/ray-cluster/values.yaml b/helm-chart/ray-cluster/values.yaml new file mode 100644 index 00000000000..d9c1e2da126 --- /dev/null +++ b/helm-chart/ray-cluster/values.yaml @@ -0,0 +1,100 @@ +# Default values for ray-cluster. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +image: + repository: rayproject/ray + tag: latest + pullPolicy: IfNotPresent + +nameOverride: "ray" +fullnameOverride: "" + +head: + groupName: headgroup + replicas: 1 + type: head + labels: + key: value + initArgs: + port: '6379' + object-manager-port: '12345' + node-manager-port: '12346' + object-store-memory: '100000000' + redis-password: 'LetMeInRay' + dashboard-host: '0.0.0.0' + num-cpus: '1' # can be auto-completed from the limits + node-ip-address: $MY_POD_IP # auto-completed as the head pod IP + containerEnv: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + resources: + limits: + cpu: 1 + requests: + cpu: 200m + annotations: {} + volumes: + - name: config + configMap: + name: ray-code + items: + - key: code.py + path: code.py + volumeMounts: + - mountPath: /opt + name: config + + +worker: + groupName: workergroup + replicas: 1 + type: worker + labels: + key: value + initArgs: + node-ip-address: $MY_POD_IP + redis-password: LetMeInRay + containerEnv: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_DISABLE_DOCKER_CPU_WARNING + value: "1" + - name: CPU_REQUEST + valueFrom: + resourceFieldRef: + containerName: ray-worker + resource: requests.cpu + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: + - containerPort: 80 + protocol: TCP + resources: + limits: + cpu: 1 + requests: + cpu: 200m + annotations: + key: value + nodeSelector: {} + tolerations: [] + affinity: {} + volumes: + - name: log-volume + emptyDir: {} + volumeMounts: + - mountPath: /var/log + name: log-volume + +headServiceSuffix: "ray-operator.svc" + +service: + type: ClusterIP + port: 8080