Namespacev1
nvidia-gpu-operator
▼ YAML
apiVersion : v1
kind : Namespace
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
openshift.io/display-name : NVIDIA GPU Operator
labels :
openshift.io/cluster-monitoring : 'true'
name : nvidia-gpu-operator
Namespacev1
openshift-nfd
▼ YAML
apiVersion : v1
kind : Namespace
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
openshift.io/display-name : Node Feature Discovery Operator
labels :
openshift.io/cluster-monitoring : 'true'
name : openshift-nfd
Namespacev1
openshift-serverless
▼ YAML
apiVersion : v1
kind : Namespace
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
openshift.io/display-name : Red Hat OpenShift Serverless
labels :
openshift.io/cluster-monitoring : 'true'
name : openshift-serverless
Namespacev1
redhat-ods-applications
▼ YAML
apiVersion : v1
kind : Namespace
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
openshift.io/display-name : OpenShift AI - Main Applications
labels :
openshift.io/cluster-monitoring : 'true'
name : redhat-ods-applications
Namespacev1
redhat-ods-monitoring
▼ YAML
apiVersion : v1
kind : Namespace
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
openshift.io/display-name : OpenShift AI - Monitoring
labels :
openshift.io/cluster-monitoring : 'true'
name : redhat-ods-monitoring
Namespacev1
redhat-ods-operator
▼ YAML
apiVersion : v1
kind : Namespace
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
openshift.io/display-name : Red Hat OpenShift AI
labels :
openshift.io/cluster-monitoring : 'true'
name : redhat-ods-operator
Namespacev1
rhods-notebooks
▼ YAML
apiVersion : v1
kind : Namespace
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
openshift.io/display-name : OpenShift AI - Individual Notebooks
labels :
openshift.io/cluster-monitoring : 'true'
name : rhods-notebooks
ServiceAccountv1
job-aro-gpu-machineset
nvidia-gpu-operator
▼ YAML
apiVersion : v1
kind : ServiceAccount
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-aro-gpu-machineset
namespace : nvidia-gpu-operator
ServiceAccountv1
job-aws-gpu-machineset
nvidia-gpu-operator
▼ YAML
apiVersion : v1
kind : ServiceAccount
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-aws-gpu-machineset
namespace : nvidia-gpu-operator
ServiceAccountv1
job-gpu-console-plugin
nvidia-gpu-operator
▼ YAML
apiVersion : v1
kind : ServiceAccount
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-gpu-console-plugin
namespace : nvidia-gpu-operator
ServiceAccountv1
job-setup-autoscale
openshift-machine-api
▼ YAML
apiVersion : v1
kind : ServiceAccount
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
autoscale : config
name : job-setup-autoscale
namespace : openshift-machine-api
ServiceAccountv1
job-pipelines-console-plugin
openshift-operators
▼ YAML
apiVersion : v1
kind : ServiceAccount
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-pipelines-console-plugin
namespace : openshift-operators
ServiceAccountv1
fix-dashboard-magic
redhat-ods-applications
▼ YAML
apiVersion : v1
kind : ServiceAccount
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : fix-dashboard-magic
namespace : redhat-ods-applications
ServiceAccountv1
approve-after-servicemesh
redhat-ods-operator
▼ YAML
apiVersion : v1
kind : ServiceAccount
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : approve-after-servicemesh
namespace : redhat-ods-operator
ServiceAccountv1
fix-operator-scale
redhat-ods-operator
▼ YAML
apiVersion : v1
kind : ServiceAccount
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : fix-operator-scale
namespace : redhat-ods-operator
ServiceAccountv1
wait-for-servicemesh
redhat-ods-operator
▼ YAML
apiVersion : v1
kind : ServiceAccount
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
name : wait-for-servicemesh
namespace : redhat-ods-operator
Rolerbac.authorization.k8s.io/v1
fix-dashboard-magic
redhat-ods-applications
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : Role
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : fix-dashboard-magic
namespace : redhat-ods-applications
rules :
- apiGroups :
- apps
resourceNames :
- rhods-dashboard
resources :
- deployments
- deployments/scale
verbs :
- get
- list
- patch
- apiGroups :
- ''
resources :
- pods
verbs :
- get
- list
- delete
Rolerbac.authorization.k8s.io/v1
fix-operator-scale
redhat-ods-operator
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : Role
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : fix-operator-scale
namespace : redhat-ods-operator
rules :
- apiGroups :
- operators.coreos.com
resources :
- clusterserviceversions
verbs :
- '*'
- apiGroups :
- ''
resources :
- pods
verbs :
- get
- list
- delete
ClusterRolerbac.authorization.k8s.io/v1
approve-after-servicemesh
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRole
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : approve-after-servicemesh
rules :
- apiGroups :
- apiextensions.k8s.io
resources :
- customresourcedefinitions
verbs :
- get
- list
- apiGroups :
- operators.coreos.com
resources :
- subscriptions
- installplans
verbs :
- get
- list
- patch
- apiGroups :
- batch
resourceNames :
- approve-after-servicemesh
- wait-for-servicemesh
resources :
- jobs
verbs :
- get
- list
- delete
ClusterRolerbac.authorization.k8s.io/v1
job-aro-gpu-machineset
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRole
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-aro-gpu-machineset
rules :
- apiGroups :
- machine.openshift.io
resources :
- machinesets
verbs :
- '*'
- apiGroups :
- autoscaling.openshift.io
resources :
- machineautoscalers
verbs :
- '*'
- apiGroups :
- ''
resourceNames :
- azure-credentials
resources :
- secrets
verbs :
- get
- list
ClusterRolerbac.authorization.k8s.io/v1
job-aws-gpu-machineset
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRole
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-aws-gpu-machineset
rules :
- apiGroups :
- machine.openshift.io
resources :
- machinesets
verbs :
- '*'
- apiGroups :
- autoscaling.openshift.io
resources :
- machineautoscalers
verbs :
- '*'
- apiGroups :
- ''
resourceNames :
- aws-creds
resources :
- secrets
verbs :
- get
- list
ClusterRolerbac.authorization.k8s.io/v1
job-gpu-console-plugin
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRole
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-gpu-console-plugin
rules :
- apiGroups :
- operator.openshift.io
resources :
- consoles
verbs :
- get
- list
- patch
- label
ClusterRolerbac.authorization.k8s.io/v1
job-pipelines-console-plugin
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRole
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-pipelines-console-plugin
rules :
- apiGroups :
- operator.openshift.io
resources :
- consoles
verbs :
- get
- list
- patch
- label
ClusterRolerbac.authorization.k8s.io/v1
job-setup-autoscale
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRole
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
autoscale : config
name : job-setup-autoscale
rules :
- apiGroups :
- machine.openshift.io
resources :
- machinesets
verbs :
- '*'
- apiGroups :
- autoscaling.openshift.io
resources :
- machineautoscalers
verbs :
- '*'
- apiGroups :
- ''
resourceNames :
- aws-creds
- azure-credentials
resources :
- secrets
verbs :
- get
- list
ClusterRolerbac.authorization.k8s.io/v1
wait-for-servicemesh
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRole
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
name : wait-for-servicemesh
rules :
- apiGroups :
- apiextensions.k8s.io
resources :
- customresourcedefinitions
verbs :
- get
- list
RoleBindingrbac.authorization.k8s.io/v1
fix-dashboard-magic
redhat-ods-applications
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : RoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : fix-dashboard-magic
namespace : redhat-ods-applications
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : Role
name : fix-dashboard-magic
subjects :
- kind : ServiceAccount
name : fix-dashboard-magic
namespace : redhat-ods-applications
RoleBindingrbac.authorization.k8s.io/v1
fix-operator-scale
redhat-ods-operator
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : RoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : fix-operator-scale
namespace : redhat-ods-operator
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : Role
name : fix-operator-scale
subjects :
- kind : ServiceAccount
name : fix-operator-scale
namespace : redhat-ods-operator
ClusterRoleBindingrbac.authorization.k8s.io/v1
approve-after-servicemesh
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : approve-after-servicemesh
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : ClusterRole
name : approve-after-servicemesh
subjects :
- kind : ServiceAccount
name : approve-after-servicemesh
namespace : redhat-ods-operator
ClusterRoleBindingrbac.authorization.k8s.io/v1
fix-rhoai-kubeadmin
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : fix-rhoai-kubeadmin
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : ClusterRole
name : cluster-admin
subjects :
- apiGroup : rbac.authorization.k8s.io
kind : User
name : kube:admin
ClusterRoleBindingrbac.authorization.k8s.io/v1
job-aro-gpu-machineset
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-aro-gpu-machineset
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : ClusterRole
name : job-aro-gpu-machineset
subjects :
- kind : ServiceAccount
name : job-aro-gpu-machineset
namespace : nvidia-gpu-operator
ClusterRoleBindingrbac.authorization.k8s.io/v1
job-aws-gpu-machineset
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-aws-gpu-machineset
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : ClusterRole
name : job-aws-gpu-machineset
subjects :
- kind : ServiceAccount
name : job-aws-gpu-machineset
namespace : nvidia-gpu-operator
ClusterRoleBindingrbac.authorization.k8s.io/v1
job-gpu-console-plugin
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-gpu-console-plugin
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : ClusterRole
name : job-gpu-console-plugin
subjects :
- kind : ServiceAccount
name : job-gpu-console-plugin
namespace : nvidia-gpu-operator
ClusterRoleBindingrbac.authorization.k8s.io/v1
job-pipelines-console-plugin
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-pipelines-console-plugin
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : ClusterRole
name : job-pipelines-console-plugin
subjects :
- kind : ServiceAccount
name : job-pipelines-console-plugin
namespace : openshift-operators
ClusterRoleBindingrbac.authorization.k8s.io/v1
job-setup-autoscale
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
autoscale : config
name : job-setup-autoscale
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : ClusterRole
name : job-setup-autoscale
subjects :
- kind : ServiceAccount
name : job-setup-autoscale
namespace : openshift-machine-api
ClusterRoleBindingrbac.authorization.k8s.io/v1
wait-for-servicemesh
▼ YAML
apiVersion : rbac.authorization.k8s.io/v1
kind : ClusterRoleBinding
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
name : wait-for-servicemesh
roleRef :
apiGroup : rbac.authorization.k8s.io
kind : ClusterRole
name : wait-for-servicemesh
subjects :
- kind : ServiceAccount
name : wait-for-servicemesh
namespace : redhat-ods-operator
ConfigMapv1
console-plugin-nvidia-gpu
nvidia-gpu-operator
▼ YAML
apiVersion : v1
data :
dcgm-metrics.csv : '# see https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/dcp-metrics-included.csv
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization.
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization.
DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization.
DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization.
DCGM_FI_DEV_FB_FREE, gauge, mem free.
DCGM_FI_DEV_FB_USED, gauge, mem used.
DCGM_FI_DEV_GPU_UTIL, gauge, gpu utilization.
DCGM_FI_DEV_POWER_USAGE, gauge, power usage.
DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, gauge, power mgmt limit.
DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp.
DCGM_FI_DEV_SM_CLOCK, gauge, sm clock.
DCGM_FI_DEV_MAX_SM_CLOCK, gauge, max sm clock.
DCGM_FI_DEV_MEM_CLOCK, gauge, mem clock.
DCGM_FI_DEV_MAX_MEM_CLOCK, gauge, max mem clock.
'
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
app.kubernetes.io/component : console-plugin-nvidia-gpu
app.kubernetes.io/instance : console-plugin-nvidia-gpu
app.kubernetes.io/managed-by : Helm
app.kubernetes.io/name : console-plugin-nvidia-gpu
app.kubernetes.io/part-of : console-plugin-nvidia-gpu
app.kubernetes.io/version : latest
helm.sh/chart : console-plugin-nvidia-gpu-0.2.4
name : console-plugin-nvidia-gpu
namespace : nvidia-gpu-operator
ConfigMapv1
device-plugin-config
nvidia-gpu-operator
▼ YAML
apiVersion : v1
data :
default : 'version: v1'
time-sliced-2 : "version: v1\nsharing:\n timeSlicing:\n resources:\n - name:\
\ nvidia.com/gpu\n replicas: 2"
time-sliced-4 : "version: v1\nsharing:\n timeSlicing:\n resources:\n - name:\
\ nvidia.com/gpu\n replicas: 4"
time-sliced-99 : "version: v1\nsharing:\n timeSlicing:\n resources:\n -\
\ name: nvidia.com/gpu\n replicas: 99"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : device-plugin-config
namespace : nvidia-gpu-operator
ConfigMapv1
job-aro-gpu-machineset
nvidia-gpu-operator
▼ YAML
apiVersion : v1
data :
job.sh : '#!/bin/bash
# shellcheck disable=SC1091
. /scripts/ocp.sh
INSTANCE_TYPE=${INSTANCE_TYPE:-Standard_NC4as_T4_v3}
ocp_aro_cluster || exit 0
ocp_aro_machineset_create_gpu "${INSTANCE_TYPE}"
ocp_machineset_create_autoscale
# ocp_machineset_taint_gpu
'
ocp.sh : "#!/bin/bash\n# shellcheck disable=SC2120\n\n# See https://github.com/redhat-na-ssa/demo-ai-gitops-catalog\n\
# FUNCTIONS='\n# ocp_aro_cluster\n# ocp_aro_machineset_create_gpu\n# ocp_aro_machineset_clone_worker\n\
# ocp_aro_machineset_fix_storage\n# ocp_machineset_create_autoscale\n# ocp_machineset_taint_gpu\n\
# '\n\n# for function in ${FUNCTIONS}\n# do\n# function_extract $function scripts/library/ocp*.sh\
\ >> tmp\n# echo >> tmp\n# done\n\nocp_machineset_create_autoscale(){\n MACHINE_MIN=${1:-0}\n\
\ MACHINE_MAX=${2:-4}\n MACHINE_SETS=${3:-$(oc -n openshift-machine-api get\
\ machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}\n\n for machine_set\
\ in ${MACHINE_SETS}\n do\ncat << YAML | oc apply -f -\napiVersion: \"autoscaling.openshift.io/v1beta1\"\
\nkind: \"MachineAutoscaler\"\nmetadata:\n name: \"${machine_set}\"\n namespace:\
\ \"openshift-machine-api\"\nspec:\n minReplicas: ${MACHINE_MIN}\n maxReplicas:\
\ ${MACHINE_MAX}\n scaleTargetRef:\n apiVersion: machine.openshift.io/v1beta1\n\
\ kind: MachineSet\n name: \"${machine_set}\"\nYAML\n done\n}\n\nocp_machineset_taint_gpu(){\n\
\ SHORT_NAME=${1:-g4dn}\n MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
\ -o name | grep \"${SHORT_NAME}\" | head -n1)\n\n echo \"Patching: ${MACHINE_SET}\"\
\n\n # taint nodes for gpu-only workloads\n oc -n openshift-machine-api \\\n\
\ patch \"${MACHINE_SET}\" \\\n --type=merge --patch '{\"spec\":{\"template\"\
:{\"spec\":{\"taints\":[{\"key\":\"nvidia.com/gpu\",\"value\":\"\",\"effect\"\
:\"NoSchedule\"}]}}}}'\n}\n\nocp_aro_cluster(){\n TARGET_NS=kube-system\n OBJ=secret/azure-credentials\n\
\ echo \"Checking if ${OBJ} exists in ${TARGET_NS} namespace\"\n oc -n \"${TARGET_NS}\"\
\ get \"${OBJ}\" -o name > /dev/null 2>&1 || return 1\n echo \"ARO cluster detected\"\
\n}\n\nocp_aro_machineset_create_gpu(){\n # https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/gpu-accelerated/nv-family\n\
\n INSTANCE_TYPE=${1:-Standard_NC64as_T4_v3}\n SHORT_NAME=${2:-${INSTANCE_TYPE//_/-}}\n\
\ SHORT_NAME=${SHORT_NAME,,}\n\n ocp_aro_machineset_clone_worker \"${INSTANCE_TYPE}\"\
\n\n MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
\ -o name | grep \"/${SHORT_NAME}\" | head -n1)\n\n echo \"Patching: ${MACHINE_SET_TYPE}\"\
\n\n # cosmetic\n oc -n openshift-machine-api \\\n patch \"${MACHINE_SET_TYPE}\"\
\ \\\n --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\"\
:{\"labels\":{\"node-role.kubernetes.io/gpu\":\"\"}}}}}}'\n\n # should use the\
\ default profile\n # oc -n openshift-machine-api \\\n # patch \"${MACHINE_SET_TYPE}\"\
\ \\\n # --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\"\
:{\"labels\":{\"nvidia.com/device-plugin.config\":\"no-time-sliced\"}}}}}}'\n\n\
\ # should help auto provisioner\n # oc -n openshift-machine-api \\\n # patch\
\ \"${MACHINE_SET_TYPE}\" \\\n # --type=merge --patch '{\"spec\":{\"template\"\
:{\"spec\":{\"metadata\":{\"labels\":{\"cluster-api/accelerator\":\"nvidia-gpu\"\
}}}}}}'\n\n # oc -n openshift-machine-api \\\n # patch \"${MACHINE_SET_TYPE}\"\
\ \\\n # --type=merge --patch '{\"metadata\":{\"labels\":{\"cluster-api/accelerator\"\
:\"nvidia-gpu\"}}}'\n\n oc -n openshift-machine-api \\\n patch \"${MACHINE_SET_TYPE}\"\
\ \\\n --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"providerSpec\"\
:{\"value\":{\"vmSize\":\"'\"${INSTANCE_TYPE}\"'\"}}}}}}'\n}\n\nocp_aro_machineset_clone_worker(){\n\
\ [ -z \"${1}\" ] && \\\n echo \"\n usage: ocp_aro_machineset_clone_worker\
\ < instance type, default Standard_D4s_v3 > < machine set name >\n \"\n\n INSTANCE_TYPE=${1:-Standard_D4s_v3}\n\
\ SHORT_NAME=${2:-${INSTANCE_TYPE//_/-}}\n SHORT_NAME=${SHORT_NAME,,}\n\n MACHINE_SET_NAME=$(oc\
\ -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep\
\ \"/${SHORT_NAME}\" | head -n1)\n MACHINE_SET_WORKER=$(oc -n openshift-machine-api\
\ get machinesets.machine.openshift.io -o name | grep worker | head -n1)\n\n \
\ # check for an existing instance machine set\n if [ -n \"${MACHINE_SET_NAME}\"\
\ ]; then\n echo \"Exists: machineset - ${MACHINE_SET_NAME}\"\n else\n \
\ echo \"Creating: machineset - ${SHORT_NAME}\"\n\n oc -n openshift-machine-api\
\ \\\n get \"${MACHINE_SET_WORKER}\" -o yaml | \\\n sed '/machine/\
\ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"'/g\n /^ name:/\
\ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"'/g\n /name/ s/'\"\
${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"'/g\n s/vmSize.*/vmSize:\
\ '\"${INSTANCE_TYPE}\"'/\n /cluster-api-autoscaler/d\n /uid:/d\n\
\ /generation:/d\n /resourceVersion:/d\n /creationTimestamp:/d\n\
\ s/replicas.*/replicas: 0/' | \\\n oc apply -f -\n\n MACHINE_SET_NAME=\"\
machinesets.machine.openshift.io/${SHORT_NAME}\"\n fi\n\n # cosmetic pretty\n\
\ oc -n openshift-machine-api \\\n patch \"${MACHINE_SET_NAME}\" \\\n --type=merge\
\ --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\":{\"labels\":{\"node-role.kubernetes.io/'\"\
${SHORT_NAME}\"'\":\"\"}}}}}}'\n}\n\n\n\n\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-aro-gpu-machineset
namespace : nvidia-gpu-operator
ConfigMapv1
job-aws-gpu-machineset
nvidia-gpu-operator
▼ YAML
apiVersion : v1
data :
job.sh : '#!/bin/bash
# shellcheck disable=SC1091
. /scripts/ocp.sh
INSTANCE_TYPE=${INSTANCE_TYPE:-g4dn.4xlarge}
ocp_aws_cluster || exit 0
ocp_aws_machineset_create_gpu "${INSTANCE_TYPE}"
ocp_machineset_create_autoscale
ocp_aws_machineset_fix_storage
# ocp_machineset_taint_gpu
'
ocp.sh : "#!/bin/bash\n# shellcheck disable=SC2120\n\n# See https://github.com/redhat-na-ssa/demo-ai-gitops-catalog\n\
# FUNCTIONS='\n# ocp_aws_cluster\n# ocp_aws_machineset_create_gpu\n# ocp_aws_machineset_clone_worker\n\
# ocp_aws_machineset_fix_storage\n# ocp_machineset_create_autoscale\n# ocp_machineset_taint_gpu\n\
# '\n\n# for function in ${FUNCTIONS}\n# do\n# function_extract $function scripts/library/ocp*.sh\
\ >> tmp\n# echo >> tmp\n# done\n\nocp_aws_cluster(){\n TARGET_NS=kube-system\n\
\ OBJ=secret/aws-creds\n echo \"Checking if ${OBJ} exists in ${TARGET_NS} namespace\"\
\n oc -n \"${TARGET_NS}\" get \"${OBJ}\" -o name > /dev/null 2>&1 || return 1\n\
\ echo \"AWS cluster detected\"\n}\n\nocp_aws_machineset_create_gpu(){\n # https://aws.amazon.com/ec2/instance-types/g4\n\
\ # single gpu: g4dn.{2,4,8,16}xlarge\n # multi gpu: g4dn.12xlarge\n # practical:\
\ g4ad.4xlarge\n # a100 (MIG): p4d.24xlarge\n # h100 (MIG): p5.48xlarge\n\n\
\ # https://aws.amazon.com/ec2/instance-types/dl1\n # 8 x gaudi: dl1.24xlarge\n\
\n INSTANCE_TYPE=${1:-g4dn.4xlarge}\n\n ocp_aws_machineset_clone_worker \"${INSTANCE_TYPE}\"\
\n\n MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
\ -o name | grep \"${INSTANCE_TYPE%.*}\" | head -n1)\n\n echo \"Patching: ${MACHINE_SET_TYPE}\"\
\n\n # cosmetic\n oc -n openshift-machine-api \\\n patch \"${MACHINE_SET_TYPE}\"\
\ \\\n --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\"\
:{\"labels\":{\"node-role.kubernetes.io/gpu\":\"\"}}}}}}'\n\n # should use the\
\ default profile\n # oc -n openshift-machine-api \\\n # patch \"${MACHINE_SET_TYPE}\"\
\ \\\n # --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\"\
:{\"labels\":{\"nvidia.com/device-plugin.config\":\"no-time-sliced\"}}}}}}'\n\n\
\ # should help auto provisioner\n # oc -n openshift-machine-api \\\n # patch\
\ \"${MACHINE_SET_TYPE}\" \\\n # --type=merge --patch '{\"spec\":{\"template\"\
:{\"spec\":{\"metadata\":{\"labels\":{\"cluster-api/accelerator\":\"nvidia-gpu\"\
}}}}}}'\n\n # oc -n openshift-machine-api \\\n # patch \"${MACHINE_SET_TYPE}\"\
\ \\\n # --type=merge --patch '{\"metadata\":{\"labels\":{\"cluster-api/accelerator\"\
:\"nvidia-gpu\"}}}'\n\n oc -n openshift-machine-api \\\n patch \"${MACHINE_SET_TYPE}\"\
\ \\\n --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"providerSpec\"\
:{\"value\":{\"instanceType\":\"'\"${INSTANCE_TYPE}\"'\"}}}}}}'\n\n# # fix storage\n\
\n# cat << YAML > /tmp/patch.yaml\n# spec:\n# template:\n# spec:\n# \
\ providerSpec:\n# value:\n# blockDevices:\n# \
\ - ebs:\n# volumeSize: 120\n# volumeType: gp3\n\
# YAML\n\n# oc -n openshift-machine-api \\\n# patch \"${MACHINE_SET_TYPE}\"\
\ \\\n# --type=merge --patch \"$(cat /tmp/patch.yaml)\"\n}\n\nocp_aws_machineset_clone_worker(){\n\
\ [ -z \"${1}\" ] && \\\n echo \"\n usage: ocp_aws_machineset_clone_worker\
\ < instance type, default g4dn.4xlarge > < machine set name >\n \"\n\n INSTANCE_TYPE=${1:-g4dn.4xlarge}\n\
\ SHORT_NAME=${2:-${INSTANCE_TYPE/./-}}\n\n MACHINE_SET_NAME=$(oc -n openshift-machine-api\
\ get machinesets.machine.openshift.io -o name | grep \"${SHORT_NAME}\" | head\
\ -n1)\n MACHINE_SET_WORKER=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
\ -o name | grep worker | head -n1)\n\n # check for an existing instance machine\
\ set\n if [ -n \"${MACHINE_SET_NAME}\" ]; then\n echo \"Exists: machineset\
\ - ${MACHINE_SET_NAME}\"\n else\n echo \"Creating: machineset - ${SHORT_NAME}\"\
\n oc -n openshift-machine-api \\\n get \"${MACHINE_SET_WORKER}\" -o yaml\
\ | \\\n sed '/machine/ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"\
'/g\n /^ name:/ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"\
'/g\n /name/ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"'/g\n\
\ s/instanceType.*/instanceType: '\"${INSTANCE_TYPE}\"'/\n /cluster-api-autoscaler/d\n\
\ /uid:/d\n /generation:/d\n /resourceVersion:/d\n\
\ /creationTimestamp:/d\n s/replicas.*/replicas: 0/' | \\\n\
\ oc apply -f -\n fi\n\n # fix aws storage\n ocp_aws_machineset_fix_storage\
\ \"${MACHINE_SET_NAME}\"\n\n # cosmetic pretty\n oc -n openshift-machine-api\
\ \\\n patch \"${MACHINE_SET_NAME}\" \\\n --type=merge --patch '{\"spec\"\
:{\"template\":{\"spec\":{\"metadata\":{\"labels\":{\"node-role.kubernetes.io/'\"\
${SHORT_NAME}\"'\":\"\"}}}}}}'\n}\n\nocp_aws_machineset_fix_storage(){\n MACHINE_SETS=${1:-$(oc\
\ -n openshift-machine-api get machineset -o name)}\n HD_SIZE=${2:-200}\n\n \
\ for machine_set in ${MACHINE_SETS}\n do\n echo \"Patching aws storage for\
\ machineset: ${machine_set}\"\n oc -n openshift-machine-api \\\n get\
\ \"${machine_set}\" -o yaml | \\\n sed 's/volumeSize: 100/volumeSize:\
\ '\"${HD_SIZE}\"'/\n s/volumeType: gp2/volumeType: gp3/' | \\\n \
\ oc apply -f -\n done\n}\n\nocp_machineset_create_autoscale(){\n MACHINE_MIN=${1:-0}\n\
\ MACHINE_MAX=${2:-4}\n MACHINE_SETS=${3:-$(oc -n openshift-machine-api get\
\ machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}\n\n for machine_set\
\ in ${MACHINE_SETS}\n do\ncat << YAML | oc apply -f -\napiVersion: \"autoscaling.openshift.io/v1beta1\"\
\nkind: \"MachineAutoscaler\"\nmetadata:\n name: \"${machine_set}\"\n namespace:\
\ \"openshift-machine-api\"\nspec:\n minReplicas: ${MACHINE_MIN}\n maxReplicas:\
\ ${MACHINE_MAX}\n scaleTargetRef:\n apiVersion: machine.openshift.io/v1beta1\n\
\ kind: MachineSet\n name: \"${machine_set}\"\nYAML\n done\n}\n\nocp_machineset_taint_gpu(){\n\
\ SHORT_NAME=${1:-g4dn}\n MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
\ -o name | grep \"${SHORT_NAME}\" | head -n1)\n\n echo \"Patching: ${MACHINE_SET}\"\
\n\n # taint nodes for gpu-only workloads\n oc -n openshift-machine-api \\\n\
\ patch \"${MACHINE_SET}\" \\\n --type=merge --patch '{\"spec\":{\"template\"\
:{\"spec\":{\"taints\":[{\"key\":\"nvidia.com/gpu\",\"value\":\"\",\"effect\"\
:\"NoSchedule\"}]}}}}'\n}\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-aws-gpu-machineset
namespace : nvidia-gpu-operator
ConfigMapv1
job-gpu-console-plugin
nvidia-gpu-operator
▼ YAML
apiVersion : v1
data :
console-plugin-job.sh : "#!/usr/bin/bash\n\nenable_console_plugin(){\n [ -z \"${PLUGIN_NAME}\"\
\ ] && return 1\n\n echo \"Attempting to enable ${PLUGIN_NAME} plugin\"\n echo\
\ \"\"\n\n # Create the plugins section on the object if it doesn't exist\n \
\ if [ -z \"$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}')\"\
\ ]; then\n echo \"Creating plugins object\"\n oc patch consoles.operator.openshift.io\
\ cluster --patch '{ \"spec\": { \"plugins\": [] } }' --type=merge\n fi\n\n \
\ INSTALLED_PLUGINS=$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}')\n\
\ echo \"Current plugins:\"\n echo \"${INSTALLED_PLUGINS}\"\n\n if [[ \"${INSTALLED_PLUGINS}\"\
\ == *\"${PLUGIN_NAME}\"* ]]; then\n echo \"${PLUGIN_NAME} is already enabled\"\
\n else\n echo \"Enabling plugin: ${PLUGIN_NAME}\"\n oc patch consoles.operator.openshift.io\
\ cluster --type=json --patch '[{\"op\": \"add\", \"path\": \"/spec/plugins/-\"\
, \"value\": \"'\"${PLUGIN_NAME}\"'\"}]'\n fi\n\n sleep 6\n oc get consoles.operator.openshift.io\
\ cluster -o=jsonpath='{.spec.plugins}'\n}\n\nenable_console_plugin\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-gpu-console-plugin
namespace : nvidia-gpu-operator
ConfigMapv1
nvidia-dcgm-exporter-dashboard
openshift-config-managed
▼ YAML
apiVersion : v1
data :
dcgm-exporter-dashboard.json : "{\n \"__requires\": [\n {\n \"type\": \"\
panel\",\n \"id\": \"gauge\",\n \"name\": \"Gauge\",\n \"version\"\
: \"\"\n },\n {\n \"type\": \"grafana\",\n \"id\": \"grafana\"\
,\n \"name\": \"Grafana\",\n \"version\": \"6.7.3\"\n },\n {\n\
\ \"type\": \"panel\",\n \"id\": \"graph\",\n \"name\": \"Graph\"\
,\n \"version\": \"\"\n },\n {\n \"type\": \"datasource\",\n \
\ \"id\": \"prometheus\",\n \"name\": \"Prometheus\",\n \"version\"\
: \"1.0.0\"\n }\n ],\n \"annotations\": {\n \"list\": [\n {\n \
\ \"$$hashKey\": \"object:192\",\n \"builtIn\": 1,\n \"datasource\"\
: \"-- Grafana --\",\n \"enable\": true,\n \"hide\": true,\n \
\ \"iconColor\": \"rgba(0, 211, 255, 1)\",\n \"name\": \"Annotations\
\ & Alerts\",\n \"type\": \"dashboard\"\n }\n ]\n },\n \"description\"\
: \"This dashboard is to display the metrics from DCGM Exporter on a Kubernetes\
\ (1.19+) cluster\",\n \"editable\": true,\n \"gnetId\": 12239,\n \"graphTooltip\"\
: 0,\n \"id\": null,\n \"iteration\": 1588401887165,\n \"links\": [],\n \"\
panels\": [\n {\n \"aliasColors\": {},\n \"bars\": false,\n \
\ \"dashLength\": 10,\n \"dashes\": false,\n \"datasource\": \"$datasource\"\
,\n \"fill\": 1,\n \"fillGradient\": 0,\n \"gridPos\": {\n \
\ \"h\": 8,\n \"w\": 18,\n \"x\": 0,\n \"y\": 0\n \
\ },\n \"hiddenSeries\": false,\n \"id\": 12,\n \"legend\": {\n\
\ \"alignAsTable\": true,\n \"avg\": true,\n \"current\"\
: true,\n \"max\": true,\n \"min\": false,\n \"rightSide\"\
: true,\n \"show\": true,\n \"total\": false,\n \"values\"\
: true\n },\n \"lines\": true,\n \"linewidth\": 2,\n \"nullPointMode\"\
: \"null\",\n \"options\": {\n \"dataLinks\": []\n },\n \
\ \"percentage\": false,\n \"pointradius\": 2,\n \"points\": false,\n\
\ \"renderer\": \"flot\",\n \"seriesOverrides\": [],\n \"spaceLength\"\
: 10,\n \"stack\": false,\n \"steppedLine\": false,\n \"targets\"\
: [\n {\n \"expr\": \"DCGM_FI_DEV_GPU_TEMP{instance=~\\\"$instance\\\
\", gpu=~\\\"$gpu\\\"}\",\n \"instant\": false,\n \"interval\"\
: \"\",\n \"legendFormat\": \"GPU {{gpu}}\",\n \"refId\": \"\
A\"\n }\n ],\n \"thresholds\": [],\n \"timeFrom\": null,\n\
\ \"timeRegions\": [],\n \"timeShift\": null,\n \"title\": \"GPU\
\ Temperature\",\n \"tooltip\": {\n \"shared\": true,\n \"\
sort\": 0,\n \"value_type\": \"individual\"\n },\n \"type\":\
\ \"graph\",\n \"xaxis\": {\n \"buckets\": null,\n \"mode\"\
: \"time\",\n \"name\": null,\n \"show\": true,\n \"values\"\
: []\n },\n \"yaxes\": [\n {\n \"format\": \"celsius\"\
,\n \"label\": null,\n \"logBase\": 1,\n \"max\": null,\n\
\ \"min\": null,\n \"show\": true\n },\n {\n \
\ \"format\": \"short\",\n \"label\": null,\n \"logBase\"\
: 1,\n \"max\": null,\n \"min\": null,\n \"show\":\
\ true\n }\n ],\n \"yaxis\": {\n \"align\": false,\n \
\ \"alignLevel\": null\n }\n },\n {\n \"datasource\": \"\
$datasource\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 6,\n \
\ \"x\": 18,\n \"y\": 0\n },\n \"id\": 14,\n \"options\"\
: {\n \"fieldOptions\": {\n \"calcs\": [\n \"mean\"\
\n ],\n \"defaults\": {\n \"color\": {\n \
\ \"mode\": \"thresholds\"\n },\n \"mappings\": [],\n\
\ \"max\": 100,\n \"min\": 0,\n \"thresholds\"\
: {\n \"mode\": \"absolute\",\n \"steps\": [\n \
\ {\n \"color\": \"green\",\n \"value\"\
: null\n },\n {\n \"color\": \"\
#EAB839\",\n \"value\": 83\n },\n \
\ {\n \"color\": \"red\",\n \"value\": 87\n\
\ }\n ]\n },\n \"unit\": \"\
celsius\"\n },\n \"overrides\": [],\n \"values\": false\n\
\ },\n \"orientation\": \"auto\",\n \"showThresholdLabels\"\
: false,\n \"showThresholdMarkers\": true\n },\n \"pluginVersion\"\
: \"6.7.3\",\n \"targets\": [\n {\n \"expr\": \"avg(DCGM_FI_DEV_GPU_TEMP{instance=~\\\
\"$instance\\\", gpu=~\\\"$gpu\\\"})\",\n \"interval\": \"\",\n \
\ \"legendFormat\": \"\",\n \"refId\": \"A\"\n }\n ],\n\
\ \"timeFrom\": null,\n \"timeShift\": null,\n \"title\": \"GPU\
\ Avg. Temp\",\n \"type\": \"gauge\"\n },\n {\n \"aliasColors\"\
: {},\n \"bars\": false,\n \"dashLength\": 10,\n \"dashes\": false,\n\
\ \"datasource\": \"$datasource\",\n \"fill\": 1,\n \"fillGradient\"\
: 0,\n \"gridPos\": {\n \"h\": 8,\n \"w\": 18,\n \"\
x\": 0,\n \"y\": 8\n },\n \"hiddenSeries\": false,\n \"\
id\": 10,\n \"legend\": {\n \"alignAsTable\": true,\n \"avg\"\
: true,\n \"current\": true,\n \"max\": true,\n \"min\":\
\ false,\n \"rightSide\": true,\n \"show\": true,\n \"total\"\
: false,\n \"values\": true\n },\n \"lines\": true,\n \"\
linewidth\": 2,\n \"nullPointMode\": \"null\",\n \"options\": {\n \
\ \"dataLinks\": []\n },\n \"percentage\": false,\n \"pluginVersion\"\
: \"6.5.2\",\n \"pointradius\": 2,\n \"points\": false,\n \"renderer\"\
: \"flot\",\n \"seriesOverrides\": [],\n \"spaceLength\": 10,\n \
\ \"stack\": false,\n \"steppedLine\": false,\n \"targets\": [\n \
\ {\n \"expr\": \"DCGM_FI_DEV_POWER_USAGE{instance=~\\\"$instance\\\
\", gpu=~\\\"$gpu\\\"}\",\n \"interval\": \"\",\n \"legendFormat\"\
: \"GPU {{gpu}}\",\n \"refId\": \"A\"\n }\n ],\n \"\
thresholds\": [],\n \"timeFrom\": null,\n \"timeRegions\": [],\n \
\ \"timeShift\": null,\n \"title\": \"GPU Power Usage\",\n \"tooltip\"\
: {\n \"shared\": true,\n \"sort\": 0,\n \"value_type\":\
\ \"individual\"\n },\n \"type\": \"graph\",\n \"xaxis\": {\n \
\ \"buckets\": null,\n \"mode\": \"time\",\n \"name\": null,\n\
\ \"show\": true,\n \"values\": []\n },\n \"yaxes\": [\n\
\ {\n \"format\": \"watt\",\n \"label\": null,\n \
\ \"logBase\": 1,\n \"max\": null,\n \"min\": null,\n \
\ \"show\": true\n },\n {\n \"format\": \"short\"\
,\n \"label\": null,\n \"logBase\": 1,\n \"max\": null,\n\
\ \"min\": null,\n \"show\": true\n }\n ],\n \
\ \"yaxis\": {\n \"align\": false,\n \"alignLevel\": null\n \
\ }\n },\n {\n \"cacheTimeout\": null,\n \"datasource\": \"\
$datasource\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 6,\n \
\ \"x\": 18,\n \"y\": 8\n },\n \"id\": 16,\n \"links\"\
: [],\n \"options\": {\n \"fieldOptions\": {\n \"calcs\"\
: [\n \"sum\"\n ],\n \"defaults\": {\n \
\ \"color\": {\n \"mode\": \"thresholds\"\n },\n \
\ \"mappings\": [],\n \"max\": 2400,\n \"min\": 0,\n\
\ \"nullValueMode\": \"connected\",\n \"thresholds\": {\n\
\ \"mode\": \"absolute\",\n \"steps\": [\n \
\ {\n \"color\": \"green\",\n \"value\"\
: null\n },\n {\n \"color\": \"\
#EAB839\",\n \"value\": 1800\n },\n \
\ {\n \"color\": \"red\",\n \"value\":\
\ 2200\n }\n ]\n },\n \"unit\"\
: \"watt\"\n },\n \"overrides\": [],\n \"values\":\
\ false\n },\n \"orientation\": \"horizontal\",\n \"showThresholdLabels\"\
: false,\n \"showThresholdMarkers\": true\n },\n \"pluginVersion\"\
: \"6.7.3\",\n \"targets\": [\n {\n \"expr\": \"sum(DCGM_FI_DEV_POWER_USAGE{instance=~\\\
\"$instance\\\", gpu=~\\\"$gpu\\\"})\",\n \"instant\": true,\n \
\ \"interval\": \"\",\n \"legendFormat\": \"\",\n \"range\"\
: false,\n \"refId\": \"A\"\n }\n ],\n \"timeFrom\"\
: null,\n \"timeShift\": null,\n \"title\": \"GPU Power Total\",\n \
\ \"type\": \"gauge\"\n },\n {\n \"aliasColors\": {},\n \"\
bars\": false,\n \"dashLength\": 10,\n \"dashes\": false,\n \"\
datasource\": \"$datasource\",\n \"fill\": 1,\n \"fillGradient\": 0,\n\
\ \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 0,\n\
\ \"y\": 16\n },\n \"hiddenSeries\": false,\n \"id\": 2,\n\
\ \"interval\": \"\",\n \"legend\": {\n \"alignAsTable\": true,\n\
\ \"avg\": true,\n \"current\": true,\n \"max\": true,\n\
\ \"min\": false,\n \"rightSide\": true,\n \"show\": true,\n\
\ \"sideWidth\": null,\n \"total\": false,\n \"values\":\
\ true\n },\n \"lines\": true,\n \"linewidth\": 2,\n \"nullPointMode\"\
: \"null\",\n \"options\": {\n \"dataLinks\": []\n },\n \
\ \"percentage\": false,\n \"pointradius\": 2,\n \"points\": false,\n\
\ \"renderer\": \"flot\",\n \"seriesOverrides\": [],\n \"spaceLength\"\
: 10,\n \"stack\": false,\n \"steppedLine\": false,\n \"targets\"\
: [\n {\n \"expr\": \"DCGM_FI_DEV_SM_CLOCK{instance=~\\\"$instance\\\
\", gpu=~\\\"$gpu\\\"} * 1000000\",\n \"format\": \"time_series\",\n\
\ \"interval\": \"\",\n \"intervalFactor\": 1,\n \"\
legendFormat\": \"GPU {{gpu}}\",\n \"refId\": \"A\"\n }\n \
\ ],\n \"thresholds\": [],\n \"timeFrom\": null,\n \"timeRegions\"\
: [],\n \"timeShift\": null,\n \"title\": \"GPU SM Clocks\",\n \
\ \"tooltip\": {\n \"shared\": true,\n \"sort\": 0,\n \"\
value_type\": \"individual\"\n },\n \"type\": \"graph\",\n \"xaxis\"\
: {\n \"buckets\": null,\n \"mode\": \"time\",\n \"name\"\
: null,\n \"show\": true,\n \"values\": []\n },\n \"yaxes\"\
: [\n {\n \"decimals\": null,\n \"format\": \"hertz\"\
,\n \"label\": \"\",\n \"logBase\": 1,\n \"max\": null,\n\
\ \"min\": null,\n \"show\": true\n },\n {\n \
\ \"format\": \"short\",\n \"label\": null,\n \"logBase\"\
: 1,\n \"max\": null,\n \"min\": null,\n \"show\":\
\ true\n }\n ],\n \"yaxis\": {\n \"align\": false,\n \
\ \"alignLevel\": null\n }\n },\n {\n \"aliasColors\": {},\n\
\ \"bars\": false,\n \"dashLength\": 10,\n \"dashes\": false,\n\
\ \"datasource\": \"$datasource\",\n \"fill\": 1,\n \"fillGradient\"\
: 0,\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"\
x\": 0,\n \"y\": 24\n },\n \"hiddenSeries\": false,\n \"\
id\": 6,\n \"legend\": {\n \"alignAsTable\": true,\n \"avg\"\
: true,\n \"current\": true,\n \"max\": true,\n \"min\":\
\ false,\n \"rightSide\": true,\n \"show\": true,\n \"total\"\
: false,\n \"values\": true\n },\n \"lines\": true,\n \"\
linewidth\": 2,\n \"nullPointMode\": \"null\",\n \"options\": {\n \
\ \"dataLinks\": []\n },\n \"percentage\": false,\n \"pointradius\"\
: 2,\n \"points\": false,\n \"renderer\": \"flot\",\n \"seriesOverrides\"\
: [],\n \"spaceLength\": 10,\n \"stack\": false,\n \"steppedLine\"\
: false,\n \"targets\": [\n {\n \"expr\": \"DCGM_FI_DEV_GPU_UTIL{instance=~\\\
\"$instance\\\", gpu=~\\\"$gpu\\\"}\",\n \"interval\": \"\",\n \
\ \"legendFormat\": \"GPU {{gpu}}\",\n \"refId\": \"A\"\n }\n\
\ ],\n \"thresholds\": [],\n \"timeFrom\": null,\n \"timeRegions\"\
: [],\n \"timeShift\": null,\n \"title\": \"GPU Utilization\",\n \
\ \"tooltip\": {\n \"shared\": true,\n \"sort\": 0,\n \"\
value_type\": \"cumulative\"\n },\n \"type\": \"graph\",\n \"xaxis\"\
: {\n \"buckets\": null,\n \"mode\": \"time\",\n \"name\"\
: null,\n \"show\": true,\n \"values\": []\n },\n \"yaxes\"\
: [\n {\n \"format\": \"percent\",\n \"label\": null,\n\
\ \"logBase\": 1,\n \"max\": \"100\",\n \"min\": \"\
0\",\n \"show\": true\n },\n {\n \"format\": \"\
short\",\n \"label\": null,\n \"logBase\": 1,\n \"\
max\": null,\n \"min\": null,\n \"show\": true\n }\n\
\ ],\n \"yaxis\": {\n \"align\": false,\n \"alignLevel\"\
: null\n }\n },\n {\n \"aliasColors\": {},\n \"bars\": false,\n\
\ \"dashLength\": 10,\n \"dashes\": false,\n \"datasource\": \"\
$datasource\",\n \"fill\": 1,\n \"fillGradient\": 0,\n \"gridPos\"\
: {\n \"h\": 8,\n \"w\": 12,\n \"x\": 0,\n \"y\":\
\ 32\n },\n \"hiddenSeries\": false,\n \"id\": 18,\n \"legend\"\
: {\n \"alignAsTable\": true,\n \"avg\": true,\n \"current\"\
: true,\n \"max\": true,\n \"min\": false,\n \"rightSide\"\
: true,\n \"show\": true,\n \"total\": false,\n \"values\"\
: true\n },\n \"lines\": true,\n \"linewidth\": 2,\n \"nullPointMode\"\
: \"null\",\n \"options\": {\n \"dataLinks\": []\n },\n \
\ \"percentage\": false,\n \"pointradius\": 2,\n \"points\": false,\n\
\ \"renderer\": \"flot\",\n \"seriesOverrides\": [],\n \"spaceLength\"\
: 10,\n \"stack\": false,\n \"steppedLine\": false,\n \"targets\"\
: [\n {\n \"expr\": \"DCGM_FI_DEV_FB_USED{instance=~\\\"$instance\\\
\", gpu=~\\\"$gpu\\\"}\",\n \"interval\": \"\",\n \"legendFormat\"\
: \"GPU {{gpu}}\",\n \"refId\": \"A\"\n }\n ],\n \"\
thresholds\": [],\n \"timeFrom\": null,\n \"timeRegions\": [],\n \
\ \"timeShift\": null,\n \"title\": \"GPU Framebuffer Mem Used\",\n \
\ \"tooltip\": {\n \"shared\": true,\n \"sort\": 0,\n \"\
value_type\": \"individual\"\n },\n \"type\": \"graph\",\n \"xaxis\"\
: {\n \"buckets\": null,\n \"mode\": \"time\",\n \"name\"\
: null,\n \"show\": true,\n \"values\": []\n },\n \"yaxes\"\
: [\n {\n \"format\": \"decmbytes\",\n \"label\": null,\n\
\ \"logBase\": 1,\n \"max\": null,\n \"min\": null,\n\
\ \"show\": true\n },\n {\n \"format\": \"short\"\
,\n \"label\": null,\n \"logBase\": 1,\n \"max\": null,\n\
\ \"min\": null,\n \"show\": true\n }\n ],\n \
\ \"yaxis\": {\n \"align\": false,\n \"alignLevel\": null\n \
\ }\n },\n {\n \"aliasColors\": {},\n \"bars\": false,\n \
\ \"dashLength\": 10,\n \"dashes\": false,\n \"datasource\": \"$datasource\"\
,\n \"fill\": 1,\n \"fillGradient\": 0,\n \"gridPos\": {\n \
\ \"h\": 8,\n \"w\": 12,\n \"x\": 0,\n \"y\": 24\n \
\ },\n \"hiddenSeries\": false,\n \"id\": 4,\n \"legend\": {\n\
\ \"alignAsTable\": true,\n \"avg\": true,\n \"current\"\
: true,\n \"max\": true,\n \"min\": false,\n \"rightSide\"\
: true,\n \"show\": true,\n \"total\": false,\n \"values\"\
: true\n },\n \"lines\": true,\n \"linewidth\": 2,\n \"nullPointMode\"\
: \"null\",\n \"options\": {\n \"dataLinks\": []\n },\n \
\ \"percentage\": false,\n \"pointradius\": 2,\n \"points\": false,\n\
\ \"renderer\": \"flot\",\n \"seriesOverrides\": [],\n \"spaceLength\"\
: 10,\n \"stack\": false,\n \"steppedLine\": false,\n \"targets\"\
: [\n {\n \"expr\": \"DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\\\
\"$instance\\\", gpu=~\\\"$gpu\\\"}\",\n \"interval\": \"\",\n \
\ \"legendFormat\": \"GPU {{gpu}}\",\n \"refId\": \"A\"\n }\n\
\ ],\n \"thresholds\": [],\n \"timeFrom\": null,\n \"timeRegions\"\
: [],\n \"timeShift\": null,\n \"title\": \"Tensor Core Utilization\"\
,\n \"tooltip\": {\n \"shared\": true,\n \"sort\": 0,\n \
\ \"value_type\": \"cumulative\"\n },\n \"type\": \"graph\",\n \
\ \"xaxis\": {\n \"buckets\": null,\n \"mode\": \"time\",\n\
\ \"name\": null,\n \"show\": true,\n \"values\": []\n \
\ },\n \"yaxes\": [\n {\n \"format\": \"percentunit\"\
,\n \"label\": null,\n \"logBase\": 1,\n \"max\": \"\
1\",\n \"min\": \"0\",\n \"show\": true\n },\n \
\ {\n \"format\": \"short\",\n \"label\": null,\n \
\ \"logBase\": 1,\n \"max\": null,\n \"min\": null,\n \
\ \"show\": true\n }\n ],\n \"yaxis\": {\n \"align\"\
: false,\n \"alignLevel\": null\n }\n }\n ],\n \"refresh\": false,\n\
\ \"schemaVersion\": 22,\n \"style\": \"dark\",\n \"tags\": [],\n \"templating\"\
: {\n \"list\": [\n {\n \"current\": {\n \"selected\"\
: true,\n \"text\": \"Prometheus\",\n \"value\": \"Prometheus\"\
\n },\n \"hide\": 0,\n \"includeAll\": false,\n \"\
multi\": false,\n \"name\": \"datasource\",\n \"options\": [],\n\
\ \"query\": \"prometheus\",\n \"queryValue\": \"\",\n \"\
refresh\": 1,\n \"regex\": \"\",\n \"skipUrlSync\": false,\n \
\ \"type\": \"datasource\"\n },\n {\n \"allValue\": null,\n\
\ \"current\": {},\n \"datasource\": \"$datasource\",\n \"\
definition\": \"label_values(DCGM_FI_DEV_GPU_TEMP, instance)\",\n \"hide\"\
: 0,\n \"includeAll\": true,\n \"index\": -1,\n \"label\"\
: null,\n \"multi\": true,\n \"name\": \"instance\",\n \"\
options\": [],\n \"query\": \"label_values(DCGM_FI_DEV_GPU_TEMP, instance)\"\
,\n \"refresh\": 1,\n \"regex\": \"\",\n \"skipUrlSync\"\
: false,\n \"sort\": 1,\n \"tagValuesQuery\": \"\",\n \"\
tags\": [],\n \"tagsQuery\": \"\",\n \"type\": \"query\",\n \
\ \"useTags\": false\n },\n {\n \"allValue\": null,\n \
\ \"current\": {},\n \"datasource\": \"$datasource\",\n \"definition\"\
: \"label_values(DCGM_FI_DEV_GPU_TEMP, gpu)\",\n \"hide\": 0,\n \
\ \"includeAll\": true,\n \"index\": -1,\n \"label\": null,\n \
\ \"multi\": true,\n \"name\": \"gpu\",\n \"options\": [],\n\
\ \"query\": \"label_values(DCGM_FI_DEV_GPU_TEMP, gpu)\",\n \"refresh\"\
: 1,\n \"regex\": \"\",\n \"skipUrlSync\": false,\n \"sort\"\
: 1,\n \"tagValuesQuery\": \"\",\n \"tags\": [],\n \"tagsQuery\"\
: \"\",\n \"type\": \"query\",\n \"useTags\": false\n }\n \
\ ]\n },\n \"time\": {\n \"from\": \"now-15m\",\n \"to\": \"now\"\n\
\ },\n \"timepicker\": {\n \"refresh_intervals\": [\n \"5s\",\n \
\ \"10s\",\n \"30s\",\n \"1m\",\n \"5m\",\n \"15m\",\n \
\ \"30m\",\n \"1h\",\n \"2h\",\n \"1d\"\n ]\n },\n \"timezone\"\
: \"\",\n \"title\": \"NVIDIA DCGM Exporter Dashboard\",\n \"uid\": \"Oxed_c6Wz\"\
,\n \"variables\": {\n \"list\": []\n },\n \"version\": 1\n}\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
console.openshift.io/dashboard : 'true'
console.openshift.io/odc-dashboard : 'true'
name : nvidia-dcgm-exporter-dashboard
namespace : openshift-config-managed
ConfigMapv1
job-setup-autoscale
openshift-machine-api
▼ YAML
apiVersion : v1
data :
job.sh : '#!/bin/bash
# shellcheck disable=SC1091
. /scripts/ocp.sh
ocp_machineset_create_autoscale "${MACHINE_MIN}" "${MACHINE_MAX}"
'
ocp.sh : "#!/bin/bash\n\n# https://mirror.openshift.com/pub/openshift-v4\n\nocp_add_admin_user(){\n\
\ HT_USERNAME=${1:-admin}\n HT_PASSWORD=${2:-$(genpass)}\n\n htpasswd_ocp_get_file\n\
\ htpasswd_add_user \"${HT_USERNAME}\" \"${HT_PASSWORD}\"\n htpasswd_ocp_set_file\n\
\ htpasswd_validate_user \"${HT_USERNAME}\" \"${HT_PASSWORD}\"\n}\n\nocp_auth_add_to_group(){\n\
\ USER=${1:-admin}\n OCP_GROUP=${2:-${DEFAULT_OCP_GROUP}}\n\n ocp_auth_create_group\
\ \"${OCP_GROUP}\"\n\n oc adm groups add-users \\\n \"${OCP_GROUP}\" \"${USER}\"\
\n}\n\nocp_auth_create_group(){\n OCP_GROUP=${1:-${DEFAULT_OCP_GROUP}}\n\n oc\
\ get group \"${OCP_GROUP}\" > /dev/null 2>&1 && return\n\necho \"\napiVersion:\
\ user.openshift.io/v1\nkind: Group\nmetadata:\n name: ${OCP_GROUP}\n\" | oc\
\ apply -f-\n\n}\n\nocp_auth_setup_user(){\n USER=${1:-admin}\n PASS=${2:-$(genpass)}\n\
\ OCP_GROUP=${3:-${DEFAULT_OCP_GROUP}}\n\n htpasswd_add_user \"${USER}\" \"\
${PASS}\"\n ocp_auth_add_to_group \"${USER}\" \"${OCP_GROUP}\"\n\n echo \"\n\
\ run: htpasswd_ocp_set_file\n \"\n}\n\nocp_check_info(){\n echo \"== OCP\
\ INFO ==\"\n ocp_check_login || return 1\n\n echo \"NAMESPACE: $(oc project\
\ -q)\"\n sleep \"${SLEEP_SECONDS:-8}\"\n}\n\nocp_check_login(){\n oc whoami\
\ || return 1\n oc cluster-info | head -n1\n echo\n}\n\nocp_clean_install_pods(){\n\
\ oc delete pod \\\n -A \\\n -l app=installer\n}\n\nocp_control_nodes_not_schedulable(){\n\
\ oc patch schedulers.config.openshift.io/cluster --type merge --patch '{\"spec\"\
:{\"mastersSchedulable\": false}}'\n}\n\nocp_control_nodes_schedulable(){\n oc\
\ patch schedulers.config.openshift.io/cluster --type merge --patch '{\"spec\"\
:{\"mastersSchedulable\": true}}'\n}\n\nocp_expose_image_registry(){\n oc patch\
\ configs.imageregistry.operator.openshift.io/cluster --type=merge --patch '{\"\
spec\":{\"defaultRoute\":true}}'\n\n # remove 'default-route-openshift-image-'\
\ from route\n HOST=$(oc get route default-route -n openshift-image-registry\
\ --template='{{ .spec.host }}')\n SHORTER_HOST=$(echo \"${HOST}\" | sed '/host/\
\ s/default-route-openshift-image-//')\n oc patch configs.imageregistry.operator.openshift.io/cluster\
\ --type=merge --patch '{\"spec\":{\"host\": \"'\"${SHORTER_HOST}\"'\"}}'\n\n\
\ echo \"OCP image registry is available at: ${SHORTER_HOST}\"\n}\n\nocp_fix_duplicate_operator_groups(){\n\
\ for ns in $(oc get og -A | awk '{print $1}' | uniq -d)\n do\n oc -n \"\
${ns}\" \\\n get og -o name | \\\n tail -n+2 | \\\n xargs oc\
\ -n \"${ns}\" delete\n \n # oc -n \"${ns}\" \\\n # delete pod --all\n\
\ done\n}\n\nocp_get_apps_domain(){\n oc get ingresses.config.openshift.io cluster\
\ -o jsonpath='{.spec.domain}'\n}\n\nocp_get_domain(){\n OCP_APPS_DOMAIN=$(ocp_get_apps_domain)\n\
\ echo \"${OCP_APPS_DOMAIN#apps.}\"\n}\n\nocp_get_kubeconfigs(){\n # https://rcarrata.com/openshift/regenerate-kubeconfig/\n\
\ # https://gist.githubusercontent.com/rcarrata/016da295c1421cccbfbd66ed9a7922bc/raw/855486c363734892988cdf1b5d0d26ece5e0960a/regenerate-kubeconfig.sh\n\
\ # https://access.redhat.com/solutions/6054981\n # https://access.redhat.com/solutions/5286371\n\
\ # https://access.redhat.com/solutions/6112601\n\n oc -n openshift-kube-apiserver\
\ extract secret/node-kubeconfigs\n}\n\nocp_get_pull_secret(){\n oc -n openshift-config\
\ \\\n get secret/pull-secret \\\n --template='{{index .data \".dockerconfigjson\"\
\ | base64decode}}'\n}\n\nocp_gpu_pretty_label(){\n oc label node -l nvidia.com/gpu.machine\
\ node-role.kubernetes.io/gpu=''\n}\n\nocp_gpu_taint_nodes(){\n oc adm taint\
\ node -l node-role.kubernetes.io/gpu nvidia.com/gpu=:NoSchedule --overwrite\n\
\ oc adm drain -l node-role.kubernetes.io/gpu --ignore-daemonsets --delete-emptydir-data\n\
\ oc adm uncordon -l node-role.kubernetes.io/gpu\n}\n\nocp_gpu_untaint_nodes(){\n\
\ oc adm taint node -l node-role.kubernetes.io/gpu nvidia.com/gpu=:NoSchedule-\n\
}\n\nocp_infra_label_control(){\n echo \"see https://docs.redhat.com/en/documentation/openshift_container_platform/4.8/html/machine_management/creating-infrastructure-machinesets#moving-resources-to-infrastructure-machinesets\"\
\n\n oc label node -l node-role.kubernetes.io/control-plane node-role.kubernetes.io/infra=\"\
\"\n\n # oc patch \\\n # scheduler cluster \\\n # --type=merge --patch\
\ '{\"spec\":{\"defaultNodeSelector\":\"node-role.kubernetes.io/infra=\\\"\\\"\
\"}}'\n\n}\n\nocp_infra_move_registry_to_control(){\n\ncat <<YAML > /tmp/patch.yaml\n\
spec:\n nodeSelector:\n node-role.kubernetes.io/infra: \"\"\n tolerations:\n\
\ - effect: NoSchedule\n key: node-role.kubernetes.io/master\n operator:\
\ Exists\n - effect: NoExecute\n key: node-role.kubernetes.io/master\n \
\ operator: Exists\nYAML\n\n oc patch \\\n configs.imageregistry.operator.openshift.io/cluster\
\ \\\n --type=merge --patch-file /tmp/patch.yaml\n\n}\n\nocp_infra_move_router_to_control(){\n\
\ncat <<YAML > /tmp/patch.yaml\nspec:\n nodePlacement:\n nodeSelector:\n \
\ matchLabels:\n node-role.kubernetes.io/infra: \"\"\n tolerations:\n\
\ - effect: NoSchedule\n key: node-role.kubernetes.io/master\n operator:\
\ Exists\n - effect: NoExecute\n key: node-role.kubernetes.io/master\n\
\ operator: Exists\nYAML\n\n oc -n openshift-ingress-operator \\\n patch\
\ \\\n ingresscontroller default \\\n --type=merge --patch-file /tmp/patch.yaml\n\
\n}\n\nocp_infra_move_monitoring_to_control(){\n\ncat <<YAML > /tmp/patch.yaml\n\
spec:\n logStore:\n elasticsearch:\n nodeCount: 3\n nodeSelector:\n\
\ node-role.kubernetes.io/infra: \"\"\n tolerations:\n - effect:\
\ NoSchedule\n key: node-role.kubernetes.io/master\n operator: Exists\n\
\ - effect: NoExecute\n key: node-role.kubernetes.io/master\n \
\ operator: Exists\n visualization:\n kibana:\n nodeSelector:\n \
\ node-role.kubernetes.io/infra: \"\"\n tolerations:\n - effect:\
\ NoSchedule\n key: node-role.kubernetes.io/master\n operator: Exists\n\
\ - effect: NoExecute\n key: node-role.kubernetes.io/master\n \
\ operator: Exists\nYAML\n\n oc -n openshift-logging \\\n patch \\\n \
\ clusterlogging instance \\\n --type=merge --patch-file /tmp/patch.yaml\n\
\n}\n\nocp_kubeadmin_create(){\n PASS=${1:-$(genpass 5 )-$(genpass 5 )-$(genpass\
\ 5 )-$(genpass 5 )}\n\n which htpasswd >/dev/null || return 1\n\n HTPASSWD=$(htpasswd\
\ -nbB -C10 null \"${PASS}\")\n HASH=${HTPASSWD##*:}\n\n echo \"\n PASSWORD:\
\ ${PASS}\n HASH: ${HASH}\n\n oc apply -f scratch/kubeadmin.yaml\n \"\n\
\ncat << YAML > scratch/kubeadmin.yaml\nkind: Secret\napiVersion: v1\nmetadata:\n\
\ name: kubeadmin\n namespace: kube-system\nstringData:\n kubeadmin: ${HASH}\n\
\ password: ${PASS}\ntype: Opaque\nYAML\n}\n\nocp_kubeadmin_remove(){\n FORCE=${1:-No}\n\
\n if [ \"${FORCE}\" = \"YES\" ]; then\n [ ! -e scratch/kubeadmin.yaml ] &&\
\ \\\n oc get secret kubeadmin -n kube-system -o yaml > scratch/kubeadmin.yaml\
\ || return 1\n oc delete secret kubeadmin -n kube-system\n else\n echo\
\ -e \"${RED}\n WARNING: you must run - ocp_remove_kubeadmin YES\n\n WARNING:\
\ you will lose access to your cluster if you do not\n have a way to login\
\ to your cluster without kubeadmin. \n \n Examples:\n - An identity\
\ provider with a cluster-admin user setup\n - A kubeconfig file\n ${NC}\"\
\n return\n fi\n}\n\nocp_machineset_create_autoscale(){\n MACHINE_MIN=${1:-0}\n\
\ MACHINE_MAX=${2:-4}\n MACHINE_SETS=${3:-$(oc -n openshift-machine-api get\
\ machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}\n\n for machine_set\
\ in ${MACHINE_SETS}\n do\ncat << YAML | oc apply -f -\napiVersion: \"autoscaling.openshift.io/v1beta1\"\
\nkind: \"MachineAutoscaler\"\nmetadata:\n name: \"${machine_set}\"\n namespace:\
\ \"openshift-machine-api\"\nspec:\n minReplicas: ${MACHINE_MIN}\n maxReplicas:\
\ ${MACHINE_MAX}\n scaleTargetRef:\n apiVersion: machine.openshift.io/v1beta1\n\
\ kind: MachineSet\n name: \"${machine_set}\"\nYAML\n done\n}\n\nocp_machineset_patch_accelerator(){\n\
\ MACHINE_SET_NAME=${1:-gpu}\n LABEL=${2:-nvidia-gpu}\n\n oc -n openshift-machine-api\
\ \\\n patch machineset \"${MACHINE_SET_NAME}\" \\\n --type=merge --patch\
\ '{\"spec\":{\"template\":{\"spec\":{\"metadata\":{\"labels\":{\"cluster-api/accelerator\"\
:\"'\"${LABEL}\"'\"}}}}}}'\n \n oc -n openshift-machine-api \\\n patch machineset\
\ \"${MACHINE_SET_NAME}\" \\\n --type=merge --patch '{\"spec\":{\"template\"\
:{\"spec\":{\"metadata\":{\"labels\":{\"node-role.kubernetes.io/gpu\":\"\"}}}}}}'\n\
}\n\nocp_machineset_scale(){\n REPLICAS=${1:-1}\n MACHINE_SETS=${2:-$(oc -n\
\ openshift-machine-api get machineset -o name)}\n\n # scale workers\n echo\
\ \"${MACHINE_SETS}\" | \\\n xargs \\\n oc -n openshift-machine-api \\\
\n scale --replicas=\"${REPLICAS}\"\n}\n\nocp_machineset_taint_gpu(){\n \
\ SHORT_NAME=${1:-g4dn}\n MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
\ -o name | grep \"${SHORT_NAME}\" | head -n1)\n\n echo \"Patching: ${MACHINE_SET}\"\
\n\n # taint nodes for gpu-only workloads\n oc -n openshift-machine-api \\\n\
\ patch \"${MACHINE_SET}\" \\\n --type=merge --patch '{\"spec\":{\"template\"\
:{\"spec\":{\"taints\":[{\"key\":\"nvidia.com/gpu\",\"value\":\"\",\"effect\"\
:\"NoSchedule\"}]}}}}'\n}\n\nocp_release_info(){\n VERSION=${1:-stable-4.12}\n\
\ echo \"VERSION: ${VERSION}\"\n curl -sL \"https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/${VERSION}/release.txt\"\
\n}\n\nocp_run_on_all_nodes(){\n case $1 in\n --confirm)\n shift\n\n\
\ COMMAND=${*:-uptime}\n ALL_NODES=$(oc get nodes --show-kind --no-headers|awk\
\ '/node/{print $1}')\n\n for node in ${ALL_NODES}\n do\n \
\ # wipefs -af /dev/nvme0n1\n # oc debug $node -- chroot /host bash\
\ -c \"$(cat -)\"\n # shellcheck disable=SC2086\n oc debug \"\
$node\" -- chroot /host ${COMMAND}\n done\n ;;\n *)\n echo \"\
-------------------------------------------------------------------\"\n echo\
\ \"WARNING. This runs as root on all nodes!\"\n echo \"You can DESTROY ALL\
\ DATA, without recovery, if used incorrectly!\"\n echo \"-------------------------------------------------------------------\"\
\n echo \"Usage:\"\n echo \" ocp_run_on_all_nodes --confirm < command\
\ >\"\n esac\n\n}\n\nocp_save_money(){\n\n # run work on masters\n ocp_control_nodes_schedulable\n\
\n # scale to zero\n ocp_machineset_scale 0\n\n # place as many pods on as\
\ few nodes as possible\n ocp_scheduler_set_profile HighNodeUtilization\n}\n\n\
ocp_scheduler_set_profile(){\n SCHED_PROFILE=${1:-LowNodeUtilization}\n\n #\
\ LowNodeUtilization, HighNodeUtilization, NoScoring\n echo \"see https://docs.openshift.com/container-platform/4.16/nodes/scheduling/nodes-scheduler-profiles.html\"\
\n echo \"OPTIONS: LowNodeUtilization (default), HighNodeUtilization, NoScoring\"\
\n echo \"SCHED_PROFILE: ${SCHED_PROFILE}\"\n\n oc patch schedulers.config.openshift.io/cluster\
\ --type merge --patch '{\"spec\":{\"profile\": \"'\"${SCHED_PROFILE}\"'\"}}'\n\
}\n\nocp_setup_namespace(){\n NAMESPACE=${1}\n\n oc new-project \"${NAMESPACE}\"\
\ 2>/dev/null || \\\n oc project \"${NAMESPACE}\"\n}\n\nocp_update_pull_secret(){\n\
\ echo \"see https://access.redhat.com/solutions/4902871\"\n\n PULL_SECRET_FILE=${1:-${GIT_ROOT}/scratch/pull-secret}\n\
\n oc extract secret/pull-secret \\\n -n openshift-config \\\n --keys .dockerconfigjson\
\ \\\n --to=- > \"${PULL_SECRET_FILE}\"\n \n oc get secret/pull-secret \\\
\n -n openshift-config \\\n -o yaml > \"${PULL_SECRET_FILE}.yaml\"\n\n \
\ [ -e \"${PULL_SECRET_FILE}\" ] || return 0\n\n if oc get secret/pull-secret\
\ -n openshift-config -o name; then\n oc set data secret/pull-secret \\\n \
\ -n openshift-config \\\n --from-file=.dockerconfigjson=\"${PULL_SECRET_FILE}\"\
\n else\n oc create secret generic pull-secret \\\n -n openshift-config\
\ \\\n --type=kubernetes.io/dockerconfigjson \\\n --from-file=.dockerconfigjson=\"\
${PULL_SECRET_FILE}\"\n fi \n}\n\nocp_upgrade_ack_4.13(){\n oc -n openshift-config\
\ patch cm admin-acks --patch '{\"data\":{\"ack-4.12-kube-1.26-api-removals-in-4.13\"\
:\"true\"}}' --type=merge\n}\n\nocp_upgrade_ack_4.19(){\n oc -n openshift-config\
\ patch cm admin-acks --patch '{\"data\":{\"ack-4.18-kube-1.32-api-removals-in-4.19\"\
:\"true\"}}' --type=merge\n}\n\nocp_upgrade_cluster(){\n OCP_VERSION=\"${1:-latest}\"\
\n\n if [ \"${OCP_VERSION}\" = \"latest\" ]; then\n oc adm upgrade --to-latest=true\n\
\ else\n oc adm upgrade --to=\"${OCP_VERSION}\"\n fi\n}\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
autoscale : config
name : job-setup-autoscale
namespace : openshift-machine-api
ConfigMapv1
job-pipelines-console-plugin
openshift-operators
▼ YAML
apiVersion : v1
data :
console-plugin-job.sh : "#!/usr/bin/bash\n\nenable_console_plugin(){\n [ -z \"${PLUGIN_NAME}\"\
\ ] && return 1\n\n echo \"Attempting to enable ${PLUGIN_NAME} plugin\"\n echo\
\ \"\"\n\n # Create the plugins section on the object if it doesn't exist\n \
\ if [ -z \"$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}')\"\
\ ]; then\n echo \"Creating plugins object\"\n oc patch consoles.operator.openshift.io\
\ cluster --patch '{ \"spec\": { \"plugins\": [] } }' --type=merge\n fi\n\n \
\ INSTALLED_PLUGINS=$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}')\n\
\ echo \"Current plugins:\"\n echo \"${INSTALLED_PLUGINS}\"\n\n if [[ \"${INSTALLED_PLUGINS}\"\
\ == *\"${PLUGIN_NAME}\"* ]]; then\n echo \"${PLUGIN_NAME} is already enabled\"\
\n else\n echo \"Enabling plugin: ${PLUGIN_NAME}\"\n oc patch consoles.operator.openshift.io\
\ cluster --type=json --patch '[{\"op\": \"add\", \"path\": \"/spec/plugins/-\"\
, \"value\": \"'\"${PLUGIN_NAME}\"'\"}]'\n fi\n\n sleep 6\n oc get consoles.operator.openshift.io\
\ cluster -o=jsonpath='{.spec.plugins}'\n}\n\nenable_console_plugin\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : job-pipelines-console-plugin
namespace : openshift-operators
ConfigMapv1
fix-dashboard-magic
redhat-ods-applications
▼ YAML
apiVersion : v1
data :
job.sh : "#!/usr/bin/bash\nset -e\n\nTIMEOUT_SECONDS=60\n\nrestart_pods(){\n oc\
\ -n redhat-ods-applications \\\n delete pods \\\n -l deployment=rhods-dashboard\n\
}\n\nfix_dashboard_bugs(){\n sleep \"${TIMEOUT_SECONDS}\"\n restart_pods\n}\n\
\nscale_down_dashboard_madness(){\n\n echo -n 'Waiting for RHOAI dashboard.'\n\
\ until oc get -n redhat-ods-applications deployment/rhods-dashboard -o name\
\ 2>/dev/null\n do\n echo -n .\n sleep 5\n done; echo\n\n oc -n redhat-ods-applications\
\ \\\n scale deployment/rhods-dashboard \\\n --replicas=2\n}\n\nscale_down_dashboard_madness\n\
fix_dashboard_bugs\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : fix-dashboard-magic
namespace : redhat-ods-applications
ConfigMapv1
notebook-controller-culler-config
redhat-ods-applications
▼ YAML
apiVersion : v1
data :
CULL_IDLE_TIME : '24'
ENABLE_CULLING : 'true'
IDLENESS_CHECK_PERIOD : '1'
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
opendatahub.io/dashboard : 'true'
name : notebook-controller-culler-config
namespace : redhat-ods-applications
ConfigMapv1
odh-segment-key-config
redhat-ods-applications
▼ YAML
apiVersion : v1
data :
segmentKeyEnabled : 'false'
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
app.kubernetes.io/part-of : segment-io
app.opendatahub.io/segment-io : 'true'
name : odh-segment-key-config
namespace : redhat-ods-applications
ConfigMapv1
approve-after-servicemesh
redhat-ods-operator
▼ YAML
apiVersion : v1
data :
job.sh : "#!/usr/bin/bash\n# shellcheck disable=SC2119,SC2120\nset -e\n\nTIMEOUT_SECONDS=120\n\
\nself_destruct(){\n\n echo \"\n engaging self cleaning in ${TIMEOUT_SECONDS}s...\n\
\ \"\n sleep \"${TIMEOUT_SECONDS}\"\n oc -n redhat-ods-operator delete jobs\
\ --all\n}\n\napprove_installplan(){\n echo -n 'Waiting for RHOAI install plan...'\n\
\ until oc -n redhat-ods-operator get installplan -l operators.coreos.com/rhods-operator.redhat-ods-operator\
\ -o name >/dev/null 2>&1\n do\n echo -n .\n sleep 5\n done; echo\n\n\
\ INSTALL_PLAN=$(oc -n redhat-ods-operator get installplan -l operators.coreos.com/rhods-operator.redhat-ods-operator\
\ -o name)\n oc -n redhat-ods-operator \\\n patch \"${INSTALL_PLAN}\" \\\n\
\ --type=merge --patch '{\"spec\":{\"approved\":true}}'\n}\n\npatch_approval(){\n\
\ APPROVAL=${1:-Automatic}\n\n echo -n 'Waiting for RHOAI subscription...'\n\
\ until oc get -n redhat-ods-operator subscriptions.operators.coreos.com/rhods-operator\
\ -o name >/dev/null 2>&1\n do\n echo -n .\n sleep 5\n done; echo\n\n\
\ oc -n redhat-ods-operator \\\n patch subscriptions.operators.coreos.com/rhods-operator\
\ \\\n --type=merge --patch '{\"spec\":{\"installPlanApproval\":\"'\"${APPROVAL}\"\
'\"}}'\n}\n\nwait_for_service_mesh(){\n echo \"Checking status of all service_mesh\
\ pre-reqs\"\n\n SERVICEMESH_RESOURCES=(\n crd/knativeservings.operator.knative.dev:condition=established\n\
\ crd/servicemeshcontrolplanes.maistra.io:condition=established\n crd/servicemeshmembers.maistra.io:condition=established\n\
\ )\n\n for crd in \"${SERVICEMESH_RESOURCES[@]}\"\n do\n RESOURCE=$(echo\
\ \"$crd\" | cut -d \":\" -f 1)\n CONDITION=$(echo \"$crd\" | cut -d \":\"\
\ -f 2)\n\n echo \"Waiting for ${RESOURCE} state to be ${CONDITION}...\"\n\
\ oc wait --for=\"${CONDITION}\" \"${RESOURCE}\" --timeout=\"${TIMEOUT_SECONDS}s\"\
\ >/dev/null 2>&1\n done\n}\n\nwait_for_service_mesh\npatch_approval\napprove_installplan\n\
self_destruct\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : approve-after-servicemesh
namespace : redhat-ods-operator
ConfigMapv1
fix-operator-scale
redhat-ods-operator
▼ YAML
apiVersion : v1
data :
job.sh : "#!/usr/bin/bash\nset -e\n\nscale_down_operator_madness(){\n\n echo -n\
\ 'Waiting for RHOAI csv.'\n until oc get -n redhat-ods-operator -l operators.coreos.com/rhods-operator.redhat-ods-operator\
\ csv -o name 2>/dev/null\n do\n echo -n .\n sleep 5\n done; echo\n\n\
oc get csv \\\n -n redhat-ods-operator \\\n -l operators.coreos.com/rhods-operator.redhat-ods-operator\
\ \\\n -o yaml | sed 's@replicas: 3@replicas: 1@' > /tmp/replace.yaml\n\noc replace\
\ -f /tmp/replace.yaml\n\n}\n\nscale_down_operator_madness\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : fix-operator-scale
namespace : redhat-ods-operator
ConfigMapv1
wait-for-servicemesh
redhat-ods-operator
▼ YAML
apiVersion : v1
data :
job.sh : "#!/usr/bin/bash\nset -e\n\nTIMEOUT_SECONDS=60\n\nwait_for_service_mesh(){\n\
\ echo \"Checking status of all service_mesh pre-reqs\"\n\n SERVICEMESH_RESOURCES=(\n\
\ crd/knativeservings.operator.knative.dev:condition=established\n crd/servicemeshcontrolplanes.maistra.io:condition=established\n\
\ )\n\n for crd in \"${SERVICEMESH_RESOURCES[@]}\"\n do\n RESOURCE=$(echo\
\ \"$crd\" | cut -d \":\" -f 1)\n CONDITION=$(echo \"$crd\" | cut -d \":\"\
\ -f 2)\n\n echo \"Waiting for ${RESOURCE} state to be ${CONDITION}...\"\n\
\ oc wait --for=\"${CONDITION}\" \"${RESOURCE}\" --timeout=\"${TIMEOUT_SECONDS}s\"\
\n done\n}\n\nwait_for_service_mesh\n"
kind : ConfigMap
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
name : wait-for-servicemesh
namespace : redhat-ods-operator
Servicev1
console-plugin-nvidia-gpu
nvidia-gpu-operator
▼ YAML
apiVersion : v1
kind : Service
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
service.alpha.openshift.io/serving-cert-secret-name : plugin-serving-cert
labels :
app.kubernetes.io/component : console-plugin-nvidia-gpu
app.kubernetes.io/instance : console-plugin-nvidia-gpu
app.kubernetes.io/managed-by : Helm
app.kubernetes.io/name : console-plugin-nvidia-gpu
app.kubernetes.io/part-of : console-plugin-nvidia-gpu
app.kubernetes.io/version : latest
helm.sh/chart : console-plugin-nvidia-gpu-0.2.4
name : console-plugin-nvidia-gpu
namespace : nvidia-gpu-operator
spec :
ports :
- name : 9443-tcp
port : 9443
protocol : TCP
targetPort : 9443
selector :
app.kubernetes.io/name : console-plugin-nvidia-gpu
sessionAffinity : None
type : ClusterIP
Deploymentapps/v1
console-plugin-nvidia-gpu
nvidia-gpu-operator
▼ YAML
apiVersion : apps/v1
kind : Deployment
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
app.kubernetes.io/component : console-plugin-nvidia-gpu
app.kubernetes.io/instance : console-plugin-nvidia-gpu
app.kubernetes.io/managed-by : Helm
app.kubernetes.io/name : console-plugin-nvidia-gpu
app.kubernetes.io/part-of : console-plugin-nvidia-gpu
app.kubernetes.io/version : latest
app.openshift.io/runtime-namespace : console-plugin-nvidia-gpu
helm.sh/chart : console-plugin-nvidia-gpu-0.2.4
name : console-plugin-nvidia-gpu
namespace : nvidia-gpu-operator
spec :
replicas : 1
selector :
matchLabels :
app.kubernetes.io/name : console-plugin-nvidia-gpu
strategy :
rollingUpdate :
maxSurge : 25%
maxUnavailable : 25%
type : RollingUpdate
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
app.kubernetes.io/name : console-plugin-nvidia-gpu
spec :
containers :
- image : quay.io/edge-infrastructure/console-plugin-nvidia-gpu:latest
imagePullPolicy : Always
name : console-plugin-nvidia-gpu
ports :
- containerPort : 9443
protocol : TCP
resources : {}
securityContext :
allowPrivilegeEscalation : false
volumeMounts :
- mountPath : /var/serving-cert
name : plugin-serving-cert
readOnly : true
dnsPolicy : ClusterFirst
restartPolicy : Always
securityContext :
runAsNonRoot : true
volumes :
- name : plugin-serving-cert
secret :
defaultMode : 420
secretName : plugin-serving-cert
- configMap :
defaultMode : 420
name : nginx-conf
name : nginx-conf
ClusterAutoscalerautoscaling.openshift.io/v1
default
openshift-machine-api
▼ YAML
apiVersion : autoscaling.openshift.io/v1
kind : ClusterAutoscaler
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
autoscale : config
name : default
namespace : openshift-machine-api
spec :
podPriorityThreshold : -10
resourceLimits :
cores :
max : 176
min : 0
gpus :
- max : 8
min : 0
type : nvidia.com/gpu
- max : 1
min : 0
type : amd.com/gpu
maxNodesTotal : 16
memory :
max : 512
min : 0
scaleDown :
delayAfterAdd : 5m
delayAfterDelete : 1m
delayAfterFailure : 30s
enabled : true
unneededTime : 5m
utilizationThreshold : '0.7'
Jobbatch/v1
job-aro-gpu-machineset
nvidia-gpu-operator
▼ YAML
apiVersion : batch/v1
kind : Job
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
generateName : job-aro-gpu-machineset-
name : job-aro-gpu-machineset
namespace : nvidia-gpu-operator
spec :
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
spec :
containers :
- command :
- /bin/bash
- -c
- /scripts/job.sh
env :
- name : INSTANCE_TYPE
value : Standard_NC4as_T4_v3
- name : NAMESPACE
valueFrom :
fieldRef :
fieldPath : metadata.namespace
image : registry.redhat.io/openshift4/ose-cli
name : job-aro-gpu-machineset
volumeMounts :
- mountPath : /scripts
name : scripts
restartPolicy : Never
serviceAccount : job-aro-gpu-machineset
serviceAccountName : job-aro-gpu-machineset
terminationGracePeriodSeconds : 30
volumes :
- configMap :
defaultMode : 493
name : job-aro-gpu-machineset
name : scripts
Jobbatch/v1
job-aws-gpu-machineset
nvidia-gpu-operator
▼ YAML
apiVersion : batch/v1
kind : Job
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
generateName : job-aws-gpu-machineset-
name : job-aws-gpu-machineset
namespace : nvidia-gpu-operator
spec :
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
spec :
containers :
- command :
- /bin/bash
- -c
- /scripts/job.sh
env :
- name : INSTANCE_TYPE
value : g4dn.4xlarge
- name : NAMESPACE
valueFrom :
fieldRef :
fieldPath : metadata.namespace
image : registry.redhat.io/openshift4/ose-cli
name : job-aws-gpu-machineset
volumeMounts :
- mountPath : /scripts
name : scripts
restartPolicy : Never
serviceAccount : job-aws-gpu-machineset
serviceAccountName : job-aws-gpu-machineset
terminationGracePeriodSeconds : 30
volumes :
- configMap :
defaultMode : 493
name : job-aws-gpu-machineset
name : scripts
Jobbatch/v1
job-gpu-console-plugin
nvidia-gpu-operator
▼ YAML
apiVersion : batch/v1
kind : Job
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-wave : '10'
generateName : job-gpu-console-plugin-
name : job-gpu-console-plugin
namespace : nvidia-gpu-operator
spec :
backoffLimit : 4
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
spec :
containers :
- command :
- /bin/bash
- -c
- /scripts/console-plugin-job.sh
env :
- name : PLUGIN_NAME
value : console-plugin-nvidia-gpu
image : registry.redhat.io/openshift4/ose-cli
name : minion
volumeMounts :
- mountPath : /scripts
name : scripts
restartPolicy : Never
serviceAccount : job-gpu-console-plugin
serviceAccountName : job-gpu-console-plugin
volumes :
- configMap :
defaultMode : 493
name : job-gpu-console-plugin
name : scripts
Jobbatch/v1
job-setup-autoscale
openshift-machine-api
▼ YAML
apiVersion : batch/v1
kind : Job
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
autoscale : config
name : job-setup-autoscale
namespace : openshift-machine-api
spec :
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
spec :
containers :
- command :
- /bin/bash
- -c
- /scripts/job.sh
env :
- name : NAMESPACE
valueFrom :
fieldRef :
fieldPath : metadata.namespace
- name : MACHINE_MIN
value : '0'
- name : MACHINE_MAX
value : '4'
image : registry.redhat.io/openshift4/ose-cli
name : minion
volumeMounts :
- mountPath : /scripts
name : scripts
restartPolicy : Never
serviceAccount : job-setup-autoscale
serviceAccountName : job-setup-autoscale
terminationGracePeriodSeconds : 30
volumes :
- configMap :
defaultMode : 493
name : job-setup-autoscale
name : scripts
Jobbatch/v1
job-pipelines-console-plugin
openshift-operators
▼ YAML
apiVersion : batch/v1
kind : Job
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-wave : '10'
name : job-pipelines-console-plugin
namespace : openshift-operators
spec :
backoffLimit : 4
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
spec :
containers :
- command :
- /bin/bash
- -c
- /scripts/console-plugin-job.sh
env :
- name : PLUGIN_NAME
value : pipelines-console-plugin
image : registry.redhat.io/openshift4/ose-cli
name : minion
volumeMounts :
- mountPath : /scripts
name : scripts
restartPolicy : Never
serviceAccount : job-pipelines-console-plugin
serviceAccountName : job-pipelines-console-plugin
volumes :
- configMap :
defaultMode : 493
name : job-pipelines-console-plugin
name : scripts
Jobbatch/v1
fix-dashboard-magic
redhat-ods-applications
▼ YAML
apiVersion : batch/v1
kind : Job
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-wave : '11'
name : fix-dashboard-magic
namespace : redhat-ods-applications
spec :
backoffLimit : 4
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
spec :
containers :
- command :
- /bin/bash
- -c
- /scripts/job.sh
image : registry.redhat.io/openshift4/ose-cli
name : minion
volumeMounts :
- mountPath : /scripts
name : scripts
restartPolicy : Never
serviceAccount : fix-dashboard-magic
serviceAccountName : fix-dashboard-magic
volumes :
- configMap :
defaultMode : 493
name : fix-dashboard-magic
name : scripts
Jobbatch/v1
approve-after-servicemesh
redhat-ods-operator
▼ YAML
apiVersion : batch/v1
kind : Job
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : approve-after-servicemesh
namespace : redhat-ods-operator
spec :
backoffLimit : 4
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
spec :
containers :
- command :
- /bin/bash
- -c
- /scripts/job.sh
image : registry.redhat.io/openshift4/ose-cli
name : minion
volumeMounts :
- mountPath : /scripts
name : scripts
restartPolicy : Never
serviceAccount : approve-after-servicemesh
serviceAccountName : approve-after-servicemesh
volumes :
- configMap :
defaultMode : 493
name : approve-after-servicemesh
name : scripts
Jobbatch/v1
fix-operator-scale
redhat-ods-operator
▼ YAML
apiVersion : batch/v1
kind : Job
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-wave : '11'
name : fix-operator-scale
namespace : redhat-ods-operator
spec :
backoffLimit : 4
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
spec :
containers :
- command :
- /bin/bash
- -c
- /scripts/job.sh
image : registry.redhat.io/openshift4/ose-cli
name : minion
volumeMounts :
- mountPath : /scripts
name : scripts
restartPolicy : Never
serviceAccount : fix-operator-scale
serviceAccountName : fix-operator-scale
volumes :
- configMap :
defaultMode : 493
name : fix-operator-scale
name : scripts
Jobbatch/v1
wait-for-servicemesh
redhat-ods-operator
▼ YAML
apiVersion : batch/v1
kind : Job
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
argocd.argoproj.io/sync-wave : '10'
name : wait-for-servicemesh
namespace : redhat-ods-operator
spec :
backoffLimit : 4
template :
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
spec :
containers :
- command :
- /bin/bash
- -c
- /scripts/job.sh
image : registry.redhat.io/openshift4/ose-cli
name : minion
volumeMounts :
- mountPath : /scripts
name : scripts
restartPolicy : Never
serviceAccount : wait-for-servicemesh
serviceAccountName : wait-for-servicemesh
volumes :
- configMap :
defaultMode : 493
name : wait-for-servicemesh
name : scripts
ConsoleLinkconsole.openshift.io/v1
github-demo-gitops
▼ YAML
apiVersion : console.openshift.io/v1
kind : ConsoleLink
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : Prune=true
source : https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
labels :
demo : ai-gitops-catalog
name : github-demo-gitops
spec :
applicationMenu :
imageURL : /static/assets/public/imgs/logos/github.svg
section : Git Repos
href : https://github.com/redhat-na-ssa/demo-ai-gitops-catalog
location : ApplicationMenu
text : GitHub - Demo GitOps Catalog
ConsoleLinkconsole.openshift.io/v1
github-ssa
▼ YAML
apiVersion : console.openshift.io/v1
kind : ConsoleLink
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : Prune=true
source : https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
labels :
demo : ai-gitops-catalog
name : github-ssa
spec :
applicationMenu :
imageURL : /static/assets/public/imgs/logos/github.svg
section : Git Repos
href : https://github.com/redhat-na-ssa
location : ApplicationMenu
text : GitHub - NA SSA
ConsoleLinkconsole.openshift.io/v1
help-link
▼ YAML
apiVersion : console.openshift.io/v1
kind : ConsoleLink
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : Prune=true
source : https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
labels :
demo : ai-gitops-catalog
name : help-link
spec :
href : https://github.com/redhat-na-ssa/demo-ai-gitops-catalog/issues
location : HelpMenu
text : Demo Catalog - Open Issue
ConsoleLinkconsole.openshift.io/v1
rhoai-docs
▼ YAML
apiVersion : console.openshift.io/v1
kind : ConsoleLink
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : rhoai-docs
spec :
applicationMenu :
imageURL : /static/assets/public/imgs/logos/redhat.svg
section : Documentation
href : https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2-latest/
location : ApplicationMenu
text : Red Hat OpenShift AI Documentation
ConsoleNotificationconsole.openshift.io/v1
banner-cluster
▼ YAML
apiVersion : console.openshift.io/v1
kind : ConsoleNotification
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : Prune=true
source : https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
labels :
demo : ai-gitops-catalog
name : banner-cluster
spec :
backgroundColor : '#0066FF'
color : '#FFF'
location : BannerBottom
text : This cluster was configured via the AI GitOps catalog
ConsoleNotificationconsole.openshift.io/v1
banner-demo
▼ YAML
apiVersion : console.openshift.io/v1
kind : ConsoleNotification
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
source : https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
labels :
demo : ai-gitops-catalog
name : banner-demo
spec :
backgroundColor : '#9F0000'
color : '#FFF'
location : BannerTop
text : 'DEMO: Red Hat OpenShift AI (RHOAI)'
ConsolePluginconsole.openshift.io/v1
console-plugin-nvidia-gpu
nvidia-gpu-operator
▼ YAML
apiVersion : console.openshift.io/v1
kind : ConsolePlugin
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
labels :
app.kubernetes.io/component : console-plugin-nvidia-gpu
app.kubernetes.io/instance : console-plugin-nvidia-gpu
app.kubernetes.io/managed-by : Helm
app.kubernetes.io/name : console-plugin-nvidia-gpu
app.kubernetes.io/part-of : console-plugin-nvidia-gpu
app.kubernetes.io/version : latest
helm.sh/chart : console-plugin-nvidia-gpu-0.2.4
name : console-plugin-nvidia-gpu
namespace : nvidia-gpu-operator
spec :
backend :
service :
basePath : /
name : console-plugin-nvidia-gpu
namespace : nvidia-gpu-operator
port : 9443
type : Service
displayName : Console Plugin NVIDIA GPU Template
HardwareProfiledashboard.opendatahub.io/v1alpha1
all-nvidia
redhat-ods-applications
▼ YAML
apiVersion : dashboard.opendatahub.io/v1alpha1
kind : HardwareProfile
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : all-nvidia
namespace : redhat-ods-applications
spec :
description : Nvidia GPU - Full Core
displayName : Nvidia GPU
enabled : true
identifiers :
- defaultCount : '1'
displayName : GPU
identifier : nvidia.com/gpu
maxCount : '4'
minCount : '1'
resourceType : Accelerator
- defaultCount : '1'
displayName : CPU
identifier : cpu
minCount : '1'
resourceType : CPU
- defaultCount : 4Gi
displayName : Memory
identifier : memory
minCount : 4Gi
resourceType : Memory
nodeSelector :
nvidia.com/gpu.present : 'true'
tolerations :
- effect : NoSchedule
key : nvidia.com/gpu
operator : Exists
HardwareProfiledashboard.opendatahub.io/v1alpha1
all-nvidia-sliced-4
redhat-ods-applications
▼ YAML
apiVersion : dashboard.opendatahub.io/v1alpha1
kind : HardwareProfile
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : all-nvidia-sliced-4
namespace : redhat-ods-applications
spec :
description : Nvidia GPU - 1/4 (Time Sliced)
displayName : Nvidia GPU - 1/4
enabled : true
identifiers :
- defaultCount : '1'
displayName : CPU
identifier : cpu
minCount : '1'
resourceType : CPU
- defaultCount : 4Gi
displayName : Memory
identifier : memory
minCount : 4Gi
resourceType : Memory
nodeSelector :
nvidia.com/gpu.present : 'true'
tolerations :
- effect : NoSchedule
key : nvidia.com/gpu
operator : Exists
HardwareProfiledashboard.opendatahub.io/v1alpha1
notebooks-demo-workshop
redhat-ods-applications
▼ YAML
apiVersion : dashboard.opendatahub.io/v1alpha1
kind : HardwareProfile
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
opendatahub.io/dashboard-feature-visibility : '["workbench"]'
name : notebooks-demo-workshop
namespace : redhat-ods-applications
spec :
displayName : Demo / Workshop
enabled : true
identifiers :
- defaultCount : '3'
displayName : CPU
identifier : cpu
maxCount : '6'
minCount : '3'
resourceType : CPU
- defaultCount : 24Gi
displayName : Memory
identifier : memory
maxCount : 24Gi
minCount : 24Gi
resourceType : Memory
HardwareProfiledashboard.opendatahub.io/v1alpha1
notebooks-small
redhat-ods-applications
▼ YAML
apiVersion : dashboard.opendatahub.io/v1alpha1
kind : HardwareProfile
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
opendatahub.io/dashboard-feature-visibility : '["workbench"]'
name : notebooks-small
namespace : redhat-ods-applications
spec :
displayName : Small
enabled : true
identifiers :
- defaultCount : '1'
displayName : CPU
identifier : cpu
maxCount : '2'
minCount : '1'
resourceType : CPU
- defaultCount : 8Gi
displayName : Memory
identifier : memory
maxCount : 8Gi
minCount : 8Gi
resourceType : Memory
HardwareProfiledashboard.opendatahub.io/v1alpha1
serving-large
redhat-ods-applications
▼ YAML
apiVersion : dashboard.opendatahub.io/v1alpha1
kind : HardwareProfile
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
opendatahub.io/dashboard-feature-visibility : '["model-serving"]'
name : serving-large
namespace : redhat-ods-applications
spec :
displayName : Large
enabled : true
identifiers :
- defaultCount : '6'
displayName : CPU
identifier : cpu
maxCount : '10'
minCount : '6'
resourceType : CPU
- defaultCount : 16Gi
displayName : Memory
identifier : memory
maxCount : 20Gi
minCount : 16Gi
resourceType : Memory
HardwareProfiledashboard.opendatahub.io/v1alpha1
serving-medium
redhat-ods-applications
▼ YAML
apiVersion : dashboard.opendatahub.io/v1alpha1
kind : HardwareProfile
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
opendatahub.io/dashboard-feature-visibility : '["model-serving"]'
name : serving-medium
namespace : redhat-ods-applications
spec :
displayName : Medium
enabled : true
identifiers :
- defaultCount : '4'
displayName : CPU
identifier : cpu
maxCount : '8'
minCount : '4'
resourceType : CPU
- defaultCount : 8Gi
displayName : Memory
identifier : memory
maxCount : 10Gi
minCount : 8Gi
resourceType : Memory
HardwareProfiledashboard.opendatahub.io/v1alpha1
serving-small
redhat-ods-applications
▼ YAML
apiVersion : dashboard.opendatahub.io/v1alpha1
kind : HardwareProfile
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
opendatahub.io/dashboard-feature-visibility : '["model-serving"]'
name : serving-small
namespace : redhat-ods-applications
spec :
displayName : Small
enabled : true
identifiers :
- defaultCount : '1'
displayName : CPU
identifier : cpu
maxCount : '2'
minCount : '1'
resourceType : CPU
- defaultCount : 4Gi
displayName : Memory
identifier : memory
maxCount : 8Gi
minCount : 4Gi
resourceType : Memory
DataScienceClusterdatasciencecluster.opendatahub.io/v1
default-dsc
redhat-ods-applications
▼ YAML
apiVersion : datasciencecluster.opendatahub.io/v1
kind : DataScienceCluster
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
name : default-dsc
namespace : redhat-ods-applications
spec :
components :
codeflare :
managementState : Managed
dashboard :
managementState : Managed
datasciencepipelines :
managementState : Managed
feastoperator :
managementState : Managed
kserve :
managementState : Managed
serving :
ingressGateway :
certificate :
type : SelfSigned
managementState : Managed
name : knative-serving
kueue :
managementState : Managed
llamastackoperator :
managementState : Managed
modelmeshserving :
managementState : Removed
ray :
managementState : Managed
trainingoperator :
managementState : Managed
trustyai :
managementState : Managed
workbenches :
managementState : Managed
DSCInitializationdscinitialization.opendatahub.io/v1
default-dsci
redhat-ods-applications
▼ YAML
apiVersion : dscinitialization.opendatahub.io/v1
kind : DSCInitialization
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
name : default-dsci
namespace : redhat-ods-applications
spec :
applicationsNamespace : redhat-ods-applications
monitoring :
managementState : Managed
namespace : redhat-ods-monitoring
serviceMesh :
auth :
audiences :
- https://kubernetes.default.svc
controlPlane :
metricsCollection : Istio
name : data-science-smcp
namespace : istio-system
managementState : Managed
trustedCABundle :
customCABundle : ''
managementState : Managed
AlertingRulemonitoring.openshift.io/v1
gpu-pods
openshift-monitoring
▼ YAML
apiVersion : monitoring.openshift.io/v1
kind : AlertingRule
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : gpu-pods
namespace : openshift-monitoring
spec :
groups :
- name : gpu-pods
rules :
- alert : GpuPods
annotations :
description : A total of {{ $value }} 'nvidia.com/gpu' requested on the cluster.
runbook_url : https://github.com/redhat-na-ssa/demo-ai-gitops-catalog/tree/main/components/operators/gpu-operator-certified/instance/components/gpu-monitoring/gpu-pods.md
summary : Cloud costs may increase by requesting specialized resources.
expr : 'sum (kube_pod_resource_request{resource="nvidia.com/gpu"} >= 1 ) > 0
# sum by (namespace, pod,resource) (kube_pod_resource_request{resource="nvidia.com/gpu"}
>= 1) > 0
'
labels :
severity : info
NodeFeatureDiscoverynfd.openshift.io/v1
nfd-instance
openshift-nfd
▼ YAML
apiVersion : nfd.openshift.io/v1
kind : NodeFeatureDiscovery
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : nfd-instance
namespace : openshift-nfd
spec :
instance : ''
operand :
image : registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.16
servicePort : 12000
topologyUpdater : false
workerConfig :
configData : "core:\n sleepInterval: 60s\nsources:\n pci:\n deviceClassWhitelist:\n\
\ - \"0200\"\n - \"03\"\n - \"12\"\n deviceLabelFields:\n\
\ - \"vendor\"\n"
ClusterPolicynvidia.com/v1
gpu-cluster-policy
nvidia-gpu-operator
▼ YAML
apiVersion : nvidia.com/v1
kind : ClusterPolicy
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : gpu-cluster-policy
namespace : nvidia-gpu-operator
spec :
daemonsets :
rollingUpdate :
maxUnavailable : '1'
tolerations :
- effect : NoSchedule
key : nvidia.com/gpu
operator : Exists
updateStrategy : RollingUpdate
dcgm :
enabled : true
dcgmExporter :
config :
name : console-plugin-nvidia-gpu
enabled : true
serviceMonitor :
enabled : true
devicePlugin :
config :
default : time-sliced-4
name : device-plugin-config
enabled : true
driver :
certConfig :
name : ''
enabled : true
kernelModuleConfig :
name : ''
licensingConfig :
configMapName : ''
nlsEnabled : false
repoConfig :
configMapName : ''
upgradePolicy :
autoUpgrade : true
drain :
deleteEmptyDir : false
enable : false
force : false
timeoutSeconds : 300
maxParallelUpgrades : 1
maxUnavailable : 25%
podDeletion :
deleteEmptyDir : false
force : false
timeoutSeconds : 300
waitForCompletion :
timeoutSeconds : 0
virtualTopology :
config : ''
gds :
enabled : false
gfd :
enabled : true
mig :
strategy : single
migManager :
enabled : true
nodeStatusExporter :
enabled : true
operator :
defaultRuntime : crio
initContainer : {}
use_ocp_driver_toolkit : true
sandboxDevicePlugin :
enabled : true
sandboxWorkloads :
defaultWorkload : container
enabled : false
toolkit :
enabled : true
validator :
plugin :
env :
- name : WITH_WORKLOAD
value : 'true'
vfioManager :
enabled : true
vgpuDeviceManager :
enabled : true
vgpuManager :
enabled : false
OdhDashboardConfigopendatahub.io/v1alpha
odh-dashboard-config
redhat-ods-applications
▼ YAML
apiVersion : opendatahub.io/v1alpha
kind : OdhDashboardConfig
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
argocd.argoproj.io/sync-options : ServerSideApply=true
name : odh-dashboard-config
namespace : redhat-ods-applications
spec :
dashboardConfig :
disableBiasMetrics : true
disableHardwareProfiles : false
disableKServe : false
disableModelMesh : false
groupsConfig :
adminGroups : rhods-admins
allowedGroups : system:authenticated
modelServerSizes : []
notebookController :
enabled : true
pvcSize : 20Gi
notebookSizes : []
OperatorGroupoperators.coreos.com/v1
gpu-operator-certified
nvidia-gpu-operator
▼ YAML
apiVersion : operators.coreos.com/v1
kind : OperatorGroup
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : gpu-operator-certified
namespace : nvidia-gpu-operator
spec :
targetNamespaces :
- nvidia-gpu-operator
OperatorGroupoperators.coreos.com/v1
nfd
openshift-nfd
▼ YAML
apiVersion : operators.coreos.com/v1
kind : OperatorGroup
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : nfd
namespace : openshift-nfd
spec :
targetNamespaces :
- openshift-nfd
OperatorGroupoperators.coreos.com/v1
serverless-operator
openshift-serverless
▼ YAML
apiVersion : operators.coreos.com/v1
kind : OperatorGroup
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : serverless-operator
namespace : openshift-serverless
OperatorGroupoperators.coreos.com/v1
rhods-operator
redhat-ods-operator
▼ YAML
apiVersion : operators.coreos.com/v1
kind : OperatorGroup
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : rhods-operator
namespace : redhat-ods-operator
Subscriptionoperators.coreos.com/v1alpha1
gpu-operator-certified
nvidia-gpu-operator
▼ YAML
apiVersion : operators.coreos.com/v1alpha1
kind : Subscription
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : gpu-operator-certified
namespace : nvidia-gpu-operator
spec :
channel : stable
installPlanApproval : Automatic
name : gpu-operator-certified
source : certified-operators
sourceNamespace : openshift-marketplace
Subscriptionoperators.coreos.com/v1alpha1
nfd
openshift-nfd
▼ YAML
apiVersion : operators.coreos.com/v1alpha1
kind : Subscription
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : nfd
namespace : openshift-nfd
spec :
channel : stable
installPlanApproval : Automatic
name : nfd
source : redhat-operators
sourceNamespace : openshift-marketplace
Subscriptionoperators.coreos.com/v1alpha1
authorino-operator
openshift-operators
▼ YAML
apiVersion : operators.coreos.com/v1alpha1
kind : Subscription
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : authorino-operator
namespace : openshift-operators
spec :
channel : stable
installPlanApproval : Automatic
name : authorino-operator
source : redhat-operators
sourceNamespace : openshift-marketplace
Subscriptionoperators.coreos.com/v1alpha1
openshift-pipelines-operator-rh
openshift-operators
▼ YAML
apiVersion : operators.coreos.com/v1alpha1
kind : Subscription
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : openshift-pipelines-operator-rh
namespace : openshift-operators
spec :
channel : latest
installPlanApproval : Automatic
name : openshift-pipelines-operator-rh
source : redhat-operators
sourceNamespace : openshift-marketplace
Subscriptionoperators.coreos.com/v1alpha1
servicemeshoperator
openshift-operators
▼ YAML
apiVersion : operators.coreos.com/v1alpha1
kind : Subscription
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : servicemeshoperator
namespace : openshift-operators
spec :
channel : stable
installPlanApproval : Automatic
name : servicemeshoperator
source : redhat-operators
sourceNamespace : openshift-marketplace
Subscriptionoperators.coreos.com/v1alpha1
serverless-operator
openshift-serverless
▼ YAML
apiVersion : operators.coreos.com/v1alpha1
kind : Subscription
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : serverless-operator
namespace : openshift-serverless
spec :
channel : stable
installPlanApproval : Automatic
name : serverless-operator
source : redhat-operators
sourceNamespace : openshift-marketplace
Subscriptionoperators.coreos.com/v1alpha1
rhods-operator
redhat-ods-operator
▼ YAML
apiVersion : operators.coreos.com/v1alpha1
kind : Subscription
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
name : rhods-operator
namespace : redhat-ods-operator
spec :
channel : stable
installPlanApproval : Manual
name : rhods-operator
source : redhat-operators
sourceNamespace : openshift-marketplace
Templatetemplate.openshift.io/v1
trition-serving-runtime
redhat-ods-applications
▼ YAML
apiVersion : template.openshift.io/v1
kind : Template
metadata :
annotations :
argocd.argoproj.io/hook : PreSync
opendatahub.io/apiProtocol : REST
opendatahub.io/modelServingSupport : '["multi"]'
labels :
opendatahub.io/dashboard : 'true'
name : trition-serving-runtime
namespace : redhat-ods-applications
objects :
- apiVersion : serving.kserve.io/v1alpha1
kind : ServingRuntime
metadata :
annotations :
maxLoadingConcurrency : '2'
openshift.io/display-name : Triton runtime 23.10
labels :
name : triton-23.10
name : triton-23.10
spec :
builtInAdapter :
memBufferBytes : 134217728
modelLoadingTimeoutMillis : 90000
runtimeManagementPort : 8001
serverType : triton
containers :
- args :
- -c
- 'mkdir -p /models/_triton_models; chmod 777 /models/_triton_models; exec tritonserver
"--model-repository=/models/_triton_models" "--model-control-mode=explicit"
"--strict-model-config=false" "--strict-readiness=false" "--allow-http=true"
"--allow-sagemaker=false" '
command :
- /bin/sh
image : nvcr.io/nvidia/tritonserver:23.10-py3
livenessProbe :
exec :
command :
- curl
- --fail
- --silent
- --show-error
- --max-time
- '9'
- http://localhost:8000/v2/health/live
initialDelaySeconds : 5
periodSeconds : 30
timeoutSeconds : 10
name : triton
resources :
limits :
cpu : '5'
memory : 1Gi
requests :
cpu : 500m
memory : 1Gi
volumeMounts :
- mountPath : /dev/shm
name : shm
grpcDataEndpoint : port:8001
grpcEndpoint : port:8085
multiModel : true
protocolVersions :
- grpc-v2
supportedModelFormats :
- autoSelect : true
name : keras
version : '2'
- autoSelect : true
name : onnx
version : '1'
- autoSelect : true
name : pytorch
version : '1'
- autoSelect : true
name : tensorflow
version : '1'
- autoSelect : true
name : tensorflow
version : '2'
- autoSelect : true
name : tensorrt
version : '7'
volumes :
- emptyDir :
medium : Memory
sizeLimit : 2Gi
name : shm