Skip to content

NVIDIA GPU Autoscale

Usage

If you have cloned the gitops-catalog repository, you can install by running from the root directory:

oc apply -k demos/overlays/nvidia-gpu-autoscale

Or, without cloning:

oc apply -k https://github.com/redhat-na-ssa/demo-ai-gitops-catalog/demos/overlays/nvidia-gpu-autoscale

As part of a different overlay in your own GitOps repo:

apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
    - https://github.com/redhat-na-ssa/demo-ai-gitops-catalog/demos/overlays/nvidia-gpu-autoscale?ref=main

Resources Analysis

Kind Name Namespace Details
Namespacev1 nvidia-gpu-operator
apiVersion: v1
kind: Namespace
metadata:
  annotations:
    openshift.io/display-name: NVIDIA GPU Operator
  labels:
    openshift.io/cluster-monitoring: 'true'
  name: nvidia-gpu-operator
Namespacev1 openshift-nfd
apiVersion: v1
kind: Namespace
metadata:
  annotations:
    openshift.io/display-name: Node Feature Discovery Operator
  labels:
    openshift.io/cluster-monitoring: 'true'
  name: openshift-nfd
ServiceAccountv1 job-aro-gpu-machineset nvidia-gpu-operator
apiVersion: v1
kind: ServiceAccount
metadata:
  name: job-aro-gpu-machineset
  namespace: nvidia-gpu-operator
ServiceAccountv1 job-aws-gpu-machineset nvidia-gpu-operator
apiVersion: v1
kind: ServiceAccount
metadata:
  name: job-aws-gpu-machineset
  namespace: nvidia-gpu-operator
ServiceAccountv1 job-gpu-console-plugin nvidia-gpu-operator
apiVersion: v1
kind: ServiceAccount
metadata:
  name: job-gpu-console-plugin
  namespace: nvidia-gpu-operator
ServiceAccountv1 job-setup-autoscale openshift-machine-api
apiVersion: v1
kind: ServiceAccount
metadata:
  labels:
    autoscale: config
  name: job-setup-autoscale
  namespace: openshift-machine-api
ClusterRolerbac.authorization.k8s.io/v1 job-aro-gpu-machineset
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: job-aro-gpu-machineset
rules:
- apiGroups:
  - machine.openshift.io
  resources:
  - machinesets
  verbs:
  - '*'
- apiGroups:
  - autoscaling.openshift.io
  resources:
  - machineautoscalers
  verbs:
  - '*'
- apiGroups:
  - ''
  resourceNames:
  - azure-credentials
  resources:
  - secrets
  verbs:
  - get
  - list
ClusterRolerbac.authorization.k8s.io/v1 job-aws-gpu-machineset
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: job-aws-gpu-machineset
rules:
- apiGroups:
  - machine.openshift.io
  resources:
  - machinesets
  verbs:
  - '*'
- apiGroups:
  - autoscaling.openshift.io
  resources:
  - machineautoscalers
  verbs:
  - '*'
- apiGroups:
  - ''
  resourceNames:
  - aws-creds
  resources:
  - secrets
  verbs:
  - get
  - list
ClusterRolerbac.authorization.k8s.io/v1 job-gpu-console-plugin
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: job-gpu-console-plugin
rules:
- apiGroups:
  - operator.openshift.io
  resources:
  - consoles
  verbs:
  - get
  - list
  - patch
  - label
ClusterRolerbac.authorization.k8s.io/v1 job-setup-autoscale
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    autoscale: config
  name: job-setup-autoscale
rules:
- apiGroups:
  - machine.openshift.io
  resources:
  - machinesets
  verbs:
  - '*'
- apiGroups:
  - autoscaling.openshift.io
  resources:
  - machineautoscalers
  verbs:
  - '*'
- apiGroups:
  - ''
  resourceNames:
  - aws-creds
  - azure-credentials
  resources:
  - secrets
  verbs:
  - get
  - list
ClusterRoleBindingrbac.authorization.k8s.io/v1 job-aro-gpu-machineset
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: job-aro-gpu-machineset
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: job-aro-gpu-machineset
subjects:
- kind: ServiceAccount
  name: job-aro-gpu-machineset
  namespace: nvidia-gpu-operator
ClusterRoleBindingrbac.authorization.k8s.io/v1 job-aws-gpu-machineset
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: job-aws-gpu-machineset
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: job-aws-gpu-machineset
subjects:
- kind: ServiceAccount
  name: job-aws-gpu-machineset
  namespace: nvidia-gpu-operator
ClusterRoleBindingrbac.authorization.k8s.io/v1 job-gpu-console-plugin
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: job-gpu-console-plugin
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: job-gpu-console-plugin
subjects:
- kind: ServiceAccount
  name: job-gpu-console-plugin
  namespace: nvidia-gpu-operator
ClusterRoleBindingrbac.authorization.k8s.io/v1 job-setup-autoscale
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  labels:
    autoscale: config
  name: job-setup-autoscale
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: job-setup-autoscale
subjects:
- kind: ServiceAccount
  name: job-setup-autoscale
  namespace: openshift-machine-api
ConfigMapv1 console-plugin-nvidia-gpu nvidia-gpu-operator
apiVersion: v1
data:
  dcgm-metrics.csv: '# see https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/dcp-metrics-included.csv

    DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization.

    DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization.

    DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization.

    DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization.

    DCGM_FI_DEV_FB_FREE, gauge, mem free.

    DCGM_FI_DEV_FB_USED, gauge, mem used.

    DCGM_FI_DEV_GPU_UTIL, gauge, gpu utilization.

    DCGM_FI_DEV_POWER_USAGE, gauge, power usage.

    DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, gauge, power mgmt limit.

    DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp.

    DCGM_FI_DEV_SM_CLOCK, gauge, sm clock.

    DCGM_FI_DEV_MAX_SM_CLOCK, gauge, max sm clock.

    DCGM_FI_DEV_MEM_CLOCK, gauge, mem clock.

    DCGM_FI_DEV_MAX_MEM_CLOCK, gauge, max mem clock.

    '
kind: ConfigMap
metadata:
  labels:
    app.kubernetes.io/component: console-plugin-nvidia-gpu
    app.kubernetes.io/instance: console-plugin-nvidia-gpu
    app.kubernetes.io/managed-by: Helm
    app.kubernetes.io/name: console-plugin-nvidia-gpu
    app.kubernetes.io/part-of: console-plugin-nvidia-gpu
    app.kubernetes.io/version: latest
    helm.sh/chart: console-plugin-nvidia-gpu-0.2.4
  name: console-plugin-nvidia-gpu
  namespace: nvidia-gpu-operator
ConfigMapv1 device-plugin-config nvidia-gpu-operator
apiVersion: v1
data:
  default: 'version: v1'
  time-sliced-2: "version: v1\nsharing:\n  timeSlicing:\n    resources:\n      - name:\
    \ nvidia.com/gpu\n        replicas: 2"
  time-sliced-4: "version: v1\nsharing:\n  timeSlicing:\n    resources:\n      - name:\
    \ nvidia.com/gpu\n        replicas: 4"
  time-sliced-99: "version: v1\nsharing:\n  timeSlicing:\n    resources:\n      -\
    \ name: nvidia.com/gpu\n        replicas: 99"
kind: ConfigMap
metadata:
  name: device-plugin-config
  namespace: nvidia-gpu-operator
ConfigMapv1 job-aro-gpu-machineset nvidia-gpu-operator
apiVersion: v1
data:
  job.sh: '#!/bin/bash


    # shellcheck disable=SC1091

    . /scripts/ocp.sh


    INSTANCE_TYPE=${INSTANCE_TYPE:-Standard_NC4as_T4_v3}


    ocp_aro_cluster || exit 0

    ocp_aro_machineset_create_gpu "${INSTANCE_TYPE}"

    ocp_machineset_create_autoscale

    # ocp_machineset_taint_gpu

    '
  ocp.sh: "#!/bin/bash\n# shellcheck disable=SC2120\n\n# See https://github.com/redhat-na-ssa/demo-ai-gitops-catalog\n\
    # FUNCTIONS='\n# ocp_aro_cluster\n# ocp_aro_machineset_create_gpu\n# ocp_aro_machineset_clone_worker\n\
    # ocp_aro_machineset_fix_storage\n# ocp_machineset_create_autoscale\n# ocp_machineset_taint_gpu\n\
    # '\n\n# for function in ${FUNCTIONS}\n# do\n#   function_extract $function scripts/library/ocp*.sh\
    \ >> tmp\n#   echo >> tmp\n# done\n\nocp_machineset_create_autoscale(){\n  MACHINE_MIN=${1:-0}\n\
    \  MACHINE_MAX=${2:-4}\n  MACHINE_SETS=${3:-$(oc -n openshift-machine-api get\
    \ machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}\n\n  for machine_set\
    \ in ${MACHINE_SETS}\n  do\ncat << YAML | oc apply -f -\napiVersion: \"autoscaling.openshift.io/v1beta1\"\
    \nkind: \"MachineAutoscaler\"\nmetadata:\n  name: \"${machine_set}\"\n  namespace:\
    \ \"openshift-machine-api\"\nspec:\n  minReplicas: ${MACHINE_MIN}\n  maxReplicas:\
    \ ${MACHINE_MAX}\n  scaleTargetRef:\n    apiVersion: machine.openshift.io/v1beta1\n\
    \    kind: MachineSet\n    name: \"${machine_set}\"\nYAML\n  done\n}\n\nocp_machineset_taint_gpu(){\n\
    \  SHORT_NAME=${1:-g4dn}\n  MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
    \ -o name | grep \"${SHORT_NAME}\" | head -n1)\n\n  echo \"Patching: ${MACHINE_SET}\"\
    \n\n  # taint nodes for gpu-only workloads\n  oc -n openshift-machine-api \\\n\
    \    patch \"${MACHINE_SET}\" \\\n    --type=merge --patch '{\"spec\":{\"template\"\
    :{\"spec\":{\"taints\":[{\"key\":\"nvidia.com/gpu\",\"value\":\"\",\"effect\"\
    :\"NoSchedule\"}]}}}}'\n}\n\nocp_aro_cluster(){\n  TARGET_NS=kube-system\n  OBJ=secret/azure-credentials\n\
    \  echo \"Checking if ${OBJ} exists in ${TARGET_NS} namespace\"\n  oc -n \"${TARGET_NS}\"\
    \ get \"${OBJ}\" -o name > /dev/null 2>&1 || return 1\n  echo \"ARO cluster detected\"\
    \n}\n\nocp_aro_machineset_create_gpu(){\n  # https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/gpu-accelerated/nv-family\n\
    \n  INSTANCE_TYPE=${1:-Standard_NC64as_T4_v3}\n  SHORT_NAME=${2:-${INSTANCE_TYPE//_/-}}\n\
    \  SHORT_NAME=${SHORT_NAME,,}\n\n  ocp_aro_machineset_clone_worker \"${INSTANCE_TYPE}\"\
    \n\n  MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
    \ -o name | grep \"/${SHORT_NAME}\" | head -n1)\n\n  echo \"Patching: ${MACHINE_SET_TYPE}\"\
    \n\n  # cosmetic\n  oc -n openshift-machine-api \\\n    patch \"${MACHINE_SET_TYPE}\"\
    \ \\\n    --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\"\
    :{\"labels\":{\"node-role.kubernetes.io/gpu\":\"\"}}}}}}'\n\n  # should use the\
    \ default profile\n  # oc -n openshift-machine-api \\\n  #   patch \"${MACHINE_SET_TYPE}\"\
    \ \\\n  #   --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\"\
    :{\"labels\":{\"nvidia.com/device-plugin.config\":\"no-time-sliced\"}}}}}}'\n\n\
    \  # should help auto provisioner\n  # oc -n openshift-machine-api \\\n  #   patch\
    \ \"${MACHINE_SET_TYPE}\" \\\n  #   --type=merge --patch '{\"spec\":{\"template\"\
    :{\"spec\":{\"metadata\":{\"labels\":{\"cluster-api/accelerator\":\"nvidia-gpu\"\
    }}}}}}'\n\n  # oc -n openshift-machine-api \\\n  #   patch \"${MACHINE_SET_TYPE}\"\
    \ \\\n  #   --type=merge --patch '{\"metadata\":{\"labels\":{\"cluster-api/accelerator\"\
    :\"nvidia-gpu\"}}}'\n\n  oc -n openshift-machine-api \\\n    patch \"${MACHINE_SET_TYPE}\"\
    \ \\\n    --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"providerSpec\"\
    :{\"value\":{\"vmSize\":\"'\"${INSTANCE_TYPE}\"'\"}}}}}}'\n}\n\nocp_aro_machineset_clone_worker(){\n\
    \  [ -z \"${1}\" ] && \\\n  echo \"\n    usage: ocp_aro_machineset_clone_worker\
    \ < instance type, default Standard_D4s_v3 > < machine set name >\n  \"\n\n  INSTANCE_TYPE=${1:-Standard_D4s_v3}\n\
    \  SHORT_NAME=${2:-${INSTANCE_TYPE//_/-}}\n  SHORT_NAME=${SHORT_NAME,,}\n\n  MACHINE_SET_NAME=$(oc\
    \ -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep\
    \ \"/${SHORT_NAME}\" | head -n1)\n  MACHINE_SET_WORKER=$(oc -n openshift-machine-api\
    \ get machinesets.machine.openshift.io -o name | grep worker | head -n1)\n\n \
    \ # check for an existing instance machine set\n  if [ -n \"${MACHINE_SET_NAME}\"\
    \ ]; then\n    echo \"Exists: machineset - ${MACHINE_SET_NAME}\"\n  else\n   \
    \ echo \"Creating: machineset - ${SHORT_NAME}\"\n\n    oc -n openshift-machine-api\
    \ \\\n      get \"${MACHINE_SET_WORKER}\" -o yaml | \\\n        sed '/machine/\
    \ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"'/g\n          /^  name:/\
    \ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"'/g\n          /name/ s/'\"\
    ${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"'/g\n          s/vmSize.*/vmSize:\
    \ '\"${INSTANCE_TYPE}\"'/\n          /cluster-api-autoscaler/d\n          /uid:/d\n\
    \          /generation:/d\n          /resourceVersion:/d\n          /creationTimestamp:/d\n\
    \          s/replicas.*/replicas: 0/' | \\\n      oc apply -f -\n\n    MACHINE_SET_NAME=\"\
    machinesets.machine.openshift.io/${SHORT_NAME}\"\n  fi\n\n  # cosmetic pretty\n\
    \  oc -n openshift-machine-api \\\n    patch \"${MACHINE_SET_NAME}\" \\\n    --type=merge\
    \ --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\":{\"labels\":{\"node-role.kubernetes.io/'\"\
    ${SHORT_NAME}\"'\":\"\"}}}}}}'\n}\n\n\n\n\n"
kind: ConfigMap
metadata:
  name: job-aro-gpu-machineset
  namespace: nvidia-gpu-operator
ConfigMapv1 job-aws-gpu-machineset nvidia-gpu-operator
apiVersion: v1
data:
  job.sh: '#!/bin/bash


    # shellcheck disable=SC1091

    . /scripts/ocp.sh


    INSTANCE_TYPE=${INSTANCE_TYPE:-g4dn.4xlarge}


    ocp_aws_cluster || exit 0

    ocp_aws_machineset_create_gpu "${INSTANCE_TYPE}"

    ocp_machineset_create_autoscale

    ocp_aws_machineset_fix_storage

    # ocp_machineset_taint_gpu

    '
  ocp.sh: "#!/bin/bash\n# shellcheck disable=SC2120\n\n# See https://github.com/redhat-na-ssa/demo-ai-gitops-catalog\n\
    # FUNCTIONS='\n# ocp_aws_cluster\n# ocp_aws_machineset_create_gpu\n# ocp_aws_machineset_clone_worker\n\
    # ocp_aws_machineset_fix_storage\n# ocp_machineset_create_autoscale\n# ocp_machineset_taint_gpu\n\
    # '\n\n# for function in ${FUNCTIONS}\n# do\n#   function_extract $function scripts/library/ocp*.sh\
    \ >> tmp\n#   echo >> tmp\n# done\n\nocp_aws_cluster(){\n  TARGET_NS=kube-system\n\
    \  OBJ=secret/aws-creds\n  echo \"Checking if ${OBJ} exists in ${TARGET_NS} namespace\"\
    \n  oc -n \"${TARGET_NS}\" get \"${OBJ}\" -o name > /dev/null 2>&1 || return 1\n\
    \  echo \"AWS cluster detected\"\n}\n\nocp_aws_machineset_create_gpu(){\n  # https://aws.amazon.com/ec2/instance-types/g4\n\
    \  # single gpu: g4dn.{2,4,8,16}xlarge\n  # multi gpu:  g4dn.12xlarge\n  # practical:\
    \  g4ad.4xlarge\n  # a100 (MIG): p4d.24xlarge\n  # h100 (MIG): p5.48xlarge\n\n\
    \  # https://aws.amazon.com/ec2/instance-types/dl1\n  # 8 x gaudi:  dl1.24xlarge\n\
    \n  INSTANCE_TYPE=${1:-g4dn.4xlarge}\n\n  ocp_aws_machineset_clone_worker \"${INSTANCE_TYPE}\"\
    \n\n  MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
    \ -o name | grep \"${INSTANCE_TYPE%.*}\" | head -n1)\n\n  echo \"Patching: ${MACHINE_SET_TYPE}\"\
    \n\n  # cosmetic\n  oc -n openshift-machine-api \\\n    patch \"${MACHINE_SET_TYPE}\"\
    \ \\\n    --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\"\
    :{\"labels\":{\"node-role.kubernetes.io/gpu\":\"\"}}}}}}'\n\n  # should use the\
    \ default profile\n  # oc -n openshift-machine-api \\\n  #   patch \"${MACHINE_SET_TYPE}\"\
    \ \\\n  #   --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"metadata\"\
    :{\"labels\":{\"nvidia.com/device-plugin.config\":\"no-time-sliced\"}}}}}}'\n\n\
    \  # should help auto provisioner\n  # oc -n openshift-machine-api \\\n  #   patch\
    \ \"${MACHINE_SET_TYPE}\" \\\n  #   --type=merge --patch '{\"spec\":{\"template\"\
    :{\"spec\":{\"metadata\":{\"labels\":{\"cluster-api/accelerator\":\"nvidia-gpu\"\
    }}}}}}'\n\n  # oc -n openshift-machine-api \\\n  #   patch \"${MACHINE_SET_TYPE}\"\
    \ \\\n  #   --type=merge --patch '{\"metadata\":{\"labels\":{\"cluster-api/accelerator\"\
    :\"nvidia-gpu\"}}}'\n\n  oc -n openshift-machine-api \\\n    patch \"${MACHINE_SET_TYPE}\"\
    \ \\\n    --type=merge --patch '{\"spec\":{\"template\":{\"spec\":{\"providerSpec\"\
    :{\"value\":{\"instanceType\":\"'\"${INSTANCE_TYPE}\"'\"}}}}}}'\n\n#  # fix storage\n\
    \n# cat << YAML > /tmp/patch.yaml\n# spec:\n#   template:\n#     spec:\n#    \
    \   providerSpec:\n#         value:\n#           blockDevices:\n#            \
    \ - ebs:\n#                 volumeSize: 120\n#                 volumeType: gp3\n\
    # YAML\n\n#   oc -n openshift-machine-api \\\n#     patch \"${MACHINE_SET_TYPE}\"\
    \ \\\n#     --type=merge --patch \"$(cat /tmp/patch.yaml)\"\n}\n\nocp_aws_machineset_clone_worker(){\n\
    \  [ -z \"${1}\" ] && \\\n  echo \"\n    usage: ocp_aws_machineset_clone_worker\
    \ < instance type, default g4dn.4xlarge > < machine set name >\n  \"\n\n  INSTANCE_TYPE=${1:-g4dn.4xlarge}\n\
    \  SHORT_NAME=${2:-${INSTANCE_TYPE/./-}}\n\n  MACHINE_SET_NAME=$(oc -n openshift-machine-api\
    \ get machinesets.machine.openshift.io -o name | grep \"${SHORT_NAME}\" | head\
    \ -n1)\n  MACHINE_SET_WORKER=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
    \ -o name | grep worker | head -n1)\n\n  # check for an existing instance machine\
    \ set\n  if [ -n \"${MACHINE_SET_NAME}\" ]; then\n    echo \"Exists: machineset\
    \ - ${MACHINE_SET_NAME}\"\n  else\n    echo \"Creating: machineset - ${SHORT_NAME}\"\
    \n    oc -n openshift-machine-api \\\n      get \"${MACHINE_SET_WORKER}\" -o yaml\
    \ | \\\n        sed '/machine/ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"\
    '/g\n          /^  name:/ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"\
    '/g\n          /name/ s/'\"${MACHINE_SET_WORKER##*/}\"'/'\"${SHORT_NAME}\"'/g\n\
    \          s/instanceType.*/instanceType: '\"${INSTANCE_TYPE}\"'/\n          /cluster-api-autoscaler/d\n\
    \          /uid:/d\n          /generation:/d\n          /resourceVersion:/d\n\
    \          /creationTimestamp:/d\n          s/replicas.*/replicas: 0/' | \\\n\
    \      oc apply -f -\n  fi\n\n  # fix aws storage\n  ocp_aws_machineset_fix_storage\
    \ \"${MACHINE_SET_NAME}\"\n\n  # cosmetic pretty\n  oc -n openshift-machine-api\
    \ \\\n    patch \"${MACHINE_SET_NAME}\" \\\n    --type=merge --patch '{\"spec\"\
    :{\"template\":{\"spec\":{\"metadata\":{\"labels\":{\"node-role.kubernetes.io/'\"\
    ${SHORT_NAME}\"'\":\"\"}}}}}}'\n}\n\nocp_aws_machineset_fix_storage(){\n  MACHINE_SETS=${1:-$(oc\
    \ -n openshift-machine-api get machineset -o name)}\n  HD_SIZE=${2:-200}\n\n \
    \ for machine_set in ${MACHINE_SETS}\n  do\n    echo \"Patching aws storage for\
    \ machineset: ${machine_set}\"\n    oc -n openshift-machine-api \\\n      get\
    \ \"${machine_set}\" -o yaml | \\\n        sed 's/volumeSize: 100/volumeSize:\
    \ '\"${HD_SIZE}\"'/\n          s/volumeType: gp2/volumeType: gp3/' | \\\n    \
    \  oc apply -f -\n  done\n}\n\nocp_machineset_create_autoscale(){\n  MACHINE_MIN=${1:-0}\n\
    \  MACHINE_MAX=${2:-4}\n  MACHINE_SETS=${3:-$(oc -n openshift-machine-api get\
    \ machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}\n\n  for machine_set\
    \ in ${MACHINE_SETS}\n  do\ncat << YAML | oc apply -f -\napiVersion: \"autoscaling.openshift.io/v1beta1\"\
    \nkind: \"MachineAutoscaler\"\nmetadata:\n  name: \"${machine_set}\"\n  namespace:\
    \ \"openshift-machine-api\"\nspec:\n  minReplicas: ${MACHINE_MIN}\n  maxReplicas:\
    \ ${MACHINE_MAX}\n  scaleTargetRef:\n    apiVersion: machine.openshift.io/v1beta1\n\
    \    kind: MachineSet\n    name: \"${machine_set}\"\nYAML\n  done\n}\n\nocp_machineset_taint_gpu(){\n\
    \  SHORT_NAME=${1:-g4dn}\n  MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
    \ -o name | grep \"${SHORT_NAME}\" | head -n1)\n\n  echo \"Patching: ${MACHINE_SET}\"\
    \n\n  # taint nodes for gpu-only workloads\n  oc -n openshift-machine-api \\\n\
    \    patch \"${MACHINE_SET}\" \\\n    --type=merge --patch '{\"spec\":{\"template\"\
    :{\"spec\":{\"taints\":[{\"key\":\"nvidia.com/gpu\",\"value\":\"\",\"effect\"\
    :\"NoSchedule\"}]}}}}'\n}\n"
kind: ConfigMap
metadata:
  name: job-aws-gpu-machineset
  namespace: nvidia-gpu-operator
ConfigMapv1 job-gpu-console-plugin nvidia-gpu-operator
apiVersion: v1
data:
  console-plugin-job.sh: "#!/usr/bin/bash\n\nenable_console_plugin(){\n  [ -z \"${PLUGIN_NAME}\"\
    \ ] && return 1\n\n  echo \"Attempting to enable ${PLUGIN_NAME} plugin\"\n  echo\
    \ \"\"\n\n  # Create the plugins section on the object if it doesn't exist\n \
    \ if [ -z \"$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}')\"\
    \ ]; then\n    echo \"Creating plugins object\"\n    oc patch consoles.operator.openshift.io\
    \ cluster --patch '{ \"spec\": { \"plugins\": [] } }' --type=merge\n  fi\n\n \
    \ INSTALLED_PLUGINS=$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}')\n\
    \  echo \"Current plugins:\"\n  echo \"${INSTALLED_PLUGINS}\"\n\n  if [[ \"${INSTALLED_PLUGINS}\"\
    \ == *\"${PLUGIN_NAME}\"* ]]; then\n      echo \"${PLUGIN_NAME} is already enabled\"\
    \n  else\n      echo \"Enabling plugin: ${PLUGIN_NAME}\"\n      oc patch consoles.operator.openshift.io\
    \ cluster --type=json --patch '[{\"op\": \"add\", \"path\": \"/spec/plugins/-\"\
    , \"value\": \"'\"${PLUGIN_NAME}\"'\"}]'\n  fi\n\n  sleep 6\n  oc get consoles.operator.openshift.io\
    \ cluster -o=jsonpath='{.spec.plugins}'\n}\n\nenable_console_plugin\n"
kind: ConfigMap
metadata:
  name: job-gpu-console-plugin
  namespace: nvidia-gpu-operator
ConfigMapv1 nvidia-dcgm-exporter-dashboard openshift-config-managed
apiVersion: v1
data:
  dcgm-exporter-dashboard.json: "{\n  \"__requires\": [\n    {\n      \"type\": \"\
    panel\",\n      \"id\": \"gauge\",\n      \"name\": \"Gauge\",\n      \"version\"\
    : \"\"\n    },\n    {\n      \"type\": \"grafana\",\n      \"id\": \"grafana\"\
    ,\n      \"name\": \"Grafana\",\n      \"version\": \"6.7.3\"\n    },\n    {\n\
    \      \"type\": \"panel\",\n      \"id\": \"graph\",\n      \"name\": \"Graph\"\
    ,\n      \"version\": \"\"\n    },\n    {\n      \"type\": \"datasource\",\n \
    \     \"id\": \"prometheus\",\n      \"name\": \"Prometheus\",\n      \"version\"\
    : \"1.0.0\"\n    }\n  ],\n  \"annotations\": {\n    \"list\": [\n      {\n   \
    \     \"$$hashKey\": \"object:192\",\n        \"builtIn\": 1,\n        \"datasource\"\
    : \"-- Grafana --\",\n        \"enable\": true,\n        \"hide\": true,\n   \
    \     \"iconColor\": \"rgba(0, 211, 255, 1)\",\n        \"name\": \"Annotations\
    \ & Alerts\",\n        \"type\": \"dashboard\"\n      }\n    ]\n  },\n  \"description\"\
    : \"This dashboard is to display the metrics from DCGM Exporter on a Kubernetes\
    \ (1.19+) cluster\",\n  \"editable\": true,\n  \"gnetId\": 12239,\n  \"graphTooltip\"\
    : 0,\n  \"id\": null,\n  \"iteration\": 1588401887165,\n  \"links\": [],\n  \"\
    panels\": [\n    {\n      \"aliasColors\": {},\n      \"bars\": false,\n     \
    \ \"dashLength\": 10,\n      \"dashes\": false,\n      \"datasource\": \"$datasource\"\
    ,\n      \"fill\": 1,\n      \"fillGradient\": 0,\n      \"gridPos\": {\n    \
    \    \"h\": 8,\n        \"w\": 18,\n        \"x\": 0,\n        \"y\": 0\n    \
    \  },\n      \"hiddenSeries\": false,\n      \"id\": 12,\n      \"legend\": {\n\
    \        \"alignAsTable\": true,\n        \"avg\": true,\n        \"current\"\
    : true,\n        \"max\": true,\n        \"min\": false,\n        \"rightSide\"\
    : true,\n        \"show\": true,\n        \"total\": false,\n        \"values\"\
    : true\n      },\n      \"lines\": true,\n      \"linewidth\": 2,\n      \"nullPointMode\"\
    : \"null\",\n      \"options\": {\n        \"dataLinks\": []\n      },\n     \
    \ \"percentage\": false,\n      \"pointradius\": 2,\n      \"points\": false,\n\
    \      \"renderer\": \"flot\",\n      \"seriesOverrides\": [],\n      \"spaceLength\"\
    : 10,\n      \"stack\": false,\n      \"steppedLine\": false,\n      \"targets\"\
    : [\n        {\n          \"expr\": \"DCGM_FI_DEV_GPU_TEMP{instance=~\\\"$instance\\\
    \", gpu=~\\\"$gpu\\\"}\",\n          \"instant\": false,\n          \"interval\"\
    : \"\",\n          \"legendFormat\": \"GPU {{gpu}}\",\n          \"refId\": \"\
    A\"\n        }\n      ],\n      \"thresholds\": [],\n      \"timeFrom\": null,\n\
    \      \"timeRegions\": [],\n      \"timeShift\": null,\n      \"title\": \"GPU\
    \ Temperature\",\n      \"tooltip\": {\n        \"shared\": true,\n        \"\
    sort\": 0,\n        \"value_type\": \"individual\"\n      },\n      \"type\":\
    \ \"graph\",\n      \"xaxis\": {\n        \"buckets\": null,\n        \"mode\"\
    : \"time\",\n        \"name\": null,\n        \"show\": true,\n        \"values\"\
    : []\n      },\n      \"yaxes\": [\n        {\n          \"format\": \"celsius\"\
    ,\n          \"label\": null,\n          \"logBase\": 1,\n          \"max\": null,\n\
    \          \"min\": null,\n          \"show\": true\n        },\n        {\n \
    \         \"format\": \"short\",\n          \"label\": null,\n          \"logBase\"\
    : 1,\n          \"max\": null,\n          \"min\": null,\n          \"show\":\
    \ true\n        }\n      ],\n      \"yaxis\": {\n        \"align\": false,\n \
    \       \"alignLevel\": null\n      }\n    },\n    {\n      \"datasource\": \"\
    $datasource\",\n      \"gridPos\": {\n        \"h\": 8,\n        \"w\": 6,\n \
    \       \"x\": 18,\n        \"y\": 0\n      },\n      \"id\": 14,\n      \"options\"\
    : {\n        \"fieldOptions\": {\n          \"calcs\": [\n            \"mean\"\
    \n          ],\n          \"defaults\": {\n            \"color\": {\n        \
    \      \"mode\": \"thresholds\"\n            },\n            \"mappings\": [],\n\
    \            \"max\": 100,\n            \"min\": 0,\n            \"thresholds\"\
    : {\n              \"mode\": \"absolute\",\n              \"steps\": [\n     \
    \           {\n                  \"color\": \"green\",\n                  \"value\"\
    : null\n                },\n                {\n                  \"color\": \"\
    #EAB839\",\n                  \"value\": 83\n                },\n            \
    \    {\n                  \"color\": \"red\",\n                  \"value\": 87\n\
    \                }\n              ]\n            },\n            \"unit\": \"\
    celsius\"\n          },\n          \"overrides\": [],\n          \"values\": false\n\
    \        },\n        \"orientation\": \"auto\",\n        \"showThresholdLabels\"\
    : false,\n        \"showThresholdMarkers\": true\n      },\n      \"pluginVersion\"\
    : \"6.7.3\",\n      \"targets\": [\n        {\n          \"expr\": \"avg(DCGM_FI_DEV_GPU_TEMP{instance=~\\\
    \"$instance\\\", gpu=~\\\"$gpu\\\"})\",\n          \"interval\": \"\",\n     \
    \     \"legendFormat\": \"\",\n          \"refId\": \"A\"\n        }\n      ],\n\
    \      \"timeFrom\": null,\n      \"timeShift\": null,\n      \"title\": \"GPU\
    \ Avg. Temp\",\n      \"type\": \"gauge\"\n    },\n    {\n      \"aliasColors\"\
    : {},\n      \"bars\": false,\n      \"dashLength\": 10,\n      \"dashes\": false,\n\
    \      \"datasource\": \"$datasource\",\n      \"fill\": 1,\n      \"fillGradient\"\
    : 0,\n      \"gridPos\": {\n        \"h\": 8,\n        \"w\": 18,\n        \"\
    x\": 0,\n        \"y\": 8\n      },\n      \"hiddenSeries\": false,\n      \"\
    id\": 10,\n      \"legend\": {\n        \"alignAsTable\": true,\n        \"avg\"\
    : true,\n        \"current\": true,\n        \"max\": true,\n        \"min\":\
    \ false,\n        \"rightSide\": true,\n        \"show\": true,\n        \"total\"\
    : false,\n        \"values\": true\n      },\n      \"lines\": true,\n      \"\
    linewidth\": 2,\n      \"nullPointMode\": \"null\",\n      \"options\": {\n  \
    \      \"dataLinks\": []\n      },\n      \"percentage\": false,\n      \"pluginVersion\"\
    : \"6.5.2\",\n      \"pointradius\": 2,\n      \"points\": false,\n      \"renderer\"\
    : \"flot\",\n      \"seriesOverrides\": [],\n      \"spaceLength\": 10,\n    \
    \  \"stack\": false,\n      \"steppedLine\": false,\n      \"targets\": [\n  \
    \      {\n          \"expr\": \"DCGM_FI_DEV_POWER_USAGE{instance=~\\\"$instance\\\
    \", gpu=~\\\"$gpu\\\"}\",\n          \"interval\": \"\",\n          \"legendFormat\"\
    : \"GPU {{gpu}}\",\n          \"refId\": \"A\"\n        }\n      ],\n      \"\
    thresholds\": [],\n      \"timeFrom\": null,\n      \"timeRegions\": [],\n   \
    \   \"timeShift\": null,\n      \"title\": \"GPU Power Usage\",\n      \"tooltip\"\
    : {\n        \"shared\": true,\n        \"sort\": 0,\n        \"value_type\":\
    \ \"individual\"\n      },\n      \"type\": \"graph\",\n      \"xaxis\": {\n \
    \       \"buckets\": null,\n        \"mode\": \"time\",\n        \"name\": null,\n\
    \        \"show\": true,\n        \"values\": []\n      },\n      \"yaxes\": [\n\
    \        {\n          \"format\": \"watt\",\n          \"label\": null,\n    \
    \      \"logBase\": 1,\n          \"max\": null,\n          \"min\": null,\n \
    \         \"show\": true\n        },\n        {\n          \"format\": \"short\"\
    ,\n          \"label\": null,\n          \"logBase\": 1,\n          \"max\": null,\n\
    \          \"min\": null,\n          \"show\": true\n        }\n      ],\n   \
    \   \"yaxis\": {\n        \"align\": false,\n        \"alignLevel\": null\n  \
    \    }\n    },\n    {\n      \"cacheTimeout\": null,\n      \"datasource\": \"\
    $datasource\",\n      \"gridPos\": {\n        \"h\": 8,\n        \"w\": 6,\n \
    \       \"x\": 18,\n        \"y\": 8\n      },\n      \"id\": 16,\n      \"links\"\
    : [],\n      \"options\": {\n        \"fieldOptions\": {\n          \"calcs\"\
    : [\n            \"sum\"\n          ],\n          \"defaults\": {\n          \
    \  \"color\": {\n              \"mode\": \"thresholds\"\n            },\n    \
    \        \"mappings\": [],\n            \"max\": 2400,\n            \"min\": 0,\n\
    \            \"nullValueMode\": \"connected\",\n            \"thresholds\": {\n\
    \              \"mode\": \"absolute\",\n              \"steps\": [\n         \
    \       {\n                  \"color\": \"green\",\n                  \"value\"\
    : null\n                },\n                {\n                  \"color\": \"\
    #EAB839\",\n                  \"value\": 1800\n                },\n          \
    \      {\n                  \"color\": \"red\",\n                  \"value\":\
    \ 2200\n                }\n              ]\n            },\n            \"unit\"\
    : \"watt\"\n          },\n          \"overrides\": [],\n          \"values\":\
    \ false\n        },\n        \"orientation\": \"horizontal\",\n        \"showThresholdLabels\"\
    : false,\n        \"showThresholdMarkers\": true\n      },\n      \"pluginVersion\"\
    : \"6.7.3\",\n      \"targets\": [\n        {\n          \"expr\": \"sum(DCGM_FI_DEV_POWER_USAGE{instance=~\\\
    \"$instance\\\", gpu=~\\\"$gpu\\\"})\",\n          \"instant\": true,\n      \
    \    \"interval\": \"\",\n          \"legendFormat\": \"\",\n          \"range\"\
    : false,\n          \"refId\": \"A\"\n        }\n      ],\n      \"timeFrom\"\
    : null,\n      \"timeShift\": null,\n      \"title\": \"GPU Power Total\",\n \
    \     \"type\": \"gauge\"\n    },\n    {\n      \"aliasColors\": {},\n      \"\
    bars\": false,\n      \"dashLength\": 10,\n      \"dashes\": false,\n      \"\
    datasource\": \"$datasource\",\n      \"fill\": 1,\n      \"fillGradient\": 0,\n\
    \      \"gridPos\": {\n        \"h\": 8,\n        \"w\": 12,\n        \"x\": 0,\n\
    \        \"y\": 16\n      },\n      \"hiddenSeries\": false,\n      \"id\": 2,\n\
    \      \"interval\": \"\",\n      \"legend\": {\n        \"alignAsTable\": true,\n\
    \        \"avg\": true,\n        \"current\": true,\n        \"max\": true,\n\
    \        \"min\": false,\n        \"rightSide\": true,\n        \"show\": true,\n\
    \        \"sideWidth\": null,\n        \"total\": false,\n        \"values\":\
    \ true\n      },\n      \"lines\": true,\n      \"linewidth\": 2,\n      \"nullPointMode\"\
    : \"null\",\n      \"options\": {\n        \"dataLinks\": []\n      },\n     \
    \ \"percentage\": false,\n      \"pointradius\": 2,\n      \"points\": false,\n\
    \      \"renderer\": \"flot\",\n      \"seriesOverrides\": [],\n      \"spaceLength\"\
    : 10,\n      \"stack\": false,\n      \"steppedLine\": false,\n      \"targets\"\
    : [\n        {\n          \"expr\": \"DCGM_FI_DEV_SM_CLOCK{instance=~\\\"$instance\\\
    \", gpu=~\\\"$gpu\\\"} * 1000000\",\n          \"format\": \"time_series\",\n\
    \          \"interval\": \"\",\n          \"intervalFactor\": 1,\n          \"\
    legendFormat\": \"GPU {{gpu}}\",\n          \"refId\": \"A\"\n        }\n    \
    \  ],\n      \"thresholds\": [],\n      \"timeFrom\": null,\n      \"timeRegions\"\
    : [],\n      \"timeShift\": null,\n      \"title\": \"GPU SM Clocks\",\n     \
    \ \"tooltip\": {\n        \"shared\": true,\n        \"sort\": 0,\n        \"\
    value_type\": \"individual\"\n      },\n      \"type\": \"graph\",\n      \"xaxis\"\
    : {\n        \"buckets\": null,\n        \"mode\": \"time\",\n        \"name\"\
    : null,\n        \"show\": true,\n        \"values\": []\n      },\n      \"yaxes\"\
    : [\n        {\n          \"decimals\": null,\n          \"format\": \"hertz\"\
    ,\n          \"label\": \"\",\n          \"logBase\": 1,\n          \"max\": null,\n\
    \          \"min\": null,\n          \"show\": true\n        },\n        {\n \
    \         \"format\": \"short\",\n          \"label\": null,\n          \"logBase\"\
    : 1,\n          \"max\": null,\n          \"min\": null,\n          \"show\":\
    \ true\n        }\n      ],\n      \"yaxis\": {\n        \"align\": false,\n \
    \       \"alignLevel\": null\n      }\n    },\n    {\n      \"aliasColors\": {},\n\
    \      \"bars\": false,\n      \"dashLength\": 10,\n      \"dashes\": false,\n\
    \      \"datasource\": \"$datasource\",\n      \"fill\": 1,\n      \"fillGradient\"\
    : 0,\n      \"gridPos\": {\n        \"h\": 8,\n        \"w\": 12,\n        \"\
    x\": 0,\n        \"y\": 24\n      },\n      \"hiddenSeries\": false,\n      \"\
    id\": 6,\n      \"legend\": {\n        \"alignAsTable\": true,\n        \"avg\"\
    : true,\n        \"current\": true,\n        \"max\": true,\n        \"min\":\
    \ false,\n        \"rightSide\": true,\n        \"show\": true,\n        \"total\"\
    : false,\n        \"values\": true\n      },\n      \"lines\": true,\n      \"\
    linewidth\": 2,\n      \"nullPointMode\": \"null\",\n      \"options\": {\n  \
    \      \"dataLinks\": []\n      },\n      \"percentage\": false,\n      \"pointradius\"\
    : 2,\n      \"points\": false,\n      \"renderer\": \"flot\",\n      \"seriesOverrides\"\
    : [],\n      \"spaceLength\": 10,\n      \"stack\": false,\n      \"steppedLine\"\
    : false,\n      \"targets\": [\n        {\n          \"expr\": \"DCGM_FI_DEV_GPU_UTIL{instance=~\\\
    \"$instance\\\", gpu=~\\\"$gpu\\\"}\",\n          \"interval\": \"\",\n      \
    \    \"legendFormat\": \"GPU {{gpu}}\",\n          \"refId\": \"A\"\n        }\n\
    \      ],\n      \"thresholds\": [],\n      \"timeFrom\": null,\n      \"timeRegions\"\
    : [],\n      \"timeShift\": null,\n      \"title\": \"GPU Utilization\",\n   \
    \   \"tooltip\": {\n        \"shared\": true,\n        \"sort\": 0,\n        \"\
    value_type\": \"cumulative\"\n      },\n      \"type\": \"graph\",\n      \"xaxis\"\
    : {\n        \"buckets\": null,\n        \"mode\": \"time\",\n        \"name\"\
    : null,\n        \"show\": true,\n        \"values\": []\n      },\n      \"yaxes\"\
    : [\n        {\n          \"format\": \"percent\",\n          \"label\": null,\n\
    \          \"logBase\": 1,\n          \"max\": \"100\",\n          \"min\": \"\
    0\",\n          \"show\": true\n        },\n        {\n          \"format\": \"\
    short\",\n          \"label\": null,\n          \"logBase\": 1,\n          \"\
    max\": null,\n          \"min\": null,\n          \"show\": true\n        }\n\
    \      ],\n      \"yaxis\": {\n        \"align\": false,\n        \"alignLevel\"\
    : null\n      }\n    },\n    {\n      \"aliasColors\": {},\n      \"bars\": false,\n\
    \      \"dashLength\": 10,\n      \"dashes\": false,\n      \"datasource\": \"\
    $datasource\",\n      \"fill\": 1,\n      \"fillGradient\": 0,\n      \"gridPos\"\
    : {\n        \"h\": 8,\n        \"w\": 12,\n        \"x\": 0,\n        \"y\":\
    \ 32\n      },\n      \"hiddenSeries\": false,\n      \"id\": 18,\n      \"legend\"\
    : {\n        \"alignAsTable\": true,\n        \"avg\": true,\n        \"current\"\
    : true,\n        \"max\": true,\n        \"min\": false,\n        \"rightSide\"\
    : true,\n        \"show\": true,\n        \"total\": false,\n        \"values\"\
    : true\n      },\n      \"lines\": true,\n      \"linewidth\": 2,\n      \"nullPointMode\"\
    : \"null\",\n      \"options\": {\n        \"dataLinks\": []\n      },\n     \
    \ \"percentage\": false,\n      \"pointradius\": 2,\n      \"points\": false,\n\
    \      \"renderer\": \"flot\",\n      \"seriesOverrides\": [],\n      \"spaceLength\"\
    : 10,\n      \"stack\": false,\n      \"steppedLine\": false,\n      \"targets\"\
    : [\n        {\n          \"expr\": \"DCGM_FI_DEV_FB_USED{instance=~\\\"$instance\\\
    \", gpu=~\\\"$gpu\\\"}\",\n          \"interval\": \"\",\n          \"legendFormat\"\
    : \"GPU {{gpu}}\",\n          \"refId\": \"A\"\n        }\n      ],\n      \"\
    thresholds\": [],\n      \"timeFrom\": null,\n      \"timeRegions\": [],\n   \
    \   \"timeShift\": null,\n      \"title\": \"GPU Framebuffer Mem Used\",\n   \
    \   \"tooltip\": {\n        \"shared\": true,\n        \"sort\": 0,\n        \"\
    value_type\": \"individual\"\n      },\n      \"type\": \"graph\",\n      \"xaxis\"\
    : {\n        \"buckets\": null,\n        \"mode\": \"time\",\n        \"name\"\
    : null,\n        \"show\": true,\n        \"values\": []\n      },\n      \"yaxes\"\
    : [\n        {\n          \"format\": \"decmbytes\",\n          \"label\": null,\n\
    \          \"logBase\": 1,\n          \"max\": null,\n          \"min\": null,\n\
    \          \"show\": true\n        },\n        {\n          \"format\": \"short\"\
    ,\n          \"label\": null,\n          \"logBase\": 1,\n          \"max\": null,\n\
    \          \"min\": null,\n          \"show\": true\n        }\n      ],\n   \
    \   \"yaxis\": {\n        \"align\": false,\n        \"alignLevel\": null\n  \
    \    }\n    },\n    {\n      \"aliasColors\": {},\n      \"bars\": false,\n  \
    \    \"dashLength\": 10,\n      \"dashes\": false,\n      \"datasource\": \"$datasource\"\
    ,\n      \"fill\": 1,\n      \"fillGradient\": 0,\n      \"gridPos\": {\n    \
    \    \"h\": 8,\n        \"w\": 12,\n        \"x\": 0,\n        \"y\": 24\n   \
    \   },\n      \"hiddenSeries\": false,\n      \"id\": 4,\n      \"legend\": {\n\
    \        \"alignAsTable\": true,\n        \"avg\": true,\n        \"current\"\
    : true,\n        \"max\": true,\n        \"min\": false,\n        \"rightSide\"\
    : true,\n        \"show\": true,\n        \"total\": false,\n        \"values\"\
    : true\n      },\n      \"lines\": true,\n      \"linewidth\": 2,\n      \"nullPointMode\"\
    : \"null\",\n      \"options\": {\n        \"dataLinks\": []\n      },\n     \
    \ \"percentage\": false,\n      \"pointradius\": 2,\n      \"points\": false,\n\
    \      \"renderer\": \"flot\",\n      \"seriesOverrides\": [],\n      \"spaceLength\"\
    : 10,\n      \"stack\": false,\n      \"steppedLine\": false,\n      \"targets\"\
    : [\n        {\n          \"expr\": \"DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\\\
    \"$instance\\\", gpu=~\\\"$gpu\\\"}\",\n          \"interval\": \"\",\n      \
    \    \"legendFormat\": \"GPU {{gpu}}\",\n          \"refId\": \"A\"\n        }\n\
    \      ],\n      \"thresholds\": [],\n      \"timeFrom\": null,\n      \"timeRegions\"\
    : [],\n      \"timeShift\": null,\n      \"title\": \"Tensor Core Utilization\"\
    ,\n      \"tooltip\": {\n        \"shared\": true,\n        \"sort\": 0,\n   \
    \     \"value_type\": \"cumulative\"\n      },\n      \"type\": \"graph\",\n \
    \     \"xaxis\": {\n        \"buckets\": null,\n        \"mode\": \"time\",\n\
    \        \"name\": null,\n        \"show\": true,\n        \"values\": []\n  \
    \    },\n      \"yaxes\": [\n        {\n          \"format\": \"percentunit\"\
    ,\n          \"label\": null,\n          \"logBase\": 1,\n          \"max\": \"\
    1\",\n          \"min\": \"0\",\n          \"show\": true\n        },\n      \
    \  {\n          \"format\": \"short\",\n          \"label\": null,\n         \
    \ \"logBase\": 1,\n          \"max\": null,\n          \"min\": null,\n      \
    \    \"show\": true\n        }\n      ],\n      \"yaxis\": {\n        \"align\"\
    : false,\n        \"alignLevel\": null\n      }\n    }\n  ],\n  \"refresh\": false,\n\
    \  \"schemaVersion\": 22,\n  \"style\": \"dark\",\n  \"tags\": [],\n  \"templating\"\
    : {\n    \"list\": [\n      {\n        \"current\": {\n          \"selected\"\
    : true,\n          \"text\": \"Prometheus\",\n          \"value\": \"Prometheus\"\
    \n        },\n        \"hide\": 0,\n        \"includeAll\": false,\n        \"\
    multi\": false,\n        \"name\": \"datasource\",\n        \"options\": [],\n\
    \        \"query\": \"prometheus\",\n        \"queryValue\": \"\",\n        \"\
    refresh\": 1,\n        \"regex\": \"\",\n        \"skipUrlSync\": false,\n   \
    \     \"type\": \"datasource\"\n      },\n      {\n        \"allValue\": null,\n\
    \        \"current\": {},\n        \"datasource\": \"$datasource\",\n        \"\
    definition\": \"label_values(DCGM_FI_DEV_GPU_TEMP, instance)\",\n        \"hide\"\
    : 0,\n        \"includeAll\": true,\n        \"index\": -1,\n        \"label\"\
    : null,\n        \"multi\": true,\n        \"name\": \"instance\",\n        \"\
    options\": [],\n        \"query\": \"label_values(DCGM_FI_DEV_GPU_TEMP, instance)\"\
    ,\n        \"refresh\": 1,\n        \"regex\": \"\",\n        \"skipUrlSync\"\
    : false,\n        \"sort\": 1,\n        \"tagValuesQuery\": \"\",\n        \"\
    tags\": [],\n        \"tagsQuery\": \"\",\n        \"type\": \"query\",\n    \
    \    \"useTags\": false\n      },\n      {\n        \"allValue\": null,\n    \
    \    \"current\": {},\n        \"datasource\": \"$datasource\",\n        \"definition\"\
    : \"label_values(DCGM_FI_DEV_GPU_TEMP, gpu)\",\n        \"hide\": 0,\n       \
    \ \"includeAll\": true,\n        \"index\": -1,\n        \"label\": null,\n  \
    \      \"multi\": true,\n        \"name\": \"gpu\",\n        \"options\": [],\n\
    \        \"query\": \"label_values(DCGM_FI_DEV_GPU_TEMP, gpu)\",\n        \"refresh\"\
    : 1,\n        \"regex\": \"\",\n        \"skipUrlSync\": false,\n        \"sort\"\
    : 1,\n        \"tagValuesQuery\": \"\",\n        \"tags\": [],\n        \"tagsQuery\"\
    : \"\",\n        \"type\": \"query\",\n        \"useTags\": false\n      }\n \
    \   ]\n  },\n  \"time\": {\n    \"from\": \"now-15m\",\n    \"to\": \"now\"\n\
    \  },\n  \"timepicker\": {\n    \"refresh_intervals\": [\n      \"5s\",\n    \
    \  \"10s\",\n      \"30s\",\n      \"1m\",\n      \"5m\",\n      \"15m\",\n  \
    \    \"30m\",\n      \"1h\",\n      \"2h\",\n      \"1d\"\n    ]\n  },\n  \"timezone\"\
    : \"\",\n  \"title\": \"NVIDIA DCGM Exporter Dashboard\",\n  \"uid\": \"Oxed_c6Wz\"\
    ,\n  \"variables\": {\n    \"list\": []\n  },\n  \"version\": 1\n}\n"
kind: ConfigMap
metadata:
  labels:
    console.openshift.io/dashboard: 'true'
    console.openshift.io/odc-dashboard: 'true'
  name: nvidia-dcgm-exporter-dashboard
  namespace: openshift-config-managed
ConfigMapv1 job-setup-autoscale openshift-machine-api
apiVersion: v1
data:
  job.sh: '#!/bin/bash

    # shellcheck disable=SC1091


    . /scripts/ocp.sh


    ocp_machineset_create_autoscale "${MACHINE_MIN}" "${MACHINE_MAX}"

    '
  ocp.sh: "#!/bin/bash\n\n# https://mirror.openshift.com/pub/openshift-v4\n\nocp_add_admin_user(){\n\
    \  HT_USERNAME=${1:-admin}\n  HT_PASSWORD=${2:-$(genpass)}\n\n  htpasswd_ocp_get_file\n\
    \  htpasswd_add_user \"${HT_USERNAME}\" \"${HT_PASSWORD}\"\n  htpasswd_ocp_set_file\n\
    \  htpasswd_validate_user \"${HT_USERNAME}\" \"${HT_PASSWORD}\"\n}\n\nocp_auth_add_to_group(){\n\
    \  USER=${1:-admin}\n  OCP_GROUP=${2:-${DEFAULT_OCP_GROUP}}\n\n  ocp_auth_create_group\
    \ \"${OCP_GROUP}\"\n\n  oc adm groups add-users \\\n  \"${OCP_GROUP}\" \"${USER}\"\
    \n}\n\nocp_auth_create_group(){\n  OCP_GROUP=${1:-${DEFAULT_OCP_GROUP}}\n\n  oc\
    \ get group \"${OCP_GROUP}\" > /dev/null 2>&1 && return\n\necho \"\napiVersion:\
    \ user.openshift.io/v1\nkind: Group\nmetadata:\n  name: ${OCP_GROUP}\n\" | oc\
    \ apply -f-\n\n}\n\nocp_auth_setup_user(){\n  USER=${1:-admin}\n  PASS=${2:-$(genpass)}\n\
    \  OCP_GROUP=${3:-${DEFAULT_OCP_GROUP}}\n\n  htpasswd_add_user \"${USER}\" \"\
    ${PASS}\"\n  ocp_auth_add_to_group \"${USER}\" \"${OCP_GROUP}\"\n\n  echo \"\n\
    \    run: htpasswd_ocp_set_file\n  \"\n}\n\nocp_check_info(){\n  echo \"== OCP\
    \ INFO ==\"\n  ocp_check_login || return 1\n\n  echo \"NAMESPACE: $(oc project\
    \ -q)\"\n  sleep \"${SLEEP_SECONDS:-8}\"\n}\n\nocp_check_login(){\n  oc whoami\
    \ || return 1\n  oc cluster-info | head -n1\n  echo\n}\n\nocp_clean_install_pods(){\n\
    \  oc delete pod \\\n    -A \\\n    -l app=installer\n}\n\nocp_control_nodes_not_schedulable(){\n\
    \  oc patch schedulers.config.openshift.io/cluster --type merge --patch '{\"spec\"\
    :{\"mastersSchedulable\": false}}'\n}\n\nocp_control_nodes_schedulable(){\n  oc\
    \ patch schedulers.config.openshift.io/cluster --type merge --patch '{\"spec\"\
    :{\"mastersSchedulable\": true}}'\n}\n\nocp_expose_image_registry(){\n  oc patch\
    \ configs.imageregistry.operator.openshift.io/cluster --type=merge --patch '{\"\
    spec\":{\"defaultRoute\":true}}'\n\n  # remove 'default-route-openshift-image-'\
    \ from route\n  HOST=$(oc get route default-route -n openshift-image-registry\
    \ --template='{{ .spec.host }}')\n  SHORTER_HOST=$(echo \"${HOST}\" | sed '/host/\
    \ s/default-route-openshift-image-//')\n  oc patch configs.imageregistry.operator.openshift.io/cluster\
    \ --type=merge --patch '{\"spec\":{\"host\": \"'\"${SHORTER_HOST}\"'\"}}'\n\n\
    \  echo \"OCP image registry is available at: ${SHORTER_HOST}\"\n}\n\nocp_fix_duplicate_operator_groups(){\n\
    \  for ns in $(oc get og -A | awk '{print $1}' | uniq -d)\n  do\n    oc -n \"\
    ${ns}\" \\\n      get og -o name | \\\n        tail -n+2 | \\\n        xargs oc\
    \ -n \"${ns}\" delete\n    \n    # oc -n \"${ns}\" \\\n    #   delete pod --all\n\
    \  done\n}\n\nocp_get_apps_domain(){\n  oc get ingresses.config.openshift.io cluster\
    \ -o jsonpath='{.spec.domain}'\n}\n\nocp_get_domain(){\n  OCP_APPS_DOMAIN=$(ocp_get_apps_domain)\n\
    \  echo \"${OCP_APPS_DOMAIN#apps.}\"\n}\n\nocp_get_kubeconfigs(){\n  # https://rcarrata.com/openshift/regenerate-kubeconfig/\n\
    \  # https://gist.githubusercontent.com/rcarrata/016da295c1421cccbfbd66ed9a7922bc/raw/855486c363734892988cdf1b5d0d26ece5e0960a/regenerate-kubeconfig.sh\n\
    \  # https://access.redhat.com/solutions/6054981\n  # https://access.redhat.com/solutions/5286371\n\
    \  # https://access.redhat.com/solutions/6112601\n\n  oc -n openshift-kube-apiserver\
    \ extract secret/node-kubeconfigs\n}\n\nocp_get_pull_secret(){\n  oc -n openshift-config\
    \ \\\n    get secret/pull-secret \\\n    --template='{{index .data \".dockerconfigjson\"\
    \ | base64decode}}'\n}\n\nocp_gpu_pretty_label(){\n  oc label node -l nvidia.com/gpu.machine\
    \ node-role.kubernetes.io/gpu=''\n}\n\nocp_gpu_taint_nodes(){\n  oc adm taint\
    \ node -l node-role.kubernetes.io/gpu nvidia.com/gpu=:NoSchedule --overwrite\n\
    \  oc adm drain -l node-role.kubernetes.io/gpu --ignore-daemonsets --delete-emptydir-data\n\
    \  oc adm uncordon -l node-role.kubernetes.io/gpu\n}\n\nocp_gpu_untaint_nodes(){\n\
    \  oc adm taint node -l node-role.kubernetes.io/gpu nvidia.com/gpu=:NoSchedule-\n\
    }\n\nocp_infra_label_control(){\n  echo \"see https://docs.redhat.com/en/documentation/openshift_container_platform/4.8/html/machine_management/creating-infrastructure-machinesets#moving-resources-to-infrastructure-machinesets\"\
    \n\n  oc label node -l node-role.kubernetes.io/control-plane node-role.kubernetes.io/infra=\"\
    \"\n\n  # oc patch \\\n  #   scheduler cluster \\\n  #   --type=merge --patch\
    \ '{\"spec\":{\"defaultNodeSelector\":\"node-role.kubernetes.io/infra=\\\"\\\"\
    \"}}'\n\n}\n\nocp_infra_move_registry_to_control(){\n\ncat <<YAML > /tmp/patch.yaml\n\
    spec:\n  nodeSelector:\n    node-role.kubernetes.io/infra: \"\"\n  tolerations:\n\
    \  - effect: NoSchedule\n    key: node-role.kubernetes.io/master\n    operator:\
    \ Exists\n  - effect: NoExecute\n    key: node-role.kubernetes.io/master\n   \
    \ operator: Exists\nYAML\n\n oc patch \\\n    configs.imageregistry.operator.openshift.io/cluster\
    \ \\\n    --type=merge --patch-file /tmp/patch.yaml\n\n}\n\nocp_infra_move_router_to_control(){\n\
    \ncat <<YAML > /tmp/patch.yaml\nspec:\n  nodePlacement:\n    nodeSelector:\n \
    \     matchLabels:\n        node-role.kubernetes.io/infra: \"\"\n    tolerations:\n\
    \    - effect: NoSchedule\n      key: node-role.kubernetes.io/master\n      operator:\
    \ Exists\n    - effect: NoExecute\n      key: node-role.kubernetes.io/master\n\
    \      operator: Exists\nYAML\n\n  oc -n openshift-ingress-operator \\\n    patch\
    \ \\\n    ingresscontroller default \\\n    --type=merge --patch-file /tmp/patch.yaml\n\
    \n}\n\nocp_infra_move_monitoring_to_control(){\n\ncat <<YAML > /tmp/patch.yaml\n\
    spec:\n  logStore:\n    elasticsearch:\n      nodeCount: 3\n      nodeSelector:\n\
    \        node-role.kubernetes.io/infra: \"\"\n      tolerations:\n      - effect:\
    \ NoSchedule\n        key: node-role.kubernetes.io/master\n        operator: Exists\n\
    \      - effect: NoExecute\n        key: node-role.kubernetes.io/master\n    \
    \    operator: Exists\n  visualization:\n    kibana:\n      nodeSelector:\n  \
    \      node-role.kubernetes.io/infra: \"\"\n      tolerations:\n      - effect:\
    \ NoSchedule\n        key: node-role.kubernetes.io/master\n        operator: Exists\n\
    \      - effect: NoExecute\n        key: node-role.kubernetes.io/master\n    \
    \    operator: Exists\nYAML\n\n  oc -n openshift-logging \\\n    patch \\\n  \
    \  clusterlogging instance \\\n    --type=merge --patch-file /tmp/patch.yaml\n\
    \n}\n\nocp_kubeadmin_create(){\n  PASS=${1:-$(genpass 5 )-$(genpass 5 )-$(genpass\
    \ 5 )-$(genpass 5 )}\n\n  which htpasswd >/dev/null || return 1\n\n  HTPASSWD=$(htpasswd\
    \ -nbB -C10 null \"${PASS}\")\n  HASH=${HTPASSWD##*:}\n\n  echo \"\n  PASSWORD:\
    \ ${PASS}\n  HASH:     ${HASH}\n\n  oc apply -f scratch/kubeadmin.yaml\n  \"\n\
    \ncat << YAML > scratch/kubeadmin.yaml\nkind: Secret\napiVersion: v1\nmetadata:\n\
    \  name: kubeadmin\n  namespace: kube-system\nstringData:\n  kubeadmin: ${HASH}\n\
    \  password: ${PASS}\ntype: Opaque\nYAML\n}\n\nocp_kubeadmin_remove(){\n  FORCE=${1:-No}\n\
    \n  if [ \"${FORCE}\" = \"YES\" ]; then\n    [ ! -e scratch/kubeadmin.yaml ] &&\
    \ \\\n      oc get secret kubeadmin -n kube-system -o yaml > scratch/kubeadmin.yaml\
    \ || return 1\n    oc delete secret kubeadmin -n kube-system\n  else\n    echo\
    \ -e \"${RED}\n    WARNING: you must run - ocp_remove_kubeadmin YES\n\n    WARNING:\
    \ you will lose access to your cluster if you do not\n      have a way to login\
    \ to your cluster without kubeadmin. \n      \n      Examples:\n        - An identity\
    \ provider with a cluster-admin user setup\n        - A kubeconfig file\n    ${NC}\"\
    \n    return\n  fi\n}\n\nocp_machineset_create_autoscale(){\n  MACHINE_MIN=${1:-0}\n\
    \  MACHINE_MAX=${2:-4}\n  MACHINE_SETS=${3:-$(oc -n openshift-machine-api get\
    \ machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}\n\n  for machine_set\
    \ in ${MACHINE_SETS}\n  do\ncat << YAML | oc apply -f -\napiVersion: \"autoscaling.openshift.io/v1beta1\"\
    \nkind: \"MachineAutoscaler\"\nmetadata:\n  name: \"${machine_set}\"\n  namespace:\
    \ \"openshift-machine-api\"\nspec:\n  minReplicas: ${MACHINE_MIN}\n  maxReplicas:\
    \ ${MACHINE_MAX}\n  scaleTargetRef:\n    apiVersion: machine.openshift.io/v1beta1\n\
    \    kind: MachineSet\n    name: \"${machine_set}\"\nYAML\n  done\n}\n\nocp_machineset_patch_accelerator(){\n\
    \  MACHINE_SET_NAME=${1:-gpu}\n  LABEL=${2:-nvidia-gpu}\n\n  oc -n openshift-machine-api\
    \ \\\n    patch machineset \"${MACHINE_SET_NAME}\" \\\n    --type=merge --patch\
    \ '{\"spec\":{\"template\":{\"spec\":{\"metadata\":{\"labels\":{\"cluster-api/accelerator\"\
    :\"'\"${LABEL}\"'\"}}}}}}'\n  \n  oc -n openshift-machine-api \\\n    patch machineset\
    \ \"${MACHINE_SET_NAME}\" \\\n    --type=merge --patch '{\"spec\":{\"template\"\
    :{\"spec\":{\"metadata\":{\"labels\":{\"node-role.kubernetes.io/gpu\":\"\"}}}}}}'\n\
    }\n\nocp_machineset_scale(){\n  REPLICAS=${1:-1}\n  MACHINE_SETS=${2:-$(oc -n\
    \ openshift-machine-api get machineset -o name)}\n\n  # scale workers\n  echo\
    \ \"${MACHINE_SETS}\" | \\\n    xargs \\\n      oc -n openshift-machine-api \\\
    \n      scale --replicas=\"${REPLICAS}\"\n}\n\nocp_machineset_taint_gpu(){\n \
    \ SHORT_NAME=${1:-g4dn}\n  MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io\
    \ -o name | grep \"${SHORT_NAME}\" | head -n1)\n\n  echo \"Patching: ${MACHINE_SET}\"\
    \n\n  # taint nodes for gpu-only workloads\n  oc -n openshift-machine-api \\\n\
    \    patch \"${MACHINE_SET}\" \\\n    --type=merge --patch '{\"spec\":{\"template\"\
    :{\"spec\":{\"taints\":[{\"key\":\"nvidia.com/gpu\",\"value\":\"\",\"effect\"\
    :\"NoSchedule\"}]}}}}'\n}\n\nocp_release_info(){\n  VERSION=${1:-stable-4.12}\n\
    \  echo \"VERSION: ${VERSION}\"\n  curl -sL \"https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/${VERSION}/release.txt\"\
    \n}\n\nocp_run_on_all_nodes(){\n  case $1 in\n    --confirm)\n      shift\n\n\
    \      COMMAND=${*:-uptime}\n      ALL_NODES=$(oc get nodes --show-kind --no-headers|awk\
    \ '/node/{print $1}')\n\n      for node in ${ALL_NODES}\n        do\n        \
    \  # wipefs -af /dev/nvme0n1\n          # oc debug $node -- chroot /host  bash\
    \ -c \"$(cat -)\"\n          # shellcheck disable=SC2086\n          oc debug \"\
    $node\" -- chroot /host ${COMMAND}\n      done\n      ;;\n   *)\n      echo \"\
    -------------------------------------------------------------------\"\n      echo\
    \ \"WARNING. This runs as root on all nodes!\"\n      echo \"You can DESTROY ALL\
    \ DATA, without recovery, if used incorrectly!\"\n      echo \"-------------------------------------------------------------------\"\
    \n      echo \"Usage:\"\n      echo \"  ocp_run_on_all_nodes --confirm < command\
    \ >\"\n  esac\n\n}\n\nocp_save_money(){\n\n  # run work on masters\n  ocp_control_nodes_schedulable\n\
    \n  # scale to zero\n  ocp_machineset_scale 0\n\n  # place as many pods on as\
    \ few nodes as possible\n  ocp_scheduler_set_profile HighNodeUtilization\n}\n\n\
    ocp_scheduler_set_profile(){\n  SCHED_PROFILE=${1:-LowNodeUtilization}\n\n  #\
    \ LowNodeUtilization, HighNodeUtilization, NoScoring\n  echo \"see https://docs.openshift.com/container-platform/4.16/nodes/scheduling/nodes-scheduler-profiles.html\"\
    \n  echo \"OPTIONS: LowNodeUtilization (default), HighNodeUtilization, NoScoring\"\
    \n  echo \"SCHED_PROFILE: ${SCHED_PROFILE}\"\n\n  oc patch schedulers.config.openshift.io/cluster\
    \ --type merge --patch '{\"spec\":{\"profile\": \"'\"${SCHED_PROFILE}\"'\"}}'\n\
    }\n\nocp_setup_namespace(){\n  NAMESPACE=${1}\n\n  oc new-project \"${NAMESPACE}\"\
    \ 2>/dev/null || \\\n    oc project \"${NAMESPACE}\"\n}\n\nocp_update_pull_secret(){\n\
    \  echo \"see https://access.redhat.com/solutions/4902871\"\n\n  PULL_SECRET_FILE=${1:-${GIT_ROOT}/scratch/pull-secret}\n\
    \n  oc extract secret/pull-secret \\\n    -n openshift-config \\\n    --keys .dockerconfigjson\
    \ \\\n    --to=- > \"${PULL_SECRET_FILE}\"\n  \n  oc get secret/pull-secret \\\
    \n    -n openshift-config \\\n    -o yaml > \"${PULL_SECRET_FILE}.yaml\"\n\n \
    \ [ -e \"${PULL_SECRET_FILE}\" ] || return 0\n\n  if oc get secret/pull-secret\
    \ -n openshift-config -o name; then\n    oc set data secret/pull-secret \\\n \
    \     -n openshift-config \\\n      --from-file=.dockerconfigjson=\"${PULL_SECRET_FILE}\"\
    \n  else\n    oc create secret generic pull-secret \\\n      -n openshift-config\
    \ \\\n      --type=kubernetes.io/dockerconfigjson \\\n      --from-file=.dockerconfigjson=\"\
    ${PULL_SECRET_FILE}\"\n  fi  \n}\n\nocp_upgrade_ack_4.13(){\n  oc -n openshift-config\
    \ patch cm admin-acks --patch '{\"data\":{\"ack-4.12-kube-1.26-api-removals-in-4.13\"\
    :\"true\"}}' --type=merge\n}\n\nocp_upgrade_ack_4.19(){\n  oc -n openshift-config\
    \ patch cm admin-acks --patch '{\"data\":{\"ack-4.18-kube-1.32-api-removals-in-4.19\"\
    :\"true\"}}' --type=merge\n}\n\nocp_upgrade_cluster(){\n  OCP_VERSION=\"${1:-latest}\"\
    \n\n  if [ \"${OCP_VERSION}\" = \"latest\" ]; then\n    oc adm upgrade --to-latest=true\n\
    \  else\n    oc adm upgrade --to=\"${OCP_VERSION}\"\n  fi\n}\n"
kind: ConfigMap
metadata:
  labels:
    autoscale: config
  name: job-setup-autoscale
  namespace: openshift-machine-api
Servicev1 console-plugin-nvidia-gpu nvidia-gpu-operator
apiVersion: v1
kind: Service
metadata:
  annotations:
    service.alpha.openshift.io/serving-cert-secret-name: plugin-serving-cert
  labels:
    app.kubernetes.io/component: console-plugin-nvidia-gpu
    app.kubernetes.io/instance: console-plugin-nvidia-gpu
    app.kubernetes.io/managed-by: Helm
    app.kubernetes.io/name: console-plugin-nvidia-gpu
    app.kubernetes.io/part-of: console-plugin-nvidia-gpu
    app.kubernetes.io/version: latest
    helm.sh/chart: console-plugin-nvidia-gpu-0.2.4
  name: console-plugin-nvidia-gpu
  namespace: nvidia-gpu-operator
spec:
  ports:
  - name: 9443-tcp
    port: 9443
    protocol: TCP
    targetPort: 9443
  selector:
    app.kubernetes.io/name: console-plugin-nvidia-gpu
  sessionAffinity: None
  type: ClusterIP
Deploymentapps/v1 console-plugin-nvidia-gpu nvidia-gpu-operator
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app.kubernetes.io/component: console-plugin-nvidia-gpu
    app.kubernetes.io/instance: console-plugin-nvidia-gpu
    app.kubernetes.io/managed-by: Helm
    app.kubernetes.io/name: console-plugin-nvidia-gpu
    app.kubernetes.io/part-of: console-plugin-nvidia-gpu
    app.kubernetes.io/version: latest
    app.openshift.io/runtime-namespace: console-plugin-nvidia-gpu
    helm.sh/chart: console-plugin-nvidia-gpu-0.2.4
  name: console-plugin-nvidia-gpu
  namespace: nvidia-gpu-operator
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: console-plugin-nvidia-gpu
  strategy:
    rollingUpdate:
      maxSurge: 25%
      maxUnavailable: 25%
    type: RollingUpdate
  template:
    metadata:
      labels:
        app.kubernetes.io/name: console-plugin-nvidia-gpu
    spec:
      containers:
      - image: quay.io/edge-infrastructure/console-plugin-nvidia-gpu:latest
        imagePullPolicy: Always
        name: console-plugin-nvidia-gpu
        ports:
        - containerPort: 9443
          protocol: TCP
        resources: {}
        securityContext:
          allowPrivilegeEscalation: false
        volumeMounts:
        - mountPath: /var/serving-cert
          name: plugin-serving-cert
          readOnly: true
      dnsPolicy: ClusterFirst
      restartPolicy: Always
      securityContext:
        runAsNonRoot: true
      volumes:
      - name: plugin-serving-cert
        secret:
          defaultMode: 420
          secretName: plugin-serving-cert
      - configMap:
          defaultMode: 420
          name: nginx-conf
        name: nginx-conf
ClusterAutoscalerautoscaling.openshift.io/v1 default openshift-machine-api
apiVersion: autoscaling.openshift.io/v1
kind: ClusterAutoscaler
metadata:
  labels:
    autoscale: config
  name: default
  namespace: openshift-machine-api
spec:
  podPriorityThreshold: -10
  resourceLimits:
    cores:
      max: 176
      min: 0
    gpus:
    - max: 8
      min: 0
      type: nvidia.com/gpu
    - max: 1
      min: 0
      type: amd.com/gpu
    maxNodesTotal: 16
    memory:
      max: 512
      min: 0
  scaleDown:
    delayAfterAdd: 5m
    delayAfterDelete: 1m
    delayAfterFailure: 30s
    enabled: true
    unneededTime: 5m
    utilizationThreshold: '0.7'
Jobbatch/v1 job-aro-gpu-machineset nvidia-gpu-operator
apiVersion: batch/v1
kind: Job
metadata:
  generateName: job-aro-gpu-machineset-
  name: job-aro-gpu-machineset
  namespace: nvidia-gpu-operator
spec:
  template:
    spec:
      containers:
      - command:
        - /bin/bash
        - -c
        - /scripts/job.sh
        env:
        - name: INSTANCE_TYPE
          value: Standard_NC4as_T4_v3
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        image: registry.redhat.io/openshift4/ose-cli
        name: job-aro-gpu-machineset
        volumeMounts:
        - mountPath: /scripts
          name: scripts
      restartPolicy: Never
      serviceAccount: job-aro-gpu-machineset
      serviceAccountName: job-aro-gpu-machineset
      terminationGracePeriodSeconds: 30
      volumes:
      - configMap:
          defaultMode: 493
          name: job-aro-gpu-machineset
        name: scripts
Jobbatch/v1 job-aws-gpu-machineset nvidia-gpu-operator
apiVersion: batch/v1
kind: Job
metadata:
  generateName: job-aws-gpu-machineset-
  name: job-aws-gpu-machineset
  namespace: nvidia-gpu-operator
spec:
  template:
    spec:
      containers:
      - command:
        - /bin/bash
        - -c
        - /scripts/job.sh
        env:
        - name: INSTANCE_TYPE
          value: g4dn.4xlarge
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        image: registry.redhat.io/openshift4/ose-cli
        name: job-aws-gpu-machineset
        volumeMounts:
        - mountPath: /scripts
          name: scripts
      restartPolicy: Never
      serviceAccount: job-aws-gpu-machineset
      serviceAccountName: job-aws-gpu-machineset
      terminationGracePeriodSeconds: 30
      volumes:
      - configMap:
          defaultMode: 493
          name: job-aws-gpu-machineset
        name: scripts
Jobbatch/v1 job-gpu-console-plugin nvidia-gpu-operator
apiVersion: batch/v1
kind: Job
metadata:
  annotations:
    argocd.argoproj.io/sync-wave: '10'
  generateName: job-gpu-console-plugin-
  name: job-gpu-console-plugin
  namespace: nvidia-gpu-operator
spec:
  backoffLimit: 4
  template:
    spec:
      containers:
      - command:
        - /bin/bash
        - -c
        - /scripts/console-plugin-job.sh
        env:
        - name: PLUGIN_NAME
          value: console-plugin-nvidia-gpu
        image: registry.redhat.io/openshift4/ose-cli
        name: minion
        volumeMounts:
        - mountPath: /scripts
          name: scripts
      restartPolicy: Never
      serviceAccount: job-gpu-console-plugin
      serviceAccountName: job-gpu-console-plugin
      volumes:
      - configMap:
          defaultMode: 493
          name: job-gpu-console-plugin
        name: scripts
Jobbatch/v1 job-setup-autoscale openshift-machine-api
apiVersion: batch/v1
kind: Job
metadata:
  annotations:
    argocd.argoproj.io/hook: Sync
  labels:
    autoscale: config
  name: job-setup-autoscale
  namespace: openshift-machine-api
spec:
  template:
    spec:
      containers:
      - command:
        - /bin/bash
        - -c
        - /scripts/job.sh
        env:
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        - name: MACHINE_MIN
          value: '0'
        - name: MACHINE_MAX
          value: '4'
        image: registry.redhat.io/openshift4/ose-cli
        name: minion
        volumeMounts:
        - mountPath: /scripts
          name: scripts
      restartPolicy: Never
      serviceAccount: job-setup-autoscale
      serviceAccountName: job-setup-autoscale
      terminationGracePeriodSeconds: 30
      volumes:
      - configMap:
          defaultMode: 493
          name: job-setup-autoscale
        name: scripts
ConsoleLinkconsole.openshift.io/v1 github-demo-gitops
apiVersion: console.openshift.io/v1
kind: ConsoleLink
metadata:
  annotations:
    argocd.argoproj.io/sync-options: Prune=true
    source: https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
  labels:
    demo: ai-gitops-catalog
  name: github-demo-gitops
spec:
  applicationMenu:
    imageURL: /static/assets/public/imgs/logos/github.svg
    section: Git Repos
  href: https://github.com/redhat-na-ssa/demo-ai-gitops-catalog
  location: ApplicationMenu
  text: GitHub - Demo GitOps Catalog
ConsoleLinkconsole.openshift.io/v1 github-ssa
apiVersion: console.openshift.io/v1
kind: ConsoleLink
metadata:
  annotations:
    argocd.argoproj.io/sync-options: Prune=true
    source: https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
  labels:
    demo: ai-gitops-catalog
  name: github-ssa
spec:
  applicationMenu:
    imageURL: /static/assets/public/imgs/logos/github.svg
    section: Git Repos
  href: https://github.com/redhat-na-ssa
  location: ApplicationMenu
  text: GitHub - NA SSA
ConsoleLinkconsole.openshift.io/v1 help-link
apiVersion: console.openshift.io/v1
kind: ConsoleLink
metadata:
  annotations:
    argocd.argoproj.io/sync-options: Prune=true
    source: https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
  labels:
    demo: ai-gitops-catalog
  name: help-link
spec:
  href: https://github.com/redhat-na-ssa/demo-ai-gitops-catalog/issues
  location: HelpMenu
  text: Demo Catalog - Open Issue
ConsoleNotificationconsole.openshift.io/v1 banner-cluster
apiVersion: console.openshift.io/v1
kind: ConsoleNotification
metadata:
  annotations:
    argocd.argoproj.io/sync-options: Prune=true
    source: https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
  labels:
    demo: ai-gitops-catalog
  name: banner-cluster
spec:
  backgroundColor: '#0066FF'
  color: '#FFF'
  location: BannerBottom
  text: This cluster was configured via the AI GitOps catalog
ConsoleNotificationconsole.openshift.io/v1 banner-demo
apiVersion: console.openshift.io/v1
kind: ConsoleNotification
metadata:
  annotations:
    source: https://github.com/redhat-na-ssa/demo-ai-gitops-catalog.git
  labels:
    demo: ai-gitops-catalog
  name: banner-demo
spec:
  backgroundColor: '#9F0000'
  color: '#FFF'
  location: BannerTop
  text: 'DEMO: Efficiently leveraging GPUs via autoscaling'
ConsolePluginconsole.openshift.io/v1 console-plugin-nvidia-gpu nvidia-gpu-operator
apiVersion: console.openshift.io/v1
kind: ConsolePlugin
metadata:
  labels:
    app.kubernetes.io/component: console-plugin-nvidia-gpu
    app.kubernetes.io/instance: console-plugin-nvidia-gpu
    app.kubernetes.io/managed-by: Helm
    app.kubernetes.io/name: console-plugin-nvidia-gpu
    app.kubernetes.io/part-of: console-plugin-nvidia-gpu
    app.kubernetes.io/version: latest
    helm.sh/chart: console-plugin-nvidia-gpu-0.2.4
  name: console-plugin-nvidia-gpu
  namespace: nvidia-gpu-operator
spec:
  backend:
    service:
      basePath: /
      name: console-plugin-nvidia-gpu
      namespace: nvidia-gpu-operator
      port: 9443
    type: Service
  displayName: Console Plugin NVIDIA GPU Template
AlertingRulemonitoring.openshift.io/v1 gpu-pods openshift-monitoring
apiVersion: monitoring.openshift.io/v1
kind: AlertingRule
metadata:
  name: gpu-pods
  namespace: openshift-monitoring
spec:
  groups:
  - name: gpu-pods
    rules:
    - alert: GpuPods
      annotations:
        description: A total of {{ $value }} 'nvidia.com/gpu' requested on the cluster.
        runbook_url: https://github.com/redhat-na-ssa/demo-ai-gitops-catalog/tree/main/components/operators/gpu-operator-certified/instance/components/gpu-monitoring/gpu-pods.md
        summary: Cloud costs may increase by requesting specialized resources.
      expr: 'sum (kube_pod_resource_request{resource="nvidia.com/gpu"} >= 1 ) > 0

        # sum by (namespace, pod,resource) (kube_pod_resource_request{resource="nvidia.com/gpu"}
        >= 1) > 0

        '
      labels:
        severity: info
NodeFeatureDiscoverynfd.openshift.io/v1 nfd-instance openshift-nfd
apiVersion: nfd.openshift.io/v1
kind: NodeFeatureDiscovery
metadata:
  name: nfd-instance
  namespace: openshift-nfd
spec:
  instance: ''
  operand:
    image: registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.16
    servicePort: 12000
  topologyUpdater: false
  workerConfig:
    configData: "core:\n  sleepInterval: 60s\nsources:\n  pci:\n    deviceClassWhitelist:\n\
      \      - \"0200\"\n      - \"03\"\n      - \"12\"\n    deviceLabelFields:\n\
      \      - \"vendor\"\n"
ClusterPolicynvidia.com/v1 gpu-cluster-policy nvidia-gpu-operator
apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
  name: gpu-cluster-policy
  namespace: nvidia-gpu-operator
spec:
  daemonsets:
    rollingUpdate:
      maxUnavailable: '1'
    tolerations:
    - effect: NoSchedule
      key: nvidia.com/gpu
      operator: Exists
    updateStrategy: RollingUpdate
  dcgm:
    enabled: true
  dcgmExporter:
    config:
      name: console-plugin-nvidia-gpu
    enabled: true
    serviceMonitor:
      enabled: true
  devicePlugin:
    config:
      default: time-sliced-4
      name: device-plugin-config
    enabled: true
  driver:
    certConfig:
      name: ''
    enabled: true
    kernelModuleConfig:
      name: ''
    licensingConfig:
      configMapName: ''
      nlsEnabled: false
    repoConfig:
      configMapName: ''
    upgradePolicy:
      autoUpgrade: true
      drain:
        deleteEmptyDir: false
        enable: false
        force: false
        timeoutSeconds: 300
      maxParallelUpgrades: 1
      maxUnavailable: 25%
      podDeletion:
        deleteEmptyDir: false
        force: false
        timeoutSeconds: 300
      waitForCompletion:
        timeoutSeconds: 0
    virtualTopology:
      config: ''
  gds:
    enabled: false
  gfd:
    enabled: true
  mig:
    strategy: single
  migManager:
    enabled: true
  nodeStatusExporter:
    enabled: true
  operator:
    defaultRuntime: crio
    initContainer: {}
    use_ocp_driver_toolkit: true
  sandboxDevicePlugin:
    enabled: true
  sandboxWorkloads:
    defaultWorkload: container
    enabled: false
  toolkit:
    enabled: true
  validator:
    plugin:
      env:
      - name: WITH_WORKLOAD
        value: 'true'
  vfioManager:
    enabled: true
  vgpuDeviceManager:
    enabled: true
  vgpuManager:
    enabled: false
OperatorGroupoperators.coreos.com/v1 gpu-operator-certified nvidia-gpu-operator
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
  name: gpu-operator-certified
  namespace: nvidia-gpu-operator
spec:
  targetNamespaces:
  - nvidia-gpu-operator
OperatorGroupoperators.coreos.com/v1 nfd openshift-nfd
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
  name: nfd
  namespace: openshift-nfd
spec:
  targetNamespaces:
  - openshift-nfd
Subscriptionoperators.coreos.com/v1alpha1 gpu-operator-certified nvidia-gpu-operator
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
  name: gpu-operator-certified
  namespace: nvidia-gpu-operator
spec:
  channel: stable
  installPlanApproval: Automatic
  name: gpu-operator-certified
  source: certified-operators
  sourceNamespace: openshift-marketplace
Subscriptionoperators.coreos.com/v1alpha1 nfd openshift-nfd
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
  name: nfd
  namespace: openshift-nfd
spec:
  channel: stable
  installPlanApproval: Automatic
  name: nfd
  source: redhat-operators
  sourceNamespace: openshift-marketplace