diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml index bb10f4f48d..6c0f049f9a 100644 --- a/.github/workflows/flyte-binary-v2.yml +++ b/.github/workflows/flyte-binary-v2.yml @@ -165,7 +165,7 @@ jobs: docker pull --platform linux/arm64 ghcr.io/flyteorg/flyte-client-v2:latest docker save ghcr.io/flyteorg/flyte-client-v2:latest -o docker/demo-bundled/images/tar/arm64/flyte-client-v2.tar - name: Build and push multi-arch image - if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} + # if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} uses: docker/build-push-action@v6 with: context: docker/demo-bundled @@ -174,3 +174,45 @@ jobs: tags: ${{ steps.image-names.outputs.tags }} build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}" push: true + + build-and-push-demo-bundled-gpu-image: + runs-on: ubuntu-latest + needs: [build-and-push-demo-bundled-image] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Prepare Image Names + id: image-names + uses: docker/metadata-action@v3 + with: + images: | + ghcr.io/${{ github.repository_owner }}/flyte-sandbox-v2-gpu + tags: | + type=raw,value=nightly,enable=${{ github.event_name == 'push' && github.ref == 'refs/heads/v2' }} + type=sha,format=long, + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: "${{ secrets.FLYTE_BOT_USERNAME }}" + password: "${{ secrets.FLYTE_BOT_PAT }}" + - name: Set base image + id: base-image + run: | + if [ "${{ github.event_name }}" = "push" ] || [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "tag=sha-${{ github.sha }}" >> "$GITHUB_OUTPUT" + else + echo "tag=nightly" >> "$GITHUB_OUTPUT" + fi + - name: Build and push GPU sandbox image + uses: docker/build-push-action@v6 + with: + context: docker/demo-bundled + file: docker/demo-bundled/Dockerfile.gpu + platforms: linux/amd64, linux/arm64 + tags: ${{ steps.image-names.outputs.tags }} + build-args: | + BASE_IMAGE=ghcr.io/${{ github.repository_owner }}/flyte-sandbox-v2:${{ steps.base-image.outputs.tag }} + push: true diff --git a/docker/demo-bundled/Dockerfile.gpu b/docker/demo-bundled/Dockerfile.gpu new file mode 100644 index 0000000000..4bff86380d --- /dev/null +++ b/docker/demo-bundled/Dockerfile.gpu @@ -0,0 +1,29 @@ +# syntax=docker/dockerfile:1.4-labs +# +# GPU-enabled sandbox image. +# Extends the base sandbox with NVIDIA Container Toolkit so that K3s +# containerd can schedule GPU workloads via nvidia-container-runtime. + +ARG BASE_IMAGE=flyte-sandbox-v2:latest +ARG NVIDIA_CTK_VERSION=1.17.5 + +# ---------- Stage: grab NVIDIA Container Toolkit binaries ---------- +FROM nvcr.io/nvidia/k8s/container-toolkit:v${NVIDIA_CTK_VERSION}-ubuntu20.04 AS nvidia-toolkit + +# ---------- Final image ---------- +FROM ${BASE_IMAGE} + +# Copy NVIDIA Container Toolkit binaries +COPY --from=nvidia-toolkit /usr/bin/nvidia-container-runtime /usr/bin/ +COPY --from=nvidia-toolkit /usr/bin/nvidia-container-runtime-hook /usr/bin/ +COPY --from=nvidia-toolkit /usr/bin/nvidia-container-cli /usr/bin/ +COPY --from=nvidia-toolkit /usr/bin/nvidia-ctk /usr/bin/ + +# Copy NVIDIA container libraries (path varies by arch) +RUN --mount=from=nvidia-toolkit,src=/usr/lib,dst=/nvidia-lib \ + cp /nvidia-lib/x86_64-linux-gnu/libnvidia-container*.so* /usr/lib/ 2>/dev/null || \ + cp /nvidia-lib/aarch64-linux-gnu/libnvidia-container*.so* /usr/lib/ 2>/dev/null || true; \ + ldconfig 2>/dev/null || true + +# Pre-set GPU env so the entrypoint auto-configures K3s for NVIDIA +ENV FLYTE_SANDBOX_GPU=true diff --git a/docker/demo-bundled/Makefile b/docker/demo-bundled/Makefile index 56eac5e051..cff0938970 100644 --- a/docker/demo-bundled/Makefile +++ b/docker/demo-bundled/Makefile @@ -80,6 +80,7 @@ build: sync-crds flyte console dep_update manifests .PHONY: start start: FLYTE_DEMO_IMAGE := flyte-demo:latest start: FLYTE_DEV := False +start: FLYTE_SANDBOX_GPU ?= false start: [ -n "$(shell docker volume ls --filter name=^flyte-demo$$ --format {{.Name}})" ] || \ docker volume create flyte-demo @@ -88,7 +89,9 @@ start: docker run --detach --rm --privileged --name flyte-demo \ --add-host "host.docker.internal:host-gateway" \ --env FLYTE_DEV=$(FLYTE_DEV) \ + --env FLYTE_SANDBOX_GPU=$(FLYTE_SANDBOX_GPU) \ --env K3S_KUBECONFIG_OUTPUT=/.kube/kubeconfig \ + $(if $(filter true,$(FLYTE_SANDBOX_GPU)),--gpus all,) \ --volume $(PWD)/.kube:/.kube \ --volume flyte-demo:/var/lib/flyte/storage \ --publish "6443":"6443" \ diff --git a/docker/demo-bundled/bin/k3d-entrypoint-gpu.sh b/docker/demo-bundled/bin/k3d-entrypoint-gpu.sh new file mode 100755 index 0000000000..717f4ed71b --- /dev/null +++ b/docker/demo-bundled/bin/k3d-entrypoint-gpu.sh @@ -0,0 +1,73 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +FLYTE_SANDBOX_GPU="${FLYTE_SANDBOX_GPU:-false}" + +if [ "$FLYTE_SANDBOX_GPU" != "true" ]; then + echo "[$(date -Iseconds)] [GPU] GPU support not enabled (FLYTE_SANDBOX_GPU=$FLYTE_SANDBOX_GPU)" + exit 0 +fi + +echo "[$(date -Iseconds)] [GPU] Configuring NVIDIA GPU support..." + +# Configure K3s containerd to use the NVIDIA container runtime. +# K3s picks up containerd config templates from this path. +mkdir -p /var/lib/rancher/k3s/agent/etc/containerd +cat > /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl <<'EOF' +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."nvidia"] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."nvidia".options] + BinaryName = "/usr/bin/nvidia-container-runtime" + +[plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" +EOF + +# Deploy the NVIDIA device plugin as a K3s auto-deploy manifest. +cat > /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin.yaml <<'EOF' +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system + labels: + app.kubernetes.io/name: nvidia-device-plugin +spec: + selector: + matchLabels: + app.kubernetes.io/name: nvidia-device-plugin + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: nvidia-device-plugin + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + priorityClassName: system-node-critical + containers: + - name: nvidia-device-plugin + image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins +EOF + +echo "[$(date -Iseconds)] [GPU] NVIDIA GPU support configured"