Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion .github/workflows/flyte-binary-v2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ jobs:
docker pull --platform linux/arm64 ghcr.io/flyteorg/flyte-client-v2:latest
docker save ghcr.io/flyteorg/flyte-client-v2:latest -o docker/demo-bundled/images/tar/arm64/flyte-client-v2.tar
- name: Build and push multi-arch image
if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
# if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
uses: docker/build-push-action@v6
with:
context: docker/demo-bundled
Expand All @@ -174,3 +174,45 @@ jobs:
tags: ${{ steps.image-names.outputs.tags }}
build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
push: true

build-and-push-demo-bundled-gpu-image:
runs-on: ubuntu-latest
needs: [build-and-push-demo-bundled-image]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Prepare Image Names
id: image-names
uses: docker/metadata-action@v3
with:
images: |
ghcr.io/${{ github.repository_owner }}/flyte-sandbox-v2-gpu
tags: |
type=raw,value=nightly,enable=${{ github.event_name == 'push' && github.ref == 'refs/heads/v2' }}
type=sha,format=long,
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: "${{ secrets.FLYTE_BOT_USERNAME }}"
password: "${{ secrets.FLYTE_BOT_PAT }}"
- name: Set base image
id: base-image
run: |
if [ "${{ github.event_name }}" = "push" ] || [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "tag=sha-${{ github.sha }}" >> "$GITHUB_OUTPUT"
else
echo "tag=nightly" >> "$GITHUB_OUTPUT"
fi
- name: Build and push GPU sandbox image
uses: docker/build-push-action@v6
with:
context: docker/demo-bundled
file: docker/demo-bundled/Dockerfile.gpu
platforms: linux/amd64, linux/arm64
tags: ${{ steps.image-names.outputs.tags }}
build-args: |
BASE_IMAGE=ghcr.io/${{ github.repository_owner }}/flyte-sandbox-v2:${{ steps.base-image.outputs.tag }}
push: true
29 changes: 29 additions & 0 deletions docker/demo-bundled/Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# syntax=docker/dockerfile:1.4-labs
#
# GPU-enabled sandbox image.
# Extends the base sandbox with NVIDIA Container Toolkit so that K3s
# containerd can schedule GPU workloads via nvidia-container-runtime.

ARG BASE_IMAGE=flyte-sandbox-v2:latest
ARG NVIDIA_CTK_VERSION=1.17.5

# ---------- Stage: grab NVIDIA Container Toolkit binaries ----------
FROM nvcr.io/nvidia/k8s/container-toolkit:v${NVIDIA_CTK_VERSION}-ubuntu20.04 AS nvidia-toolkit

# ---------- Final image ----------
FROM ${BASE_IMAGE}

# Copy NVIDIA Container Toolkit binaries
COPY --from=nvidia-toolkit /usr/bin/nvidia-container-runtime /usr/bin/
COPY --from=nvidia-toolkit /usr/bin/nvidia-container-runtime-hook /usr/bin/
COPY --from=nvidia-toolkit /usr/bin/nvidia-container-cli /usr/bin/
COPY --from=nvidia-toolkit /usr/bin/nvidia-ctk /usr/bin/

# Copy NVIDIA container libraries (path varies by arch)
RUN --mount=from=nvidia-toolkit,src=/usr/lib,dst=/nvidia-lib \
cp /nvidia-lib/x86_64-linux-gnu/libnvidia-container*.so* /usr/lib/ 2>/dev/null || \
cp /nvidia-lib/aarch64-linux-gnu/libnvidia-container*.so* /usr/lib/ 2>/dev/null || true; \
ldconfig 2>/dev/null || true

# Pre-set GPU env so the entrypoint auto-configures K3s for NVIDIA
ENV FLYTE_SANDBOX_GPU=true
3 changes: 3 additions & 0 deletions docker/demo-bundled/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ build: sync-crds flyte console dep_update manifests
.PHONY: start
start: FLYTE_DEMO_IMAGE := flyte-demo:latest
start: FLYTE_DEV := False
start: FLYTE_SANDBOX_GPU ?= false
start:
[ -n "$(shell docker volume ls --filter name=^flyte-demo$$ --format {{.Name}})" ] || \
docker volume create flyte-demo
Expand All @@ -88,7 +89,9 @@ start:
docker run --detach --rm --privileged --name flyte-demo \
--add-host "host.docker.internal:host-gateway" \
--env FLYTE_DEV=$(FLYTE_DEV) \
--env FLYTE_SANDBOX_GPU=$(FLYTE_SANDBOX_GPU) \
--env K3S_KUBECONFIG_OUTPUT=/.kube/kubeconfig \
$(if $(filter true,$(FLYTE_SANDBOX_GPU)),--gpus all,) \
--volume $(PWD)/.kube:/.kube \
--volume flyte-demo:/var/lib/flyte/storage \
--publish "6443":"6443" \
Expand Down
73 changes: 73 additions & 0 deletions docker/demo-bundled/bin/k3d-entrypoint-gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/sh

set -o errexit
set -o nounset

FLYTE_SANDBOX_GPU="${FLYTE_SANDBOX_GPU:-false}"

if [ "$FLYTE_SANDBOX_GPU" != "true" ]; then
echo "[$(date -Iseconds)] [GPU] GPU support not enabled (FLYTE_SANDBOX_GPU=$FLYTE_SANDBOX_GPU)"
exit 0
fi

echo "[$(date -Iseconds)] [GPU] Configuring NVIDIA GPU support..."

# Configure K3s containerd to use the NVIDIA container runtime.
# K3s picks up containerd config templates from this path.
mkdir -p /var/lib/rancher/k3s/agent/etc/containerd
cat > /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl <<'EOF'
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."nvidia"]
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."nvidia".options]
BinaryName = "/usr/bin/nvidia-container-runtime"

[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia"
EOF

# Deploy the NVIDIA device plugin as a K3s auto-deploy manifest.
cat > /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin.yaml <<'EOF'
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
labels:
app.kubernetes.io/name: nvidia-device-plugin
spec:
selector:
matchLabels:
app.kubernetes.io/name: nvidia-device-plugin
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/name: nvidia-device-plugin
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
priorityClassName: system-node-critical
containers:
- name: nvidia-device-plugin
image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
EOF

echo "[$(date -Iseconds)] [GPU] NVIDIA GPU support configured"
Loading