Skip to content

Deploy & Self-Heal

Deploy & Self-Heal #10

name: Deploy & Self-Heal
# Autonomous deployment pipeline with self-healing loop.
# Deploys as a canary revision → smoke tests → promote or diagnose → auto-fix → retry.
# Constitution XV — Autonomous Development Pipeline (self-healing extension).
#
# Flow:
# 1. Build & push container image
# 2. Deploy new revision (canary)
# 3. Run smoke tests via composite action (standardised across workflows)
# 4. If pass → deployment successful
# 5. If fail → collect diagnostics, classify error, create fix issue, wait for agent PR
# 6. Repeat from step 1 with exponential backoff (max 3 iterations)
# 7. After max retries → rollback to last known-good revision → escalate to human review
on:
workflow_dispatch:
inputs:
environment:
description: "Target environment"
required: true
type: choice
options:
- staging
- nightly
- canary
deploy-infrastructure:
description: "Deploy infrastructure via Bicep before app update (required for new environments)"
required: false
type: boolean
default: false
max-heal-iterations:
description: "Maximum self-heal iterations (1-3)"
required: false
type: number
default: 3
image-tag:
description: "Image tag to deploy (optional — builds from HEAD if omitted)"
required: false
type: string
source-issue-number:
description: "Issue number to post deploy/self-heal diagnostics to"
required: false
type: string
# Trigger self-healing when nightly deploy job fails (item 5)
workflow_run:
workflows: ["Nightly Build & Deploy"]
types: [completed]
# Prevent overlapping self-heal runs
concurrency:
group: deploy-heal-${{ inputs.environment || 'nightly' }}
cancel-in-progress: true
permissions:
contents: read
id-token: write
issues: write
pull-requests: read
jobs:
# ── Job 1: Build container image (skip if image-tag provided or triggered by nightly failure) ──
build:
runs-on: ubuntu-latest
timeout-minutes: 30
# Skip build when: image-tag is provided, or triggered by nightly (which already built)
# Only trigger self-heal on nightly failure, not success
if: ${{ github.event_name == 'workflow_dispatch' && !inputs.image-tag }}
outputs:
image-tag: sha-${{ github.sha }}
environment: ${{ inputs.environment || 'nightly' }}
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v5
# Use the nightly identity for nightly/canary environments; staging identity for staging
- name: Azure login (OIDC)
uses: azure/login@v3
with:
client-id: ${{ inputs.environment == 'staging' && secrets.AZURE_CLIENT_ID || secrets.AZURE_CLIENT_ID_NIGHTLY }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: ACR login (with retry)
run: |
REGISTRY="${{ secrets.AZURE_CONTAINER_REGISTRY }}"
for attempt in 1 2 3; do
echo "ACR login attempt ${attempt}/3..."
if az acr login --name "${REGISTRY}"; then
echo "::notice title=ACR Login::Success (attempt ${attempt})"
exit 0
fi
[ "$attempt" -lt 3 ] && sleep 10
done
echo "::error title=ACR Login::Failed after 3 attempts"
exit 1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
push: true
tags: |
${{ secrets.AZURE_CONTAINER_REGISTRY }}/acroyoga-web:sha-${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
# ── Job 2: Deploy canary, test, promote or heal ──
deploy-and-heal:
runs-on: ubuntu-latest
timeout-minutes: 120
needs: [build]
# Run when: build succeeded/skipped (dispatch), or nightly failed (workflow_run)
if: >
always() && (
(github.event_name == 'workflow_dispatch' && (needs.build.result == 'success' || needs.build.result == 'skipped')) ||
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.head_sha == github.sha)
)
environment: ${{ inputs.environment || 'nightly' }}
permissions:
id-token: write
contents: read
issues: write
pull-requests: read
env:
# For workflow_run triggers, head_sha may be empty if nightly failed before checkout;
# fall back to github.sha in that case
IMAGE_TAG: ${{ needs.build.outputs.image-tag || inputs.image-tag || format('nightly-sha-{0}', github.event.workflow_run.head_sha || github.sha) }}
RESOURCE_GROUP: rg-acroyoga-${{ inputs.environment || 'nightly' }}
APP_NAME: ca-acroyoga-web-${{ inputs.environment || 'nightly' }}
MAX_ITERATIONS: ${{ inputs.max-heal-iterations || 3 }}
REGISTRY: ${{ secrets.AZURE_CONTAINER_REGISTRY }}
SOURCE_ISSUE_NUMBER: ${{ inputs.source-issue-number || '' }}
steps:
- uses: actions/checkout@v5
# Use the nightly identity for nightly/canary environments; staging identity for staging
- name: Azure login (OIDC)
uses: azure/login@v3
with:
client-id: ${{ (inputs.environment || 'nightly') == 'staging' && secrets.AZURE_CLIENT_ID || secrets.AZURE_CLIENT_ID_NIGHTLY }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
# Fix IMAGE_TAG for workflow_run trigger: nightly tags images with 7-char SHA
# (nightly-sha-abc1234) but the env default uses the full SHA
- name: Resolve image tag for nightly trigger
if: github.event_name == 'workflow_run'
run: |
SHA="${{ github.event.workflow_run.head_sha || github.sha }}"
if [ -z "$SHA" ]; then
echo "::error title=Image Tag::Could not resolve commit SHA for nightly trigger"
exit 1
fi
echo "IMAGE_TAG=nightly-sha-${SHA:0:7}" >> "$GITHUB_ENV"
echo "::notice title=Image Tag::Resolved nightly image tag: nightly-sha-${SHA:0:7}"
# Ensure required labels exist before the self-healing loop tries to use them.
# GitHub returns 422 when creating an issue with non-existent labels.
- name: Ensure deploy-fix labels exist
uses: actions/github-script@v7
with:
script: |
const labels = [
{ name: 'deploy-fix-auto', color: 'd93f0b', description: 'Auto-created by self-healing deploy pipeline' },
{ name: 'copilot', color: '1d76db', description: 'Assigned to Copilot agent for autonomous fix' },
{ name: 'needs-human-review', color: 'e4e669', description: 'Requires human review — agent cannot fix autonomously' },
];
for (const label of labels) {
try {
await github.rest.issues.getLabel({
owner: context.repo.owner,
repo: context.repo.repo,
name: label.name,
});
} catch (e) {
if (e.status === 404) {
console.log(`Creating missing label: ${label.name}`);
await github.rest.issues.createLabel({
owner: context.repo.owner,
repo: context.repo.repo,
...label,
});
}
}
}
- name: Start PostgreSQL server if stopped
uses: ./.github/actions/pg-wake
with:
resource-group: ${{ env.RESOURCE_GROUP }}
- name: Validate infrastructure (Bicep)
if: inputs.deploy-infrastructure == true
run: |
ENV="${{ inputs.environment || 'nightly' }}"
PARAMS_FILE="infra/main.parameters.${ENV}.json"
echo "Running Bicep template validation for ${ENV}..."
if ! az deployment group validate \
--resource-group ${{ env.RESOURCE_GROUP }} \
--template-file infra/main.bicep \
--parameters "$PARAMS_FILE" \
--parameters \
imageTag="${{ env.IMAGE_TAG }}" \
dbAdminPassword="${{ secrets.DB_ADMIN_PASSWORD }}" \
nextAuthSecret="$(openssl rand -base64 32)" \
sharedContainerRegistryLoginServer="${{ secrets.AZURE_CONTAINER_REGISTRY }}" \
githubOwnerId="${{ github.repository_owner_id }}" \
githubRepoId="${{ github.repository_id }}" 2>&1; then
echo "::error title=DEPLOYMENT: Bicep validate::Template validation failed — fix Bicep errors before deployment"
exit 1
fi
echo "::notice title=Bicep Validate::Template validation passed"
- name: Deploy infrastructure (Bicep)
if: inputs.deploy-infrastructure == true
run: |
ENV="${{ inputs.environment || 'nightly' }}"
PARAMS_FILE="infra/main.parameters.${ENV}.json"
if [ ! -f "$PARAMS_FILE" ]; then
echo "::error title=DEPLOY::Parameter file ${PARAMS_FILE} not found"
exit 1
fi
echo "Deploying infrastructure for ${ENV} environment..."
MAX_ATTEMPTS=2
for attempt in $(seq 1 $MAX_ATTEMPTS); do
echo "Bicep deployment attempt ${attempt}/${MAX_ATTEMPTS}..."
if az deployment group create \
--resource-group ${{ env.RESOURCE_GROUP }} \
--template-file infra/main.bicep \
--parameters "$PARAMS_FILE" \
--parameters \
imageTag="${{ env.IMAGE_TAG }}" \
dbAdminPassword="${{ secrets.DB_ADMIN_PASSWORD }}" \
nextAuthSecret="$(openssl rand -base64 32)" \
sharedContainerRegistryLoginServer="${{ secrets.AZURE_CONTAINER_REGISTRY }}" \
githubOwnerId="${{ github.repository_owner_id }}" \
githubRepoId="${{ github.repository_id }}" 2>&1; then
echo "::notice title=Bicep Deploy::Infrastructure deployed successfully (attempt ${attempt})"
break
fi
if [ "$attempt" -lt "$MAX_ATTEMPTS" ]; then
echo "::warning title=DEPLOYMENT: Bicep deploy::Attempt ${attempt} failed, retrying in 60s..."
sleep 60
else
echo "::error title=DEPLOYMENT: Bicep deploy::Deployment failed after ${MAX_ATTEMPTS} attempts"
exit 1
fi
done
- name: Record known-good revision before deploy
id: baseline
run: |
GOOD_REV=$(az containerapp revision list \
--name ${{ env.APP_NAME }} --resource-group ${{ env.RESOURCE_GROUP }} \
--query "sort_by([?properties.runningState=='Running'], &properties.createdTime)[-1].name" \
-o tsv 2>/dev/null || echo "")
echo "revision=${GOOD_REV}" >> "$GITHUB_OUTPUT"
echo "::notice title=Baseline::Known-good revision: ${GOOD_REV:-none}"
- name: Self-healing deploy loop
id: heal-loop
uses: actions/github-script@v7
env:
IMAGE_TAG: ${{ env.IMAGE_TAG }}
RESOURCE_GROUP: ${{ env.RESOURCE_GROUP }}
APP_NAME: ${{ env.APP_NAME }}
MAX_ITERATIONS: ${{ env.MAX_ITERATIONS }}
REGISTRY: ${{ env.REGISTRY }}
BASELINE_REVISION: ${{ steps.baseline.outputs.revision }}
SOURCE_ISSUE_NUMBER: ${{ env.SOURCE_ISSUE_NUMBER }}
with:
script: |
const { execSync } = require('child_process');
const maxIter = parseInt(process.env.MAX_ITERATIONS, 10) || 3;
const appName = process.env.APP_NAME;
const rg = process.env.RESOURCE_GROUP;
const registry = process.env.REGISTRY;
const baselineRevision = process.env.BASELINE_REVISION;
const sourceIssueNumber = parseInt(process.env.SOURCE_ISSUE_NUMBER || '', 10) || 0;
const runIdSuffix = (process.env.GITHUB_RUN_ID || String(Date.now())).slice(-6);
let imageTag = process.env.IMAGE_TAG;
// Base cooldown: 60s, doubles each iteration (item 8: rate limiting)
const BASE_COOLDOWN_MS = 60000;
function run(cmd) {
console.log(`$ ${cmd}`);
return execSync(cmd, { encoding: 'utf8', timeout: 300000 }).trim();
}
function runSafe(cmd) {
try { return run(cmd); }
catch (e) { return e.stderr || e.message; }
}
// Item 7: Improved PR merge detection — search by issue reference
async function waitForPR(issueNumber, timeoutMs = 900000) {
const deadline = Date.now() + timeoutMs;
const pollInterval = 60000;
const createdAfter = new Date(Date.now() - timeoutMs).toISOString().split('T')[0];
while (Date.now() < deadline) {
try {
// Use search API to find merged PRs referencing this issue
const { data: searchResult } = await github.rest.search.issuesAndPullRequests({
q: `repo:${context.repo.owner}/${context.repo.repo} is:pr is:merged Fixes #${issueNumber} created:>=${createdAfter}`,
sort: 'updated',
order: 'desc',
per_page: 5
});
for (const item of searchResult.items) {
// Verify it's actually merged by fetching the PR details
const { data: pr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: item.number
});
if (pr.merged_at) {
console.log(`Found merged fix PR #${pr.number} for issue #${issueNumber}`);
return pr;
}
}
} catch (e) {
console.log(`Search API error (will retry): ${e.message}`);
}
console.log(`Waiting for fix PR for issue #${issueNumber}... (${Math.round((deadline - Date.now()) / 60000)} min remaining)`);
await new Promise(r => setTimeout(r, pollInterval));
}
return null;
}
async function commentSourceIssue(title, lines) {
if (!sourceIssueNumber) return;
try {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: sourceIssueNumber,
body: [
`## ${title}`,
'',
...lines,
].join('\n')
});
} catch (e) {
console.log(`Could not comment on source issue #${sourceIssueNumber}: ${e.message}`);
}
}
// Item 4: Automatic rollback helper
function rollbackToBaseline() {
if (!baselineRevision) {
console.log('No baseline revision recorded — cannot rollback');
return false;
}
try {
console.log(`Rolling back to baseline revision: ${baselineRevision}`);
run(`az containerapp revision activate \
--name ${appName} --resource-group ${rg} \
--revision ${baselineRevision} --output none`);
run(`az containerapp ingress traffic set \
--name ${appName} --resource-group ${rg} \
--revision-weight ${baselineRevision}=100 --output none`);
console.log(`::notice title=Rollback::Traffic shifted to baseline revision ${baselineRevision}`);
return true;
} catch (e) {
console.log(`::warning title=Rollback::Failed to rollback: ${e.message}`);
return false;
}
}
for (let iter = 1; iter <= maxIter; iter++) {
console.log(`\n${'='.repeat(60)}`);
console.log(`SELF-HEAL ITERATION ${iter}/${maxIter}`);
console.log(`${'='.repeat(60)}\n`);
// Item 8: Exponential backoff between iterations (skip cooldown on first)
if (iter > 1) {
const cooldownMs = BASE_COOLDOWN_MS * Math.pow(2, iter - 2);
console.log(`Cooldown: waiting ${cooldownMs / 1000}s before next attempt...`);
await new Promise(r => setTimeout(r, cooldownMs));
}
// ── Step 1: Deploy new revision as canary ──
console.log('Deploying canary revision...');
const tagSuffix = imageTag.replace(/[^a-z0-9]/gi, '').slice(-6);
const revisionSuffix = `h${iter}-${tagSuffix}-${runIdSuffix}`;
try {
run(`az containerapp update \
--name ${appName} \
--resource-group ${rg} \
--image ${registry}/acroyoga-web:${imageTag} \
--revision-suffix ${revisionSuffix} \
--output none`);
} catch (e) {
console.log(`::error title=DEPLOY::Canary deploy failed: ${e.message}`);
const infraError = runSafe(`az deployment group list \
--resource-group ${rg} --query "[?properties.provisioningState!='Succeeded'] | [0:3]" -o json`);
await commentSourceIssue('❌ Self-heal canary deploy failed', [
`- Environment: \`${rg}\``,
`- App: \`${appName}\``,
`- Iteration: ${iter}/${maxIter}`,
`- Image tag: \`${imageTag}\``,
`- Revision suffix: \`${revisionSuffix}\``,
'',
'### Error',
'```',
String(e.message || e).slice(-3000),
'```'
]);
core.setOutput('result', 'fail');
core.setOutput('error', `Infrastructure deployment failed at iteration ${iter}: ${infraError}`);
return;
}
// Get the new revision name
const latestRevision = run(`az containerapp revision list \
--name ${appName} --resource-group ${rg} \
--query "sort_by(@, &properties.createdTime)[-1].name" -o tsv`);
console.log(`New revision: ${latestRevision}`);
// ── Step 2: Get app FQDN for smoke tests ──
const appFqdn = run(`az containerapp show \
--name ${appName} --resource-group ${rg} \
--query properties.configuration.ingress.fqdn -o tsv`);
// Item 3: Run smoke tests via composite action output file
// Write FQDN to file for the composite action to use
// Since we're inside github-script, we invoke the smoke-test
// action's logic using the same curl patterns but in a
// standardised way. The composite action is used in the
// separate workflow steps (nightly/deploy); here we replicate
// the same parameters for consistency.
console.log(`Running smoke tests against https://${appFqdn} ...`);
let readinessOk = false;
for (let r = 0; r < 50; r++) {
try {
const result = run(`curl --connect-timeout 10 --max-time 30 -sf "https://${appFqdn}/api/ready" -o /dev/null -w "%{http_code}"`);
if (result === '200') {
readinessOk = true;
console.log(`Readiness check passed after ${r + 1} attempts`);
break;
}
} catch { /* retry */ }
if (r < 49) await new Promise(resolve => setTimeout(resolve, 15000));
}
let healthOk = false;
let homepageOk = false;
if (readinessOk) {
try {
const healthResp = run(`curl --connect-timeout 10 --max-time 30 -sf "https://${appFqdn}/api/health"`);
healthOk = healthResp.includes('"status":"healthy"');
if (!healthOk) console.log(`Health response: ${healthResp}`);
} catch (e) {
console.log(`Health check failed: ${e.message}`);
}
try {
run(`curl --connect-timeout 10 --max-time 30 -sf "https://${appFqdn}/" -o /dev/null`);
homepageOk = true;
} catch (e) {
console.log(`Home page check failed: ${e.message}`);
}
}
const allPassed = readinessOk && healthOk && homepageOk;
// ── Step 3: If all pass → success ──
if (allPassed) {
console.log('\n✅ All smoke tests passed — deployment successful!');
await commentSourceIssue('✅ Self-heal deployment succeeded', [
`- Environment: \`${rg}\``,
`- App: \`${appName}\``,
`- Iteration: ${iter}/${maxIter}`,
`- Image tag: \`${imageTag}\``,
'',
'### Smoke Test Results',
'- Readiness (/api/ready): ✅ pass',
'- Health (/api/health): ✅ pass',
'- Home page (/): ✅ pass'
]);
core.setOutput('result', 'pass');
core.setOutput('iteration', String(iter));
return;
}
// ── Step 4: Tests failed — collect diagnostics ──
console.log('\n❌ Smoke tests failed — collecting diagnostics...');
const containerLogs = runSafe(`az containerapp logs show \
--name ${appName} --resource-group ${rg} --tail 100 2>&1 | tail -50`);
const systemLogs = runSafe(`az containerapp logs show \
--name ${appName} --resource-group ${rg} --type system --tail 50 2>&1 | tail -30`);
// Build a minimal diagnostics JSON for the classify-error action
const diagJson = JSON.stringify({
containerLogs: containerLogs,
systemLogs: systemLogs,
deploymentOperations: [],
smokeTestResults: {
readiness: readinessOk ? 'pass' : 'fail',
health: healthOk ? 'pass' : 'fail',
homepage: homepageOk ? 'pass' : 'fail'
}
});
// Write diagnostics for the classify-error action to consume
require('fs').writeFileSync('/tmp/heal-diagnostics.json', diagJson);
// Classify inline using the same patterns as classify-error action
let errorCategory = 'unknown';
let errorSummary = 'Deployment smoke tests failed';
const allLogs = `${containerLogs}\n${systemLogs}`;
// Credential patterns (expanded — item 9)
if (allLogs.match(/AZURE_CLIENT_ID|ManagedIdentityCredential|DefaultAzureCredential|AADSTS|credential|TokenExpiredError|AuthenticationFailedError|AuthorizationFailed|RBAC|authorization denied|Access denied|SecretNotFound|KeyVaultError|VaultAccessError|Forbidden.*vault|identity.*error/i)) {
errorCategory = 'credential';
errorSummary = 'Credential or managed identity error — requires human review';
} else if (allLogs.match(/BackOff|CrashLoop|OOMKilled/i)) {
errorCategory = 'runtime';
errorSummary = 'Container is crash-looping or OOM';
} else if (allLogs.match(/MODULE_NOT_FOUND|Cannot find module|ENOENT|ERR_MODULE_NOT_FOUND/i)) {
errorCategory = 'dependency';
errorSummary = 'Missing module or file at runtime';
} else if (allLogs.match(/ECONNREFUSED|ENOTFOUND|ETIMEDOUT|EHOSTUNREACH/i)) {
errorCategory = 'config';
errorSummary = 'Connection error — check env vars and service endpoints';
} else if (!readinessOk) {
errorCategory = 'runtime';
errorSummary = 'Application never became ready (/api/ready timeout)';
} else if (!healthOk) {
errorCategory = 'runtime';
errorSummary = 'Health check failed (/api/health not healthy)';
}
// ── Step 5: If this is the last iteration, rollback then escalate ──
if (iter >= maxIter) {
console.log(`\n🚨 Max iterations (${maxIter}) reached`);
// Item 4: Automatic rollback before escalation
console.log('Attempting rollback to last known-good revision...');
const rolledBack = rollbackToBaseline();
// Deactivate the broken revision
try {
run(`az containerapp revision deactivate \
--name ${appName} --resource-group ${rg} \
--revision ${latestRevision} --output none`);
console.log(`Deactivated failed revision: ${latestRevision}`);
} catch { /* best effort */ }
const issueBody = [
`## 🚨 Self-Healing Deployment Failed — Needs Human Review`,
'',
`**Environment**: \`${rg}\``,
`**Image Tag**: \`${imageTag}\``,
`**Iterations Attempted**: ${iter}`,
`**Error Category**: \`${errorCategory}\``,
`**Summary**: ${errorSummary}`,
`**Rollback**: ${rolledBack ? `✅ Rolled back to \`${baselineRevision}\`` : '❌ No baseline revision available'}`,
'',
'### Container Logs (last 50 lines)',
'```',
// Truncate to 3000 chars to stay within GitHub issue body limit (~65535 chars)
containerLogs.slice(-3000),
'```',
'',
'### System Logs (last 30 lines)',
'```',
// Truncate to 2000 chars — system logs are less verbose than container logs
systemLogs.slice(-2000),
'```',
'',
'### Smoke Test Results',
`- Readiness: ${readinessOk ? '✅' : '❌'}`,
`- Health: ${healthOk ? '✅' : '❌'}`,
`- Home page: ${homepageOk ? '✅' : '❌'}`,
'',
'> This issue was auto-created by the self-healing deploy pipeline after exhausting all retry attempts.',
].join('\n');
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: `[Deploy Fix] Self-healing failed: ${errorSummary}`,
body: issueBody,
labels: ['deploy-fix-auto', 'needs-human-review']
});
core.setOutput('result', 'fail');
core.setOutput('error', `Self-healing exhausted after ${iter} iterations: ${errorSummary}`);
return;
}
// ── Step 6: Create a fix issue for the Copilot agent ──
console.log(`\nCreating fix issue for Copilot agent (iteration ${iter})...`);
const fixIssueBody = [
`## 🔧 Automated Deployment Fix Required`,
'',
`**Environment**: \`${rg}\``,
`**Image Tag**: \`${imageTag}\``,
`**Self-Heal Iteration**: ${iter}/${maxIter}`,
`**Error Category**: \`${errorCategory}\``,
`**Error Summary**: ${errorSummary}`,
'',
'### Agent Instructions',
'',
'This issue was auto-created by the self-healing deployment pipeline.',
'The deployment to Azure Container Apps failed smoke tests.',
'Diagnose and fix the root cause based on the error category and logs below.',
'',
'**Steps:**',
'1. Read the error diagnostics below',
'2. Identify the root cause in the codebase',
'3. Implement a fix (code, config, or infrastructure)',
'4. Create a PR with `Fixes #ISSUE_NUMBER` in the description (replace ISSUE_NUMBER with this issue number)',
'5. The self-healing pipeline will automatically redeploy after your fix merges',
'',
`**Error Category**: \`${errorCategory}\``,
'',
errorCategory === 'runtime' ? '> **Hint**: Check application startup, API route handlers, database migrations, and health check endpoints.' :
errorCategory === 'dependency' ? '> **Hint**: Check package.json, imports, and the Dockerfile build steps.' :
errorCategory === 'config' ? '> **Hint**: Check environment variables, connection strings, and Azure service endpoints.' :
errorCategory === 'infra' ? '> **Hint**: Check Bicep templates, resource parameters, and Azure resource configurations. Note: infra changes require human review per Constitution XV.' :
'> **Hint**: Review the full logs below to identify the failure pattern.',
'',
'### Container Logs',
'```',
// Truncate to 3000 chars to stay within GitHub issue body limit (~65535 chars)
containerLogs.slice(-3000),
'```',
'',
'### System Logs',
'```',
// Truncate to 2000 chars — system logs are less verbose than container logs
systemLogs.slice(-2000),
'```',
'',
'### Smoke Test Results',
`- Readiness (/api/ready): ${readinessOk ? '✅ pass' : '❌ fail'}`,
`- Health (/api/health): ${healthOk ? '✅ pass' : '❌ fail'}`,
`- Home page (/): ${homepageOk ? '✅ pass' : '❌ fail'}`,
].join('\n');
const { data: fixIssue } = await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: `[Deploy Fix] ${errorSummary} (iteration ${iter})`,
body: fixIssueBody,
labels: ['deploy-fix-auto', 'copilot']
});
await commentSourceIssue(`🔁 Self-heal iteration ${iter}/${maxIter} failed`, [
`- Environment: \`${rg}\``,
`- App: \`${appName}\``,
`- Image tag: \`${imageTag}\``,
`- Error category: \`${errorCategory}\``,
`- Summary: ${errorSummary}`,
`- Auto-created fix issue: #${fixIssue.number}`,
'',
'### Smoke Test Results',
`- Readiness (/api/ready): ${readinessOk ? '✅ pass' : '❌ fail'}`,
`- Health (/api/health): ${healthOk ? '✅ pass' : '❌ fail'}`,
`- Home page (/): ${homepageOk ? '✅ pass' : '❌ fail'}`
]);
// Update the issue body with the actual issue number now that we have it
await github.rest.issues.update({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: fixIssue.number,
body: fixIssueBody.replace('ISSUE_NUMBER', String(fixIssue.number))
});
console.log(`Created fix issue #${fixIssue.number}`);
// ── Step 7: Wait for the agent to merge a fix PR ──
console.log(`Waiting for Copilot agent to fix issue #${fixIssue.number}...`);
const fixPR = await waitForPR(fixIssue.number, 900000); // 15 minute timeout
if (!fixPR) {
console.log(`::warning title=SELF-HEAL::No fix PR merged within timeout for issue #${fixIssue.number}`);
// Continue to next iteration anyway — the agent might have pushed to main directly
} else {
console.log(`Fix PR #${fixPR.number} merged — rebuilding for next iteration...`);
// Update image tag for the next iteration using the merge commit
imageTag = `sha-${fixPR.merge_commit_sha}`;
// Rebuild with the new code
try {
// ACR login with retry (matching the initial login pattern)
let acrLoggedIn = false;
for (let a = 1; a <= 3; a++) {
try {
run(`az acr login --name ${registry}`);
acrLoggedIn = true;
break;
} catch (e) {
if (a < 3) {
console.log(`ACR login attempt ${a}/3 failed, retrying in 10s...`);
await new Promise(r => setTimeout(r, 10000));
} else {
console.log(`::warning title=SELF-HEAL::ACR login failed after 3 attempts: ${e.message}`);
}
}
}
// Trigger a rebuild via ACR task or use the existing image if deploy.yml already built it
// For now, check if the image exists from the deploy.yml pipeline
const imageExists = runSafe(`az acr repository show-tags \
--name ${registry.split('.')[0]} \
--repository acroyoga-web \
--query "contains(@, '${imageTag}')" -o tsv`);
if (imageExists !== 'true') {
console.log('Fix image not yet built — waiting for deploy.yml to build it...');
// Wait up to 10 minutes for the main deploy pipeline to build the image
for (let w = 0; w < 20; w++) {
await new Promise(r => setTimeout(r, 30000));
const exists = runSafe(`az acr repository show-tags \
--name ${registry.split('.')[0]} \
--repository acroyoga-web \
--query "contains(@, '${imageTag}')" -o tsv`);
if (exists === 'true') break;
}
}
} catch (e) {
console.log(`::warning title=SELF-HEAL::Could not verify fix image: ${e.message}`);
}
}
// Clean up failed revision to avoid revision limit
try {
run(`az containerapp revision deactivate \
--name ${appName} --resource-group ${rg} \
--revision ${latestRevision} --output none`);
console.log(`Deactivated failed revision: ${latestRevision}`);
} catch { /* best effort */ }
console.log(`\nProceeding to iteration ${iter + 1}...`);
}
- name: Report result
if: always()
run: |
RESULT="${{ steps.heal-loop.outputs.result || 'fail' }}"
if [ "$RESULT" = "pass" ]; then
echo "::notice title=Deploy & Self-Heal::✅ Deployment successful (iteration ${{ steps.heal-loop.outputs.iteration }})"
else
echo "::error title=Deploy & Self-Heal::❌ Deployment failed after self-healing attempts"
exit 1
fi