Deploy & Self-Heal #10

Workflow file for this run

.github/workflows/deploy-and-heal.yml at 386113e

	name: Deploy & Self-Heal

	# Autonomous deployment pipeline with self-healing loop.
	# Deploys as a canary revision → smoke tests → promote or diagnose → auto-fix → retry.
	# Constitution XV — Autonomous Development Pipeline (self-healing extension).
	#
	# Flow:
	# 1. Build & push container image
	# 2. Deploy new revision (canary)
	# 3. Run smoke tests via composite action (standardised across workflows)
	# 4. If pass → deployment successful
	# 5. If fail → collect diagnostics, classify error, create fix issue, wait for agent PR
	# 6. Repeat from step 1 with exponential backoff (max 3 iterations)
	# 7. After max retries → rollback to last known-good revision → escalate to human review

	on:
	workflow_dispatch:
	inputs:
	environment:
	description: "Target environment"
	required: true
	type: choice
	options:
	- staging
	- nightly
	- canary
	deploy-infrastructure:
	description: "Deploy infrastructure via Bicep before app update (required for new environments)"
	required: false
	type: boolean
	default: false
	max-heal-iterations:
	description: "Maximum self-heal iterations (1-3)"
	required: false
	type: number
	default: 3
	image-tag:
	description: "Image tag to deploy (optional — builds from HEAD if omitted)"
	required: false
	type: string
	source-issue-number:
	description: "Issue number to post deploy/self-heal diagnostics to"
	required: false
	type: string
	# Trigger self-healing when nightly deploy job fails (item 5)
	workflow_run:
	workflows: ["Nightly Build & Deploy"]
	types: [completed]

	# Prevent overlapping self-heal runs
	concurrency:
	group: deploy-heal-${{ inputs.environment \|\| 'nightly' }}
	cancel-in-progress: true

	permissions:
	contents: read
	id-token: write
	issues: write
	pull-requests: read

	jobs:
	# ── Job 1: Build container image (skip if image-tag provided or triggered by nightly failure) ──
	build:
	runs-on: ubuntu-latest
	timeout-minutes: 30
	# Skip build when: image-tag is provided, or triggered by nightly (which already built)
	# Only trigger self-heal on nightly failure, not success
	if: ${{ github.event_name == 'workflow_dispatch' && !inputs.image-tag }}
	outputs:
	image-tag: sha-${{ github.sha }}
	environment: ${{ inputs.environment \|\| 'nightly' }}
	permissions:
	id-token: write
	contents: read

	steps:
	- uses: actions/checkout@v5

	# Use the nightly identity for nightly/canary environments; staging identity for staging
	- name: Azure login (OIDC)
	uses: azure/login@v3
	with:
	client-id: ${{ inputs.environment == 'staging' && secrets.AZURE_CLIENT_ID \|\| secrets.AZURE_CLIENT_ID_NIGHTLY }}
	tenant-id: ${{ secrets.AZURE_TENANT_ID }}
	subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

	- name: ACR login (with retry)
	run: \|
	REGISTRY="${{ secrets.AZURE_CONTAINER_REGISTRY }}"
	for attempt in 1 2 3; do
	echo "ACR login attempt ${attempt}/3..."
	if az acr login --name "${REGISTRY}"; then
	echo "::notice title=ACR Login::Success (attempt ${attempt})"
	exit 0
	fi
	[ "$attempt" -lt 3 ] && sleep 10
	done
	echo "::error title=ACR Login::Failed after 3 attempts"
	exit 1

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3

	- name: Build and push
	uses: docker/build-push-action@v6
	with:
	context: .
	file: ./Dockerfile
	push: true
	tags: \|
	${{ secrets.AZURE_CONTAINER_REGISTRY }}/acroyoga-web:sha-${{ github.sha }}
	cache-from: type=gha
	cache-to: type=gha,mode=max

	# ── Job 2: Deploy canary, test, promote or heal ──
	deploy-and-heal:
	runs-on: ubuntu-latest
	timeout-minutes: 120
	needs: [build]
	# Run when: build succeeded/skipped (dispatch), or nightly failed (workflow_run)
	if: >
	always() && (
	(github.event_name == 'workflow_dispatch' && (needs.build.result == 'success' \|\| needs.build.result == 'skipped')) \|\|
	(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.head_sha == github.sha)
	)
	environment: ${{ inputs.environment \|\| 'nightly' }}
	permissions:
	id-token: write
	contents: read
	issues: write
	pull-requests: read

	env:
	# For workflow_run triggers, head_sha may be empty if nightly failed before checkout;
	# fall back to github.sha in that case
	IMAGE_TAG: ${{ needs.build.outputs.image-tag \|\| inputs.image-tag \|\| format('nightly-sha-{0}', github.event.workflow_run.head_sha \|\| github.sha) }}
	RESOURCE_GROUP: rg-acroyoga-${{ inputs.environment \|\| 'nightly' }}
	APP_NAME: ca-acroyoga-web-${{ inputs.environment \|\| 'nightly' }}
	MAX_ITERATIONS: ${{ inputs.max-heal-iterations \|\| 3 }}
	REGISTRY: ${{ secrets.AZURE_CONTAINER_REGISTRY }}
	SOURCE_ISSUE_NUMBER: ${{ inputs.source-issue-number \|\| '' }}

	steps:
	- uses: actions/checkout@v5

	# Use the nightly identity for nightly/canary environments; staging identity for staging
	- name: Azure login (OIDC)
	uses: azure/login@v3
	with:
	client-id: ${{ (inputs.environment \|\| 'nightly') == 'staging' && secrets.AZURE_CLIENT_ID \|\| secrets.AZURE_CLIENT_ID_NIGHTLY }}
	tenant-id: ${{ secrets.AZURE_TENANT_ID }}
	subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

	# Fix IMAGE_TAG for workflow_run trigger: nightly tags images with 7-char SHA
	# (nightly-sha-abc1234) but the env default uses the full SHA
	- name: Resolve image tag for nightly trigger
	if: github.event_name == 'workflow_run'
	run: \|
	SHA="${{ github.event.workflow_run.head_sha \|\| github.sha }}"
	if [ -z "$SHA" ]; then
	echo "::error title=Image Tag::Could not resolve commit SHA for nightly trigger"
	exit 1
	fi
	echo "IMAGE_TAG=nightly-sha-${SHA:0:7}" >> "$GITHUB_ENV"
	echo "::notice title=Image Tag::Resolved nightly image tag: nightly-sha-${SHA:0:7}"

	# Ensure required labels exist before the self-healing loop tries to use them.
	# GitHub returns 422 when creating an issue with non-existent labels.
	- name: Ensure deploy-fix labels exist
	uses: actions/github-script@v7
	with:
	script: \|
	const labels = [
	{ name: 'deploy-fix-auto', color: 'd93f0b', description: 'Auto-created by self-healing deploy pipeline' },
	{ name: 'copilot', color: '1d76db', description: 'Assigned to Copilot agent for autonomous fix' },
	{ name: 'needs-human-review', color: 'e4e669', description: 'Requires human review — agent cannot fix autonomously' },
	];
	for (const label of labels) {
	try {
	await github.rest.issues.getLabel({
	owner: context.repo.owner,
	repo: context.repo.repo,
	name: label.name,
	});
	} catch (e) {
	if (e.status === 404) {
	console.log(`Creating missing label: ${label.name}`);
	await github.rest.issues.createLabel({
	owner: context.repo.owner,
	repo: context.repo.repo,
	...label,
	});
	}
	}
	}

	- name: Start PostgreSQL server if stopped
	uses: ./.github/actions/pg-wake
	with:
	resource-group: ${{ env.RESOURCE_GROUP }}

	- name: Validate infrastructure (Bicep)
	if: inputs.deploy-infrastructure == true
	run: \|
	ENV="${{ inputs.environment \|\| 'nightly' }}"
	PARAMS_FILE="infra/main.parameters.${ENV}.json"
	echo "Running Bicep template validation for ${ENV}..."
	if ! az deployment group validate \
	--resource-group ${{ env.RESOURCE_GROUP }} \
	--template-file infra/main.bicep \
	--parameters "$PARAMS_FILE" \
	--parameters \
	imageTag="${{ env.IMAGE_TAG }}" \
	dbAdminPassword="${{ secrets.DB_ADMIN_PASSWORD }}" \
	nextAuthSecret="$(openssl rand -base64 32)" \
	sharedContainerRegistryLoginServer="${{ secrets.AZURE_CONTAINER_REGISTRY }}" \
	githubOwnerId="${{ github.repository_owner_id }}" \
	githubRepoId="${{ github.repository_id }}" 2>&1; then
	echo "::error title=DEPLOYMENT: Bicep validate::Template validation failed — fix Bicep errors before deployment"
	exit 1
	fi
	echo "::notice title=Bicep Validate::Template validation passed"

	- name: Deploy infrastructure (Bicep)
	if: inputs.deploy-infrastructure == true
	run: \|
	ENV="${{ inputs.environment \|\| 'nightly' }}"
	PARAMS_FILE="infra/main.parameters.${ENV}.json"
	if [ ! -f "$PARAMS_FILE" ]; then
	echo "::error title=DEPLOY::Parameter file ${PARAMS_FILE} not found"
	exit 1
	fi
	echo "Deploying infrastructure for ${ENV} environment..."
	MAX_ATTEMPTS=2
	for attempt in $(seq 1 $MAX_ATTEMPTS); do
	echo "Bicep deployment attempt ${attempt}/${MAX_ATTEMPTS}..."
	if az deployment group create \
	--resource-group ${{ env.RESOURCE_GROUP }} \
	--template-file infra/main.bicep \
	--parameters "$PARAMS_FILE" \
	--parameters \
	imageTag="${{ env.IMAGE_TAG }}" \
	dbAdminPassword="${{ secrets.DB_ADMIN_PASSWORD }}" \
	nextAuthSecret="$(openssl rand -base64 32)" \
	sharedContainerRegistryLoginServer="${{ secrets.AZURE_CONTAINER_REGISTRY }}" \
	githubOwnerId="${{ github.repository_owner_id }}" \
	githubRepoId="${{ github.repository_id }}" 2>&1; then
	echo "::notice title=Bicep Deploy::Infrastructure deployed successfully (attempt ${attempt})"
	break
	fi
	if [ "$attempt" -lt "$MAX_ATTEMPTS" ]; then
	echo "::warning title=DEPLOYMENT: Bicep deploy::Attempt ${attempt} failed, retrying in 60s..."
	sleep 60
	else
	echo "::error title=DEPLOYMENT: Bicep deploy::Deployment failed after ${MAX_ATTEMPTS} attempts"
	exit 1
	fi
	done

	- name: Record known-good revision before deploy
	id: baseline
	run: \|
	GOOD_REV=$(az containerapp revision list \
	--name ${{ env.APP_NAME }} --resource-group ${{ env.RESOURCE_GROUP }} \
	--query "sort_by([?properties.runningState=='Running'], &properties.createdTime)[-1].name" \
	-o tsv 2>/dev/null \|\| echo "")
	echo "revision=${GOOD_REV}" >> "$GITHUB_OUTPUT"
	echo "::notice title=Baseline::Known-good revision: ${GOOD_REV:-none}"

	- name: Self-healing deploy loop
	id: heal-loop
	uses: actions/github-script@v7
	env:
	IMAGE_TAG: ${{ env.IMAGE_TAG }}
	RESOURCE_GROUP: ${{ env.RESOURCE_GROUP }}
	APP_NAME: ${{ env.APP_NAME }}
	MAX_ITERATIONS: ${{ env.MAX_ITERATIONS }}
	REGISTRY: ${{ env.REGISTRY }}
	BASELINE_REVISION: ${{ steps.baseline.outputs.revision }}
	SOURCE_ISSUE_NUMBER: ${{ env.SOURCE_ISSUE_NUMBER }}
	with:
	script: \|
	const { execSync } = require('child_process');
	const maxIter = parseInt(process.env.MAX_ITERATIONS, 10) \|\| 3;
	const appName = process.env.APP_NAME;
	const rg = process.env.RESOURCE_GROUP;
	const registry = process.env.REGISTRY;
	const baselineRevision = process.env.BASELINE_REVISION;
	const sourceIssueNumber = parseInt(process.env.SOURCE_ISSUE_NUMBER \|\| '', 10) \|\| 0;
	const runIdSuffix = (process.env.GITHUB_RUN_ID \|\| String(Date.now())).slice(-6);
	let imageTag = process.env.IMAGE_TAG;

	// Base cooldown: 60s, doubles each iteration (item 8: rate limiting)
	const BASE_COOLDOWN_MS = 60000;

	function run(cmd) {
	console.log(`$ ${cmd}`);
	return execSync(cmd, { encoding: 'utf8', timeout: 300000 }).trim();
	}

	function runSafe(cmd) {
	try { return run(cmd); }
	catch (e) { return e.stderr \|\| e.message; }
	}

	// Item 7: Improved PR merge detection — search by issue reference
	async function waitForPR(issueNumber, timeoutMs = 900000) {
	const deadline = Date.now() + timeoutMs;
	const pollInterval = 60000;
	const createdAfter = new Date(Date.now() - timeoutMs).toISOString().split('T')[0];

	while (Date.now() < deadline) {
	try {
	// Use search API to find merged PRs referencing this issue
	const { data: searchResult } = await github.rest.search.issuesAndPullRequests({
	q: `repo:${context.repo.owner}/${context.repo.repo} is:pr is:merged Fixes #${issueNumber} created:>=${createdAfter}`,
	sort: 'updated',
	order: 'desc',
	per_page: 5
	});

	for (const item of searchResult.items) {
	// Verify it's actually merged by fetching the PR details
	const { data: pr } = await github.rest.pulls.get({
	owner: context.repo.owner,
	repo: context.repo.repo,
	pull_number: item.number
	});
	if (pr.merged_at) {
	console.log(`Found merged fix PR #${pr.number} for issue #${issueNumber}`);
	return pr;
	}
	}
	} catch (e) {
	console.log(`Search API error (will retry): ${e.message}`);
	}

	console.log(`Waiting for fix PR for issue #${issueNumber}... (${Math.round((deadline - Date.now()) / 60000)} min remaining)`);
	await new Promise(r => setTimeout(r, pollInterval));
	}
	return null;
	}

	async function commentSourceIssue(title, lines) {
	if (!sourceIssueNumber) return;
	try {
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: sourceIssueNumber,
	body: [
	`## ${title}`,
	'',
	...lines,
	].join('\n')
	});
	} catch (e) {
	console.log(`Could not comment on source issue #${sourceIssueNumber}: ${e.message}`);
	}
	}

	// Item 4: Automatic rollback helper
	function rollbackToBaseline() {
	if (!baselineRevision) {
	console.log('No baseline revision recorded — cannot rollback');
	return false;
	}
	try {
	console.log(`Rolling back to baseline revision: ${baselineRevision}`);
	run(`az containerapp revision activate \
	--name ${appName} --resource-group ${rg} \
	--revision ${baselineRevision} --output none`);
	run(`az containerapp ingress traffic set \
	--name ${appName} --resource-group ${rg} \
	--revision-weight ${baselineRevision}=100 --output none`);
	console.log(`::notice title=Rollback::Traffic shifted to baseline revision ${baselineRevision}`);
	return true;
	} catch (e) {
	console.log(`::warning title=Rollback::Failed to rollback: ${e.message}`);
	return false;
	}
	}

	for (let iter = 1; iter <= maxIter; iter++) {
	console.log(`\n${'='.repeat(60)}`);
	console.log(`SELF-HEAL ITERATION ${iter}/${maxIter}`);
	console.log(`${'='.repeat(60)}\n`);

	// Item 8: Exponential backoff between iterations (skip cooldown on first)
	if (iter > 1) {
	const cooldownMs = BASE_COOLDOWN_MS * Math.pow(2, iter - 2);
	console.log(`Cooldown: waiting ${cooldownMs / 1000}s before next attempt...`);
	await new Promise(r => setTimeout(r, cooldownMs));
	}

	// ── Step 1: Deploy new revision as canary ──
	console.log('Deploying canary revision...');
	const tagSuffix = imageTag.replace(/[^a-z0-9]/gi, '').slice(-6);
	const revisionSuffix = `h${iter}-${tagSuffix}-${runIdSuffix}`;

	try {
	run(`az containerapp update \
	--name ${appName} \
	--resource-group ${rg} \
	--image ${registry}/acroyoga-web:${imageTag} \
	--revision-suffix ${revisionSuffix} \
	--output none`);
	} catch (e) {
	console.log(`::error title=DEPLOY::Canary deploy failed: ${e.message}`);
	const infraError = runSafe(`az deployment group list \
	--resource-group ${rg} --query "[?properties.provisioningState!='Succeeded'] \| [0:3]" -o json`);
	await commentSourceIssue('❌ Self-heal canary deploy failed', [
	`- Environment: \`${rg}\``,
	`- App: \`${appName}\``,
	`- Iteration: ${iter}/${maxIter}`,
	`- Image tag: \`${imageTag}\``,
	`- Revision suffix: \`${revisionSuffix}\``,
	'',
	'### Error',
	'```',
	String(e.message \|\| e).slice(-3000),
	'```'
	]);
	core.setOutput('result', 'fail');
	core.setOutput('error', `Infrastructure deployment failed at iteration ${iter}: ${infraError}`);
	return;
	}

	// Get the new revision name
	const latestRevision = run(`az containerapp revision list \
	--name ${appName} --resource-group ${rg} \
	--query "sort_by(@, &properties.createdTime)[-1].name" -o tsv`);
	console.log(`New revision: ${latestRevision}`);

	// ── Step 2: Get app FQDN for smoke tests ──
	const appFqdn = run(`az containerapp show \
	--name ${appName} --resource-group ${rg} \
	--query properties.configuration.ingress.fqdn -o tsv`);

	// Item 3: Run smoke tests via composite action output file
	// Write FQDN to file for the composite action to use
	// Since we're inside github-script, we invoke the smoke-test
	// action's logic using the same curl patterns but in a
	// standardised way. The composite action is used in the
	// separate workflow steps (nightly/deploy); here we replicate
	// the same parameters for consistency.
	console.log(`Running smoke tests against https://${appFqdn} ...`);

	let readinessOk = false;
	for (let r = 0; r < 50; r++) {
	try {
	const result = run(`curl --connect-timeout 10 --max-time 30 -sf "https://${appFqdn}/api/ready" -o /dev/null -w "%{http_code}"`);
	if (result === '200') {
	readinessOk = true;
	console.log(`Readiness check passed after ${r + 1} attempts`);
	break;
	}
	} catch { /* retry */ }
	if (r < 49) await new Promise(resolve => setTimeout(resolve, 15000));
	}

	let healthOk = false;
	let homepageOk = false;

	if (readinessOk) {
	try {
	const healthResp = run(`curl --connect-timeout 10 --max-time 30 -sf "https://${appFqdn}/api/health"`);
	healthOk = healthResp.includes('"status":"healthy"');
	if (!healthOk) console.log(`Health response: ${healthResp}`);
	} catch (e) {
	console.log(`Health check failed: ${e.message}`);
	}

	try {
	run(`curl --connect-timeout 10 --max-time 30 -sf "https://${appFqdn}/" -o /dev/null`);
	homepageOk = true;
	} catch (e) {
	console.log(`Home page check failed: ${e.message}`);
	}
	}

	const allPassed = readinessOk && healthOk && homepageOk;

	// ── Step 3: If all pass → success ──
	if (allPassed) {
	console.log('\n✅ All smoke tests passed — deployment successful!');
	await commentSourceIssue('✅ Self-heal deployment succeeded', [
	`- Environment: \`${rg}\``,
	`- App: \`${appName}\``,
	`- Iteration: ${iter}/${maxIter}`,
	`- Image tag: \`${imageTag}\``,
	'',
	'### Smoke Test Results',
	'- Readiness (/api/ready): ✅ pass',
	'- Health (/api/health): ✅ pass',
	'- Home page (/): ✅ pass'
	]);
	core.setOutput('result', 'pass');
	core.setOutput('iteration', String(iter));
	return;
	}

	// ── Step 4: Tests failed — collect diagnostics ──
	console.log('\n❌ Smoke tests failed — collecting diagnostics...');

	const containerLogs = runSafe(`az containerapp logs show \
	--name ${appName} --resource-group ${rg} --tail 100 2>&1 \| tail -50`);
	const systemLogs = runSafe(`az containerapp logs show \
	--name ${appName} --resource-group ${rg} --type system --tail 50 2>&1 \| tail -30`);

	// Build a minimal diagnostics JSON for the classify-error action
	const diagJson = JSON.stringify({
	containerLogs: containerLogs,
	systemLogs: systemLogs,
	deploymentOperations: [],
	smokeTestResults: {
	readiness: readinessOk ? 'pass' : 'fail',
	health: healthOk ? 'pass' : 'fail',
	homepage: homepageOk ? 'pass' : 'fail'
	}
	});

	// Write diagnostics for the classify-error action to consume
	require('fs').writeFileSync('/tmp/heal-diagnostics.json', diagJson);

	// Classify inline using the same patterns as classify-error action
	let errorCategory = 'unknown';
	let errorSummary = 'Deployment smoke tests failed';
	const allLogs = `${containerLogs}\n${systemLogs}`;

	// Credential patterns (expanded — item 9)
	if (allLogs.match(/AZURE_CLIENT_ID\|ManagedIdentityCredential\|DefaultAzureCredential\|AADSTS\|credential\|TokenExpiredError\|AuthenticationFailedError\|AuthorizationFailed\|RBAC\|authorization denied\|Access denied\|SecretNotFound\|KeyVaultError\|VaultAccessError\|Forbidden.vault\|identity.error/i)) {
	errorCategory = 'credential';
	errorSummary = 'Credential or managed identity error — requires human review';
	} else if (allLogs.match(/BackOff\|CrashLoop\|OOMKilled/i)) {
	errorCategory = 'runtime';
	errorSummary = 'Container is crash-looping or OOM';
	} else if (allLogs.match(/MODULE_NOT_FOUND\|Cannot find module\|ENOENT\|ERR_MODULE_NOT_FOUND/i)) {
	errorCategory = 'dependency';
	errorSummary = 'Missing module or file at runtime';
	} else if (allLogs.match(/ECONNREFUSED\|ENOTFOUND\|ETIMEDOUT\|EHOSTUNREACH/i)) {
	errorCategory = 'config';
	errorSummary = 'Connection error — check env vars and service endpoints';
	} else if (!readinessOk) {
	errorCategory = 'runtime';
	errorSummary = 'Application never became ready (/api/ready timeout)';
	} else if (!healthOk) {
	errorCategory = 'runtime';
	errorSummary = 'Health check failed (/api/health not healthy)';
	}

	// ── Step 5: If this is the last iteration, rollback then escalate ──
	if (iter >= maxIter) {
	console.log(`\n🚨 Max iterations (${maxIter}) reached`);

	// Item 4: Automatic rollback before escalation
	console.log('Attempting rollback to last known-good revision...');
	const rolledBack = rollbackToBaseline();

	// Deactivate the broken revision
	try {
	run(`az containerapp revision deactivate \
	--name ${appName} --resource-group ${rg} \
	--revision ${latestRevision} --output none`);
	console.log(`Deactivated failed revision: ${latestRevision}`);
	} catch { /* best effort */ }

	const issueBody = [
	`## 🚨 Self-Healing Deployment Failed — Needs Human Review`,
	'',
	`Environment: \`${rg}\``,
	`Image Tag: \`${imageTag}\``,
	`Iterations Attempted: ${iter}`,
	`Error Category: \`${errorCategory}\``,
	`Summary: ${errorSummary}`,
	`Rollback: ${rolledBack ? `✅ Rolled back to \`${baselineRevision}\`` : '❌ No baseline revision available'}`,
	'',
	'### Container Logs (last 50 lines)',
	'```',
	// Truncate to 3000 chars to stay within GitHub issue body limit (~65535 chars)
	containerLogs.slice(-3000),
	'```',
	'',
	'### System Logs (last 30 lines)',
	'```',
	// Truncate to 2000 chars — system logs are less verbose than container logs
	systemLogs.slice(-2000),
	'```',
	'',
	'### Smoke Test Results',
	`- Readiness: ${readinessOk ? '✅' : '❌'}`,
	`- Health: ${healthOk ? '✅' : '❌'}`,
	`- Home page: ${homepageOk ? '✅' : '❌'}`,
	'',
	'> This issue was auto-created by the self-healing deploy pipeline after exhausting all retry attempts.',
	].join('\n');

	await github.rest.issues.create({
	owner: context.repo.owner,
	repo: context.repo.repo,
	title: `[Deploy Fix] Self-healing failed: ${errorSummary}`,
	body: issueBody,
	labels: ['deploy-fix-auto', 'needs-human-review']
	});

	core.setOutput('result', 'fail');
	core.setOutput('error', `Self-healing exhausted after ${iter} iterations: ${errorSummary}`);
	return;
	}

	// ── Step 6: Create a fix issue for the Copilot agent ──
	console.log(`\nCreating fix issue for Copilot agent (iteration ${iter})...`);

	const fixIssueBody = [
	`## 🔧 Automated Deployment Fix Required`,
	'',
	`Environment: \`${rg}\``,
	`Image Tag: \`${imageTag}\``,
	`Self-Heal Iteration: ${iter}/${maxIter}`,
	`Error Category: \`${errorCategory}\``,
	`Error Summary: ${errorSummary}`,
	'',
	'### Agent Instructions',
	'',
	'This issue was auto-created by the self-healing deployment pipeline.',
	'The deployment to Azure Container Apps failed smoke tests.',
	'Diagnose and fix the root cause based on the error category and logs below.',
	'',
	'Steps:',
	'1. Read the error diagnostics below',
	'2. Identify the root cause in the codebase',
	'3. Implement a fix (code, config, or infrastructure)',
	'4. Create a PR with `Fixes #ISSUE_NUMBER` in the description (replace ISSUE_NUMBER with this issue number)',
	'5. The self-healing pipeline will automatically redeploy after your fix merges',
	'',
	`Error Category: \`${errorCategory}\``,
	'',
	errorCategory === 'runtime' ? '> Hint: Check application startup, API route handlers, database migrations, and health check endpoints.' :
	errorCategory === 'dependency' ? '> Hint: Check package.json, imports, and the Dockerfile build steps.' :
	errorCategory === 'config' ? '> Hint: Check environment variables, connection strings, and Azure service endpoints.' :
	errorCategory === 'infra' ? '> Hint: Check Bicep templates, resource parameters, and Azure resource configurations. Note: infra changes require human review per Constitution XV.' :
	'> Hint: Review the full logs below to identify the failure pattern.',
	'',
	'### Container Logs',
	'```',
	// Truncate to 3000 chars to stay within GitHub issue body limit (~65535 chars)
	containerLogs.slice(-3000),
	'```',
	'',
	'### System Logs',
	'```',
	// Truncate to 2000 chars — system logs are less verbose than container logs
	systemLogs.slice(-2000),
	'```',
	'',
	'### Smoke Test Results',
	`- Readiness (/api/ready): ${readinessOk ? '✅ pass' : '❌ fail'}`,
	`- Health (/api/health): ${healthOk ? '✅ pass' : '❌ fail'}`,
	`- Home page (/): ${homepageOk ? '✅ pass' : '❌ fail'}`,
	].join('\n');

	const { data: fixIssue } = await github.rest.issues.create({
	owner: context.repo.owner,
	repo: context.repo.repo,
	title: `[Deploy Fix] ${errorSummary} (iteration ${iter})`,
	body: fixIssueBody,
	labels: ['deploy-fix-auto', 'copilot']
	});

	await commentSourceIssue(`🔁 Self-heal iteration ${iter}/${maxIter} failed`, [
	`- Environment: \`${rg}\``,
	`- App: \`${appName}\``,
	`- Image tag: \`${imageTag}\``,
	`- Error category: \`${errorCategory}\``,
	`- Summary: ${errorSummary}`,
	`- Auto-created fix issue: #${fixIssue.number}`,
	'',
	'### Smoke Test Results',
	`- Readiness (/api/ready): ${readinessOk ? '✅ pass' : '❌ fail'}`,
	`- Health (/api/health): ${healthOk ? '✅ pass' : '❌ fail'}`,
	`- Home page (/): ${homepageOk ? '✅ pass' : '❌ fail'}`
	]);

	// Update the issue body with the actual issue number now that we have it
	await github.rest.issues.update({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: fixIssue.number,
	body: fixIssueBody.replace('ISSUE_NUMBER', String(fixIssue.number))
	});

	console.log(`Created fix issue #${fixIssue.number}`);

	// ── Step 7: Wait for the agent to merge a fix PR ──
	console.log(`Waiting for Copilot agent to fix issue #${fixIssue.number}...`);

	const fixPR = await waitForPR(fixIssue.number, 900000); // 15 minute timeout

	if (!fixPR) {
	console.log(`::warning title=SELF-HEAL::No fix PR merged within timeout for issue #${fixIssue.number}`);
	// Continue to next iteration anyway — the agent might have pushed to main directly
	} else {
	console.log(`Fix PR #${fixPR.number} merged — rebuilding for next iteration...`);
	// Update image tag for the next iteration using the merge commit
	imageTag = `sha-${fixPR.merge_commit_sha}`;

	// Rebuild with the new code
	try {
	// ACR login with retry (matching the initial login pattern)
	let acrLoggedIn = false;
	for (let a = 1; a <= 3; a++) {
	try {
	run(`az acr login --name ${registry}`);
	acrLoggedIn = true;
	break;
	} catch (e) {
	if (a < 3) {
	console.log(`ACR login attempt ${a}/3 failed, retrying in 10s...`);
	await new Promise(r => setTimeout(r, 10000));
	} else {
	console.log(`::warning title=SELF-HEAL::ACR login failed after 3 attempts: ${e.message}`);
	}
	}
	}

	// Trigger a rebuild via ACR task or use the existing image if deploy.yml already built it
	// For now, check if the image exists from the deploy.yml pipeline
	const imageExists = runSafe(`az acr repository show-tags \
	--name ${registry.split('.')[0]} \
	--repository acroyoga-web \
	--query "contains(@, '${imageTag}')" -o tsv`);

	if (imageExists !== 'true') {
	console.log('Fix image not yet built — waiting for deploy.yml to build it...');
	// Wait up to 10 minutes for the main deploy pipeline to build the image
	for (let w = 0; w < 20; w++) {
	await new Promise(r => setTimeout(r, 30000));
	const exists = runSafe(`az acr repository show-tags \
	--name ${registry.split('.')[0]} \
	--repository acroyoga-web \
	--query "contains(@, '${imageTag}')" -o tsv`);
	if (exists === 'true') break;
	}
	}
	} catch (e) {
	console.log(`::warning title=SELF-HEAL::Could not verify fix image: ${e.message}`);
	}
	}

	// Clean up failed revision to avoid revision limit
	try {
	run(`az containerapp revision deactivate \
	--name ${appName} --resource-group ${rg} \
	--revision ${latestRevision} --output none`);
	console.log(`Deactivated failed revision: ${latestRevision}`);
	} catch { /* best effort */ }

	console.log(`\nProceeding to iteration ${iter + 1}...`);
	}

	- name: Report result
	if: always()
	run: \|
	RESULT="${{ steps.heal-loop.outputs.result \|\| 'fail' }}"
	if [ "$RESULT" = "pass" ]; then
	echo "::notice title=Deploy & Self-Heal::✅ Deployment successful (iteration ${{ steps.heal-loop.outputs.iteration }})"
	else
	echo "::error title=Deploy & Self-Heal::❌ Deployment failed after self-healing attempts"
	exit 1
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Deploy & Self-Heal #10

Workflow file

Deploy & Self-Heal #10

Uh oh!

Workflow file for this run