Skip to content

Commit a1085e3

Browse files
Merge pull request #529 from microsoft/copilot/fix-nightly-deploy-issue
fix(infra): fix container app activation failures
2 parents 32c5892 + 8005770 commit a1085e3

5 files changed

Lines changed: 76 additions & 7 deletions

File tree

apps/web/src/app/api/ready/route.ts

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,23 @@ import { BlobServiceClient } from "@azure/storage-blob";
44
import { DefaultAzureCredential } from "@azure/identity";
55
import type { ReadinessResponse } from "@acroyoga/shared";
66

7+
const CHECK_TIMEOUT_MS = 5000;
8+
9+
function withTimeout(promise: Promise<string>): Promise<string> {
10+
return new Promise<string>((resolve) => {
11+
const timer = setTimeout(
12+
() => resolve("error: health check timeout"),
13+
CHECK_TIMEOUT_MS,
14+
);
15+
promise
16+
.then((value) => { clearTimeout(timer); resolve(value); })
17+
.catch((err: unknown) => {
18+
clearTimeout(timer);
19+
resolve(`error: ${err instanceof Error ? err.message : "unknown"}`);
20+
});
21+
});
22+
}
23+
724
async function checkDatabase(): Promise<string> {
825
try {
926
await db().query("SELECT 1");
@@ -33,8 +50,8 @@ async function checkStorage(): Promise<string> {
3350

3451
export async function GET(): Promise<NextResponse<ReadinessResponse>> {
3552
const [databaseStatus, storageStatus] = await Promise.all([
36-
checkDatabase(),
37-
checkStorage(),
53+
withTimeout(checkDatabase()),
54+
withTimeout(checkStorage()),
3855
]);
3956

4057
const allOk = databaseStatus === "ok" && storageStatus === "ok";

apps/web/src/lib/db/client.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ export function getDb(): DbClient {
4242
password: getEntraPassword,
4343
ssl: { rejectUnauthorized: true },
4444
max: 10,
45+
connectionTimeoutMillis: 5000,
4546
});
4647
}
4748
return pool;
@@ -68,6 +69,7 @@ export function getDb(): DbClient {
6869
pool = new pg.Pool({
6970
connectionString: dbUrl,
7071
max: 10,
72+
connectionTimeoutMillis: 5000,
7173
});
7274
}
7375
return pool;

apps/web/tests/integration/health.test.ts

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { describe, it, expect, beforeAll, afterAll } from "vitest";
1+
import { describe, it, expect, beforeAll, afterAll, vi } from "vitest";
22
import { createTestDb } from "../helpers/db";
33
import { setTestDb, clearTestDb } from "../../src/lib/db/client";
44
import type { PGlite } from "@electric-sql/pglite";
@@ -69,5 +69,32 @@ describe("Health API endpoints", () => {
6969
}
7070
setTestDb(testDb);
7171
});
72+
73+
it("returns 503 with timeout error when database check hangs", async () => {
74+
vi.useFakeTimers();
75+
76+
// Inject a DB client whose query never resolves (simulates a hanging connection)
77+
setTestDb({
78+
query: () => new Promise(() => {}),
79+
} as unknown as PGlite);
80+
81+
try {
82+
const { GET } = await readyModule();
83+
const responsePromise = GET();
84+
85+
// Advance past the 5000 ms check timeout
86+
await vi.advanceTimersByTimeAsync(5001);
87+
88+
const response = await responsePromise;
89+
expect(response.status).toBe(503);
90+
91+
const body = await response.json();
92+
expect(body).toHaveProperty("status", "not_ready");
93+
expect(body.checks.database).toBe("error: health check timeout");
94+
} finally {
95+
vi.useRealTimers();
96+
setTestDb(testDb);
97+
}
98+
});
7299
});
73100
});

infra/modules/container-apps.bicep

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ resource containerApp 'Microsoft.App/containerApps@2024-03-01' = {
9494
properties: {
9595
managedEnvironmentId: containerAppEnvironment.id
9696
configuration: {
97-
activeRevisionsMode: 'Multiple'
97+
activeRevisionsMode: 'Single'
9898
ingress: {
9999
external: true
100100
targetPort: 3000
@@ -208,14 +208,19 @@ resource containerApp 'Microsoft.App/containerApps@2024-03-01' = {
208208
failureThreshold: 10
209209
}
210210
{
211+
// Startup probe window: initialDelaySeconds + failureThreshold × periodSeconds
212+
// = 30 + 60 × 5 = 330 s total. Worst-case migration time is
213+
// 3 attempts × 60 s + 2 × 10 s backoff = 200 s, leaving ~130 s of
214+
// margin for Node.js to start and pass this probe.
211215
type: 'Startup'
212216
httpGet: {
213217
path: '/api/health'
214218
port: 3000
215219
}
220+
initialDelaySeconds: 30
216221
periodSeconds: 5
217222
timeoutSeconds: 5
218-
failureThreshold: 30
223+
failureThreshold: 60
219224
}
220225
]
221226
}

start.sh

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,24 @@
11
#!/bin/sh
22
set -e
3-
# Run database migrations (idempotent — safe on every container start)
4-
MIGRATIONS_DIR=/app/migrations node /app/migrate.cjs
3+
4+
# Run database migrations with retries to handle transient cold-start failures
5+
# (e.g. Managed Identity IMDS delays, PostgreSQL not yet accepting connections).
6+
MAX_RETRIES=3
7+
RETRY_DELAY=10
8+
MIGRATION_TIMEOUT=60
9+
RETRY=0
10+
while [ "$RETRY" -lt "$MAX_RETRIES" ]; do
11+
if MIGRATIONS_DIR=/app/migrations timeout "$MIGRATION_TIMEOUT" node /app/migrate.cjs; then
12+
break
13+
fi
14+
RETRY=$((RETRY + 1))
15+
if [ "$RETRY" -eq "$MAX_RETRIES" ]; then
16+
echo "DB migrations failed after $MAX_RETRIES attempts, exiting"
17+
exit 1
18+
fi
19+
echo "Migration attempt $RETRY failed, retrying in ${RETRY_DELAY}s..."
20+
sleep "$RETRY_DELAY"
21+
done
22+
523
# Start the Next.js server
624
exec node apps/web/server.js

0 commit comments

Comments
 (0)