Retry Failed CI #195
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This workflow automatically re-runs failed jobs from the Daily CI and PR CI. | |
| # It only retries if ALL failures match known infrastructure error patterns | |
| # (e.g., Maven Central outages, Docker pull errors, Windows credential issues). | |
| # If any failure looks like a real test assertion failure, the retry is skipped. | |
| # It only retries once to avoid infinite loops. | |
| name: Retry Failed CI | |
| on: | |
| workflow_run: | |
| workflows: ["Daily CI", "PR CI"] | |
| types: | |
| - completed | |
| jobs: | |
| retry: | |
| if: github.event.workflow_run.conclusion == 'failure' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| steps: | |
| - name: Check failure patterns and retry if infrastructure-related | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const runId = context.payload.workflow_run.id; | |
| // Check if this is already a retry to avoid infinite loops | |
| const run = await github.rest.actions.getWorkflowRun({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| run_id: runId, | |
| }); | |
| if (run.data.run_attempt > 1) { | |
| console.log('Already a retry (attempt ' + run.data.run_attempt + '). Skipping.'); | |
| return; | |
| } | |
| // Known infrastructure error patterns that are safe to retry | |
| const infraPatterns = [ | |
| // Maven Central / dependency download outages | |
| 'could not get', | |
| 'could not resolve', | |
| 'status code 502', | |
| 'status code 403', | |
| 'status code 500', | |
| 'bad gateway', | |
| 'server returned http response code: 500', | |
| // Docker/Colima failures on macOS | |
| 'docker: unexpected eof', | |
| 'connection reset by peer', | |
| 'wrong diff id', | |
| // Windows DLL/process crashes | |
| 'exit code -1073741502', | |
| 'exit code -1073741819', | |
| // Windows OIDC credential signing issues | |
| 'invalidsignatureexception', | |
| 'the request signature we calculated does not match', | |
| // Transient DynamoDB errors | |
| 'provisionedthroughputexceededexception', | |
| ]; | |
| // Patterns that indicate real test failures — never retry these | |
| const testFailurePatterns = [ | |
| 'assertionerror', | |
| 'assertionfailederror', | |
| 'expected:<', | |
| 'nullpointerexception', | |
| ]; | |
| // Jobs that should never be retried regardless of error pattern | |
| const skipJobPatterns = [ | |
| 'fuzz', | |
| ]; | |
| // Get all jobs for this run | |
| const jobs = await github.paginate( | |
| github.rest.actions.listJobsForWorkflowRun, | |
| { owner: context.repo.owner, repo: context.repo.repo, run_id: runId } | |
| ); | |
| const failedJobs = jobs.filter(j => j.conclusion === 'failure'); | |
| console.log(`Found ${failedJobs.length} failed job(s):`); | |
| failedJobs.forEach(j => console.log(` - ${j.name} (id: ${j.id})`)); | |
| if (failedJobs.length === 0) { | |
| console.log('No failed jobs found. Skipping.'); | |
| return; | |
| } | |
| // Check skip list first | |
| const skippedJobs = failedJobs.filter(job => | |
| skipJobPatterns.some(p => job.name.toLowerCase().includes(p)) | |
| ); | |
| if (skippedJobs.length > 0) { | |
| console.log('Skip-listed job(s) failed. Not retrying:'); | |
| skippedJobs.forEach(j => console.log(` - ${j.name}`)); | |
| return; | |
| } | |
| // Check each failed job's logs | |
| let allInfra = true; | |
| for (const job of failedJobs) { | |
| console.log(`\nAnalyzing logs for: ${job.name}`); | |
| let logs; | |
| try { | |
| const response = await github.rest.actions.downloadJobLogsForWorkflowRun({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| job_id: job.id, | |
| }); | |
| logs = response.data.toLowerCase(); | |
| } catch (e) { | |
| console.log(` Could not fetch logs: ${e.message}. Assuming real failure.`); | |
| allInfra = false; | |
| break; | |
| } | |
| // Check for real test failures first | |
| const hasTestFailure = testFailurePatterns.some(p => logs.includes(p)); | |
| if (hasTestFailure) { | |
| console.log(` Found test assertion failure. Not retrying.`); | |
| allInfra = false; | |
| break; | |
| } | |
| // Check if failure matches known infra patterns | |
| const matchedInfra = infraPatterns.filter(p => logs.includes(p)); | |
| if (matchedInfra.length > 0) { | |
| console.log(` Matched infra patterns: ${matchedInfra.join(', ')}`); | |
| } else { | |
| console.log(` No known infra pattern matched. Assuming real failure.`); | |
| allInfra = false; | |
| break; | |
| } | |
| } | |
| if (!allInfra) { | |
| console.log('\nReal test failure detected. Skipping retry.'); | |
| return; | |
| } | |
| console.log('\nAll failures are infrastructure-related. Re-running failed jobs...'); | |
| await github.rest.actions.reRunWorkflowFailedJobs({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| run_id: runId, | |
| }); |