fix: improve batch job monitoring with 1-minute polling

- Change from exponential backoff to fixed 1-minute polling interval
- Log status on EVERY check (not just on status change)
- Show check number and elapsed time in each log
- Increase max timeout to 120 minutes (was 100 attempts with variable times)
- Add better status symbols for completed/failed states

This fixes the issue where batches completed on OpenAI's side but monitoring
appeared to hang because it was waiting too long between checks.

Note: Error files with API tokens are now excluded from commits for security.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
czlonkowski
2025-09-30 10:38:36 +02:00
parent fff47f9f9d
commit 2057f98e76
3 changed files with 27 additions and 141 deletions

View File

@@ -258,54 +258,52 @@ export class BatchProcessor {
}
/**
* Monitor batch job with exponential backoff
* Monitor batch job with fixed 1-minute polling interval
*/
private async monitorBatchJob(batchId: string): Promise<any> {
// Start with shorter wait times for better UX
const waitTimes = [30, 60, 120, 300, 600, 900, 1800]; // Progressive wait times in seconds
let waitIndex = 0;
const pollInterval = 60; // Check every 60 seconds (1 minute)
let attempts = 0;
const maxAttempts = 100; // Safety limit
const maxAttempts = 120; // 120 minutes max (2 hours)
const startTime = Date.now();
let lastStatus = '';
while (attempts < maxAttempts) {
const batchJob = await this.client.batches.retrieve(batchId);
// Only log if status changed
const elapsedMinutes = Math.floor((Date.now() - startTime) / 60000);
// Log status on every check (not just on change)
const statusSymbol = batchJob.status === 'in_progress' ? '⚙️' :
batchJob.status === 'finalizing' ? '📦' :
batchJob.status === 'validating' ? '🔍' :
batchJob.status === 'completed' ? '✅' :
batchJob.status === 'failed' ? '❌' : '⏳';
console.log(` ${statusSymbol} Batch ${batchId.slice(-8)}: ${batchJob.status} (${elapsedMinutes} min, check ${attempts + 1})`);
if (batchJob.status !== lastStatus) {
const elapsedMinutes = Math.floor((Date.now() - startTime) / 60000);
const statusSymbol = batchJob.status === 'in_progress' ? '⚙️' :
batchJob.status === 'finalizing' ? '📦' :
batchJob.status === 'validating' ? '🔍' : '⏳';
console.log(` ${statusSymbol} Batch ${batchId.slice(-8)}: ${batchJob.status} (${elapsedMinutes} min)`);
logger.info(`Batch ${batchId} status changed: ${lastStatus} -> ${batchJob.status}`);
lastStatus = batchJob.status;
}
logger.debug(`Batch ${batchId} status: ${batchJob.status} (attempt ${attempts + 1})`);
if (batchJob.status === 'completed') {
const elapsedMinutes = Math.floor((Date.now() - startTime) / 60000);
console.log(` ✅ Batch ${batchId.slice(-8)} completed in ${elapsedMinutes} minutes`);
console.log(` ✅ Batch ${batchId.slice(-8)} completed successfully in ${elapsedMinutes} minutes`);
logger.info(`Batch job ${batchId} completed successfully`);
return batchJob;
}
if (['failed', 'expired', 'cancelled'].includes(batchJob.status)) {
logger.error(`Batch job ${batchId} failed with status: ${batchJob.status}`);
throw new Error(`Batch job failed with status: ${batchJob.status}`);
}
// Wait before next check
const waitTime = waitTimes[Math.min(waitIndex, waitTimes.length - 1)];
logger.debug(`Waiting ${waitTime} seconds before next check...`);
await this.sleep(waitTime * 1000);
waitIndex = Math.min(waitIndex + 1, waitTimes.length - 1);
// Wait before next check (always 1 minute)
logger.debug(`Waiting ${pollInterval} seconds before next check...`);
await this.sleep(pollInterval * 1000);
attempts++;
}
throw new Error(`Batch job monitoring timed out after ${maxAttempts} attempts`);
throw new Error(`Batch job monitoring timed out after ${maxAttempts} minutes`);
}
/**