From d7aa61e6f1ef808b1817160fb4f3b2d8bec7023c Mon Sep 17 00:00:00 2001 From: Boris Cherny Date: Fri, 8 Aug 2025 12:03:40 -0700 Subject: [PATCH] Add script to backfill duplicate comments for old issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Creates a script that identifies old issues without duplicate detection comments and triggers the existing claude-dedupe-issues workflow for each one. This helps ensure historical issues get proper duplicate detection coverage. Features: - Scans issues from configurable time period (default 30 days) - Skips issues that already have duplicate detection comments - Triggers existing workflow instead of duplicating logic - Includes dry-run mode and rate limiting for safety 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- scripts/backfill-duplicate-comments.ts | 198 +++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 scripts/backfill-duplicate-comments.ts diff --git a/scripts/backfill-duplicate-comments.ts b/scripts/backfill-duplicate-comments.ts new file mode 100644 index 00000000..457b1a38 --- /dev/null +++ b/scripts/backfill-duplicate-comments.ts @@ -0,0 +1,198 @@ +#!/usr/bin/env bun + +declare global { + var process: { + env: Record; + }; +} + +interface GitHubIssue { + number: number; + title: string; + state: string; + state_reason?: string; + user: { id: number }; + created_at: string; + closed_at?: string; +} + +interface GitHubComment { + id: number; + body: string; + created_at: string; + user: { type: string; id: number }; +} + +async function githubRequest(endpoint: string, token: string, method: string = 'GET', body?: any): Promise { + const response = await fetch(`https://api.github.com${endpoint}`, { + method, + headers: { + Authorization: `Bearer ${token}`, + Accept: "application/vnd.github.v3+json", + "User-Agent": "backfill-duplicate-comments-script", + ...(body && { "Content-Type": "application/json" }), + }, + ...(body && { body: JSON.stringify(body) }), + }); + + if (!response.ok) { + throw new Error( + `GitHub API request failed: ${response.status} ${response.statusText}` + ); + } + + return response.json(); +} + +async function triggerDedupeWorkflow( + owner: string, + repo: string, + issueNumber: number, + token: string, + dryRun: boolean = true +): Promise { + if (dryRun) { + console.log(`[DRY RUN] Would trigger dedupe workflow for issue #${issueNumber}`); + return; + } + + await githubRequest( + `/repos/${owner}/${repo}/actions/workflows/claude-dedupe-issues.yml/dispatches`, + token, + 'POST', + { + ref: 'main', + inputs: { + issue_number: issueNumber.toString() + } + } + ); +} + +async function backfillDuplicateComments(): Promise { + console.log("[DEBUG] Starting backfill duplicate comments script"); + + const token = process.env.GITHUB_TOKEN; + if (!token) { + throw new Error(`GITHUB_TOKEN environment variable is required + +Usage: + GITHUB_TOKEN=your_token bun run scripts/backfill-duplicate-comments.ts + +Environment Variables: + GITHUB_TOKEN - GitHub personal access token with repo and actions permissions (required) + DRY_RUN - Set to "false" to actually trigger workflows (default: true for safety) + DAYS_BACK - How many days back to look for old issues (default: 90)`); + } + console.log("[DEBUG] GitHub token found"); + + const owner = "anthropics"; + const repo = "claude-code"; + const dryRun = process.env.DRY_RUN !== "false"; + const daysBack = parseInt(process.env.DAYS_BACK || "90", 10); + + console.log(`[DEBUG] Repository: ${owner}/${repo}`); + console.log(`[DEBUG] Dry run mode: ${dryRun}`); + console.log(`[DEBUG] Looking back ${daysBack} days`); + + const cutoffDate = new Date(); + cutoffDate.setDate(cutoffDate.getDate() - daysBack); + + console.log(`[DEBUG] Fetching issues created since ${cutoffDate.toISOString()}...`); + const allIssues: GitHubIssue[] = []; + let page = 1; + const perPage = 100; + + while (true) { + const pageIssues: GitHubIssue[] = await githubRequest( + `/repos/${owner}/${repo}/issues?state=all&per_page=${perPage}&page=${page}&since=${cutoffDate.toISOString()}`, + token + ); + + if (pageIssues.length === 0) break; + + allIssues.push(...pageIssues); + page++; + + // Safety limit to avoid infinite loops + if (page > 100) { + console.log("[DEBUG] Reached page limit, stopping pagination"); + break; + } + } + + console.log(`[DEBUG] Found ${allIssues.length} issues from the last ${daysBack} days`); + + let processedCount = 0; + let candidateCount = 0; + let triggeredCount = 0; + + for (const issue of allIssues) { + processedCount++; + console.log( + `[DEBUG] Processing issue #${issue.number} (${processedCount}/${allIssues.length}): ${issue.title}` + ); + + console.log(`[DEBUG] Fetching comments for issue #${issue.number}...`); + const comments: GitHubComment[] = await githubRequest( + `/repos/${owner}/${repo}/issues/${issue.number}/comments`, + token + ); + console.log( + `[DEBUG] Issue #${issue.number} has ${comments.length} comments` + ); + + // Look for existing duplicate detection comments (from the dedupe bot) + const dupeDetectionComments = comments.filter( + (comment) => + comment.body.includes("Found") && + comment.body.includes("possible duplicate") && + comment.user.type === "Bot" + ); + + console.log( + `[DEBUG] Issue #${issue.number} has ${dupeDetectionComments.length} duplicate detection comments` + ); + + // Skip if there's already a duplicate detection comment + if (dupeDetectionComments.length > 0) { + console.log( + `[DEBUG] Issue #${issue.number} already has duplicate detection comment, skipping` + ); + continue; + } + + candidateCount++; + const issueUrl = `https://github.com/${owner}/${repo}/issues/${issue.number}`; + + try { + console.log( + `[INFO] ${dryRun ? '[DRY RUN] ' : ''}Triggering dedupe workflow for issue #${issue.number}: ${issueUrl}` + ); + await triggerDedupeWorkflow(owner, repo, issue.number, token, dryRun); + + if (!dryRun) { + console.log( + `[SUCCESS] Successfully triggered dedupe workflow for issue #${issue.number}` + ); + } + triggeredCount++; + } catch (error) { + console.error( + `[ERROR] Failed to trigger workflow for issue #${issue.number}: ${error}` + ); + } + + // Add a delay between workflow triggers to avoid overwhelming the system + await new Promise(resolve => setTimeout(resolve, 1000)); + } + + console.log( + `[DEBUG] Script completed. Processed ${processedCount} issues, found ${candidateCount} candidates without duplicate comments, ${dryRun ? 'would trigger' : 'triggered'} ${triggeredCount} workflows` + ); +} + +backfillDuplicateComments().catch(console.error); + +// Make it a module +export {}; \ No newline at end of file