fix(hookify): add size limits to transcript reading to prevent OOM

When hookify rules use `field: transcript`, the entire transcript file
was read into memory. For long-running sessions or ralph loops,
transcripts can grow to gigabytes, causing OOM kills.

This fix adds:
- 10MB max size limit for transcript reading
- For larger files, only the tail (most recent content) is read
- Warning at 5MB to alert users of large transcripts
- Proper error handling consolidated into a reusable function

This addresses memory issues reported in monorepo where CC was
consuming 30-40GB before getting OOM killed during startup/resume.
This commit is contained in:
Claude
2026-01-05 20:20:01 +00:00
parent 5c92b97cc4
commit 449aa9d3b0

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python3
"""Rule evaluation engine for hookify plugin."""
import os
import re
import sys
from functools import lru_cache
@@ -9,6 +10,13 @@ from typing import List, Dict, Any, Optional
# Import from local module
from hookify.core.config_loader import Rule, Condition
# Maximum transcript size to load into memory (10MB default)
# For larger files, only the tail is read to prevent OOM
MAX_TRANSCRIPT_SIZE_BYTES = 10 * 1024 * 1024 # 10MB
# Size threshold for warning about large transcripts
TRANSCRIPT_WARNING_SIZE_BYTES = 5 * 1024 * 1024 # 5MB
# Cache compiled regexes (max 128 patterns)
@lru_cache(maxsize=128)
@@ -24,6 +32,58 @@ def compile_regex(pattern: str) -> re.Pattern:
return re.compile(pattern, re.IGNORECASE)
def read_transcript_safely(transcript_path: str) -> str:
"""Read transcript file with size limits to prevent OOM.
For large transcripts (>10MB), only reads the tail of the file
to prevent memory exhaustion. This preserves the most recent
conversation context which is typically what rules care about.
Args:
transcript_path: Path to the transcript file
Returns:
Transcript content as string, possibly truncated for large files
"""
try:
file_size = os.path.getsize(transcript_path)
# Warn about large transcripts
if file_size > TRANSCRIPT_WARNING_SIZE_BYTES:
size_mb = file_size / (1024 * 1024)
print(f"Warning: Large transcript ({size_mb:.1f}MB): {transcript_path}", file=sys.stderr)
# For files within limit, read normally
if file_size <= MAX_TRANSCRIPT_SIZE_BYTES:
with open(transcript_path, 'r') as f:
return f.read()
# For large files, read only the tail to prevent OOM
size_mb = file_size / (1024 * 1024)
limit_mb = MAX_TRANSCRIPT_SIZE_BYTES / (1024 * 1024)
print(f"Warning: Transcript too large ({size_mb:.1f}MB), reading last {limit_mb:.0f}MB only", file=sys.stderr)
with open(transcript_path, 'r') as f:
# Seek to position near end, leaving room for MAX_TRANSCRIPT_SIZE_BYTES
f.seek(file_size - MAX_TRANSCRIPT_SIZE_BYTES)
# Skip partial line at seek position
f.readline()
return f.read()
except FileNotFoundError:
print(f"Warning: Transcript file not found: {transcript_path}", file=sys.stderr)
return ''
except PermissionError:
print(f"Warning: Permission denied reading transcript: {transcript_path}", file=sys.stderr)
return ''
except (IOError, OSError) as e:
print(f"Warning: Error reading transcript {transcript_path}: {e}", file=sys.stderr)
return ''
except UnicodeDecodeError as e:
print(f"Warning: Encoding error in transcript {transcript_path}: {e}", file=sys.stderr)
return ''
class RuleEngine:
"""Evaluates rules against hook input data."""
@@ -205,24 +265,10 @@ class RuleEngine:
if field == 'reason':
return input_data.get('reason', '')
elif field == 'transcript':
# Read transcript file if path provided
# Read transcript file with size limits to prevent OOM
transcript_path = input_data.get('transcript_path')
if transcript_path:
try:
with open(transcript_path, 'r') as f:
return f.read()
except FileNotFoundError:
print(f"Warning: Transcript file not found: {transcript_path}", file=sys.stderr)
return ''
except PermissionError:
print(f"Warning: Permission denied reading transcript: {transcript_path}", file=sys.stderr)
return ''
except (IOError, OSError) as e:
print(f"Warning: Error reading transcript {transcript_path}: {e}", file=sys.stderr)
return ''
except UnicodeDecodeError as e:
print(f"Warning: Encoding error in transcript {transcript_path}: {e}", file=sys.stderr)
return ''
return read_transcript_safely(transcript_path)
elif field == 'user_prompt':
# For UserPromptSubmit events
return input_data.get('user_prompt', '')