fix(hookify): add size limits to transcript reading to prevent OOM

When hookify rules use `field: transcript`, the entire transcript file was read into memory. For long-running sessions or ralph loops, transcripts can grow to gigabytes, causing OOM kills. This fix adds: - 10MB max size limit for transcript reading - For larger files, only the tail (most recent content) is read - Warning at 5MB to alert users of large transcripts - Proper error handling consolidated into a reusable function This addresses memory issues reported in monorepo where CC was consuming 30-40GB before getting OOM killed during startup/resume.
2026-01-30 04:02:03 +00:00 · 2026-01-05 20:20:01 +00:00
parent 5c92b97cc4
commit 449aa9d3b0
1 changed files with 62 additions and 16 deletions
--- a/plugins/hookify/core/rule_engine.py
+++ b/plugins/hookify/core/rule_engine.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 """Rule evaluation engine for hookify plugin."""

+import os
 import re
 import sys
 from functools import lru_cache
@@ -9,6 +10,13 @@ from typing import List, Dict, Any, Optional
 # Import from local module
 from hookify.core.config_loader import Rule, Condition

+# Maximum transcript size to load into memory (10MB default)
+# For larger files, only the tail is read to prevent OOM
+MAX_TRANSCRIPT_SIZE_BYTES = 10 * 1024 * 1024  # 10MB
+
+# Size threshold for warning about large transcripts
+TRANSCRIPT_WARNING_SIZE_BYTES = 5 * 1024 * 1024  # 5MB
+

 # Cache compiled regexes (max 128 patterns)
@lru_cache(maxsize=128)
@@ -24,6 +32,58 @@ def compile_regex(pattern: str) -> re.Pattern:
    return re.compile(pattern, re.IGNORECASE)


+def read_transcript_safely(transcript_path: str) -> str:
+    """Read transcript file with size limits to prevent OOM.
+
+    For large transcripts (>10MB), only reads the tail of the file
+    to prevent memory exhaustion. This preserves the most recent
+    conversation context which is typically what rules care about.
+
+    Args:
+        transcript_path: Path to the transcript file
+
+    Returns:
+        Transcript content as string, possibly truncated for large files
+    """
+    try:
+        file_size = os.path.getsize(transcript_path)
+
+        # Warn about large transcripts
+        if file_size > TRANSCRIPT_WARNING_SIZE_BYTES:
+            size_mb = file_size / (1024 * 1024)
+            print(f"Warning: Large transcript ({size_mb:.1f}MB): {transcript_path}", file=sys.stderr)
+
+        # For files within limit, read normally
+        if file_size <= MAX_TRANSCRIPT_SIZE_BYTES:
+            with open(transcript_path, 'r') as f:
+                return f.read()
+
+        # For large files, read only the tail to prevent OOM
+        size_mb = file_size / (1024 * 1024)
+        limit_mb = MAX_TRANSCRIPT_SIZE_BYTES / (1024 * 1024)
+        print(f"Warning: Transcript too large ({size_mb:.1f}MB), reading last {limit_mb:.0f}MB only", file=sys.stderr)
+
+        with open(transcript_path, 'r') as f:
+            # Seek to position near end, leaving room for MAX_TRANSCRIPT_SIZE_BYTES
+            f.seek(file_size - MAX_TRANSCRIPT_SIZE_BYTES)
+            # Skip partial line at seek position
+            f.readline()
+            return f.read()
+
+    except FileNotFoundError:
+        print(f"Warning: Transcript file not found: {transcript_path}", file=sys.stderr)
+        return ''
+    except PermissionError:
+        print(f"Warning: Permission denied reading transcript: {transcript_path}", file=sys.stderr)
+        return ''
+    except (IOError, OSError) as e:
+        print(f"Warning: Error reading transcript {transcript_path}: {e}", file=sys.stderr)
+        return ''
+    except UnicodeDecodeError as e:
+        print(f"Warning: Encoding error in transcript {transcript_path}: {e}", file=sys.stderr)
+        return ''
+
+
 class RuleEngine:
    """Evaluates rules against hook input data."""

@@ -205,24 +265,10 @@ class RuleEngine:
            if field == 'reason':
                return input_data.get('reason', '')
            elif field == 'transcript':
-                # Read transcript file if path provided
+                # Read transcript file with size limits to prevent OOM
                transcript_path = input_data.get('transcript_path')
                if transcript_path:
-                    try:
-                        with open(transcript_path, 'r') as f:
-                            return f.read()
-                    except FileNotFoundError:
-                        print(f"Warning: Transcript file not found: {transcript_path}", file=sys.stderr)
-                        return ''
-                    except PermissionError:
-                        print(f"Warning: Permission denied reading transcript: {transcript_path}", file=sys.stderr)
-                        return ''
-                    except (IOError, OSError) as e:
-                        print(f"Warning: Error reading transcript {transcript_path}: {e}", file=sys.stderr)
-                        return ''
-                    except UnicodeDecodeError as e:
-                        print(f"Warning: Encoding error in transcript {transcript_path}: {e}", file=sys.stderr)
-                        return ''
+                    return read_transcript_safely(transcript_path)
            elif field == 'user_prompt':
                # For UserPromptSubmit events
                return input_data.get('user_prompt', '')