feat: Add .taskmaster directory (#619)

2025-05-31 16:21:03 +02:00
parent 669b744ced
commit 3f64202c9f
151 changed files with 3432 additions and 7584 deletions
--- a/.taskmaster/tasks/task_084.txt
+++ b/.taskmaster/tasks/task_084.txt
@@ -0,0 +1,93 @@
+# Task ID: 84
+# Title: Implement token counting utility
+# Status: pending
+# Dependencies: 82
+# Priority: high
+# Description: Create a utility function to count tokens for prompts based on the model being used, primarily using tiktoken for OpenAI and Anthropic models with character-based fallbacks for other providers.
+# Details:
+1. Install the tiktoken package:
+```bash
+npm install tiktoken
+```
+
+2. Create a new file `scripts/modules/token-counter.js`:
+```javascript
+const tiktoken = require('tiktoken');
+
+/**
+ * Count tokens for a given text and model
+ * @param {string} text - The text to count tokens for
+ * @param {string} provider - The AI provider (e.g., 'openai', 'anthropic')
+ * @param {string} modelId - The model ID
+ * @returns {number} - Estimated token count
+ */
+function countTokens(text, provider, modelId) {
+  if (!text) return 0;
+  
+  // Convert to lowercase for case-insensitive matching
+  const providerLower = provider?.toLowerCase();
+  
+  try {
+    // OpenAI models
+    if (providerLower === 'openai') {
+      // Most OpenAI chat models use cl100k_base encoding
+      const encoding = tiktoken.encoding_for_model(modelId) || tiktoken.get_encoding('cl100k_base');
+      return encoding.encode(text).length;
+    }
+    
+    // Anthropic models - can use cl100k_base as an approximation
+    // or follow Anthropic's guidance
+    if (providerLower === 'anthropic') {
+      try {
+        // Try to use cl100k_base as a reasonable approximation
+        const encoding = tiktoken.get_encoding('cl100k_base');
+        return encoding.encode(text).length;
+      } catch (e) {
+        // Fallback to Anthropic's character-based estimation
+        return Math.ceil(text.length / 3.5); // ~3.5 chars per token for English
+      }
+    }
+    
+    // For other providers, use character-based estimation as fallback
+    // Different providers may have different tokenization schemes
+    return Math.ceil(text.length / 4); // General fallback estimate
+  } catch (error) {
+    console.warn(`Token counting error: ${error.message}. Using character-based estimate.`);
+    return Math.ceil(text.length / 4); // Fallback if tiktoken fails
+  }
+}
+
+module.exports = { countTokens };
+```
+
+3. Add tests for the token counter in `tests/token-counter.test.js`:
+```javascript
+const { countTokens } = require('../scripts/modules/token-counter');
+
+describe('Token Counter', () => {
+  test('counts tokens for OpenAI models', () => {
+    const text = 'Hello, world! This is a test.';
+    const count = countTokens(text, 'openai', 'gpt-4');
+    expect(count).toBeGreaterThan(0);
+    expect(typeof count).toBe('number');
+  });
+  
+  test('counts tokens for Anthropic models', () => {
+    const text = 'Hello, world! This is a test.';
+    const count = countTokens(text, 'anthropic', 'claude-3-7-sonnet-20250219');
+    expect(count).toBeGreaterThan(0);
+    expect(typeof count).toBe('number');
+  });
+  
+  test('handles empty text', () => {
+    expect(countTokens('', 'openai', 'gpt-4')).toBe(0);
+    expect(countTokens(null, 'openai', 'gpt-4')).toBe(0);
+  });
+});
+```
+
+# Test Strategy:
+1. Unit test the countTokens function with various inputs and models
+2. Compare token counts with known examples from OpenAI and Anthropic documentation
+3. Test edge cases: empty strings, very long texts, non-English texts
+4. Test fallback behavior when tiktoken fails or is not applicable