Files
claude-task-master/.taskmaster/tasks/task_084.txt
2025-05-31 16:21:03 +02:00

94 lines
3.3 KiB
Plaintext

# Task ID: 84
# Title: Implement token counting utility
# Status: pending
# Dependencies: 82
# Priority: high
# Description: Create a utility function to count tokens for prompts based on the model being used, primarily using tiktoken for OpenAI and Anthropic models with character-based fallbacks for other providers.
# Details:
1. Install the tiktoken package:
```bash
npm install tiktoken
```
2. Create a new file `scripts/modules/token-counter.js`:
```javascript
const tiktoken = require('tiktoken');
/**
* Count tokens for a given text and model
* @param {string} text - The text to count tokens for
* @param {string} provider - The AI provider (e.g., 'openai', 'anthropic')
* @param {string} modelId - The model ID
* @returns {number} - Estimated token count
*/
function countTokens(text, provider, modelId) {
if (!text) return 0;
// Convert to lowercase for case-insensitive matching
const providerLower = provider?.toLowerCase();
try {
// OpenAI models
if (providerLower === 'openai') {
// Most OpenAI chat models use cl100k_base encoding
const encoding = tiktoken.encoding_for_model(modelId) || tiktoken.get_encoding('cl100k_base');
return encoding.encode(text).length;
}
// Anthropic models - can use cl100k_base as an approximation
// or follow Anthropic's guidance
if (providerLower === 'anthropic') {
try {
// Try to use cl100k_base as a reasonable approximation
const encoding = tiktoken.get_encoding('cl100k_base');
return encoding.encode(text).length;
} catch (e) {
// Fallback to Anthropic's character-based estimation
return Math.ceil(text.length / 3.5); // ~3.5 chars per token for English
}
}
// For other providers, use character-based estimation as fallback
// Different providers may have different tokenization schemes
return Math.ceil(text.length / 4); // General fallback estimate
} catch (error) {
console.warn(`Token counting error: ${error.message}. Using character-based estimate.`);
return Math.ceil(text.length / 4); // Fallback if tiktoken fails
}
}
module.exports = { countTokens };
```
3. Add tests for the token counter in `tests/token-counter.test.js`:
```javascript
const { countTokens } = require('../scripts/modules/token-counter');
describe('Token Counter', () => {
test('counts tokens for OpenAI models', () => {
const text = 'Hello, world! This is a test.';
const count = countTokens(text, 'openai', 'gpt-4');
expect(count).toBeGreaterThan(0);
expect(typeof count).toBe('number');
});
test('counts tokens for Anthropic models', () => {
const text = 'Hello, world! This is a test.';
const count = countTokens(text, 'anthropic', 'claude-3-7-sonnet-20250219');
expect(count).toBeGreaterThan(0);
expect(typeof count).toBe('number');
});
test('handles empty text', () => {
expect(countTokens('', 'openai', 'gpt-4')).toBe(0);
expect(countTokens(null, 'openai', 'gpt-4')).toBe(0);
});
});
```
# Test Strategy:
1. Unit test the countTokens function with various inputs and models
2. Compare token counts with known examples from OpenAI and Anthropic documentation
3. Test edge cases: empty strings, very long texts, non-English texts
4. Test fallback behavior when tiktoken fails or is not applicable