feat: Add .taskmaster directory (#619)
This commit is contained in:
93
.taskmaster/tasks/task_084.txt
Normal file
93
.taskmaster/tasks/task_084.txt
Normal file
@@ -0,0 +1,93 @@
|
||||
# Task ID: 84
|
||||
# Title: Implement token counting utility
|
||||
# Status: pending
|
||||
# Dependencies: 82
|
||||
# Priority: high
|
||||
# Description: Create a utility function to count tokens for prompts based on the model being used, primarily using tiktoken for OpenAI and Anthropic models with character-based fallbacks for other providers.
|
||||
# Details:
|
||||
1. Install the tiktoken package:
|
||||
```bash
|
||||
npm install tiktoken
|
||||
```
|
||||
|
||||
2. Create a new file `scripts/modules/token-counter.js`:
|
||||
```javascript
|
||||
const tiktoken = require('tiktoken');
|
||||
|
||||
/**
|
||||
* Count tokens for a given text and model
|
||||
* @param {string} text - The text to count tokens for
|
||||
* @param {string} provider - The AI provider (e.g., 'openai', 'anthropic')
|
||||
* @param {string} modelId - The model ID
|
||||
* @returns {number} - Estimated token count
|
||||
*/
|
||||
function countTokens(text, provider, modelId) {
|
||||
if (!text) return 0;
|
||||
|
||||
// Convert to lowercase for case-insensitive matching
|
||||
const providerLower = provider?.toLowerCase();
|
||||
|
||||
try {
|
||||
// OpenAI models
|
||||
if (providerLower === 'openai') {
|
||||
// Most OpenAI chat models use cl100k_base encoding
|
||||
const encoding = tiktoken.encoding_for_model(modelId) || tiktoken.get_encoding('cl100k_base');
|
||||
return encoding.encode(text).length;
|
||||
}
|
||||
|
||||
// Anthropic models - can use cl100k_base as an approximation
|
||||
// or follow Anthropic's guidance
|
||||
if (providerLower === 'anthropic') {
|
||||
try {
|
||||
// Try to use cl100k_base as a reasonable approximation
|
||||
const encoding = tiktoken.get_encoding('cl100k_base');
|
||||
return encoding.encode(text).length;
|
||||
} catch (e) {
|
||||
// Fallback to Anthropic's character-based estimation
|
||||
return Math.ceil(text.length / 3.5); // ~3.5 chars per token for English
|
||||
}
|
||||
}
|
||||
|
||||
// For other providers, use character-based estimation as fallback
|
||||
// Different providers may have different tokenization schemes
|
||||
return Math.ceil(text.length / 4); // General fallback estimate
|
||||
} catch (error) {
|
||||
console.warn(`Token counting error: ${error.message}. Using character-based estimate.`);
|
||||
return Math.ceil(text.length / 4); // Fallback if tiktoken fails
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { countTokens };
|
||||
```
|
||||
|
||||
3. Add tests for the token counter in `tests/token-counter.test.js`:
|
||||
```javascript
|
||||
const { countTokens } = require('../scripts/modules/token-counter');
|
||||
|
||||
describe('Token Counter', () => {
|
||||
test('counts tokens for OpenAI models', () => {
|
||||
const text = 'Hello, world! This is a test.';
|
||||
const count = countTokens(text, 'openai', 'gpt-4');
|
||||
expect(count).toBeGreaterThan(0);
|
||||
expect(typeof count).toBe('number');
|
||||
});
|
||||
|
||||
test('counts tokens for Anthropic models', () => {
|
||||
const text = 'Hello, world! This is a test.';
|
||||
const count = countTokens(text, 'anthropic', 'claude-3-7-sonnet-20250219');
|
||||
expect(count).toBeGreaterThan(0);
|
||||
expect(typeof count).toBe('number');
|
||||
});
|
||||
|
||||
test('handles empty text', () => {
|
||||
expect(countTokens('', 'openai', 'gpt-4')).toBe(0);
|
||||
expect(countTokens(null, 'openai', 'gpt-4')).toBe(0);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
# Test Strategy:
|
||||
1. Unit test the countTokens function with various inputs and models
|
||||
2. Compare token counts with known examples from OpenAI and Anthropic documentation
|
||||
3. Test edge cases: empty strings, very long texts, non-English texts
|
||||
4. Test fallback behavior when tiktoken fails or is not applicable
|
||||
Reference in New Issue
Block a user