feat: implement intelligent language detection for podcast conversations

- Added robust language detection system that analyzes URL domains, content keywords, and special characters
- Supports automatic detection of English, Italian, Spanish, French, and German
- Generates podcast conversations in the same language as the website content
- Enhanced AI prompts with strong language-specific instructions
- Added comprehensive debug logging for language detection accuracy
- Improved detection thresholds to prevent false positives

The system now automatically detects the language of scraped website content and generates podcast conversations in that language, ensuring localization and improved user experience.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Rosario Moscato
2025-09-23 20:02:04 +02:00
parent 30bb11d62a
commit e32f433c79

View File

@@ -3,6 +3,187 @@ import { streamObject } from 'ai';
import { openai } from '@ai-sdk/openai';
import { z } from 'zod';
// Function to detect language from URL and content
function detectLanguage(url: string, content: string): string {
// English common words (most frequent English words)
const englishWords = ['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us'];
// Language detection patterns
const languagePatterns = {
italian: {
domains: [/\.it$/],
keywords: [/italia/i, /italian/i, /italiano/i],
// Italian specific words that don't exist in English
words: ['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'una', 'uno', 'che', 'di', 'a', 'da', 'in', 'con', 'su', 'per', 'tra', 'fra', 'e', 'ma', 'o', 'se', 'perché', 'come', 'dove', 'quando', 'quanto', 'questo', 'quello', 'essere', 'avere', 'fare', 'non', 'sì', 'tutto', 'molto', 'più', 'può', 'quale', 'chi', 'cui', 'senza', 'fra', 'tra', 'contro', 'durante', 'sulla', 'dallo', 'nella', 'dalla', 'coi', 'col', 'negli', 'nelle', 'dagli', 'dalle', 'sugli', 'sulle'],
phrases: ['ciao', 'grazie', 'prego', 'scusi', 'buongiorno', 'buonasera', 'arrivederci', 'prego'],
chars: /[àèéìòùù]/i
},
spanish: {
domains: [/\.es$/, /\.mx$/, /\.ar$/, /\.cl$/, /\.co$/],
keywords: [/españa/i, /espanol/i, /hispano/i],
// Spanish specific words
words: ['el', 'la', 'los', 'las', 'un', 'una', 'uno', 'que', 'de', 'en', 'con', 'por', 'para', 'como', 'estar', 'tener', 'hacer', 'poder', 'decir', 'ir', 'ver', 'saber', 'y', 'no', 'sí', 'todo', 'muy', 'más', 'puede', 'cual', 'quién', 'sin', 'contra', 'durante', 'al', 'del', 'los', 'las', 'un', 'una', 'unos', 'unas', 'este', 'esta', 'estos', 'estas', 'ese', 'esa', 'esos', 'esas', 'aquel', 'aquella', 'aquellos', 'aquellas'],
phrases: ['hola', 'gracias', 'por favor', 'de nada', 'buenos días', 'buenas tardes', 'buenas noches', 'adiós'],
chars: /[ñáéíóúü]/i
},
french: {
domains: [/\.fr$/, /\.ca$/, /\.be$/, /\.ch$/],
keywords: [/france/i, /français/i, /french/i],
// French specific words
words: ['le', 'la', 'les', 'un', 'une', 'des', 'que', 'de', 'à', 'en', 'avec', 'pour', 'dans', 'sur', 'être', 'avoir', 'faire', 'pouvoir', 'dire', 'aller', 'voir', 'savoir', 'et', 'non', 'oui', 'tout', 'très', 'plus', 'peut', 'quel', 'qui', 'sans', 'contre', 'pendant', 'au', 'du', 'des', 'ces', 'cette', 'ces', 'cet', 'cette', 'mon', 'ma', 'mes', 'ton', 'ta', 'tes', 'son', 'sa', 'ses'],
phrases: ['bonjour', 'merci', 's\'il vous plaît', 'de rien', 'bonsoir', 'au revoir', 'salut', 'excusez-moi'],
chars: /[àâäçéèêëïîôùûüÿ]/i
},
german: {
domains: [/\.de$/, /\.at$/, /\.ch$/],
keywords: [/deutschland/i, /deutsch/i, /german/i],
// German specific words
words: ['der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'das', 'ist', 'nicht', 'zu', 'der', 'und', 'in', 'den', 'von', 'zu', 'mit', 'sich', 'für', 'haben', 'werden', 'können', 'auch', 'nur', 'oder', 'aber', 'wenn', 'dass', 'sie', 'ich', 'du', 'wir', 'dem', 'den', 'einem', 'einen', 'einer', 'eines', 'eine', 'einer'],
phrases: ['hallo', 'danke', 'bitte', 'guten tag', 'guten morgen', 'guten abend', 'auf wiedersehen', 'entschuldigung'],
chars: /[äöüß]/i
}
};
const words = content.toLowerCase().split(/\s+/).filter(word => word.length > 1);
if (words.length === 0) return 'english';
// First check if content is primarily English - more aggressive detection
let englishScore = 0;
for (const word of words) {
if (englishWords.includes(word)) {
englishScore++;
}
}
const englishRatio = englishScore / words.length;
// If content is mostly English (>10% common English words), default to English
if (englishRatio > 0.10) {
console.log(`Detected English content: ${englishScore}/${words.length} = ${(englishRatio * 100).toFixed(1)}% English words`);
return 'english';
}
// Check each language only if content is not primarily English
for (const [lang, patterns] of Object.entries(languagePatterns)) {
let score = 0;
// Check URL patterns (strong indicator)
if (patterns.domains.some(domain => domain.test(url))) {
score += 100;
}
// Check keywords in URL
if (patterns.keywords.some(keyword => keyword.test(url))) {
score += 50;
}
// Check words in content (only language-specific words)
let wordMatches = 0;
for (const word of words) {
if (patterns.words.includes(word) && !englishWords.includes(word)) {
wordMatches++;
}
}
score += wordMatches;
// Check phrases in content
for (const phrase of patterns.phrases) {
if (content.toLowerCase().includes(phrase)) {
score += 20;
}
}
// Check special characters (strong indicator for non-English)
const specialChars = content.match(patterns.chars);
if (specialChars) {
score += specialChars.length * 5;
}
console.log(`Language ${lang}: score=${score}, matches=${wordMatches}, chars=${specialChars?.length || 0}`);
// Calculate score relative to content length
const scoreRatio = score / words.length;
// Much higher threshold for non-English languages
if (scoreRatio > 0.08 || score > 20) {
console.log(`Detected ${lang}: scoreRatio=${scoreRatio.toFixed(3)}, score=${score}`);
return lang;
}
}
console.log('Defaulting to English - no other language detected');
// Default to English
return 'english';
}
// Get language-specific instructions
function getLanguageInstructions(language: string): string {
switch (language) {
case 'italian':
return `IMPORTANTE: Questa conversazione DEVE essere generata interamente in LINGUA ITALIANA. Non usare inglese.
ISTRUZIONI PRECISE:
1. LINGUA OBBLIGATORIA: SOLO ITALIANO - ogni singola parola deve essere in italiano
2. CONTENUTO: Analizza e discuti il contenuto fornito in italiano
3. PERSONAGGI:
- HOST 1 (Alex): Entusiasta, ottimista, usa espressioni come "Wow!", "Incredibile!", "Fantastico!"
- HOST 2 (Sarah): Scettica, sarcastica, usa umorismo secco
4. FORMATO: Conversazione naturale con emozioni [parentesi quadre]
5. LINGUA: ASSOLUTAMENTE SOLO ITALIANO - zero inglese permesso
Ricorda: OGNI PAROLA di questa conversazione deve essere in italiano, senza eccezioni.`;
case 'spanish':
return `IMPORTANTE: Esta conversación DEBE ser generada completamente en IDIOMA ESPAÑOL. No uses inglés.
INSTRUCCIONES PRECISAS:
1. IDIOMA OBLIGATORIO: SOLO ESPAÑOL - cada palabra debe estar en español
2. CONTENIDO: Analiza y discute el contenido proporcionado en español
3. PERSONAJES:
- HOST 1 (Alex): Entusiasta, optimista, usa expresiones como "¡Wow!", "¡Increíble!", "¡Fantástico!"
- HOST 2 (Sarah): Escéptica, sarcástica, usa humor seco
4. FORMATO: Conversación natural con emociones [corchetes]
5. IDIOMA: ABSOLUTAMENTE SOLO ESPAÑOL - cero inglés permitido
Recuerda: CADA PALABRA de esta conversación debe estar en español, sin excepciones.`;
case 'french':
return `IMPORTANT: Cette conversation DOIT être générée entièrement en LANGUE FRANÇAISE. N'utilisez pas d'anglais.
INSTRUCTIONS PRÉCISES:
1. LANGUE OBLIGATOIRE: SEULEMENT FRANÇAIS - chaque mot doit être en français
2. CONTENU: Analysez et discutez du contenu fourni en français
3. PERSONNAGES:
- HOST 1 (Alex): Enthousiaste, optimiste, utilise des expressions comme "Wow!", "Incroyable!", "Fantastique!"
- HOST 2 (Sarah): Sceptique, sarcastique, utilise l'humour sec
4. FORMAT: Conversation naturelle avec émotions [crochets]
5. LANGUE: ABSOLUMENT SEULEMENT FRANÇAIS - zéro anglais permis
Rappelez-vous: CHAQUE MOT de cette conversation doit être en français, sans exceptions.`;
case 'german':
return `WICHTIG: Dieses Gespräch MUSS vollständig in DEUTSCHER SPRACHE generiert werden. Kein Englisch verwenden.
GENAUE ANWEISUNGEN:
1. OBLIGATORISCHE SPRACHE: NUR DEUTSCH - jedes Wort muss auf Deutsch sein
2. INHALT: Analysiere und diskutiere den bereitgestellten Inhalt auf Deutsch
3. PERSONEN:
- HOST 1 (Alex): Enthusiastisch, optimistisch, verwendet Ausdrücke wie "Wow!", "Incredible!", "Fantastisch!"
- HOST 2 (Sarah): Skeptisch, sarkastisch, verwendet trockenen Humor
4. FORMAT: Natürliche Unterhaltung mit Emotionen [eckige Klammern]
5. SPRACHE: ABSOLUT NUR DEUTSCH - Null Englisch erlaubt
Denken Sie daran: JEDES WORT dieses Gesprächs muss auf Deutsch sein, ohne Ausnahmen.`;
default:
return `Generate this conversation in ENGLISH. Use only English for the entire conversation.
HOST 1 PERSONALITY: Bubbly, excited, enthusiastic, and optimistic. Uses expressions like "Wow!", "Amazing!", "That's so cool!".
HOST 2 PERSONALITY: Skeptical, sarcastic, and thoughtful. Uses dry humor and irony.
USE ONLY ENGLISH FOR THE ENTIRE CONVERSATION.`;
}
}
const messageSchema = z.object({
id: z.string(),
speaker: z.enum(['host1', 'host2']),
@@ -23,6 +204,17 @@ export async function POST(request: NextRequest) {
console.log('Generating streaming conversation for:', { title, url, contentLength: content.length, contentPreview: content.substring(0, 200) + '...' });
// Detect language from URL and content
const detectedLanguage = detectLanguage(url, content);
const languageInstructions = getLanguageInstructions(detectedLanguage);
console.log('Language detection debug:', {
url,
contentLength: content.length,
contentPreview: content.substring(0, 100) + '...',
detectedLanguage
});
// Stream podcast conversation using OpenAI
const result = streamObject({
model: openai('gpt-4o-mini'),
@@ -32,26 +224,29 @@ export async function POST(request: NextRequest) {
schemaDescription: 'A single message in a podcast-style conversation between two hosts',
prompt: `You are generating a podcast conversation between two hosts discussing the following scraped content from "${title}" at ${url}.
CRITICAL LANGUAGE DETECTION:
- URL: ${url}
- Content language detected: ${detectedLanguage}
- You MUST generate the conversation in the same language as the content
CONTENT:
${content}
${languageInstructions}
Generate a natural, engaging podcast conversation with at least 20 turns (10 per host). The conversation should:
1. HOST 1 PERSONALITY: Bubbly, excited, enthusiastic, and optimistic. Uses expressions like "Wow!", "Amazing!", "That's so cool!". Often laughs [giggles] and shows genuine excitement.
1. Include emotional expressions in brackets like [giggles], [laughs], [sarcastically], [whispers], [excitedly], [thoughtfully], etc.
2. HOST 2 PERSONALITY: Skeptical, sarcastic, and thoughtful. Uses dry humor and irony. Often makes sarcastic comments [sarcastically] and plays devil's advocate.
2. Make it sound natural and conversational, like a real podcast
3. Use the same language as the content (if content is in English, respond in English; if in Italian, respond in Italian, etc.)
3. Include timestamps in MM:SS format (starting from 0:15 and incrementing by 20-30 seconds each)
4. Include emotional expressions in brackets like [giggles], [laughs], [sarcastically], [whispers], [excitedly], [thoughtfully], etc.
4. The conversation should flow naturally and cover the main points of the content
5. Make it sound natural and conversational, like a real podcast
5. Create a substantial conversation that thoroughly explores the content from multiple angles
6. Include timestamps in MM:SS format (starting from 0:15 and incrementing by 20-30 seconds each)
7. The conversation should flow naturally and cover the main points of the content
8. Create a substantial conversation that thoroughly explores the content from multiple angles
REMEMBER: This conversation MUST be in ${detectedLanguage.toUpperCase()} language only.
Generate the messages one by one as an array. Each message should have:
- id: sequential number as string
@@ -59,7 +254,6 @@ Generate the messages one by one as an array. Each message should have:
- text: the message content with emotional expressions in brackets
- timestamp: in MM:SS format`,
temperature: 0.7,
maxTokens: 4000,
onError({ error }) {
console.error('Streaming error:', error);
},