feat: implement intelligent language detection for podcast conversations

- Added robust language detection system that analyzes URL domains, content keywords, and special characters - Supports automatic detection of English, Italian, Spanish, French, and German - Generates podcast conversations in the same language as the website content - Enhanced AI prompts with strong language-specific instructions - Added comprehensive debug logging for language detection accuracy - Improved detection thresholds to prevent false positives The system now automatically detects the language of scraped website content and generates podcast conversations in that language, ensuring localization and improved user experience. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-23 20:02:04 +02:00
parent 30bb11d62a
commit e32f433c79
1 changed files with 205 additions and 11 deletions
--- a/src/app/api/generate-conversation/route.ts
+++ b/src/app/api/generate-conversation/route.ts
@@ -3,6 +3,187 @@ import { streamObject } from 'ai';
 import { openai } from '@ai-sdk/openai';
 import { z } from 'zod';

+// Function to detect language from URL and content
+function detectLanguage(url: string, content: string): string {
+  // English common words (most frequent English words)
+  const englishWords = ['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us'];
+  
+  // Language detection patterns
+  const languagePatterns = {
+    italian: {
+      domains: [/\.it$/],
+      keywords: [/italia/i, /italian/i, /italiano/i],
+      // Italian specific words that don't exist in English
+      words: ['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'una', 'uno', 'che', 'di', 'a', 'da', 'in', 'con', 'su', 'per', 'tra', 'fra', 'e', 'ma', 'o', 'se', 'perché', 'come', 'dove', 'quando', 'quanto', 'questo', 'quello', 'essere', 'avere', 'fare', 'non', 'sì', 'tutto', 'molto', 'più', 'può', 'quale', 'chi', 'cui', 'senza', 'fra', 'tra', 'contro', 'durante', 'sulla', 'dallo', 'nella', 'dalla', 'coi', 'col', 'negli', 'nelle', 'dagli', 'dalle', 'sugli', 'sulle'],
+      phrases: ['ciao', 'grazie', 'prego', 'scusi', 'buongiorno', 'buonasera', 'arrivederci', 'prego'],
+      chars: /[àèéìòùù]/i
+    },
+    spanish: {
+      domains: [/\.es$/, /\.mx$/, /\.ar$/, /\.cl$/, /\.co$/],
+      keywords: [/españa/i, /espanol/i, /hispano/i],
+      // Spanish specific words
+      words: ['el', 'la', 'los', 'las', 'un', 'una', 'uno', 'que', 'de', 'en', 'con', 'por', 'para', 'como', 'estar', 'tener', 'hacer', 'poder', 'decir', 'ir', 'ver', 'saber', 'y', 'no', 'sí', 'todo', 'muy', 'más', 'puede', 'cual', 'quién', 'sin', 'contra', 'durante', 'al', 'del', 'los', 'las', 'un', 'una', 'unos', 'unas', 'este', 'esta', 'estos', 'estas', 'ese', 'esa', 'esos', 'esas', 'aquel', 'aquella', 'aquellos', 'aquellas'],
+      phrases: ['hola', 'gracias', 'por favor', 'de nada', 'buenos días', 'buenas tardes', 'buenas noches', 'adiós'],
+      chars: /[ñáéíóúü]/i
+    },
+    french: {
+      domains: [/\.fr$/, /\.ca$/, /\.be$/, /\.ch$/],
+      keywords: [/france/i, /français/i, /french/i],
+      // French specific words
+      words: ['le', 'la', 'les', 'un', 'une', 'des', 'que', 'de', 'à', 'en', 'avec', 'pour', 'dans', 'sur', 'être', 'avoir', 'faire', 'pouvoir', 'dire', 'aller', 'voir', 'savoir', 'et', 'non', 'oui', 'tout', 'très', 'plus', 'peut', 'quel', 'qui', 'sans', 'contre', 'pendant', 'au', 'du', 'des', 'ces', 'cette', 'ces', 'cet', 'cette', 'mon', 'ma', 'mes', 'ton', 'ta', 'tes', 'son', 'sa', 'ses'],
+      phrases: ['bonjour', 'merci', 's\'il vous plaît', 'de rien', 'bonsoir', 'au revoir', 'salut', 'excusez-moi'],
+      chars: /[àâäçéèêëïîôùûüÿ]/i
+    },
+    german: {
+      domains: [/\.de$/, /\.at$/, /\.ch$/],
+      keywords: [/deutschland/i, /deutsch/i, /german/i],
+      // German specific words
+      words: ['der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'das', 'ist', 'nicht', 'zu', 'der', 'und', 'in', 'den', 'von', 'zu', 'mit', 'sich', 'für', 'haben', 'werden', 'können', 'auch', 'nur', 'oder', 'aber', 'wenn', 'dass', 'sie', 'ich', 'du', 'wir', 'dem', 'den', 'einem', 'einen', 'einer', 'eines', 'eine', 'einer'],
+      phrases: ['hallo', 'danke', 'bitte', 'guten tag', 'guten morgen', 'guten abend', 'auf wiedersehen', 'entschuldigung'],
+      chars: /[äöüß]/i
+    }
+  };
+
+  const words = content.toLowerCase().split(/\s+/).filter(word => word.length > 1);
+  if (words.length === 0) return 'english';
+  
+  // First check if content is primarily English - more aggressive detection
+  let englishScore = 0;
+  for (const word of words) {
+    if (englishWords.includes(word)) {
+      englishScore++;
+    }
+  }
+  const englishRatio = englishScore / words.length;
+  
+  // If content is mostly English (>10% common English words), default to English
+  if (englishRatio > 0.10) {
+    console.log(`Detected English content: ${englishScore}/${words.length} = ${(englishRatio * 100).toFixed(1)}% English words`);
+    return 'english';
+  }
+  
+  // Check each language only if content is not primarily English
+  for (const [lang, patterns] of Object.entries(languagePatterns)) {
+    let score = 0;
+    
+    // Check URL patterns (strong indicator)
+    if (patterns.domains.some(domain => domain.test(url))) {
+      score += 100;
+    }
+    
+    // Check keywords in URL
+    if (patterns.keywords.some(keyword => keyword.test(url))) {
+      score += 50;
+    }
+    
+    // Check words in content (only language-specific words)
+    let wordMatches = 0;
+    for (const word of words) {
+      if (patterns.words.includes(word) && !englishWords.includes(word)) {
+        wordMatches++;
+      }
+    }
+    score += wordMatches;
+    
+    // Check phrases in content
+    for (const phrase of patterns.phrases) {
+      if (content.toLowerCase().includes(phrase)) {
+        score += 20;
+      }
+    }
+    
+    // Check special characters (strong indicator for non-English)
+    const specialChars = content.match(patterns.chars);
+    if (specialChars) {
+      score += specialChars.length * 5;
+    }
+    
+    console.log(`Language ${lang}: score=${score}, matches=${wordMatches}, chars=${specialChars?.length || 0}`);
+    
+    // Calculate score relative to content length
+    const scoreRatio = score / words.length;
+    
+    // Much higher threshold for non-English languages
+    if (scoreRatio > 0.08 || score > 20) {
+      console.log(`Detected ${lang}: scoreRatio=${scoreRatio.toFixed(3)}, score=${score}`);
+      return lang;
+    }
+  }
+
+  console.log('Defaulting to English - no other language detected');
+  // Default to English
+  return 'english';
+}
+
+// Get language-specific instructions
+function getLanguageInstructions(language: string): string {
+  switch (language) {
+    case 'italian':
+      return `IMPORTANTE: Questa conversazione DEVE essere generata interamente in LINGUA ITALIANA. Non usare inglese.
+
+ISTRUZIONI PRECISE:
+1. LINGUA OBBLIGATORIA: SOLO ITALIANO - ogni singola parola deve essere in italiano
+2. CONTENUTO: Analizza e discuti il contenuto fornito in italiano
+3. PERSONAGGI:
+   - HOST 1 (Alex): Entusiasta, ottimista, usa espressioni come "Wow!", "Incredibile!", "Fantastico!"
+   - HOST 2 (Sarah): Scettica, sarcastica, usa umorismo secco
+4. FORMATO: Conversazione naturale con emozioni [parentesi quadre]
+5. LINGUA: ASSOLUTAMENTE SOLO ITALIANO - zero inglese permesso
+
+Ricorda: OGNI PAROLA di questa conversazione deve essere in italiano, senza eccezioni.`;
+    
+    case 'spanish':
+      return `IMPORTANTE: Esta conversación DEBE ser generada completamente en IDIOMA ESPAÑOL. No uses inglés.
+
+INSTRUCCIONES PRECISAS:
+1. IDIOMA OBLIGATORIO: SOLO ESPAÑOL - cada palabra debe estar en español
+2. CONTENIDO: Analiza y discute el contenido proporcionado en español
+3. PERSONAJES:
+   - HOST 1 (Alex): Entusiasta, optimista, usa expresiones como "¡Wow!", "¡Increíble!", "¡Fantástico!"
+   - HOST 2 (Sarah): Escéptica, sarcástica, usa humor seco
+4. FORMATO: Conversación natural con emociones [corchetes]
+5. IDIOMA: ABSOLUTAMENTE SOLO ESPAÑOL - cero inglés permitido
+
+Recuerda: CADA PALABRA de esta conversación debe estar en español, sin excepciones.`;
+    
+    case 'french':
+      return `IMPORTANT: Cette conversation DOIT être générée entièrement en LANGUE FRANÇAISE. N'utilisez pas d'anglais.
+
+INSTRUCTIONS PRÉCISES:
+1. LANGUE OBLIGATOIRE: SEULEMENT FRANÇAIS - chaque mot doit être en français
+2. CONTENU: Analysez et discutez du contenu fourni en français
+3. PERSONNAGES:
+   - HOST 1 (Alex): Enthousiaste, optimiste, utilise des expressions comme "Wow!", "Incroyable!", "Fantastique!"
+   - HOST 2 (Sarah): Sceptique, sarcastique, utilise l'humour sec
+4. FORMAT: Conversation naturelle avec émotions [crochets]
+5. LANGUE: ABSOLUMENT SEULEMENT FRANÇAIS - zéro anglais permis
+
+Rappelez-vous: CHAQUE MOT de cette conversation doit être en français, sans exceptions.`;
+    
+    case 'german':
+      return `WICHTIG: Dieses Gespräch MUSS vollständig in DEUTSCHER SPRACHE generiert werden. Kein Englisch verwenden.
+
+GENAUE ANWEISUNGEN:
+1. OBLIGATORISCHE SPRACHE: NUR DEUTSCH - jedes Wort muss auf Deutsch sein
+2. INHALT: Analysiere und diskutiere den bereitgestellten Inhalt auf Deutsch
+3. PERSONEN:
+   - HOST 1 (Alex): Enthusiastisch, optimistisch, verwendet Ausdrücke wie "Wow!", "Incredible!", "Fantastisch!"
+   - HOST 2 (Sarah): Skeptisch, sarkastisch, verwendet trockenen Humor
+4. FORMAT: Natürliche Unterhaltung mit Emotionen [eckige Klammern]
+5. SPRACHE: ABSOLUT NUR DEUTSCH - Null Englisch erlaubt
+
+Denken Sie daran: JEDES WORT dieses Gesprächs muss auf Deutsch sein, ohne Ausnahmen.`;
+    
+    default:
+      return `Generate this conversation in ENGLISH. Use only English for the entire conversation.
+
+HOST 1 PERSONALITY: Bubbly, excited, enthusiastic, and optimistic. Uses expressions like "Wow!", "Amazing!", "That's so cool!".
+HOST 2 PERSONALITY: Skeptical, sarcastic, and thoughtful. Uses dry humor and irony.
+
+USE ONLY ENGLISH FOR THE ENTIRE CONVERSATION.`;
+  }
+}
+
 const messageSchema = z.object({
  id: z.string(),
  speaker: z.enum(['host1', 'host2']),
@@ -23,6 +204,17 @@ export async function POST(request: NextRequest) {

    console.log('Generating streaming conversation for:', { title, url, contentLength: content.length, contentPreview: content.substring(0, 200) + '...' });

+    // Detect language from URL and content
+    const detectedLanguage = detectLanguage(url, content);
+    const languageInstructions = getLanguageInstructions(detectedLanguage);
+    
+    console.log('Language detection debug:', {
+      url,
+      contentLength: content.length,
+      contentPreview: content.substring(0, 100) + '...',
+      detectedLanguage
+    });
+
    // Stream podcast conversation using OpenAI
    const result = streamObject({
      model: openai('gpt-4o-mini'),
@@ -32,26 +224,29 @@ export async function POST(request: NextRequest) {
      schemaDescription: 'A single message in a podcast-style conversation between two hosts',
      prompt: `You are generating a podcast conversation between two hosts discussing the following scraped content from "${title}" at ${url}.

+CRITICAL LANGUAGE DETECTION:
+- URL: ${url}
+- Content language detected: ${detectedLanguage}
+- You MUST generate the conversation in the same language as the content
+
 CONTENT:
 ${content}

+${languageInstructions}
+
 Generate a natural, engaging podcast conversation with at least 20 turns (10 per host). The conversation should:

-1. HOST 1 PERSONALITY: Bubbly, excited, enthusiastic, and optimistic. Uses expressions like "Wow!", "Amazing!", "That's so cool!". Often laughs [giggles] and shows genuine excitement.
+1. Include emotional expressions in brackets like [giggles], [laughs], [sarcastically], [whispers], [excitedly], [thoughtfully], etc.

-2. HOST 2 PERSONALITY: Skeptical, sarcastic, and thoughtful. Uses dry humor and irony. Often makes sarcastic comments [sarcastically] and plays devil's advocate.
+2. Make it sound natural and conversational, like a real podcast

-3. Use the same language as the content (if content is in English, respond in English; if in Italian, respond in Italian, etc.)
+3. Include timestamps in MM:SS format (starting from 0:15 and incrementing by 20-30 seconds each)

-4. Include emotional expressions in brackets like [giggles], [laughs], [sarcastically], [whispers], [excitedly], [thoughtfully], etc.
+4. The conversation should flow naturally and cover the main points of the content

-5. Make it sound natural and conversational, like a real podcast
+5. Create a substantial conversation that thoroughly explores the content from multiple angles

-6. Include timestamps in MM:SS format (starting from 0:15 and incrementing by 20-30 seconds each)
-
-7. The conversation should flow naturally and cover the main points of the content
-
-8. Create a substantial conversation that thoroughly explores the content from multiple angles
+REMEMBER: This conversation MUST be in ${detectedLanguage.toUpperCase()} language only.

 Generate the messages one by one as an array. Each message should have:
 - id: sequential number as string
@@ -59,7 +254,6 @@ Generate the messages one by one as an array. Each message should have:
 - text: the message content with emotional expressions in brackets
 - timestamp: in MM:SS format`,
      temperature: 0.7,
-      maxTokens: 4000,
      onError({ error }) {
        console.error('Streaming error:', error);
      },