mirror of
https://github.com/leonvanzyl/autocoder.git
synced 2026-03-25 23:03:09 +00:00
Add support for uploading Markdown, Text, Word (.docx), CSV, Excel (.xlsx), PDF, and PowerPoint (.pptx) files in addition to existing JPEG/PNG image uploads in the spec creation and project expansion chat interfaces. Backend changes: - New server/utils/document_extraction.py: in-memory text extraction for all document formats using python-docx, openpyxl, PyPDF2, python-pptx (no disk persistence) - Rename ImageAttachment to FileAttachment across schemas, routers, and chat session services - Add build_attachment_content_blocks() helper in chat_constants.py to route images as image content blocks and documents as extracted text blocks - Separate size limits: 5MB for images, 20MB for documents - Handle extraction errors (corrupt files, encrypted PDFs) gracefully Frontend changes: - Widen accepted MIME types and file extensions in both chat components - Add resolveMimeType() fallback for browsers that don't set MIME on .md files - Document attachments display with FileText icon instead of image thumbnail - ChatMessage renders documents as compact pills with filename and size - Update help text from "attach images" to "attach files" Dependencies added: python-docx, openpyxl, PyPDF2, python-pptx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
222 lines
7.0 KiB
Python
222 lines
7.0 KiB
Python
"""
|
|
Document Extraction Utility
|
|
============================
|
|
|
|
Extracts text content from various document formats in memory (no disk I/O).
|
|
Supports: TXT, MD, CSV, DOCX, XLSX, PDF, PPTX.
|
|
"""
|
|
|
|
import base64
|
|
import csv
|
|
import io
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Maximum characters of extracted text to send to Claude
|
|
MAX_EXTRACTED_CHARS = 200_000
|
|
|
|
# Maximum rows per sheet for Excel files
|
|
MAX_EXCEL_ROWS_PER_SHEET = 10_000
|
|
MAX_EXCEL_SHEETS = 50
|
|
|
|
# MIME type classification
|
|
DOCUMENT_MIME_TYPES: dict[str, str] = {
|
|
"text/plain": ".txt",
|
|
"text/markdown": ".md",
|
|
"text/csv": ".csv",
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
"application/pdf": ".pdf",
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
}
|
|
|
|
IMAGE_MIME_TYPES = {"image/jpeg", "image/png"}
|
|
|
|
ALL_ALLOWED_MIME_TYPES = IMAGE_MIME_TYPES | set(DOCUMENT_MIME_TYPES.keys())
|
|
|
|
|
|
def is_image(mime_type: str) -> bool:
|
|
"""Check if the MIME type is a supported image format."""
|
|
return mime_type in IMAGE_MIME_TYPES
|
|
|
|
|
|
def is_document(mime_type: str) -> bool:
|
|
"""Check if the MIME type is a supported document format."""
|
|
return mime_type in DOCUMENT_MIME_TYPES
|
|
|
|
|
|
class DocumentExtractionError(Exception):
|
|
"""Raised when text extraction from a document fails."""
|
|
|
|
def __init__(self, filename: str, reason: str):
|
|
self.filename = filename
|
|
self.reason = reason
|
|
super().__init__(f"Failed to read {filename}: {reason}")
|
|
|
|
|
|
def _truncate(text: str) -> str:
|
|
"""Truncate text if it exceeds the maximum character limit."""
|
|
if len(text) > MAX_EXTRACTED_CHARS:
|
|
omitted = len(text) - MAX_EXTRACTED_CHARS
|
|
return text[:MAX_EXTRACTED_CHARS] + f"\n\n[... truncated, {omitted:,} characters omitted]"
|
|
return text
|
|
|
|
|
|
def _extract_plain_text(data: bytes) -> str:
|
|
"""Extract text from plain text or markdown files."""
|
|
try:
|
|
return data.decode("utf-8")
|
|
except UnicodeDecodeError:
|
|
return data.decode("latin-1")
|
|
|
|
|
|
def _extract_csv(data: bytes) -> str:
|
|
"""Extract text from CSV files, formatted as a readable table."""
|
|
try:
|
|
text = data.decode("utf-8")
|
|
except UnicodeDecodeError:
|
|
text = data.decode("latin-1")
|
|
|
|
reader = csv.reader(io.StringIO(text))
|
|
lines = []
|
|
for i, row in enumerate(reader):
|
|
lines.append(f"Row {i + 1}: {', '.join(row)}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _extract_docx(data: bytes) -> str:
|
|
"""Extract text from Word documents."""
|
|
from docx import Document
|
|
|
|
doc = Document(io.BytesIO(data))
|
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
return "\n\n".join(paragraphs)
|
|
|
|
|
|
def _extract_xlsx(data: bytes) -> str:
|
|
"""Extract text from Excel spreadsheets."""
|
|
from openpyxl import load_workbook
|
|
|
|
wb = load_workbook(io.BytesIO(data), read_only=True, data_only=True)
|
|
sections = []
|
|
|
|
for sheet_idx, sheet_name in enumerate(wb.sheetnames):
|
|
if sheet_idx >= MAX_EXCEL_SHEETS:
|
|
sections.append(f"\n[... {len(wb.sheetnames) - MAX_EXCEL_SHEETS} more sheets omitted]")
|
|
break
|
|
|
|
ws = wb[sheet_name]
|
|
rows_text = [f"=== Sheet: {sheet_name} ==="]
|
|
row_count = 0
|
|
|
|
for row in ws.iter_rows(values_only=True):
|
|
if row_count >= MAX_EXCEL_ROWS_PER_SHEET:
|
|
rows_text.append(f"[... more rows omitted, limit {MAX_EXCEL_ROWS_PER_SHEET:,} rows/sheet]")
|
|
break
|
|
cells = [str(cell) if cell is not None else "" for cell in row]
|
|
rows_text.append("\t".join(cells))
|
|
row_count += 1
|
|
|
|
sections.append("\n".join(rows_text))
|
|
|
|
wb.close()
|
|
return "\n\n".join(sections)
|
|
|
|
|
|
def _extract_pdf(data: bytes, filename: str) -> str:
|
|
"""Extract text from PDF files."""
|
|
from PyPDF2 import PdfReader
|
|
from PyPDF2.errors import PdfReadError
|
|
|
|
try:
|
|
reader = PdfReader(io.BytesIO(data))
|
|
except PdfReadError as e:
|
|
if "encrypt" in str(e).lower() or "password" in str(e).lower():
|
|
raise DocumentExtractionError(filename, "PDF is password-protected")
|
|
raise
|
|
|
|
if reader.is_encrypted:
|
|
raise DocumentExtractionError(filename, "PDF is password-protected")
|
|
|
|
pages = []
|
|
for i, page in enumerate(reader.pages):
|
|
text = page.extract_text()
|
|
if text and text.strip():
|
|
pages.append(f"--- Page {i + 1} ---\n{text}")
|
|
|
|
return "\n\n".join(pages)
|
|
|
|
|
|
def _extract_pptx(data: bytes) -> str:
|
|
"""Extract text from PowerPoint presentations."""
|
|
from pptx import Presentation
|
|
|
|
prs = Presentation(io.BytesIO(data))
|
|
slides_text = []
|
|
|
|
for i, slide in enumerate(prs.slides):
|
|
texts = []
|
|
for shape in slide.shapes:
|
|
if shape.has_text_frame:
|
|
for paragraph in shape.text_frame.paragraphs:
|
|
text = paragraph.text.strip()
|
|
if text:
|
|
texts.append(text)
|
|
if texts:
|
|
slides_text.append(f"--- Slide {i + 1} ---\n" + "\n".join(texts))
|
|
|
|
return "\n\n".join(slides_text)
|
|
|
|
|
|
def extract_text_from_document(base64_data: str, mime_type: str, filename: str) -> str:
|
|
"""
|
|
Extract text content from a document file.
|
|
|
|
Args:
|
|
base64_data: Base64-encoded file content
|
|
mime_type: MIME type of the document
|
|
filename: Original filename (for error messages)
|
|
|
|
Returns:
|
|
Extracted text content, truncated if necessary
|
|
|
|
Raises:
|
|
DocumentExtractionError: If extraction fails
|
|
"""
|
|
if mime_type not in DOCUMENT_MIME_TYPES:
|
|
raise DocumentExtractionError(filename, f"unsupported document type: {mime_type}")
|
|
|
|
try:
|
|
data = base64.b64decode(base64_data)
|
|
except Exception as e:
|
|
raise DocumentExtractionError(filename, f"invalid base64 data: {e}")
|
|
|
|
try:
|
|
if mime_type in ("text/plain", "text/markdown"):
|
|
text = _extract_plain_text(data)
|
|
elif mime_type == "text/csv":
|
|
text = _extract_csv(data)
|
|
elif mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
|
text = _extract_docx(data)
|
|
elif mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
|
|
text = _extract_xlsx(data)
|
|
elif mime_type == "application/pdf":
|
|
text = _extract_pdf(data, filename)
|
|
elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
|
text = _extract_pptx(data)
|
|
else:
|
|
raise DocumentExtractionError(filename, f"unsupported document type: {mime_type}")
|
|
except DocumentExtractionError:
|
|
raise
|
|
except Exception as e:
|
|
logger.warning(f"Document extraction failed for {filename}: {e}")
|
|
raise DocumentExtractionError(
|
|
filename, "file appears to be corrupt or in an unexpected format"
|
|
)
|
|
|
|
if not text or not text.strip():
|
|
return f"[File {filename} is empty or contains no extractable text]"
|
|
|
|
return _truncate(text)
|