feat: add document file upload support for spec creation and project expansion

Add support for uploading Markdown, Text, Word (.docx), CSV, Excel (.xlsx), PDF, and PowerPoint (.pptx) files in addition to existing JPEG/PNG image uploads in the spec creation and project expansion chat interfaces. Backend changes: - New server/utils/document_extraction.py: in-memory text extraction for all document formats using python-docx, openpyxl, PyPDF2, python-pptx (no disk persistence) - Rename ImageAttachment to FileAttachment across schemas, routers, and chat session services - Add build_attachment_content_blocks() helper in chat_constants.py to route images as image content blocks and documents as extracted text blocks - Separate size limits: 5MB for images, 20MB for documents - Handle extraction errors (corrupt files, encrypted PDFs) gracefully Frontend changes: - Widen accepted MIME types and file extensions in both chat components - Add resolveMimeType() fallback for browsers that don't set MIME on .md files - Document attachments display with FileText icon instead of image thumbnail - ChatMessage renders documents as compact pills with filename and size - Update help text from "attach images" to "attach files" Dependencies added: python-docx, openpyxl, PyPDF2, python-pptx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 07:13:09 +00:00 · 2026-03-25 12:51:17 +02:00
parent fca1f6a5e2
commit 7210c6f066
15 changed files with 513 additions and 123 deletions
--- a/server/services/chat_constants.py
+++ b/server/services/chat_constants.py
@@ -35,6 +35,13 @@ if _root_str not in sys.path:
 from env_constants import API_ENV_VARS  # noqa: E402, F401
 from rate_limit_utils import is_rate_limit_error, parse_retry_after  # noqa: E402, F401

+from ..schemas import FileAttachment
+from ..utils.document_extraction import (
+    extract_text_from_document,
+    is_document,
+    is_image,
+)
+
 logger = logging.getLogger(__name__)


@@ -88,6 +95,35 @@ async def safe_receive_response(client: Any, log: logging.Logger) -> AsyncGenera
            raise


+def build_attachment_content_blocks(attachments: list[FileAttachment]) -> list[dict]:
+    """Convert FileAttachment objects to Claude API content blocks.
+
+    Images become image content blocks (passed directly to Claude's vision).
+    Documents are extracted to text and become text content blocks.
+
+    Raises:
+        DocumentExtractionError: If a document cannot be read.
+    """
+    blocks: list[dict] = []
+    for att in attachments:
+        if is_image(att.mimeType):
+            blocks.append({
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": att.mimeType,
+                    "data": att.base64Data,
+                }
+            })
+        elif is_document(att.mimeType):
+            text = extract_text_from_document(att.base64Data, att.mimeType, att.filename)
+            blocks.append({
+                "type": "text",
+                "text": f"[Content of uploaded file: {att.filename}]\n\n{text}",
+            })
+    return blocks
+
+
 async def make_multimodal_message(content_blocks: list[dict]) -> AsyncGenerator[dict, None]:
    """Yield a single multimodal user message in Claude Agent SDK format.