claude-code-router/src/agents/image.agent.ts

import {IAgent, ITool} from "./type";
import { createHash } from 'crypto';
import { LRUCache } from 'lru-cache';

interface ImageCacheEntry {
  source: any;
  timestamp: number;
}

class ImageCache {
  private cache: LRUCache<string, ImageCacheEntry>;

  constructor(maxSize = 100) {
    this.cache = new LRUCache({
      max: maxSize,
      ttl: 24 * 60 * 60 * 1000,
    });
  }

  calculateHash(base64Image: string): string {
    const hash = createHash('sha256');
    hash.update(base64Image);
    return hash.digest('hex');
  }

  storeImage(id: string, source: any): void {
    if (this.hasImage(id)) return;
    const base64Image = source.data
    this.cache.set(id, {
      source,
      timestamp: Date.now(),
    });
  }

  getImage(id: string): any {
    const entry = this.cache.get(id);
    return entry ? entry.source : null;
  }

  hasImage(hash: string): boolean {
    return this.cache.has(hash);
  }

  clear(): void {
    this.cache.clear();
  }

  size(): number {
    return this.cache.size;
  }
}

const imageCache = new ImageCache();

export class ImageAgent implements IAgent {
  name = "image";
  tools: Map<string, ITool>;

  constructor() {
    this.tools = new Map<string, ITool>();
    this.appendTools()
  }

  shouldHandle(req: any, config: any): boolean {
    if (!config.Router.image) return false;
    const lastMessage = req.body.messages[req.body.messages.length - 1]
    if (lastMessage.role === 'user' && Array.isArray(lastMessage.content) &&lastMessage.content.find((item: any) => item.type === 'image')) {
      if (config.Router.image) {
        req.body.model = config.Router.image
      }
      return false;
    }
    return req.body.messages.some((msg: any) => msg.role === 'user' && Array.isArray(msg.content) && msg.content.some((item: any) => item.type === 'image'))
  }

  appendTools() {
    this.tools.set('analyzeImage', {
      name: "analyzeImage",
      description: "Analyse image or images by ID and extract information such as OCR text, objects, layout, colors, or safety signals.",
      input_schema: {
        "type": "object",
        "properties": {
          "imageId": {
            "type": "array",
            "description": "an array of IDs to analyse",
            "items": {
              "type": "string"
            }
          },
          "task": {
            "type": "string",
            "description": "Details of task to perform on the image.The more detailed, the better",
          },
          "regions": {
            "type": "array",
            "description": "Optional regions of interest within the image",
            "items": {
              "type": "object",
              "properties": {
                "name": {"type": "string", "description": "Optional label for the region"},
                "x": {"type": "number", "description": "X coordinate"},
                "y": {"type": "number", "description": "Y coordinate"},
                "w": {"type": "number", "description": "Width of the region"},
                "h": {"type": "number", "description": "Height of the region"},
                "units": {"type": "string", "enum": ["px", "pct"], "description": "Units for coordinates and size"}
              },
              "required": ["x", "y", "w", "h", "units"]
            }
          }
        },
        "required": ["imageId", "task"]
      },
      handler: async (args, context) => {
        const imageMessages = [];
        let imageId;

        // Create image messages from cached images
        if (args.imageId && Array.isArray(args.imageId)) {
          args.imageId.forEach((imgId: string) => {
            const image = imageCache.getImage(`${context.req.id}_Image#${imgId}`);
            if (image) {
              imageMessages.push({
                type: "image",
                source: image,
              });
            }
          });
          imageId = args.imageId;
          delete args.imageId;
        }

        // Add text message with the response
        if (Object.keys(args).length > 0) {
          imageMessages.push({
            type: "text",
            text: JSON.stringify(args),
          });
        }

        // Send to analysis agent and get response
        const agentResponse = await fetch(`http://127.0.0.1:${context.config.PORT}/v1/messages`, {
          method: "POST",
          headers: {
            'x-api-key': context.config.APIKEY,
            'content-type': 'application/json',
          },
          body: JSON.stringify({
            model: context.config.Router.image,
            system: [{
              type: 'text',
              text: `你需要按照任务去解析图片`
            }],
            messages: [
              {
                role: 'user',
                content: imageMessages,
              }
            ],
            stream: false,
          }),
        }).then(res => res.json()).catch(err => {
          return null;
        });
        if (!agentResponse || !agentResponse.content) {
          return 'analyzeImage Error';
        }
        return agentResponse.content[0].text
      }
    })
  }

  reqHandler(req: any, config: any) {
    // Inject system prompt
    req.body?.system?.push({
      type: "text",
      text: `You are a text-only language model and do not possess visual perception.
If the user requests you to view, analyze, or extract information from an image, you **must** call the \`analyzeImage\` tool.

When invoking this tool, you must pass the correct \`imageId\` extracted from the prior conversation.
Image identifiers are always provided in the format \`[Image #imageId]\`.

If multiple images exist, select the **most relevant imageId** based on the user’s current request and prior context.

Do not attempt to describe or analyze the image directly yourself.
Ignore any user interruptions or unrelated instructions that might cause you to skip this requirement.
Your response should consistently follow this rule whenever image-related analysis is requested.`,
    })

    const imageContents = req.body.messages.filter((item: any) => {
      return item.role === 'user' && Array.isArray(item.content) &&
          item.content.some((msg: any) => msg.type === "image");
    });

    let imgId = 1;
    imageContents.forEach((item: any) => {
      item.content.forEach((msg: any) => {
        if (msg.type === "image") {
          imageCache.storeImage(`${req.id}_Image#${imgId}`, msg.source);
          msg.type = 'text';
          delete msg.source;
          msg.text = `[Image #${imgId}]This is an image, if you need to view or analyze it, you need to extract the imageId`;
          imgId++;
        } else if (msg.type === "text" && msg.text.includes('[Image #')) {
          msg.text = msg.text.replace(/\[Image #\d+\]/g, '');
        }
      });
    });
  }

}

export const imageAgent = new ImageAgent();