feat: 添加 OCR 词汇提取功能

新增 OCR 页面，用户可上传教材词汇表截图，使用 GLM-4.6V 视觉模型提取单词-释义对并保存到指定文件夹。 - AI 管道: src/lib/bigmodel/ocr/ (orchestrator, types) - 后端模块: src/modules/ocr/ (action-service-repository 架构) - 前端页面: src/app/(features)/ocr/ (拖拽上传、folder 选择) - i18n: 8 种语言翻译支持
2026-03-10 15:21:45 +08:00
parent 683a4104ec
commit 9b78fd5215
18 changed files with 868 additions and 0 deletions
--- a/src/lib/bigmodel/ocr/orchestrator.ts
+++ b/src/lib/bigmodel/ocr/orchestrator.ts
@@ -0,0 +1,152 @@
+import OpenAI from "openai";
+import { parseAIGeneratedJSON } from "@/utils/json";
+import { createLogger } from "@/lib/logger";
+import { OCRInput, OCROutput, OCRRawResponse } from "./types";
+
+const log = createLogger("ocr-orchestrator");
+
+const openai = new OpenAI({
+  apiKey: process.env.ZHIPU_API_KEY,
+  baseURL: "https://open.bigmodel.cn/api/paas/v4",
+});
+
+/**
+ * Executes OCR on an image to extract vocabulary word-definition pairs.
+ *
+ * Uses GLM-4.6V vision model to analyze vocabulary table images and
+ * extract structured word-definition pairs.
+ *
+ * @param input - OCR input containing base64 image and optional language hints
+ * @returns Structured output with extracted pairs and detected languages
+ * @throws Error if OCR fails or response is malformed
+ *
+ * @example
+ * ```typescript
+ * const result = await executeOCR({
+ *   imageBase64: "iVBORw0KGgo...",
+ *   sourceLanguage: "English",
+ *   targetLanguage: "Chinese"
+ * });
+ * // result.pairs: [{ word: "hello", definition: "你好" }, ...]
+ * ```
+ */
+export async function executeOCR(input: OCRInput): Promise<OCROutput> {
+  const { imageBase64, sourceLanguage, targetLanguage } = input;
+
+  log.debug("Starting OCR", {
+    hasSourceHint: !!sourceLanguage,
+    hasTargetHint: !!targetLanguage,
+    imageLength: imageBase64.length,
+  });
+
+  const languageHints: string[] = [];
+  if (sourceLanguage) {
+    languageHints.push(`源语言提示: ${sourceLanguage}`);
+  }
+  if (targetLanguage) {
+    languageHints.push(`目标语言提示: ${targetLanguage}`);
+  }
+
+  const prompt = `
+你是一个专业的OCR识别助手，专门从词汇表截图中提取单词和释义。
+
+${languageHints.length > 0 ? `语言提示：\n${languageHints.join("\n")}\n` : ""}
+
+你的任务是分析图片中的词汇表，提取所有单词-释义对。
+
+要求：
+1. 识别图片中的词汇表结构（可能是两列或多列）
+2. 提取每一行的单词和对应的释义/翻译
+3. 自动检测源语言和目标语言
+4. 保持原始大小写和拼写
+5. 如果图片模糊或不清晰，尽力识别并标注置信度较低的项目
+6. 忽略表头、页码等非词汇内容
+
+返回 JSON 格式：
+{
+  "pairs": [
+    { "word": "单词1", "definition": "释义1" },
+    { "word": "单词2", "definition": "释义2" }
+  ],
+  "detectedSourceLanguage": "检测到的源语言",
+  "detectedTargetLanguage": "检测到的目标语言"
+}
+
+只返回 JSON，不要任何其他文字。
+`.trim();
+
+  try {
+    const response = await openai.chat.completions.create({
+      model: "glm-4.6v",
+      messages: [
+        {
+          role: "user",
+          content: [
+            {
+              type: "image_url",
+              image_url: {
+                url: imageBase64,
+              },
+            },
+            {
+              type: "text",
+              text: prompt,
+            },
+          ],
+        },
+      ],
+      temperature: 0.1,
+    });
+
+    const content = response.choices[0]?.message?.content;
+
+    if (!content) {
+      log.error("OCR returned empty response");
+      throw new Error("OCR 返回空响应");
+    }
+
+    log.debug("Received OCR response", { contentLength: content.length });
+
+    const parsed = parseAIGeneratedJSON<OCRRawResponse>(content);
+
+    if (!parsed.pairs || !Array.isArray(parsed.pairs)) {
+      log.error("Invalid OCR response: missing or invalid pairs array", { parsed });
+      throw new Error("OCR 响应格式无效：缺少 pairs 数组");
+    }
+
+    const validPairs = parsed.pairs.filter((pair) => {
+      const isValid = typeof pair.word === "string" && typeof pair.definition === "string";
+      if (!isValid) {
+        log.warn("Skipping invalid pair", { pair });
+      }
+      return isValid;
+    });
+
+    if (validPairs.length === 0) {
+      log.error("No valid pairs extracted from image");
+      throw new Error("未能从图片中提取有效的词汇对");
+    }
+
+    const result: OCROutput = {
+      pairs: validPairs,
+      detectedSourceLanguage: parsed.detectedSourceLanguage,
+      detectedTargetLanguage: parsed.detectedTargetLanguage,
+    };
+
+    log.info("OCR completed successfully", {
+      pairCount: result.pairs.length,
+      sourceLanguage: result.detectedSourceLanguage,
+      targetLanguage: result.detectedTargetLanguage,
+    });
+
+    return result;
+  } catch (error) {
+    if (error instanceof Error && error.message.startsWith("OCR")) {
+      throw error;
+    }
+
+    log.error("OCR failed", { error });
+    const errorMessage = error instanceof Error ? error.message : "未知错误";
+    throw new Error(`OCR 处理失败: ${errorMessage}`);
+  }
+}
--- a/src/lib/bigmodel/ocr/types.ts
+++ b/src/lib/bigmodel/ocr/types.ts
@@ -0,0 +1,44 @@
+/**
+ * Input for OCR pipeline
+ */
+export interface OCRInput {
+  /** Base64 encoded image (without data URL prefix) */
+  imageBase64: string;
+  /** Optional: hint about source language */
+  sourceLanguage?: string;
+  /** Optional: hint about target/translation language */
+  targetLanguage?: string;
+}
+
+/**
+ * Single word-definition pair extracted from image
+ */
+export interface VocabularyPair {
+  /** The original word */
+  word: string;
+  /** The translation/definition */
+  definition: string;
+}
+
+/**
+ * Output from OCR pipeline
+ */
+export interface OCROutput {
+  /** Extracted word-definition pairs */
+  pairs: VocabularyPair[];
+  /** Detected source language */
+  detectedSourceLanguage?: string;
+  /** Detected target/translation language */
+  detectedTargetLanguage?: string;
+}
+
+/**
+ * Internal structure for AI response parsing
+ */
+interface OCRRawResponse {
+  pairs: Array<{ word: string; definition: string }>;
+  detectedSourceLanguage?: string;
+  detectedTargetLanguage?: string;
+}
+
+export type { OCRRawResponse };