feat: 添加 OCR 词汇提取功能

新增 OCR 页面,用户可上传教材词汇表截图,使用 GLM-4.6V 视觉模型
提取单词-释义对并保存到指定文件夹。

- AI 管道: src/lib/bigmodel/ocr/ (orchestrator, types)
- 后端模块: src/modules/ocr/ (action-service-repository 架构)
- 前端页面: src/app/(features)/ocr/ (拖拽上传、folder 选择)
- i18n: 8 种语言翻译支持
This commit is contained in:
2026-03-10 15:21:45 +08:00
parent 683a4104ec
commit 9b78fd5215
18 changed files with 868 additions and 0 deletions

View File

@@ -0,0 +1,253 @@
"use client";
import { useState, useCallback, useRef } from "react";
import { useTranslations } from "next-intl";
import { PageLayout } from "@/components/ui/PageLayout";
import { PrimaryButton, LightButton } from "@/design-system/base/button";
import { Input } from "@/design-system/base/input";
import { Select } from "@/design-system/base/select";
import { Card } from "@/design-system/base/card";
import { toast } from "sonner";
import { Upload, FileImage, Loader2 } from "lucide-react";
import { actionProcessOCR } from "@/modules/ocr/ocr-action";
import { TSharedFolder } from "@/shared/folder-type";
import { OCROutput } from "@/lib/bigmodel/ocr/types";
interface OCRClientProps {
initialFolders: TSharedFolder[];
}
export function OCRClient({ initialFolders }: OCRClientProps) {
const t = useTranslations("ocr");
const fileInputRef = useRef<HTMLInputElement>(null);
const [selectedFile, setSelectedFile] = useState<File | null>(null);
const [previewUrl, setPreviewUrl] = useState<string | null>(null);
const [selectedFolderId, setSelectedFolderId] = useState<number | null>(
initialFolders.length > 0 ? initialFolders[0].id : null
);
const [sourceLanguage, setSourceLanguage] = useState<string>("");
const [targetLanguage, setTargetLanguage] = useState<string>("");
const [isProcessing, setIsProcessing] = useState(false);
const [ocrResult, setOcrResult] = useState<OCROutput | null>(null);
const handleFileChange = useCallback((file: File | null) => {
if (!file) return;
if (!file.type.startsWith("image/")) {
toast.error(t("processingFailed"));
return;
}
const url = URL.createObjectURL(file);
setPreviewUrl(url);
setSelectedFile(file);
setOcrResult(null);
}, [t]);
const handleDrop = useCallback((e: React.DragEvent<HTMLDivElement>) => {
e.preventDefault();
const file = e.dataTransfer.files[0];
handleFileChange(file);
}, [handleFileChange]);
const handleDragOver = useCallback((e: React.DragEvent<HTMLDivElement>) => {
e.preventDefault();
}, []);
const fileToBase64 = async (file: File): Promise<string> => {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => {
const result = reader.result as string;
const base64 = result.split(",")[1];
resolve(base64);
};
reader.onerror = reject;
reader.readAsDataURL(file);
});
};
const handleProcess = async () => {
if (!selectedFile) {
toast.error(t("noImage"));
return;
}
if (!selectedFolderId) {
toast.error(t("noFolder"));
return;
}
setIsProcessing(true);
setOcrResult(null);
try {
const base64 = await fileToBase64(selectedFile);
const result = await actionProcessOCR({
imageBase64: base64,
folderId: selectedFolderId,
sourceLanguage: sourceLanguage || undefined,
targetLanguage: targetLanguage || undefined,
});
if (result.success) {
const folderName = initialFolders.find(f => f.id === selectedFolderId)?.name || "";
toast.success(t("saved", { count: result.data?.pairsCreated ?? 0, folder: folderName }));
} else {
toast.error(result.message || t("processingFailed"));
}
} catch {
toast.error(t("processingFailed"));
} finally {
setIsProcessing(false);
}
};
const clearImage = () => {
if (previewUrl) {
URL.revokeObjectURL(previewUrl);
}
setPreviewUrl(null);
setSelectedFile(null);
setOcrResult(null);
if (fileInputRef.current) {
fileInputRef.current.value = "";
}
};
return (
<PageLayout>
<div className="text-center mb-6">
<h1 className="text-3xl font-bold text-gray-800 mb-2">{t("title")}</h1>
<p className="text-gray-600">{t("description")}</p>
</div>
<div className="space-y-6">
<Card variant="bordered" padding="lg">
<div className="space-y-4">
<div className="font-semibold text-gray-800 flex items-center gap-2">
<Upload className="w-5 h-5" />
{t("uploadImage")}
</div>
<div
className={`border-2 border-dashed rounded-lg p-8 text-center cursor-pointer transition-colors ${
previewUrl
? "border-primary-300 bg-primary-50"
: "border-gray-300 hover:border-primary-400 hover:bg-gray-50"
}`}
onDrop={handleDrop}
onDragOver={handleDragOver}
onClick={() => fileInputRef.current?.click()}
>
{previewUrl ? (
<div className="space-y-3">
<img
src={previewUrl}
alt="Preview"
className="max-h-64 mx-auto rounded-lg shadow-md"
/>
<div className="flex justify-center gap-2">
<LightButton
type="button"
onClick={(e) => {
e.stopPropagation();
clearImage();
}}
>
{t("uploadImage")}
</LightButton>
</div>
</div>
) : (
<div className="space-y-3 text-gray-500">
<FileImage className="w-12 h-12 mx-auto text-gray-400" />
<p>{t("dragDropHint")}</p>
<p className="text-sm">{t("supportedFormats")}</p>
</div>
)}
<input
ref={fileInputRef}
type="file"
accept="image/*"
className="hidden"
onChange={(e) => handleFileChange(e.target.files?.[0] || null)}
/>
</div>
</div>
</Card>
<Card variant="bordered" padding="lg">
<div className="space-y-4">
<div className="font-semibold text-gray-800">{t("selectFolder")}</div>
{initialFolders.length > 0 ? (
<Select
value={selectedFolderId?.toString() || ""}
onChange={(e) => setSelectedFolderId(Number(e.target.value))}
className="w-full"
>
{initialFolders.map((folder) => (
<option key={folder.id} value={folder.id}>
{folder.name}
</option>
))}
</Select>
) : (
<p className="text-gray-500 text-sm">{t("noFolders")}</p>
)}
</div>
</Card>
<Card variant="bordered" padding="lg">
<div className="space-y-4">
<div className="font-semibold text-gray-800">{t("languageHints")}</div>
<div className="grid grid-cols-1 md:grid-cols-2 gap-4">
<div>
<label className="text-sm text-gray-600 block mb-1">
{t("sourceLanguageHint")}
</label>
<Input
value={sourceLanguage}
onChange={(e) => setSourceLanguage(e.target.value)}
placeholder="English"
/>
</div>
<div>
<label className="text-sm text-gray-600 block mb-1">
{t("targetLanguageHint")}
</label>
<Input
value={targetLanguage}
onChange={(e) => setTargetLanguage(e.target.value)}
placeholder="Chinese"
/>
</div>
</div>
</div>
</Card>
<div className="flex justify-center">
<PrimaryButton
onClick={handleProcess}
disabled={isProcessing || !selectedFile || !selectedFolderId}
size="lg"
className="px-8"
>
{isProcessing ? (
<>
<Loader2 className="w-5 h-5 mr-2 animate-spin" />
{t("processing")}
</>
) : (
t("process")
)}
</PrimaryButton>
</div>
</div>
</PageLayout>
);
}

View File

@@ -0,0 +1,20 @@
import { OCRClient } from "./OCRClient";
import { auth } from "@/auth";
import { headers } from "next/headers";
import { actionGetFoldersByUserId } from "@/modules/folder/folder-action";
import { TSharedFolder } from "@/shared/folder-type";
export default async function OCRPage() {
const session = await auth.api.getSession({ headers: await headers() });
let folders: TSharedFolder[] = [];
if (session?.user?.id) {
const result = await actionGetFoldersByUserId(session.user.id as string);
if (result.success && result.data) {
folders = result.data;
}
}
return <OCRClient initialFolders={folders} />;
}

View File

@@ -0,0 +1,152 @@
import OpenAI from "openai";
import { parseAIGeneratedJSON } from "@/utils/json";
import { createLogger } from "@/lib/logger";
import { OCRInput, OCROutput, OCRRawResponse } from "./types";
const log = createLogger("ocr-orchestrator");
const openai = new OpenAI({
apiKey: process.env.ZHIPU_API_KEY,
baseURL: "https://open.bigmodel.cn/api/paas/v4",
});
/**
* Executes OCR on an image to extract vocabulary word-definition pairs.
*
* Uses GLM-4.6V vision model to analyze vocabulary table images and
* extract structured word-definition pairs.
*
* @param input - OCR input containing base64 image and optional language hints
* @returns Structured output with extracted pairs and detected languages
* @throws Error if OCR fails or response is malformed
*
* @example
* ```typescript
* const result = await executeOCR({
* imageBase64: "iVBORw0KGgo...",
* sourceLanguage: "English",
* targetLanguage: "Chinese"
* });
* // result.pairs: [{ word: "hello", definition: "你好" }, ...]
* ```
*/
export async function executeOCR(input: OCRInput): Promise<OCROutput> {
const { imageBase64, sourceLanguage, targetLanguage } = input;
log.debug("Starting OCR", {
hasSourceHint: !!sourceLanguage,
hasTargetHint: !!targetLanguage,
imageLength: imageBase64.length,
});
const languageHints: string[] = [];
if (sourceLanguage) {
languageHints.push(`源语言提示: ${sourceLanguage}`);
}
if (targetLanguage) {
languageHints.push(`目标语言提示: ${targetLanguage}`);
}
const prompt = `
你是一个专业的OCR识别助手专门从词汇表截图中提取单词和释义。
${languageHints.length > 0 ? `语言提示:\n${languageHints.join("\n")}\n` : ""}
你的任务是分析图片中的词汇表,提取所有单词-释义对。
要求:
1. 识别图片中的词汇表结构(可能是两列或多列)
2. 提取每一行的单词和对应的释义/翻译
3. 自动检测源语言和目标语言
4. 保持原始大小写和拼写
5. 如果图片模糊或不清晰,尽力识别并标注置信度较低的项目
6. 忽略表头、页码等非词汇内容
返回 JSON 格式:
{
"pairs": [
{ "word": "单词1", "definition": "释义1" },
{ "word": "单词2", "definition": "释义2" }
],
"detectedSourceLanguage": "检测到的源语言",
"detectedTargetLanguage": "检测到的目标语言"
}
只返回 JSON不要任何其他文字。
`.trim();
try {
const response = await openai.chat.completions.create({
model: "glm-4.6v",
messages: [
{
role: "user",
content: [
{
type: "image_url",
image_url: {
url: imageBase64,
},
},
{
type: "text",
text: prompt,
},
],
},
],
temperature: 0.1,
});
const content = response.choices[0]?.message?.content;
if (!content) {
log.error("OCR returned empty response");
throw new Error("OCR 返回空响应");
}
log.debug("Received OCR response", { contentLength: content.length });
const parsed = parseAIGeneratedJSON<OCRRawResponse>(content);
if (!parsed.pairs || !Array.isArray(parsed.pairs)) {
log.error("Invalid OCR response: missing or invalid pairs array", { parsed });
throw new Error("OCR 响应格式无效:缺少 pairs 数组");
}
const validPairs = parsed.pairs.filter((pair) => {
const isValid = typeof pair.word === "string" && typeof pair.definition === "string";
if (!isValid) {
log.warn("Skipping invalid pair", { pair });
}
return isValid;
});
if (validPairs.length === 0) {
log.error("No valid pairs extracted from image");
throw new Error("未能从图片中提取有效的词汇对");
}
const result: OCROutput = {
pairs: validPairs,
detectedSourceLanguage: parsed.detectedSourceLanguage,
detectedTargetLanguage: parsed.detectedTargetLanguage,
};
log.info("OCR completed successfully", {
pairCount: result.pairs.length,
sourceLanguage: result.detectedSourceLanguage,
targetLanguage: result.detectedTargetLanguage,
});
return result;
} catch (error) {
if (error instanceof Error && error.message.startsWith("OCR")) {
throw error;
}
log.error("OCR failed", { error });
const errorMessage = error instanceof Error ? error.message : "未知错误";
throw new Error(`OCR 处理失败: ${errorMessage}`);
}
}

View File

@@ -0,0 +1,44 @@
/**
* Input for OCR pipeline
*/
export interface OCRInput {
/** Base64 encoded image (without data URL prefix) */
imageBase64: string;
/** Optional: hint about source language */
sourceLanguage?: string;
/** Optional: hint about target/translation language */
targetLanguage?: string;
}
/**
* Single word-definition pair extracted from image
*/
export interface VocabularyPair {
/** The original word */
word: string;
/** The translation/definition */
definition: string;
}
/**
* Output from OCR pipeline
*/
export interface OCROutput {
/** Extracted word-definition pairs */
pairs: VocabularyPair[];
/** Detected source language */
detectedSourceLanguage?: string;
/** Detected target/translation language */
detectedTargetLanguage?: string;
}
/**
* Internal structure for AI response parsing
*/
interface OCRRawResponse {
pairs: Array<{ word: string; definition: string }>;
detectedSourceLanguage?: string;
detectedTargetLanguage?: string;
}
export type { OCRRawResponse };

View File

@@ -0,0 +1,20 @@
import { z } from "zod";
export const schemaActionInputProcessOCR = z.object({
imageBase64: z.string().min(1, "Image is required"),
folderId: z.number().int().positive("Folder ID must be positive"),
sourceLanguage: z.string().optional(),
targetLanguage: z.string().optional(),
});
export type ActionInputProcessOCR = z.infer<typeof schemaActionInputProcessOCR>;
export interface ActionOutputProcessOCR {
success: boolean;
message: string;
data?: {
pairsCreated: number;
sourceLanguage?: string;
targetLanguage?: string;
};
}

View File

@@ -0,0 +1,25 @@
"use server";
import { validate } from "@/utils/validate";
import { ValidateError } from "@/lib/errors";
import { createLogger } from "@/lib/logger";
import { serviceProcessOCR } from "./ocr-service";
import { schemaActionInputProcessOCR } from "./ocr-action-dto";
import type { ActionOutputProcessOCR } from "./ocr-action-dto";
const log = createLogger("ocr-action");
export async function actionProcessOCR(
input: unknown
): Promise<ActionOutputProcessOCR> {
try {
const validatedInput = validate(input, schemaActionInputProcessOCR);
return serviceProcessOCR(validatedInput);
} catch (e) {
if (e instanceof ValidateError) {
return { success: false, message: e.message };
}
log.error("OCR action failed", { error: e });
return { success: false, message: "Unknown error occurred." };
}
}

View File

@@ -0,0 +1 @@
export type { RepoInputCreatePair } from "@/modules/folder/folder-repository-dto";

View File

@@ -0,0 +1,5 @@
import { repoCreatePair, repoGetUserIdByFolderId } from "@/modules/folder/folder-repository";
import type { RepoInputCreatePair } from "./ocr-repository-dto";
export { repoCreatePair, repoGetUserIdByFolderId };
export type { RepoInputCreatePair };

View File

@@ -0,0 +1,20 @@
import { z } from "zod";
export const schemaServiceInputProcessOCR = z.object({
imageBase64: z.string().min(1, "Image is required"),
folderId: z.number().int().positive("Folder ID must be positive"),
sourceLanguage: z.string().optional(),
targetLanguage: z.string().optional(),
});
export type ServiceInputProcessOCR = z.infer<typeof schemaServiceInputProcessOCR>;
export interface ServiceOutputProcessOCR {
success: boolean;
message: string;
data?: {
pairsCreated: number;
sourceLanguage?: string;
targetLanguage?: string;
};
}

View File

@@ -0,0 +1,96 @@
"use server";
import { executeOCR } from "@/lib/bigmodel/ocr/orchestrator";
import { repoCreatePair, repoGetUserIdByFolderId } from "@/modules/folder/folder-repository";
import { auth } from "@/auth";
import { headers } from "next/headers";
import { createLogger } from "@/lib/logger";
import type { ServiceInputProcessOCR, ServiceOutputProcessOCR } from "./ocr-service-dto";
const log = createLogger("ocr-service");
export async function serviceProcessOCR(
input: ServiceInputProcessOCR
): Promise<ServiceOutputProcessOCR> {
log.info("Processing OCR request", { folderId: input.folderId });
const session = await auth.api.getSession({ headers: await headers() });
if (!session?.user?.id) {
log.warn("Unauthorized OCR attempt");
return { success: false, message: "Unauthorized" };
}
const folderOwner = await repoGetUserIdByFolderId(input.folderId);
if (folderOwner !== session.user.id) {
log.warn("Folder ownership mismatch", {
folderId: input.folderId,
userId: session.user.id
});
return {
success: false,
message: "You don't have permission to modify this folder"
};
}
let ocrResult;
try {
log.debug("Calling OCR pipeline");
ocrResult = await executeOCR({
imageBase64: input.imageBase64,
sourceLanguage: input.sourceLanguage,
targetLanguage: input.targetLanguage,
});
} catch (error) {
log.error("OCR pipeline failed", { error });
return {
success: false,
message: "Failed to process image. Please try again."
};
}
if (!ocrResult.pairs || ocrResult.pairs.length === 0) {
log.info("No vocabulary pairs extracted from image");
return {
success: false,
message: "No vocabulary pairs could be extracted from the image"
};
}
const sourceLanguage = ocrResult.detectedSourceLanguage || input.sourceLanguage || "Unknown";
const targetLanguage = ocrResult.detectedTargetLanguage || input.targetLanguage || "Unknown";
let pairsCreated = 0;
for (const pair of ocrResult.pairs) {
try {
await repoCreatePair({
folderId: input.folderId,
language1: sourceLanguage,
language2: targetLanguage,
text1: pair.word,
text2: pair.definition,
});
pairsCreated++;
} catch (error) {
log.error("Failed to create pair", {
word: pair.word,
error
});
}
}
log.info("OCR processing complete", {
pairsCreated,
sourceLanguage,
targetLanguage
});
return {
success: true,
message: `Successfully created ${pairsCreated} vocabulary pairs`,
data: {
pairsCreated,
sourceLanguage,
targetLanguage,
},
};
}