diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java index c3a74070..3a729b02 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java @@ -402,13 +402,23 @@ private void addFileToDataset(String datasetId, List unpacked) for (FileUploadResult file : unpacked) { File savedFile = file.getSavedFile(); LocalDateTime currentTime = LocalDateTime.now(); + // 统一 fileName:无论是否通过文件夹/压缩包上传,都只保留纯文件名 + String originalFileName = file.getFileName(); + String baseFileName = originalFileName; + if (originalFileName != null) { + String normalized = originalFileName.replace("\\", "/"); + int lastSlash = normalized.lastIndexOf('/'); + if (lastSlash >= 0 && lastSlash + 1 < normalized.length()) { + baseFileName = normalized.substring(lastSlash + 1); + } + } DatasetFile datasetFile = DatasetFile.builder() .id(UUID.randomUUID().toString()) .datasetId(datasetId) .fileSize(savedFile.length()) .uploadTime(currentTime) .lastAccessTime(currentTime) - .fileName(file.getFileName()) + .fileName(baseFileName) .filePath(savedFile.getPath()) .fileType(AnalyzerUtils.getExtension(file.getFileName())) .build(); diff --git a/frontend/src/components/business/DatasetFileTransfer.tsx b/frontend/src/components/business/DatasetFileTransfer.tsx index b9431101..08aa64cb 100644 --- a/frontend/src/components/business/DatasetFileTransfer.tsx +++ b/frontend/src/components/business/DatasetFileTransfer.tsx @@ -21,6 +21,13 @@ interface DatasetFileTransferProps onSelectedFilesChange: (filesMap: { [key: string]: DatasetFile }) => void; onDatasetSelect?: (dataset: Dataset | null) => void; datasetTypeFilter?: DatasetType; + /** + * 锁定的文件ID集合: + * - 在左侧文件列表中,这些文件的勾选框会变成灰色且不可交互; + * - 点击整行也不会改变其选中状态; + * - 主要用于“编辑任务数据集”场景下锁死任务初始文件。 + */ + lockedFileIds?: string[]; } const fileCols = [ @@ -52,6 +59,7 @@ const DatasetFileTransfer: React.FC = ({ onSelectedFilesChange, onDatasetSelect, datasetTypeFilter, + lockedFileIds, ...props }) => { const [datasets, setDatasets] = React.useState([]); @@ -79,6 +87,10 @@ const DatasetFileTransfer: React.FC = ({ ); const [selectingAll, setSelectingAll] = React.useState(false); + const lockedIdSet = React.useMemo(() => { + return new Set((lockedFileIds || []).map((id) => String(id))); + }, [lockedFileIds]); + const fetchDatasets = async () => { const { data } = await queryDatasetsUsingGet({ // Ant Design Table pagination.current is 1-based; ensure backend also receives 1-based value @@ -230,6 +242,10 @@ const DatasetFileTransfer: React.FC = ({ }, [selectedDataset, selectedFilesMap, onSelectedFilesChange]); const toggleSelectFile = (record: DatasetFile) => { + // 被锁定的文件不允许在此组件中被增删 + if (lockedIdSet.has(String(record.id))) { + return; + } if (!selectedFilesMap[record.id]) { onSelectedFilesChange({ ...selectedFilesMap, @@ -421,6 +437,7 @@ const DatasetFileTransfer: React.FC = ({ getCheckboxProps: (record: DatasetFile) => ({ name: record.fileName, + disabled: lockedIdSet.has(String(record.id)), }), }} /> diff --git a/frontend/src/pages/DataAnnotation/AutoAnnotation/AutoAnnotation.tsx b/frontend/src/pages/DataAnnotation/AutoAnnotation/AutoAnnotation.tsx index 75b3138e..589ca4ff 100644 --- a/frontend/src/pages/DataAnnotation/AutoAnnotation/AutoAnnotation.tsx +++ b/frontend/src/pages/DataAnnotation/AutoAnnotation/AutoAnnotation.tsx @@ -1,13 +1,16 @@ import { useState, useEffect } from "react"; -import { Card, Button, Table, message, Modal, Tag, Progress, Space, Tooltip } from "antd"; +import { Card, Button, Table, message, Modal, Tag, Progress, Space, Tooltip, Dropdown } from "antd"; import { PlusOutlined, DeleteOutlined, DownloadOutlined, ReloadOutlined, EyeOutlined, - SyncOutlined, EditOutlined, + MoreOutlined, + SettingOutlined, + ExportOutlined, + ImportOutlined, } from "@ant-design/icons"; import type { ColumnType } from "antd/es/table"; import type { AutoAnnotationTask, AutoAnnotationStatus } from "../annotation.model"; @@ -19,6 +22,8 @@ import { syncAutoAnnotationTaskToLabelStudioUsingPost, } from "../annotation.api"; import CreateAutoAnnotationDialog from "./components/CreateAutoAnnotationDialog"; +import EditAutoAnnotationDatasetDialog from "./components/EditAutoAnnotationDatasetDialog"; +import ImportFromLabelStudioDialog from "./components/ImportFromLabelStudioDialog"; const STATUS_COLORS: Record = { pending: "default", @@ -51,6 +56,10 @@ export default function AutoAnnotation() { const [selectedRowKeys, setSelectedRowKeys] = useState([]); const [labelStudioBase, setLabelStudioBase] = useState(null); const [datasetProjectMap, setDatasetProjectMap] = useState>({}); + const [editingTask, setEditingTask] = useState(null); + const [showEditDatasetDialog, setShowEditDatasetDialog] = useState(false); + const [importingTask, setImportingTask] = useState(null); + const [showImportDialog, setShowImportDialog] = useState(false); useEffect(() => { fetchTasks(); @@ -106,6 +115,16 @@ export default function AutoAnnotation() { } }; + const handleEditTaskDataset = (task: AutoAnnotationTask) => { + setEditingTask(task); + setShowEditDatasetDialog(true); + }; + + const handleImportFromLabelStudio = (task: AutoAnnotationTask) => { + setImportingTask(task); + setShowImportDialog(true); + }; + const handleDelete = (task: AutoAnnotationTask) => { Modal.confirm({ title: `确认删除自动标注任务「${task.name}」吗?`, @@ -303,13 +322,46 @@ export default function AutoAnnotation() { { title: "操作", key: "actions", - width: 260, + width: 320, fixed: "right", render: (_: any, record: AutoAnnotationTask) => ( + {/* 一级功能菜单:前向同步 + 编辑(跳转 Label Studio) */} + + + + + + + + + + + {/* 已完成任务的查看/下载结果仍保留 */} {record.status === "completed" && ( <> - + + ), }, @@ -402,6 +456,37 @@ export default function AutoAnnotation() { fetchTasks(); }} /> + + {editingTask && ( + { + setShowEditDatasetDialog(false); + setEditingTask(null); + }} + onSuccess={() => { + setShowEditDatasetDialog(false); + setEditingTask(null); + fetchTasks(); + }} + /> + )} + + {importingTask && ( + { + setShowImportDialog(false); + setImportingTask(null); + }} + onSuccess={() => { + setShowImportDialog(false); + setImportingTask(null); + }} + /> + )} ); } \ No newline at end of file diff --git a/frontend/src/pages/DataAnnotation/AutoAnnotation/components/EditAutoAnnotationDatasetDialog.tsx b/frontend/src/pages/DataAnnotation/AutoAnnotation/components/EditAutoAnnotationDatasetDialog.tsx new file mode 100644 index 00000000..609dd0f6 --- /dev/null +++ b/frontend/src/pages/DataAnnotation/AutoAnnotation/components/EditAutoAnnotationDatasetDialog.tsx @@ -0,0 +1,201 @@ +import { useEffect, useState } from "react"; +import { Modal, Form, Input, message } from "antd"; +import type { AutoAnnotationTask } from "../../annotation.model"; +import { getAutoAnnotationTaskFilesUsingGet, updateAutoAnnotationTaskFilesUsingPut } from "../../annotation.api"; +import DatasetFileTransfer from "@/components/business/DatasetFileTransfer"; +import type { DatasetFile, Dataset } from "@/pages/DataManagement/dataset.model"; +import { DatasetType } from "@/pages/DataManagement/dataset.model"; + +interface EditAutoAnnotationDatasetDialogProps { + visible: boolean; + task: AutoAnnotationTask; + onCancel: () => void; + onSuccess: () => void; +} + +const imageExtensions = [ + ".jpg", + ".jpeg", + ".png", + ".bmp", + ".gif", + ".tiff", + ".webp", +]; + +export default function EditAutoAnnotationDatasetDialog({ + visible, + task, + onCancel, + onSuccess, +}: EditAutoAnnotationDatasetDialogProps) { + const [form] = Form.useForm(); + const [loading, setLoading] = useState(false); + const [selectedFilesMap, setSelectedFilesMap] = useState>({}); + const [initialFilesMap, setInitialFilesMap] = useState>({}); + const [initialFileIds, setInitialFileIds] = useState>(new Set()); + const [selectedDataset, setSelectedDataset] = useState(null); + const [imageFileCount, setImageFileCount] = useState(0); + + // 预计算当前已选中的图像文件数量 + useEffect(() => { + const count = Object.values(selectedFilesMap).filter((file) => { + const ext = file.fileName?.toLowerCase().match(/\.[^.]+$/)?.[0] || ""; + return imageExtensions.includes(ext); + }).length; + setImageFileCount(count); + }, [selectedFilesMap]); + + // 打开弹窗时,拉取任务当前关联的文件列表,作为默认选中项 + useEffect(() => { + if (!visible || !task?.id) { + return; + } + + let cancelled = false; + + (async () => { + try { + const resp = await getAutoAnnotationTaskFilesUsingGet(task.id); + const list: any[] = resp?.data || resp || []; + if (cancelled) return; + + const nextMap: Record = {}; + list.forEach((item) => { + if (!item || item.id == null) return; + const idStr = String(item.id); + nextMap[idStr] = { + // DatasetFile 接口字段与后端返回字段对齐,这里做最小映射 + id: idStr, + fileName: item.fileName, + filePath: item.filePath, + fileSize: item.fileSize, + fileType: item.fileType, + status: "ACTIVE", + // 额外附加 datasetId/datasetName 供 DatasetFileTransfer 使用 + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + datasetId: item.datasetId, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + datasetName: item.datasetName, + } as unknown as DatasetFile; + }); + + setSelectedFilesMap(nextMap); + setInitialFilesMap(nextMap); + setInitialFileIds(new Set(Object.keys(nextMap))); + } catch (e) { + console.error("Failed to fetch auto annotation task files:", e); + message.error("获取任务当前数据集文件失败"); + } + })(); + + return () => { + cancelled = true; + }; + }, [visible, task?.id]); + + useEffect(() => { + if (visible) { + form.setFieldsValue({ + name: task?.name, + }); + } + }, [visible, task?.name, form]); + + const handleSubmit = async () => { + try { + if (imageFileCount === 0) { + message.error("请至少选择一个图像文件"); + return; + } + + setLoading(true); + + const selectedFiles = Object.values(selectedFilesMap) as any[]; + const datasetIds = Array.from( + new Set( + selectedFiles + .map((file) => ( + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + file?.datasetId + )) + .filter((id) => id !== undefined && id !== null && id !== ""), + ), + ); + + // 主数据集:优先沿用任务原有 datasetId,其次取当前选择中的第一个 + const effectiveDatasetId = task.datasetId || datasetIds[0]; + + const imageFileIds = Object.values(selectedFilesMap) + .filter((file) => { + const ext = file.fileName?.toLowerCase().match(/\.[^.]+$/)?.[0] || ""; + return imageExtensions.includes(ext); + }) + .map((file) => file.id); + + const payload = { + datasetId: effectiveDatasetId, + fileIds: imageFileIds, + }; + + await updateAutoAnnotationTaskFilesUsingPut(task.id, payload); + message.success("任务数据集已更新,将仅对新增的图像执行自动标注"); + onSuccess(); + } catch (error: any) { + console.error("Failed to update auto annotation task files:", error); + message.error(error?.message || "更新任务数据集失败"); + } finally { + setLoading(false); + } + }; + + return ( + +
+ + + + + + { + // 不允许删除任务最初已包含的文件: + // 无论在 UI 中如何操作,这些初始文件都会被强制保留 + const merged: Record = { ...next }; + initialFileIds.forEach((id) => { + if (!merged[id] && initialFilesMap[id]) { + merged[id] = initialFilesMap[id]; + } + }); + setSelectedFilesMap(merged); + }} + onDatasetSelect={(dataset) => { + setSelectedDataset(dataset); + }} + datasetTypeFilter={DatasetType.IMAGE} + lockedFileIds={Array.from(initialFileIds)} + /> + {selectedDataset && ( +
+ 当前数据集:{selectedDataset.name} - 已选择 + {imageFileCount} 个图像文件 +
+ )} +
+
+
+ ); +} diff --git a/frontend/src/pages/DataAnnotation/AutoAnnotation/components/ImportFromLabelStudioDialog.tsx b/frontend/src/pages/DataAnnotation/AutoAnnotation/components/ImportFromLabelStudioDialog.tsx new file mode 100644 index 00000000..ac9a2a0b --- /dev/null +++ b/frontend/src/pages/DataAnnotation/AutoAnnotation/components/ImportFromLabelStudioDialog.tsx @@ -0,0 +1,164 @@ +import { useEffect, useState } from "react"; +import { Modal, Form, Select, Input, message } from "antd"; +import type { AutoAnnotationTask } from "../../annotation.model"; +import { queryDatasetsUsingGet } from "@/pages/DataManagement/dataset.api"; +import type { Dataset } from "@/pages/DataManagement/dataset.model"; +import { importAutoAnnotationFromLabelStudioUsingPost } from "../../annotation.api"; + +interface ImportFromLabelStudioDialogProps { + visible: boolean; + task: AutoAnnotationTask | null; + onCancel: () => void; + onSuccess: () => void; +} + +const EXPORT_FORMAT_OPTIONS = [ + "JSON", + "JSON_MIN", + "CSV", + "TSV", + "COCO", + "YOLO", + "YOLOv8", +]; + +export default function ImportFromLabelStudioDialog({ + visible, + task, + onCancel, + onSuccess, +}: ImportFromLabelStudioDialogProps) { + const [form] = Form.useForm(); + const [datasets, setDatasets] = useState([]); + const [loading, setLoading] = useState(false); + + useEffect(() => { + if (!visible) return; + + let cancelled = false; + + (async () => { + try { + const resp: any = await queryDatasetsUsingGet({ page: 0, size: 1000 }); + const list: Dataset[] = resp?.content || resp?.data?.content || resp?.data || resp || []; + if (!cancelled && Array.isArray(list)) { + setDatasets(list); + } + } catch (e) { + console.error("Failed to fetch datasets for LS import:", e); + if (!cancelled) { + message.error("获取数据集列表失败"); + } + } + })(); + + return () => { + cancelled = true; + }; + }, [visible]); + + useEffect(() => { + if (visible && task) { + // 默认选中任务原始数据集和 JSON 导出格式 + form.setFieldsValue({ + targetDatasetId: task.datasetId, + exportFormat: "JSON", + }); + } + }, [visible, task, form]); + + const handleOk = async () => { + try { + const values = await form.validateFields(); + const targetDatasetId: string = values.targetDatasetId; + const exportFormat: string = values.exportFormat; + const fileName: string | undefined = values.fileName; + + if (!task?.id) { + message.error("未找到自动标注任务"); + return; + } + + setLoading(true); + await importAutoAnnotationFromLabelStudioUsingPost(task.id, { + targetDatasetId, + exportFormat, + // 后端会自动附加正确的扩展名 + fileName: fileName?.trim() || undefined, + }); + + message.success("已从 Label Studio 导出结果并保存到数据集"); + onSuccess(); + } catch (e: any) { + if (e?.errorFields) { + // 表单校验错误,忽略 + return; + } + console.error("Failed to import from Label Studio:", e); + message.error(e?.message || "后向同步失败,请稍后重试"); + } finally { + setLoading(false); + } + }; + + return ( + +
+ + {task?.name || "-"} + + + + ({ + label: fmt, + value: fmt, + }))} + /> + + + + + + +
+ 将从与该自动标注任务关联的 Label Studio 项目中, + 按所选格式导出完整标注结果,并作为一个文件保存到所选数据集中。 + 不会修改已有标签,仅追加一个导出工件文件。 +
+
+
+ ); +} diff --git a/frontend/src/pages/DataAnnotation/EditManualAnnotationDatasetDialog.tsx b/frontend/src/pages/DataAnnotation/EditManualAnnotationDatasetDialog.tsx new file mode 100644 index 00000000..c80b088f --- /dev/null +++ b/frontend/src/pages/DataAnnotation/EditManualAnnotationDatasetDialog.tsx @@ -0,0 +1,186 @@ +import { useEffect, useState } from "react"; +import { Modal, Form, Input, message } from "antd"; +import type { AnnotationTask } from "./annotation.model"; +import DatasetFileTransfer from "@/components/business/DatasetFileTransfer"; +import type { DatasetFile, Dataset } from "@/pages/DataManagement/dataset.model"; +import { getManualAnnotationMappingFilesUsingGet, updateManualAnnotationMappingFilesUsingPut } from "./annotation.api"; + +interface EditManualAnnotationDatasetDialogProps { + visible: boolean; + task: AnnotationTask; + onCancel: () => void; + onSuccess: () => void; +} + +export default function EditManualAnnotationDatasetDialog({ + visible, + task, + onCancel, + onSuccess, +}: EditManualAnnotationDatasetDialogProps) { + const [form] = Form.useForm(); + const [loading, setLoading] = useState(false); + const [selectedFilesMap, setSelectedFilesMap] = useState>({}); + const [initialFilesMap, setInitialFilesMap] = useState>({}); + const [initialFileIds, setInitialFileIds] = useState>(new Set()); + const [selectedDataset, setSelectedDataset] = useState(null); + const [selectedCount, setSelectedCount] = useState(0); + + // 统计当前选中的文件数量(所有类型) + useEffect(() => { + setSelectedCount(Object.keys(selectedFilesMap).length); + }, [selectedFilesMap]); + + // 打开弹窗时,拉取当前映射在 LS 中已有的文件列表,作为默认选中且锁定的文件 + useEffect(() => { + if (!visible || !task?.id) { + return; + } + + let cancelled = false; + + (async () => { + try { + const resp = await getManualAnnotationMappingFilesUsingGet(task.id); + const list: any[] = (resp as any)?.data || (resp as any) || []; + + const nextMap: Record = {}; + list.forEach((item) => { + if (!item || item.id == null) return; + const idStr = String(item.id); + nextMap[idStr] = { + // DatasetFile 接口字段与后端返回字段对齐,这里做最小映射 + // 类型上按需扩展,运行时以实际字段为准 + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + id: idStr, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + fileName: item.fileName, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + filePath: item.filePath, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + fileSize: item.fileSize, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + fileType: item.fileType, + // 附加 datasetId/datasetName 供 DatasetFileTransfer 使用 + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + datasetId: item.datasetId, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + datasetName: item.datasetName, + // 其余字段保持为空/默认值 + } as unknown as DatasetFile; + }); + + if (!cancelled) { + setSelectedFilesMap(nextMap); + setInitialFilesMap(nextMap); + setInitialFileIds(new Set(Object.keys(nextMap))); + } + } catch (e) { + console.error("Failed to fetch manual mapping files:", e); + if (!cancelled) { + message.error("获取任务当前数据集文件失败"); + } + } + })(); + + return () => { + cancelled = true; + }; + }, [visible, task?.id]); + + useEffect(() => { + if (visible) { + form.setFieldsValue({ + name: task?.name, + }); + } + }, [visible, task?.name, form]); + + const handleSubmit = async () => { + try { + if (selectedCount === 0) { + message.error("请至少选择一个文件"); + return; + } + + setLoading(true); + + const fileIds = Object.values(selectedFilesMap).map((file: any) => String(file.id)); + + const payload = { + // datasetId 字段在后端目前未强依赖,这里沿用任务主体数据集以保持一致 + datasetId: task.datasetId, + fileIds, + }; + + await updateManualAnnotationMappingFilesUsingPut(task.id, payload); + message.success("任务数据集已更新,将仅为新增文件在 Label Studio 中创建任务"); + onSuccess(); + } catch (error: any) { + console.error("Failed to update manual annotation mapping files:", error); + message.error(error?.message || "更新任务数据集失败"); + } finally { + setLoading(false); + } + }; + + return ( + +
+ + + + + + { + // 不允许删除任务最初已包含的文件: + // 无论在 UI 中如何操作,这些初始文件都会被强制保留 + const merged: Record = { ...next } as any; + initialFileIds.forEach((id) => { + if (!merged[id] && initialFilesMap[id]) { + merged[id] = initialFilesMap[id]; + } + }); + setSelectedFilesMap(merged); + }} + onDatasetSelect={(dataset) => { + setSelectedDataset(dataset as Dataset | null); + }} + // 手动标注支持所有数据集/文件类型,这里不设置 datasetTypeFilter + lockedFileIds={Array.from(initialFileIds)} + /> +
+ {selectedDataset ? ( + <> + 当前数据集:{selectedDataset.name} - 已选择 + {selectedCount} 个文件 + + ) : ( + <> + 已选择 {selectedCount} 个文件 + + )} +
+
+
+
+ ); +} diff --git a/frontend/src/pages/DataAnnotation/Home/DataAnnotation.tsx b/frontend/src/pages/DataAnnotation/Home/DataAnnotation.tsx index 84b811e7..038068a5 100644 --- a/frontend/src/pages/DataAnnotation/Home/DataAnnotation.tsx +++ b/frontend/src/pages/DataAnnotation/Home/DataAnnotation.tsx @@ -1,10 +1,14 @@ import { useState, useEffect } from "react"; -import { Card, Button, Table, message, Modal, Tabs, Tag, Progress, Tooltip } from "antd"; +import { Card, Button, Table, message, Modal, Tabs, Tag, Progress, Tooltip, Dropdown } from "antd"; import { PlusOutlined, EditOutlined, DeleteOutlined, SyncOutlined, + MoreOutlined, + SettingOutlined, + ExportOutlined, + ImportOutlined, } from "@ant-design/icons"; import { SearchControls } from "@/components/SearchControls"; import CardView from "@/components/CardView"; @@ -24,6 +28,10 @@ import { mapAnnotationTask } from "../annotation.const"; import CreateAnnotationTask from "../Create/components/CreateAnnotationTaskDialog"; import { ColumnType } from "antd/es/table"; import { TemplateList } from "../Template"; +import EditAutoAnnotationDatasetDialog from "../AutoAnnotation/components/EditAutoAnnotationDatasetDialog"; +import ImportFromLabelStudioDialog from "../AutoAnnotation/components/ImportFromLabelStudioDialog"; +import ManualImportFromLabelStudioDialog from "../ManualImportFromLabelStudioDialog"; +import EditManualAnnotationDatasetDialog from "../EditManualAnnotationDatasetDialog"; // Note: DevelopmentInProgress intentionally not used here const AUTO_STATUS_LABELS: Record = { @@ -63,6 +71,14 @@ export default function DataAnnotation() { const [selectedRowKeys, setSelectedRowKeys] = useState<(string | number)[]>([]); const [selectedRows, setSelectedRows] = useState([]); const [datasetProjectMap, setDatasetProjectMap] = useState>({}); + const [editingAutoTask, setEditingAutoTask] = useState(null); + const [showEditAutoDatasetDialog, setShowEditAutoDatasetDialog] = useState(false); + const [editingManualTask, setEditingManualTask] = useState(null); + const [showEditManualDatasetDialog, setShowEditManualDatasetDialog] = useState(false); + const [importingAutoTask, setImportingAutoTask] = useState(null); + const [showImportAutoDialog, setShowImportAutoDialog] = useState(false); + const [importingManualTask, setImportingManualTask] = useState(null); + const [showImportManualDialog, setShowImportManualDialog] = useState(false); // 拉取自动标注任务(供轮询和创建成功后立即刷新复用) const refreshAutoTasks = async (silent = false) => { @@ -211,6 +227,57 @@ export default function DataAnnotation() { }); }; + const handleEditAutoTaskDataset = (row: any) => { + if (!row?.id) { + message.error("未找到对应的自动标注任务"); + return; + } + + const full = autoTasks.find((t: any) => t.id === row.id); + if (!full) { + message.error("未找到对应的自动标注任务详情,请稍后重试"); + return; + } + + setEditingAutoTask(full); + setShowEditAutoDatasetDialog(true); + }; + + const handleEditManualTaskDataset = (task: AnnotationTask) => { + if (!task?.id) { + message.error("未找到对应的标注任务"); + return; + } + setEditingManualTask(task); + setShowEditManualDatasetDialog(true); + }; + + const handleImportManualFromLabelStudio = (task: AnnotationTask) => { + if (!task?.id) { + message.error("未找到标注任务"); + return; + } + + setImportingManualTask(task); + setShowImportManualDialog(true); + }; + + const handleImportAutoFromLabelStudio = (row: any) => { + if (!row?.id) { + message.error("未找到对应的自动标注任务"); + return; + } + + const full = autoTasks.find((t: any) => t.id === row.id); + if (!full) { + message.error("未找到对应的自动标注任务详情,请稍后重试"); + return; + } + + setImportingAutoTask(full); + setShowImportAutoDialog(true); + }; + const handleSyncAutoToLabelStudio = (task: any) => { if (task.autoStatus !== "completed") { message.warning("仅已完成的自动标注任务可以同步到 Label Studio"); @@ -392,10 +459,10 @@ export default function DataAnnotation() { onClick: handleAnnotate, }, { - key: "sync", - label: "同步", - icon: , - onClick: handleSync, + key: "back-sync", + label: "后向同步", + icon: , + onClick: handleImportManualFromLabelStudio, }, { key: "delete", @@ -569,40 +636,109 @@ export default function DataAnnotation() { title: "操作", key: "actions", fixed: "right" as const, - width: 220, + width: 260, dataIndex: "actions", render: (_: any, task: any) => (
- {task._kind === "manual" && - operations.map((operation) => ( + {task._kind === "manual" && ( + <> + + + , + onClick: () => handleEditManualTaskDataset(task), + }, + { + key: "delete", + label: "删除任务", + icon: , + danger: true, + onClick: () => handleDelete(task), + }, + ], + }} + trigger={["click"]} + > + + + + {/* 二级功能:编辑任务数据集 + 删除任务(折叠菜单) */} + , + onClick: () => handleEditAutoTaskDataset(task), + }, + { + key: "delete", + label: "删除任务", + icon: , + danger: true, + onClick: () => handleDeleteAuto(task), + }, + ], + }} + trigger={["click"]} + > +
@@ -719,6 +855,68 @@ export default function DataAnnotation() { }, ]} /> + + {editingAutoTask && ( + { + setShowEditAutoDatasetDialog(false); + setEditingAutoTask(null); + }} + onSuccess={() => { + setShowEditAutoDatasetDialog(false); + setEditingAutoTask(null); + refreshAutoTasks(); + }} + /> + )} + + {editingManualTask && ( + { + setShowEditManualDatasetDialog(false); + setEditingManualTask(null); + }} + onSuccess={() => { + setShowEditManualDatasetDialog(false); + setEditingManualTask(null); + }} + /> + )} + + {importingManualTask && ( + { + setShowImportManualDialog(false); + setImportingManualTask(null); + }} + onSuccess={() => { + setShowImportManualDialog(false); + setImportingManualTask(null); + }} + /> + )} + + {importingAutoTask && ( + { + setShowImportAutoDialog(false); + setImportingAutoTask(null); + }} + onSuccess={() => { + setShowImportAutoDialog(false); + setImportingAutoTask(null); + }} + /> + )} ); } diff --git a/frontend/src/pages/DataAnnotation/ManualImportFromLabelStudioDialog.tsx b/frontend/src/pages/DataAnnotation/ManualImportFromLabelStudioDialog.tsx new file mode 100644 index 00000000..9dbefcb6 --- /dev/null +++ b/frontend/src/pages/DataAnnotation/ManualImportFromLabelStudioDialog.tsx @@ -0,0 +1,161 @@ +import { useEffect, useState } from "react"; +import { Modal, Form, Select, Input, message } from "antd"; +import type { AnnotationTask } from "./annotation.model"; +import { queryDatasetsUsingGet } from "@/pages/DataManagement/dataset.api"; +import type { Dataset } from "@/pages/DataManagement/dataset.model"; +import { importManualAnnotationFromLabelStudioUsingPost } from "./annotation.api"; + +interface ManualImportFromLabelStudioDialogProps { + visible: boolean; + task: AnnotationTask | null; + onCancel: () => void; + onSuccess: () => void; +} + +const EXPORT_FORMAT_OPTIONS = [ + "JSON", + "JSON_MIN", + "CSV", + "TSV", + "COCO", + "YOLO", + "YOLOv8", +]; + +export default function ManualImportFromLabelStudioDialog({ + visible, + task, + onCancel, + onSuccess, +}: ManualImportFromLabelStudioDialogProps) { + const [form] = Form.useForm(); + const [datasets, setDatasets] = useState([]); + const [loading, setLoading] = useState(false); + + useEffect(() => { + if (!visible) return; + + let cancelled = false; + + (async () => { + try { + const resp: any = await queryDatasetsUsingGet({ page: 0, size: 1000 }); + const list: Dataset[] = resp?.content || resp?.data?.content || resp?.data || resp || []; + if (!cancelled && Array.isArray(list)) { + setDatasets(list); + } + } catch (e) { + console.error("Failed to fetch datasets for manual LS import:", e); + if (!cancelled) { + message.error("获取数据集列表失败"); + } + } + })(); + + return () => { + cancelled = true; + }; + }, [visible]); + + useEffect(() => { + if (visible && task) { + form.setFieldsValue({ + targetDatasetId: task.datasetId, + exportFormat: "JSON", + }); + } + }, [visible, task, form]); + + const handleOk = async () => { + try { + const values = await form.validateFields(); + const targetDatasetId: string = values.targetDatasetId; + const exportFormat: string = values.exportFormat; + const fileName: string | undefined = values.fileName; + + if (!task?.id) { + message.error("未找到标注任务"); + return; + } + + setLoading(true); + await importManualAnnotationFromLabelStudioUsingPost(task.id, { + targetDatasetId, + exportFormat, + fileName: fileName?.trim() || undefined, + }); + + message.success("已从 Label Studio 导出结果并保存到数据集"); + onSuccess(); + } catch (e: any) { + if (e?.errorFields) { + return; + } + console.error("Failed to import manual annotations from Label Studio:", e); + message.error(e?.message || "后向同步失败,请稍后重试"); + } finally { + setLoading(false); + } + }; + + return ( + +
+ + {task?.name || "-"} + + + + ({ + label: fmt, + value: fmt, + }))} + /> + + + + + + +
+ 将从与该标注任务关联的 Label Studio 项目中, + 按所选格式导出完整标注结果,并作为一个文件保存到所选数据集中。 + 不会修改已有标签,仅追加一个导出工件文件。 +
+
+
+ ); +} diff --git a/frontend/src/pages/DataAnnotation/annotation.api.ts b/frontend/src/pages/DataAnnotation/annotation.api.ts index 6b932129..38affaec 100644 --- a/frontend/src/pages/DataAnnotation/annotation.api.ts +++ b/frontend/src/pages/DataAnnotation/annotation.api.ts @@ -18,6 +18,15 @@ export function deleteAnnotationTaskByIdUsingDelete(mappingId: string) { return del(`/api/annotation/project/${mappingId}`); } +// 手动标注:查询/更新映射当前关联的 DM 文件列表(用于“编辑任务数据集”) +export function getManualAnnotationMappingFilesUsingGet(mappingId: string) { + return get(`/api/annotation/project/${mappingId}/files`); +} + +export function updateManualAnnotationMappingFilesUsingPut(mappingId: string, data: any) { + return put(`/api/annotation/project/${mappingId}/files`, data); +} + // 标签配置管理 export function getTagConfigUsingGet() { return get("/api/annotation/tags/config"); @@ -62,6 +71,14 @@ export function loginAnnotationUsingGet(mappingId: string) { return get(`/api/annotation/project/${mappingId}/login`); } +// 手动标注:从 Label Studio 导回标注结果到某个数据集(导出为文件写入数据集) +export function importManualAnnotationFromLabelStudioUsingPost( + mappingId: string, + data: { targetDatasetId: string; exportFormat?: string } +) { + return post(`/api/annotation/project/${mappingId}/sync-label-studio-back`, data); +} + export function downloadAutoAnnotationResultUsingGet(taskId: string) { return download(`/api/annotation/auto/${taskId}/download`); } @@ -71,7 +88,25 @@ export function syncAutoAnnotationTaskToLabelStudioUsingPost(taskId: string) { return post(`/api/annotation/auto/${taskId}/sync-label-studio`); } +// 从 Label Studio 导回自动标注任务的标注结果(导出为文件写入指定数据集) +export function importAutoAnnotationFromLabelStudioUsingPost( + taskId: string, + data: { targetDatasetId: string; exportFormat?: string } +) { + return post(`/api/annotation/auto/${taskId}/sync-label-studio-back`, data); +} + // 查询自动标注任务关联的 Label Studio 项目 export function getAutoAnnotationLabelStudioProjectUsingGet(taskId: string) { return get(`/api/annotation/auto/${taskId}/label-studio-project`); } + +// 查询自动标注任务当前关联的 DM 文件列表(用于编辑任务数据集弹窗预选) +export function getAutoAnnotationTaskFilesUsingGet(taskId: string) { + return get(`/api/annotation/auto/${taskId}/files`); +} + +// 更新自动标注任务所关联的数据集文件,并触发重新调度与 LS 同步 +export function updateAutoAnnotationTaskFilesUsingPut(taskId: string, data: any) { + return put(`/api/annotation/auto/${taskId}/files`, data); +} diff --git a/frontend/src/pages/DataAnnotation/annotation.model.ts b/frontend/src/pages/DataAnnotation/annotation.model.ts index bd60ad4d..ec4091db 100644 --- a/frontend/src/pages/DataAnnotation/annotation.model.ts +++ b/frontend/src/pages/DataAnnotation/annotation.model.ts @@ -121,6 +121,9 @@ export interface AutoAnnotationTask { datasetName?: string; sourceDatasets?: string[]; + // 当前任务关联的 DM 文件 ID 列表(由后端返回,可选) + fileIds?: string[]; + config: { modelSize: string; confThreshold: number; diff --git a/runtime/datamate-python/app/module/annotation/client/labelstudio/client.py b/runtime/datamate-python/app/module/annotation/client/labelstudio/client.py index ca9b8cfc..6f1139f3 100644 --- a/runtime/datamate-python/app/module/annotation/client/labelstudio/client.py +++ b/runtime/datamate-python/app/module/annotation/client/labelstudio/client.py @@ -438,6 +438,52 @@ async def delete_project(self, project_id: int) -> bool: logger.error(f"Error while deleting project {project_id}: {e}") return False + async def export_project( + self, + project_id: int, + export_type: str = "JSON", + ) -> Optional[bytes]: + """导出 Label Studio 项目数据。 + + 对应 Label Studio 的项目导出接口,支持多种 exportType, + 例如 JSON/JSON_MIN/CSV/TSV/COCO/YOLO/YOLOv8 等。 + + 返回导出文件的原始二进制内容,调用方负责将其写入本地文件。 + """ + + try: + logger.debug( + "Exporting Label Studio project %s with exportType=%s", + project_id, + export_type, + ) + + response = await self.client.get( + f"/api/projects/{project_id}/export", + params={"exportType": export_type}, + ) + response.raise_for_status() + + content = response.content or b"" + logger.debug( + "Exported project %s with %d bytes", + project_id, + len(content), + ) + return content + + except httpx.HTTPStatusError as e: + logger.error( + "Export project %s failed HTTP %s: %s", + project_id, + e.response.status_code, + e.response.text, + ) + return None + except Exception as e: + logger.error("Error while exporting project %s: %s", project_id, e) + return None + async def get_task_annotations( self, task_id: int diff --git a/runtime/datamate-python/app/module/annotation/interface/auto.py b/runtime/datamate-python/app/module/annotation/interface/auto.py index 9239eeb4..161dd6fd 100644 --- a/runtime/datamate-python/app/module/annotation/interface/auto.py +++ b/runtime/datamate-python/app/module/annotation/interface/auto.py @@ -26,6 +26,8 @@ CreateAutoAnnotationTaskRequest, AutoAnnotationTaskResponse, AutoAnnotationConfig, + UpdateAutoAnnotationTaskFilesRequest, + ImportFromLabelStudioRequest, ) from ..service.auto import AutoAnnotationTaskService from ..service.mapping import DatasetMappingService @@ -173,6 +175,7 @@ async def _ensure_ls_mapping_for_auto_task( task_name: str, file_ids: Optional[List[str]] = None, auto_task_id: Optional[str] = None, + delete_orphans: bool = False, ) -> Optional[str]: """确保给定数据集存在一个 Label Studio 项目映射,并按需同步子集文件。 @@ -305,7 +308,11 @@ async def _ensure_ls_mapping_for_auto_task( if not file_ids: # 兼容老逻辑:未指定 file_ids 时,同步整个主数据集 - await sync_service.sync_files(mapping, 100) + await sync_service.sync_files( + mapping, + 100, + delete_orphans=delete_orphans, + ) else: # 按 file_ids 反查所属数据集:dataset_id -> set(file_ids) stmt = ( @@ -375,7 +382,7 @@ async def _ensure_ls_mapping_for_auto_task( mapping, 100, allowed_file_ids={str(fid) for fid in file_ids}, - delete_orphans=False, + delete_orphans=delete_orphans, ) else: # 对每个涉及到的数据集,使用 override_dataset_id 将其文件同步到同一个项目 @@ -385,7 +392,7 @@ async def _ensure_ls_mapping_for_auto_task( 100, allowed_file_ids=ds_file_ids, override_dataset_id=ds_id, - delete_orphans=False, + delete_orphans=delete_orphans, ) except Exception as e: # pragma: no cover - 同步失败不影响项目创建 logger.warning( @@ -397,6 +404,8 @@ async def _ensure_ls_mapping_for_auto_task( return str(project_id) + + @router.get("", response_model=StandardResponse[List[AutoAnnotationTaskResponse]]) async def list_auto_annotation_tasks( db: AsyncSession = Depends(get_db), @@ -482,6 +491,91 @@ async def create_auto_annotation_task( ) +@router.put("/{task_id}/files", response_model=StandardResponse[AutoAnnotationTaskResponse]) +async def update_auto_annotation_task_files( + task_id: str = Path(..., description="任务ID"), + request: UpdateAutoAnnotationTaskFilesRequest = ..., # 通过 body 传入 datasetId 与 fileIds + db: AsyncSession = Depends(get_db), +): + """更新自动标注任务所关联的数据集文件,并同步到 Label Studio。 + + 最新约定: + 1. 创建任务时选择的文件集合视为“基础集合”,后续编辑时不允许将其移除,只能追加新文件; + 2. runtime worker 仅对新增文件执行自动标注,历史文件结果和输出数据集保持不变; + 3. 编辑任务数据集时,仅将新增文件同步到 Label Studio,不再删除已有任务。 + """ + + # 1. 获取现有任务(响应模型)以便读取当前配置 + existing = await service.get_task(db, task_id) + if not existing: + raise HTTPException(status_code=404, detail="Task not found") + + # 1.1 计算旧文件集合与本次提交集合: + # - 旧集合视为“锁定”,不允许通过编辑接口移除; + # - 新集合与旧集合求并集后才会写回任务记录; + # - 仅对 (新集合 - 旧集合) 这一部分视为“新增文件”,用于后续 LS 同步。 + old_ids = {str(fid) for fid in (existing.file_ids or [])} + requested_ids = {str(fid) for fid in (request.file_ids or [])} + added_ids = sorted(requested_ids - old_ids) + final_ids = sorted(old_ids | requested_ids) + + # datasetId 若未显式传入,则沿用原任务值 + dataset_id = request.dataset_id or existing.dataset_id + + # 2. 更新底层任务记录(ORM),重置状态与文件列表(文件集合为旧集合 ∪ 本次提交集合) + updated = await service.update_task_files( + db, + task_id=task_id, + dataset_id=str(dataset_id), + file_ids=final_ids, + ) + if not updated: + raise HTTPException(status_code=404, detail="Task not found") + + # 3. 使用当前配置 + 新 file_ids 同步 LS 项目映射。 + # 解析失败时退回到一个默认配置(与其它接口保持一致)。 + try: + auto_config = AutoAnnotationConfig.model_validate(existing.config) # type: ignore[arg-type] + except Exception as e: # pragma: no cover - 降级使用默认配置 + logger.warning( + "Failed to parse auto task config when updating LS mapping for task %s: %s", + task_id, + e, + ) + auto_config = AutoAnnotationConfig( + model_size="l", + conf_threshold=0.5, + target_classes=[], + ) + + # 仅在存在“新增文件”时,才将这部分文件同步到 Label Studio; + # 不再删除 LS 中已有任务(delete_orphans=False)。 + if added_ids: + try: + await _ensure_ls_mapping_for_auto_task( + db, + dataset_id=str(dataset_id), + dataset_name=updated.dataset_name, + config=auto_config, + task_name=updated.name, + file_ids=added_ids, + auto_task_id=updated.id, + delete_orphans=False, + ) + except Exception as e: # pragma: no cover - 映射同步失败不阻塞前端 + logger.warning( + "Failed to sync Label Studio mapping when updating auto task %s: %s", + task_id, + e, + ) + + return StandardResponse( + code=200, + message="success", + data=updated, + ) + + @router.get("/{task_id}/status", response_model=StandardResponse[AutoAnnotationTaskResponse]) async def get_auto_annotation_task_status( task_id: str = Path(..., description="任务ID"), @@ -503,6 +597,82 @@ async def get_auto_annotation_task_status( ) +@router.get("/{task_id}/files", response_model=StandardResponse[List[Dict[str, Any]]]) +async def get_auto_annotation_task_files( + task_id: str = Path(..., description="任务ID"), + db: AsyncSession = Depends(get_db), +): + """查询自动标注任务当前关联的 DM 文件列表。 + + 该接口主要用于前端“编辑任务数据集”弹窗的初始选中状态: + - 若任务记录中存在 file_ids,则按这些 ID 精确查询; + - 若 file_ids 为空,则回退为查询主数据集下的全部 ACTIVE 文件。 + + 返回的每一项仅包含前端需要的基础字段,避免一次性返回过多无关信息。 + """ + + from sqlalchemy import select # 本地导入避免循环依赖 + from app.db.models.dataset_management import DatasetFiles, Dataset + + # 先获取任务,确保存在 + result = await db.execute( + select(AutoAnnotationTask).where( + AutoAnnotationTask.id == task_id, + AutoAnnotationTask.deleted_at.is_(None), + ) + ) + task_row = result.scalar_one_or_none() + if not task_row: + raise HTTPException(status_code=404, detail="Task not found") + + file_ids = getattr(task_row, "file_ids", None) or [] + dataset_id = getattr(task_row, "dataset_id", None) + + files_query = None + params: Dict[str, Any] = {} + + if file_ids: + # 按任务记录中的 file_ids 精确查询 + files_query = select(DatasetFiles).where(DatasetFiles.id.in_(file_ids)) + else: + # 未显式记录 file_ids 时,回退为主数据集下所有 ACTIVE 文件 + if not dataset_id: + return StandardResponse(code=200, message="success", data=[]) + files_query = select(DatasetFiles).where( + DatasetFiles.dataset_id == dataset_id, + DatasetFiles.status == "ACTIVE", + ) + + files_result = await db.execute(files_query) + files = list(files_result.scalars().all()) + + # 为涉及到的 dataset_id 一次性查询名称映射,方便前端展示 + dataset_ids = {str(f.dataset_id) for f in files if getattr(f, "dataset_id", None)} + dataset_name_map: Dict[str, str] = {} + if dataset_ids: + ds_result = await db.execute( + select(Dataset.id, Dataset.name).where(Dataset.id.in_(dataset_ids)) + ) + for ds_id, ds_name in ds_result.fetchall(): + dataset_name_map[str(ds_id)] = ds_name or "" + + data: List[Dict[str, Any]] = [] + for f in files: + fid = str(getattr(f, "id")) + ds_id = str(getattr(f, "dataset_id")) if getattr(f, "dataset_id", None) else None + item: Dict[str, Any] = { + "id": fid, + "datasetId": ds_id, + "datasetName": dataset_name_map.get(ds_id or "", ""), + "fileName": getattr(f, "file_name", ""), + "fileSize": int(getattr(f, "file_size", 0) or 0), + "filePath": getattr(f, "file_path", ""), + } + data.append(item) + + return StandardResponse(code=200, message="success", data=data) + + @router.get("/{task_id}/label-studio-project", response_model=StandardResponse[Dict[str, str]]) async def get_auto_annotation_label_studio_project( task_id: str = Path(..., description="任务ID"), @@ -856,3 +1026,160 @@ async def sync_auto_annotation_to_label_studio( message="success", data=created_count, ) + + +@router.post("/{task_id}/sync-label-studio-back", response_model=StandardResponse[bool]) +async def import_from_label_studio_to_dataset( + task_id: str = Path(..., description="任务ID"), + body: ImportFromLabelStudioRequest | None = None, + db: AsyncSession = Depends(get_db), +): + """将指定自动标注任务在 Label Studio 中的标注结果导回到某个数据集。 + + 行为说明: + - 基于自动标注任务找到/创建对应的 Label Studio 项目; + - 调用 Label Studio 项目导出接口,按 exportFormat 下载完整导出文件; + - 将该导出文件作为一个普通数据集文件写入 targetDatasetId 对应的数据集目录; + - 不解析每条标注、不修改现有 tags,仅作为“标注导出工件”落盘,方便后续人工使用。 + """ + + import os + import tempfile + from datetime import datetime + + if body is None: + raise HTTPException(status_code=400, detail="Request body is required") + + # 1. 获取并校验自动标注任务 + task = await service.get_task(db, task_id) + if not task: + raise HTTPException(status_code=404, detail="Task not found") + + # 2. 校验目标数据集是否存在 + dm_service = DatasetManagementService(db) + target_dataset_id = body.target_dataset_id + dataset = await dm_service.get_dataset(target_dataset_id) + if not dataset: + raise HTTPException(status_code=404, detail="Target dataset not found") + + # 3. 查找或创建与该自动标注任务关联的 Label Studio 项目 + mapping_service = DatasetMappingService(db) + mappings = await mapping_service.get_mappings_by_dataset_id(task.dataset_id) + + project_id: Optional[str] = None + for m in mappings: + cfg = getattr(m, "configuration", None) or {} + if isinstance(cfg, dict) and cfg.get("autoTaskId") == task.id: + project_id = str(m.labeling_project_id) + break + + if project_id is None: + for m in mappings: + if m.name == task.name: + project_id = str(m.labeling_project_id) + break + + if project_id is None: + # 与前向同步逻辑保持一致:如无现成项目则按自动标注配置自动创建。 + try: + auto_config = AutoAnnotationConfig.model_validate(task.config) + except Exception as e: # pragma: no cover + logger.warning("Failed to parse auto task config when creating LS project for back sync: %s", e) + auto_config = AutoAnnotationConfig( + model_size="l", + conf_threshold=0.5, + target_classes=[], + ) + + project_id = await _ensure_ls_mapping_for_auto_task( + db, + dataset_id=task.dataset_id, + dataset_name=task.dataset_name, + config=auto_config, + task_name=task.name, + file_ids=[str(fid) for fid in (task.file_ids or [])], + auto_task_id=task.id, + ) + + if not project_id: + raise HTTPException( + status_code=500, + detail=( + "Failed to create or resolve Label Studio project for this auto task " + "when importing annotations back." + ), + ) + + # 4. 调用 Label Studio 导出接口 + ls_client = LabelStudioClient( + base_url=settings.label_studio_base_url, + token=settings.label_studio_user_token, + ) + + export_format = (body.export_format or "JSON").upper() + + # 简单根据导出格式推断文件扩展名,仅用于提高可读性 + ext_map = { + "JSON": ".json", + "JSON_MIN": ".json", + "CSV": ".csv", + "TSV": ".tsv", + "COCO": ".json", + "YOLO": ".json", + "YOLOV8": ".json", + } + file_ext = ext_map.get(export_format, ".json") + + content = await ls_client.export_project(int(project_id), export_type=export_format) + if content is None or len(content) == 0: + raise HTTPException(status_code=500, detail="Failed to export project from Label Studio") + + # 5. 将导出结果写入临时文件,再通过 DatasetManagementService 复制到数据集目录并注册记录 + tmp_fd, tmp_path = tempfile.mkstemp(suffix=file_ext) + os.close(tmp_fd) + + try: + with open(tmp_path, "wb") as f: + f.write(content) + + # 生成一个具备时间戳的文件名,避免与现有文件冲突; + # 若前端提供了自定义文件名,则优先使用其主体部分,再附加正确的扩展名。 + timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + + def _sanitize_base_name(raw: str) -> str: + # 去掉路径分隔符,仅保留最后一段 + name = (raw or "").strip().replace("\\", "/").split("/")[-1] + # 去掉用户自带的扩展名,避免与服务器推断的后缀冲突 + if "." in name: + name = name.rsplit(".", 1)[0] + # 退回到默认前缀 + return name or f"ls_export_{project_id}_{timestamp}" + + if getattr(body, "file_name", None): + base_stem = _sanitize_base_name(body.file_name) # type: ignore[arg-type] + base_name = f"{base_stem}{file_ext}" + else: + base_name = f"ls_export_{project_id}_{timestamp}{file_ext}" + + # DatasetManagementService.add_files_to_dataset 会使用源文件名决定目标文件名, + # 因此这里将临时文件重命名为期望的可读名称后再导入。 + tmp_dir = os.path.dirname(tmp_path) + target_tmp_path = os.path.join(tmp_dir, base_name) + os.replace(tmp_path, target_tmp_path) + + await dm_service.add_files_to_dataset(target_dataset_id, [target_tmp_path]) + finally: + # 清理临时文件(若仍存在) + if os.path.exists(tmp_path): + try: + os.remove(tmp_path) + except Exception: + pass + # 也尝试清理重命名后的临时文件 + if "target_tmp_path" in locals() and os.path.exists(target_tmp_path): + try: + os.remove(target_tmp_path) + except Exception: + pass + + return StandardResponse(code=200, message="success", data=True) diff --git a/runtime/datamate-python/app/module/annotation/interface/project.py b/runtime/datamate-python/app/module/annotation/interface/project.py index 50425dda..c75318cd 100644 --- a/runtime/datamate-python/app/module/annotation/interface/project.py +++ b/runtime/datamate-python/app/module/annotation/interface/project.py @@ -23,6 +23,7 @@ DeleteDatasetResponse, DatasetMappingResponse, ) +from ..schema.auto import ImportFromLabelStudioRequest, UpdateAutoAnnotationTaskFilesRequest router = APIRouter( prefix="/project", @@ -80,80 +81,101 @@ async def create_mapping( """ try: dm_client = DatasetManagementService(db) - ls_client = LabelStudioClient(base_url=settings.label_studio_base_url, - token=settings.label_studio_user_token) + ls_client = LabelStudioClient( + base_url=settings.label_studio_base_url, + token=settings.label_studio_user_token, + ) mapping_service = DatasetMappingService(db) sync_service = SyncService(dm_client, ls_client, mapping_service) template_service = AnnotationTemplateService() - logger.info(f"Create dataset mapping request: dataset_id={request.dataset_id}, file_ids={request.file_ids}") + logger.info( + "Create dataset mapping request: dataset_id=%s, file_ids=%s", + request.dataset_id, + request.file_ids, + ) # 从DM服务获取数据集信息 dataset_info = await dm_client.get_dataset(request.dataset_id) if not dataset_info: raise HTTPException( status_code=404, - detail=f"Dataset not found in DM service: {request.dataset_id}" + detail=f"Dataset not found in DM service: {request.dataset_id}", ) - project_name = request.name or \ - dataset_info.name or \ - "A new project from DataMate" + project_name = ( + request.name + or dataset_info.name + or "A new project from DataMate" + ) - project_description = request.description or \ - dataset_info.description or \ - f"Imported from DM dataset {dataset_info.name} ({dataset_info.id})" + project_description = ( + request.description + or dataset_info.description + or f"Imported from DM dataset {dataset_info.name} ({dataset_info.id})" + ) # 如果提供了模板ID,获取模板配置 label_config = None if request.template_id: - logger.info(f"Using template: {request.template_id}") + logger.info("Using template: %s", request.template_id) template = await template_service.get_template(db, request.template_id) if not template: raise HTTPException( status_code=404, - detail=f"Template not found: {request.template_id}" + detail=f"Template not found: {request.template_id}", ) label_config = template.label_config - logger.debug(f"Template label config loaded for template: {template.name}") + logger.debug( + "Template label config loaded for template: %s", + template.name, + ) # 在Label Studio中创建项目 project_data = await ls_client.create_project( title=project_name, description=project_description, - label_config=label_config # 传递模板配置 + label_config=label_config, ) - if not project_data: raise HTTPException( status_code=500, - detail="Fail to create Label Studio project." + detail="Fail to create Label Studio project.", ) project_id = project_data["id"] # 配置主数据集的本地存储:dataset/ - local_storage_path = f"{settings.label_studio_local_document_root}/{request.dataset_id}" + local_storage_path = ( + f"{settings.label_studio_local_document_root}/{request.dataset_id}" + ) storage_result = await ls_client.create_local_storage( project_id=project_id, path=local_storage_path, title="Dataset_BLOB", use_blob_urls=True, - description=f"Local storage for dataset {dataset_info.name}" + description=f"Local storage for dataset {dataset_info.name}", ) if not storage_result: # 本地存储配置失败,记录警告但不中断流程 - logger.warning(f"Failed to configure local storage for project {project_id}") + logger.warning( + "Failed to configure local storage for project %s", + project_id, + ) else: - logger.info(f"Local storage configured for project {project_id}: {local_storage_path}") + logger.info( + "Local storage configured for project %s: %s", + project_id, + local_storage_path, + ) labeling_project = LabelingProject( - id=str(uuid.uuid4()), # Generate UUID here + id=str(uuid.uuid4()), dataset_id=request.dataset_id, labeling_project_id=str(project_id), name=project_name, - template_id=request.template_id, # Save template_id to database + template_id=request.template_id, ) # 创建映射关系,包含项目名称(先持久化映射以获得 mapping.id) @@ -195,23 +217,29 @@ async def create_mapping( "Some file_ids could not be resolved to dataset_id when syncing manual project files: %s", ",".join(sorted(unresolved_ids)), ) - grouped.setdefault(str(request.dataset_id), set()).update(unresolved_ids) + grouped.setdefault(str(request.dataset_id), set()).update( + unresolved_ids + ) - # 为所有涉及到的额外数据集提前配置本地存储,避免首次引用该数据集时 - # /data/local-files/?d=//... 返回 404 的情况。 + # 为所有涉及到的额外数据集提前配置本地存储 try: for extra_ds_id in grouped.keys(): - # 主数据集已在上方配置过,这里只为额外数据集创建存储记录 + # 主数据集已在上方配置过 if str(extra_ds_id) == str(request.dataset_id): continue - extra_local_storage_path = f"{settings.label_studio_local_document_root}/{extra_ds_id}" + extra_local_storage_path = ( + f"{settings.label_studio_local_document_root}/{extra_ds_id}" + ) extra_storage_result = await ls_client.create_local_storage( project_id=project_id, path=extra_local_storage_path, title=f"Dataset_BLOB_{extra_ds_id}", use_blob_urls=True, - description=f"Local storage for dataset {extra_ds_id} (multi-dataset manual project)", + description=( + f"Local storage for dataset {extra_ds_id} " + "(multi-dataset manual project)" + ), ) if not extra_storage_result: logger.warning( @@ -225,7 +253,7 @@ async def create_mapping( project_id, extra_local_storage_path, ) - except Exception as e: + except Exception as e: # pragma: no cover - 容错 logger.warning( "Error while configuring extra local storage for project %s: %s", project_id, @@ -250,7 +278,7 @@ async def create_mapping( override_dataset_id=ds_id, delete_orphans=False, ) - except Exception as e: + except Exception as e: # pragma: no cover - 映射创建成功但首次文件同步失败 # 同步失败不影响项目和映射本身的创建,前端可通过“同步”按钮重试 logger.warning( "Failed to sync dataset files for manual LS project %s with file_ids filter: %s", @@ -261,21 +289,319 @@ async def create_mapping( response_data = DatasetMappingCreateResponse( id=mapping.id, labeling_project_id=str(mapping.labeling_project_id), - labeling_project_name=mapping.name or project_name + labeling_project_name=mapping.name or project_name, ) return StandardResponse( code=201, message="success", - data=response_data + data=response_data, ) except HTTPException: raise - except Exception as e: + except Exception as e: # pragma: no cover - 兜底错误 logger.error(f"Error while creating dataset mapping: {e}") raise HTTPException(status_code=500, detail="Internal server error") + +@router.get("/{mapping_id}/files", response_model=StandardResponse[list[dict]]) +async def get_manual_mapping_files( + mapping_id: str = Path(..., description="映射ID (mapping UUID)"), + db: AsyncSession = Depends(get_db), +): + """查询手动标注映射当前在 Label Studio 中已存在的 DM 文件列表。 + + 该接口主要用于前端“编辑任务数据集”弹窗的初始选中状态: + - 通过 Label Studio 任务反查当前已有关联的 DM 文件ID; + - 再到 DM 数据库中查询这些文件的基础信息与所属数据集。 + """ + + from sqlalchemy import select as _select # 本地导入避免循环依赖 + from app.db.models.dataset_management import DatasetFiles, Dataset + + mapping_service = DatasetMappingService(db) + mapping = await mapping_service.get_mapping_by_uuid(mapping_id) + if not mapping: + raise HTTPException(status_code=404, detail="Mapping not found") + + dm_client = DatasetManagementService(db) + ls_client = LabelStudioClient( + base_url=settings.label_studio_base_url, + token=settings.label_studio_user_token, + ) + sync_service = SyncService(dm_client, ls_client, mapping_service) + + existing_mapping = await sync_service.get_existing_dm_file_mapping( + mapping.labeling_project_id + ) + file_ids = list(existing_mapping.keys()) + if not file_ids: + return StandardResponse(code=200, message="success", data=[]) + + files_result = await db.execute( + _select(DatasetFiles).where(DatasetFiles.id.in_(file_ids)) + ) + files = list(files_result.scalars().all()) + + dataset_ids = { + str(getattr(f, "dataset_id")) + for f in files + if getattr(f, "dataset_id", None) + } + dataset_name_map: dict[str, str] = {} + if dataset_ids: + ds_result = await db.execute( + _select(Dataset.id, Dataset.name).where(Dataset.id.in_(dataset_ids)) + ) + for ds_id, ds_name in ds_result.fetchall(): + dataset_name_map[str(ds_id)] = ds_name or "" + + data: list[dict] = [] + for f in files: + fid = str(getattr(f, "id")) + ds_id = ( + str(getattr(f, "dataset_id")) + if getattr(f, "dataset_id", None) + else None + ) + item: dict = { + "id": fid, + "datasetId": ds_id, + "datasetName": dataset_name_map.get(ds_id or "", ""), + "fileName": getattr(f, "file_name", ""), + "fileSize": int(getattr(f, "file_size", 0) or 0), + "filePath": getattr(f, "file_path", ""), + "fileType": getattr(f, "file_type", None), + } + data.append(item) + + return StandardResponse(code=200, message="success", data=data) + + +@router.put("/{mapping_id}/files", response_model=StandardResponse[bool]) +async def update_manual_mapping_files( + mapping_id: str = Path(..., description="映射ID (mapping UUID)"), + body: UpdateAutoAnnotationTaskFilesRequest = ..., # 复用通用结构:datasetId + fileIds + db: AsyncSession = Depends(get_db), +): + """更新手动标注映射所关联的 DM 文件集合(仅追加,不删除已有任务)。 + + 语义约定: + - 映射创建时所同步的文件集合视为“基础集合”,后续编辑不支持移除; + - 本接口只会为新增的 fileIds 在 Label Studio 中创建任务; + - 不会删除任何现有任务(delete_orphans=False)。 + """ + + from sqlalchemy import select as _select # 本地导入 + from typing import Set as _Set, Dict as _Dict + from app.db.models.dataset_management import DatasetFiles + + mapping_service = DatasetMappingService(db) + mapping = await mapping_service.get_mapping_by_uuid(mapping_id) + if not mapping: + raise HTTPException(status_code=404, detail="Mapping not found") + + dm_client = DatasetManagementService(db) + ls_client = LabelStudioClient( + base_url=settings.label_studio_base_url, + token=settings.label_studio_user_token, + ) + sync_service = SyncService(dm_client, ls_client, mapping_service) + + requested_ids = {str(fid) for fid in (body.file_ids or [])} + if not requested_ids: + # 不做任何变更,但认为成功 + return StandardResponse(code=200, message="success", data=True) + + existing_mapping = await sync_service.get_existing_dm_file_mapping( + mapping.labeling_project_id + ) + existing_ids = set(existing_mapping.keys()) + + # 仅对新增文件创建任务 + new_ids = sorted(requested_ids - existing_ids) + if not new_ids: + return StandardResponse(code=200, message="success", data=True) + + stmt = ( + _select(DatasetFiles.dataset_id, DatasetFiles.id) + .where(DatasetFiles.id.in_(new_ids)) + ) + result = await db.execute(stmt) + rows = result.fetchall() + + grouped: _Dict[str, _Set[str]] = {} + resolved_ids: _Set[str] = set() + + for ds_id, fid in rows: + if not ds_id or not fid: + continue + fid_str = str(fid) + grouped.setdefault(str(ds_id), set()).add(fid_str) + resolved_ids.add(fid_str) + + unresolved_ids = {str(fid) for fid in new_ids} - resolved_ids + if unresolved_ids: + logger.warning( + "Some file_ids could not be resolved to dataset_id when updating manual mapping files: %s", + ",".join(sorted(unresolved_ids)), + ) + grouped.setdefault(str(mapping.dataset_id), set()).update(unresolved_ids) + + # 为所有涉及到的额外数据集提前配置本地存储(与创建逻辑保持一致) + try: + for extra_ds_id in grouped.keys(): + if str(extra_ds_id) == str(mapping.dataset_id): + continue + + extra_local_storage_path = ( + f"{settings.label_studio_local_document_root}/{extra_ds_id}" + ) + extra_storage_result = await ls_client.create_local_storage( + project_id=int(mapping.labeling_project_id), + path=extra_local_storage_path, + title=f"Dataset_BLOB_{extra_ds_id}", + use_blob_urls=True, + description=( + f"Local storage for dataset {extra_ds_id} " + "(multi-dataset manual project, edit)" + ), + ) + if not extra_storage_result: + logger.warning( + "Failed to configure extra local storage for project %s (dataset %s) when updating manual mapping files", + mapping.labeling_project_id, + extra_ds_id, + ) + else: + logger.info( + "Extra local storage configured for project %s: %s (edit)", + mapping.labeling_project_id, + extra_local_storage_path, + ) + except Exception as e: # pragma: no cover + logger.warning( + "Error while configuring extra local storage for project %s during update: %s", + mapping.labeling_project_id, + e, + ) + + # 将新增文件按数据集分组,同步到 Label Studio;不删除已有任务 + for ds_id, ds_file_ids in grouped.items(): + await sync_service.sync_files( + mapping, + batch_size=100, + allowed_file_ids=ds_file_ids, + override_dataset_id=ds_id, + delete_orphans=False, + ) + + return StandardResponse(code=200, message="success", data=True) + + +@router.post("/{mapping_id}/sync-label-studio-back", response_model=StandardResponse[bool]) +async def import_manual_from_label_studio_to_dataset( + mapping_id: str = Path(..., description="映射ID (mapping UUID)"), + body: ImportFromLabelStudioRequest | None = None, + db: AsyncSession = Depends(get_db), +): + """将手动标注工程在 Label Studio 中的标注结果导回到某个数据集。 + + 行为与自动标注的后向同步保持一致: + - 按 mapping_id 定位 Label Studio 项目; + - 通过 exportType 导出项目完整结果(JSON/COCO/YOLO 等); + - 将导出文件作为一个普通文件保存到指定数据集目录,并注册到 t_dm_dataset_files; + - 不解析每条标注、不修改 tags,仅追加一个“导出工件”文件。 + """ + + import os + import tempfile + from datetime import datetime + + if body is None: + raise HTTPException(status_code=400, detail="Request body is required") + + # 1. 获取并校验映射(手动标注任务) + mapping_service = DatasetMappingService(db) + mapping = await mapping_service.get_mapping_by_uuid(mapping_id) + if not mapping: + raise HTTPException(status_code=404, detail="Mapping not found") + + # 2. 校验目标数据集是否存在 + dm_service = DatasetManagementService(db) + target_dataset_id = body.target_dataset_id + dataset = await dm_service.get_dataset(target_dataset_id) + if not dataset: + raise HTTPException(status_code=404, detail="Target dataset not found") + + # 3. 调用 Label Studio 导出接口 + ls_client = LabelStudioClient( + base_url=settings.label_studio_base_url, + token=settings.label_studio_user_token, + ) + + export_format = (body.export_format or "JSON").upper() + + ext_map = { + "JSON": ".json", + "JSON_MIN": ".json", + "CSV": ".csv", + "TSV": ".tsv", + "COCO": ".json", + "YOLO": ".json", + "YOLOV8": ".json", + } + file_ext = ext_map.get(export_format, ".json") + + project_id = mapping.labeling_project_id + content = await ls_client.export_project(int(project_id), export_type=export_format) + if content is None or len(content) == 0: + raise HTTPException(status_code=500, detail="Failed to export project from Label Studio") + + # 4. 将导出结果写入临时文件,再通过 DatasetManagementService 导入到数据集 + tmp_fd, tmp_path = tempfile.mkstemp(suffix=file_ext) + os.close(tmp_fd) + + try: + with open(tmp_path, "wb") as f: + f.write(content) + + timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + + def _sanitize_base_name(raw: str) -> str: + # 去掉路径分隔符,仅保留最后一段 + name = (raw or "").strip().replace("\\", "/").split("/")[-1] + # 去掉用户自带的扩展名,避免与服务器推断的后缀冲突 + if "." in name: + name = name.rsplit(".", 1)[0] + return name or f"ls_export_{project_id}_{timestamp}" + + if getattr(body, "file_name", None): + base_stem = _sanitize_base_name(body.file_name) # type: ignore[arg-type] + base_name = f"{base_stem}{file_ext}" + else: + base_name = f"ls_export_{project_id}_{timestamp}{file_ext}" + + tmp_dir = os.path.dirname(tmp_path) + target_tmp_path = os.path.join(tmp_dir, base_name) + os.replace(tmp_path, target_tmp_path) + + await dm_service.add_files_to_dataset(target_dataset_id, [target_tmp_path]) + finally: + if os.path.exists(tmp_path): + try: + os.remove(tmp_path) + except Exception: + pass + if "target_tmp_path" in locals() and os.path.exists(target_tmp_path): + try: + os.remove(target_tmp_path) + except Exception: + pass + + return StandardResponse(code=200, message="success", data=True) + @router.get("", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]]) async def list_mappings( page: int = Query(1, ge=1, description="页码(从1开始)"), diff --git a/runtime/datamate-python/app/module/annotation/schema/auto.py b/runtime/datamate-python/app/module/annotation/schema/auto.py index 4257b9c5..95f410df 100644 --- a/runtime/datamate-python/app/module/annotation/schema/auto.py +++ b/runtime/datamate-python/app/module/annotation/schema/auto.py @@ -76,3 +76,54 @@ class AutoAnnotationTaskListResponse(BaseModel): total: int = Field(..., description="总数") model_config = ConfigDict(populate_by_name=True) + + +class UpdateAutoAnnotationTaskFilesRequest(BaseModel): + """更新自动标注任务所关联的数据集文件。 + + - dataset_id: 主数据集 ID;若前端未显式传入,则在接口层回退为现有任务的 dataset_id。 + - file_ids: 选中的文件 ID 列表,允许跨多个数据集; + 实际执行自动标注时仅会对 tags 为空的文件重新推理。 + """ + + dataset_id: Optional[str] = Field( + default=None, + alias="datasetId", + description="主数据集ID(可选,不传则沿用原任务的datasetId)", + ) + file_ids: List[str] = Field( + default_factory=list, + alias="fileIds", + description="要参与本自动标注任务的数据集文件ID列表,允许跨多个数据集", + ) + + model_config = ConfigDict(populate_by_name=True) + + +class ImportFromLabelStudioRequest(BaseModel): + """从 Label Studio 导入标注结果到 DM 数据集的请求体。 + + - target_dataset_id: 要将导出结果写入的目标数据集ID; + - export_format: Label Studio 导出格式(如 JSON/JSON_MIN/CSV/TSV/COCO/YOLO 等)。 + - file_name: 可选,自定义保存到数据集中的文件名称(不含路径); + 若包含扩展名或路径分隔符,服务端会自动裁剪,仅保留文件名主体并附加正确后缀。 + """ + + target_dataset_id: str = Field( + ..., + alias="targetDatasetId", + description="导入目标数据集ID", + ) + export_format: str = Field( + default="JSON", + alias="exportFormat", + description="Label Studio 导出格式 (JSON/COCO/YOLO 等)", + ) + + file_name: Optional[str] = Field( + default=None, + alias="fileName", + description="自定义导出文件名(可选,不含路径,扩展名将按导出格式自动附加)", + ) + + model_config = ConfigDict(populate_by_name=True) diff --git a/runtime/datamate-python/app/module/annotation/service/auto.py b/runtime/datamate-python/app/module/annotation/service/auto.py index 78b3dac7..2eb9ee4f 100644 --- a/runtime/datamate-python/app/module/annotation/service/auto.py +++ b/runtime/datamate-python/app/module/annotation/service/auto.py @@ -152,3 +152,52 @@ async def soft_delete_task(self, db: AsyncSession, task_id: str) -> bool: task.deleted_at = datetime.now() await db.commit() return True + + async def update_task_files( + self, + db: AsyncSession, + task_id: str, + *, + dataset_id: str, + file_ids: List[str], + ) -> Optional[AutoAnnotationTaskResponse]: + """更新自动标注任务关联的数据集与文件列表。 + + - 覆盖任务的 dataset_id 与 file_ids; + - 将任务重置为 pending,供 worker 重新调度; + - 保留已有的 output_path 与统计信息,确保同一任务始终复用同一个输出数据集, + worker 内部会根据 output_path 中已有的 images 仅对新增文件执行自动标注。 + """ + + result = await db.execute( + select(AutoAnnotationTask).where( + AutoAnnotationTask.id == task_id, + AutoAnnotationTask.deleted_at.is_(None), + ) + ) + task = result.scalar_one_or_none() + if not task: + return None + + now = datetime.now() + + task.dataset_id = dataset_id + task.file_ids = file_ids or [] + + # 将任务标记为待处理,让 worker 在同一个输出目录下仅对新增文件执行自动标注 + task.status = "pending" + task.progress = 0 + task.updated_at = now + + db.add(task) + await db.commit() + await db.refresh(task) + + resp = AutoAnnotationTaskResponse.model_validate(task) + try: + resp.source_datasets = await self._compute_source_datasets(db, task) + except Exception: + fallback_name = getattr(task, "dataset_name", None) + fallback_id = getattr(task, "dataset_id", "") + resp.source_datasets = [fallback_name] if fallback_name else [fallback_id] + return resp diff --git a/runtime/python-executor/datamate/auto_annotation_worker.py b/runtime/python-executor/datamate/auto_annotation_worker.py index a270c5c9..90ccc56f 100644 --- a/runtime/python-executor/datamate/auto_annotation_worker.py +++ b/runtime/python-executor/datamate/auto_annotation_worker.py @@ -205,7 +205,10 @@ def _update_task_status( def _load_dataset_files(dataset_id: str) -> List[Tuple[str, str, str]]: - """加载指定数据集下的所有已完成文件。""" + """加载指定数据集下的所有已激活文件。 + + 不再根据 tags 是否为空进行过滤,避免影响后续新任务或重复任务的执行。 + """ sql = text( """ @@ -370,7 +373,11 @@ def _register_output_dataset( *, tags_by_filename: Optional[Dict[str, List[Dict[str, Any]]]] = None, ) -> None: - """将自动标注结果注册到新建的数据集。""" + """将自动标注结果注册到数据集。 + + 如果多次为同一自动标注任务重复运行,将复用同一个输出数据集,并避免对 + 同一 dataset_id + file_name 插入重复记录,只为新文件追加记录。 + """ images_dir = os.path.join(output_dir, "images") if not os.path.isdir(images_dir): @@ -438,7 +445,18 @@ def _register_output_dataset( with SQLManager.create_connect() as conn: added_count = 0 + # 预先查询已存在的文件名,避免重复插入 + existing_names_sql = text( + """ + SELECT file_name FROM t_dm_dataset_files WHERE dataset_id = :dataset_id + """ + ) + rows = conn.execute(existing_names_sql, {"dataset_id": output_dataset_id}).fetchall() + existing_names = {str(r[0]) for r in rows} + for file_name, file_path, file_size in image_files: + if file_name in existing_names: + continue ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None file_tags = None if tags_by_filename: @@ -458,8 +476,11 @@ def _register_output_dataset( }, ) added_count += 1 + existing_names.add(file_name) for file_name, file_path, file_size in annotation_files: + if file_name in existing_names: + continue ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None conn.execute( insert_file_sql, @@ -476,6 +497,7 @@ def _register_output_dataset( }, ) added_count += 1 + existing_names.add(file_name) if added_count > 0: conn.execute( @@ -557,12 +579,82 @@ def _process_single_task(task: Dict[str, Any]) -> None: else: all_files = _load_dataset_files(dataset_id) + # 优先复用任务已有的输出目录和对应数据集,避免重复创建结果数据集 + existing_output_path = (task.get("output_path") or "").strip() or None + output_dataset_id: Optional[str] = None + output_dir: Optional[str] = None + + if existing_output_path: + try: + # 确保目录存在 + output_dir = _ensure_output_dir(existing_output_path) + + # 根据路径查找已存在的数据集记录 + find_dataset_sql = text( + """ + SELECT id FROM t_dm_datasets + WHERE path = :path + ORDER BY created_at DESC + LIMIT 1 + """ + ) + with SQLManager.create_connect() as conn: + row = conn.execute(find_dataset_sql, {"path": output_dir}).fetchone() + if row: + output_dataset_id = str(row[0]) + logger.info( + "Reuse existing output dataset for auto-annotation task: task_id={}, dataset_id={}, path={}", + task_id, + output_dataset_id, + output_dir, + ) + except Exception as e: + logger.error( + "Failed to reuse existing output dataset for task {}: {}", + task_id, + e, + ) + + # 如果没有可复用的数据集,则创建新的结果数据集 + if not output_dataset_id or not output_dir: + output_dataset_id, new_output_dir = _create_output_dataset( + source_dataset_id=dataset_id, + source_dataset_name=source_dataset_name, + output_dataset_name=output_dataset_name, + ) + output_dir = _ensure_output_dir(new_output_dir) + + # 仅对“新选的数据”执行自动标注: + # 已经在输出目录 images/ 中存在对应文件名的,认为该任务已跑过,不再重复标注 + existing_image_names: Set[str] = set() + images_dir = os.path.join(output_dir, "images") + if os.path.isdir(images_dir): + try: + for name in os.listdir(images_dir): + if not os.path.isfile(os.path.join(images_dir, name)): + continue + existing_image_names.add(name) + except Exception as e: + logger.error( + "Failed to list existing images for auto-annotation task {}: {}", + task_id, + e, + ) + # all_files: List[(file_id, file_path, file_name)] - files = all_files + files = [ + (file_id, file_path, file_name) + for file_id, file_path, file_name in all_files + if os.path.basename(file_path) not in existing_image_names + ] total_images = len(files) if total_images == 0: - logger.warning("No files found for dataset {} when running auto-annotation task {}", dataset_id, task_id) + logger.info( + "No new files to process for auto-annotation task {}, reuse existing output at {}", + task_id, + output_dir, + ) _update_task_status( task_id, status="completed", @@ -571,17 +663,10 @@ def _process_single_task(task: Dict[str, Any]) -> None: processed_images=0, detected_objects=0, completed=True, - output_path=None, + output_path=output_dir, ) return - output_dataset_id, output_dir = _create_output_dataset( - source_dataset_id=dataset_id, - source_dataset_name=source_dataset_name, - output_dataset_name=output_dataset_name, - ) - output_dir = _ensure_output_dir(output_dir) - try: detector = ImageObjectDetectionBoundingBox( modelSize=model_size,