diff --git a/backend/api-gateway/src/main/java/com/datamate/gateway/common/filter/UserContextFilter.java b/backend/api-gateway/src/main/java/com/datamate/gateway/common/filter/AuthFilter.java similarity index 79% rename from backend/api-gateway/src/main/java/com/datamate/gateway/common/filter/UserContextFilter.java rename to backend/api-gateway/src/main/java/com/datamate/gateway/common/filter/AuthFilter.java index 950dcb0ec..676ec1a09 100644 --- a/backend/api-gateway/src/main/java/com/datamate/gateway/common/filter/UserContextFilter.java +++ b/backend/api-gateway/src/main/java/com/datamate/gateway/common/filter/AuthFilter.java @@ -7,6 +7,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; import org.springframework.beans.factory.annotation.Value; import org.springframework.cloud.gateway.filter.GatewayFilterChain; import org.springframework.cloud.gateway.filter.GlobalFilter; @@ -22,17 +23,19 @@ import java.nio.charset.StandardCharsets; /** - * 用户信息过滤器 + * 鉴权过滤器 * */ @Slf4j @Component @RequiredArgsConstructor -public class UserContextFilter implements GlobalFilter { +public class AuthFilter implements GlobalFilter { private static final String AUTH_HEADER = "Authorization"; private static final String TOKEN_PREFIX = "Bearer "; + private static final String USER_HEADER = "User"; + private final UserService userService; @Value("${datamate.jwt.enable:false}") @@ -55,10 +58,22 @@ public Mono filter(ServerWebExchange exchange, GatewayFilterChain chain) { return sendUnauthorizedResponse(exchange); } String token = authHeader.substring(TOKEN_PREFIX.length()); - if (!userService.validateToken(token)) { + String user = userService.validateToken(token); + if (StringUtils.isBlank(user)) { return sendUnauthorizedResponse(exchange); } - return chain.filter(exchange); + // 4. 创建新的请求 + ServerHttpRequest mutatedRequest = request.mutate() + .headers(httpHeaders -> { + // 或者直接操作headers + httpHeaders.add(USER_HEADER, user); + }) + .build(); + // 5. 使用新的请求创建新的exchange + ServerWebExchange mutatedExchange = exchange.mutate() + .request(mutatedRequest) + .build(); + return chain.filter(mutatedExchange); } catch (Exception e) { log.error("get current user info error", e); return sendUnauthorizedResponse(exchange); diff --git a/backend/api-gateway/src/main/java/com/datamate/gateway/domain/service/UserService.java b/backend/api-gateway/src/main/java/com/datamate/gateway/domain/service/UserService.java index 938dfa575..0978563fc 100644 --- a/backend/api-gateway/src/main/java/com/datamate/gateway/domain/service/UserService.java +++ b/backend/api-gateway/src/main/java/com/datamate/gateway/domain/service/UserService.java @@ -3,9 +3,7 @@ import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; import com.datamate.gateway.domain.entity.User; import com.datamate.gateway.domain.repository.UserRepository; -import io.jsonwebtoken.JwtException; -import io.jsonwebtoken.Jwts; -import io.jsonwebtoken.SignatureAlgorithm; +import io.jsonwebtoken.*; import io.jsonwebtoken.security.Keys; import lombok.RequiredArgsConstructor; import org.springframework.beans.factory.annotation.Value; @@ -26,6 +24,8 @@ @Service @RequiredArgsConstructor public class UserService { + private static final String SYSTEM_USER = "system"; + private final UserRepository userRepository; @Value("${datamate.jwt.expiration-seconds:3600}") @@ -70,12 +70,12 @@ private String generateToken(User user) { .compact(); } - public boolean validateToken(String token) { + public String validateToken(String token) { try { - Jwts.parser().setSigningKey(secret.getBytes()).parseClaimsJws(token); - return true; + Jws claimsJws = Jwts.parserBuilder().setSigningKey(Keys.hmacShaKeyFor(secret.getBytes(StandardCharsets.UTF_8))).build().parseClaimsJws(token); + return claimsJws.getBody().getSubject(); } catch (JwtException | IllegalArgumentException ex) { - return false; + return null; } } @@ -89,7 +89,7 @@ public Optional register(RegisterRequest registerRequest) { // Check if username already exists LambdaQueryWrapper usernameQuery = new LambdaQueryWrapper<>(); usernameQuery.eq(User::getUsername, registerRequest.getUsername()); - if (userRepository.getOne(usernameQuery) != null) { + if (userRepository.getOne(usernameQuery) != null || SYSTEM_USER.equals(registerRequest.getUsername())) { return Optional.empty(); } diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/entity/CleaningTask.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/entity/CleaningTask.java index f1f75a9ef..298e83c8a 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/entity/CleaningTask.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/entity/CleaningTask.java @@ -2,6 +2,7 @@ import com.baomidou.mybatisplus.annotation.TableName; import com.datamate.cleaning.common.enums.CleaningTaskStatusEnum; +import com.datamate.common.domain.model.base.BaseEntity; import lombok.Getter; import lombok.Setter; @@ -14,9 +15,7 @@ @Getter @Setter @TableName(value = "t_clean_task", autoResultMap = true) -public class CleaningTask { - private String id; - +public class CleaningTask extends BaseEntity { private String name; private String description; @@ -37,8 +36,6 @@ public class CleaningTask { private Integer fileCount; - private LocalDateTime createdAt; - private LocalDateTime startedAt; private LocalDateTime finishedAt; diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningResultMapper.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningResultMapper.java index 1b20746c5..455de26e5 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningResultMapper.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningResultMapper.java @@ -2,8 +2,10 @@ import com.baomidou.mybatisplus.core.mapper.BaseMapper; import com.datamate.cleaning.domain.model.entity.CleaningResult; +import com.datamate.common.infrastructure.config.IgnoreDataScopeAnnotation; import org.apache.ibatis.annotations.Mapper; @Mapper +@IgnoreDataScopeAnnotation public interface CleaningResultMapper extends BaseMapper { } diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningTemplateMapper.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningTemplateMapper.java index d7493e10a..be5fca718 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningTemplateMapper.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningTemplateMapper.java @@ -5,6 +5,7 @@ import com.baomidou.mybatisplus.core.toolkit.Constants; import com.datamate.cleaning.domain.model.entity.TemplateWithInstance; import com.datamate.cleaning.domain.model.entity.CleaningTemplate; +import com.datamate.common.infrastructure.config.IgnoreDataScopeAnnotation; import org.apache.ibatis.annotations.Mapper; import org.apache.ibatis.annotations.Param; import org.apache.ibatis.annotations.Select; @@ -12,6 +13,7 @@ import java.util.List; @Mapper +@IgnoreDataScopeAnnotation public interface CleaningTemplateMapper extends BaseMapper { @Select("SELECT t.id AS id, name, description, created_at, updated_at, created_by, operator_id, op_index, " + "settings_override FROM t_clean_template t LEFT JOIN t_operator_instance o ON t.id = o.instance_id " + diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/OperatorInstanceMapper.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/OperatorInstanceMapper.java index b7c07a876..01c838fa3 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/OperatorInstanceMapper.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/OperatorInstanceMapper.java @@ -2,6 +2,7 @@ import com.baomidou.mybatisplus.core.mapper.BaseMapper; import com.datamate.cleaning.domain.model.entity.OperatorInstance; +import com.datamate.common.infrastructure.config.IgnoreDataScopeAnnotation; import com.datamate.operator.domain.model.OperatorView; import org.apache.ibatis.annotations.Mapper; import org.apache.ibatis.annotations.Select; @@ -10,6 +11,7 @@ @Mapper +@IgnoreDataScopeAnnotation public interface OperatorInstanceMapper extends BaseMapper { @Select("SELECT o.operator_id as id, o.operator_name as name, o.description, o.version, o.inputs, o.outputs, " + "o.runtime, o.settings, o.created_at, o.updated_at, " + diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java index 5fc1f6abe..5d4e6ffef 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java @@ -244,7 +244,7 @@ public AllDatasetStatisticsResponse getAllDatasetStatistics() { public void processDataSourceAsync(String datasetId, String dataSourceId) { try { log.info("Initiating data source file scanning, dataset ID: {}, collection task ID: {}", datasetId, dataSourceId); - List filePaths = getFilePaths(dataSourceId); + List filePaths = getFilePaths(dataSourceId, datasetRepository.getById(datasetId)); if (CollectionUtils.isEmpty(filePaths)) { return; } @@ -255,8 +255,8 @@ public void processDataSourceAsync(String datasetId, String dataSourceId) { } } - private List getFilePaths(String dataSourceId) { - CollectionTaskDetailResponse taskDetail = collectionTaskClient.getTaskDetail(dataSourceId).getData(); + private List getFilePaths(String dataSourceId, Dataset dataset) { + CollectionTaskDetailResponse taskDetail = collectionTaskClient.getTaskDetail(dataSourceId, dataset.getCreatedBy()).getData(); if (taskDetail == null) { log.warn("Fail to get collection task detail, task ID: {}", dataSourceId); return Collections.emptyList(); diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java index f671b02c9..c3a74070e 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java @@ -137,7 +137,7 @@ public PagedResponse getDatasetFilesWithDirectory(String datasetId, return new PagedResponse<>(page, size, total, totalPages, datasetFiles); } catch (IOException e) { - log.error("list dataset path error", e); + log.warn("list dataset path error"); return PagedResponse.of(new Page<>(page, size)); } } diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/CollectionTaskClient.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/CollectionTaskClient.java index b34522c47..73ab0c71b 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/CollectionTaskClient.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/CollectionTaskClient.java @@ -5,6 +5,7 @@ import org.springframework.cloud.openfeign.FeignClient; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.PathVariable; +import org.springframework.web.bind.annotation.RequestHeader; /** * 数据归集服务 Feign Client @@ -18,5 +19,5 @@ public interface CollectionTaskClient { * @return 任务详情 */ @GetMapping("/api/data-collection/tasks/{id}") - Response getTaskDetail(@PathVariable("id") String taskId); + Response getTaskDetail(@PathVariable("id") String taskId, @RequestHeader("User") String authorization); } diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/mapper/DatasetFileMapper.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/mapper/DatasetFileMapper.java index f67121c9c..831576859 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/mapper/DatasetFileMapper.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/mapper/DatasetFileMapper.java @@ -1,6 +1,7 @@ package com.datamate.datamanagement.infrastructure.persistence.mapper; import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.datamate.common.infrastructure.config.IgnoreDataScopeAnnotation; import com.datamate.datamanagement.domain.model.dataset.DatasetFile; import org.apache.ibatis.annotations.Mapper; import org.apache.ibatis.annotations.Param; @@ -9,6 +10,7 @@ import java.util.List; @Mapper +@IgnoreDataScopeAnnotation public interface DatasetFileMapper extends BaseMapper { DatasetFile findById(@Param("id") String id); List findByDatasetId(@Param("datasetId") String datasetId, RowBounds rowBounds); diff --git a/backend/services/main-application/src/main/java/com/datamate/main/filter/UserContextFilter.java b/backend/services/main-application/src/main/java/com/datamate/main/filter/UserContextFilter.java new file mode 100644 index 000000000..daff212f2 --- /dev/null +++ b/backend/services/main-application/src/main/java/com/datamate/main/filter/UserContextFilter.java @@ -0,0 +1,51 @@ +package com.datamate.main.filter; + +import com.datamate.common.infrastructure.common.Response; +import com.datamate.common.infrastructure.config.DataScopeHandle; +import com.datamate.common.infrastructure.exception.CommonErrorCode; +import com.fasterxml.jackson.databind.ObjectMapper; +import jakarta.servlet.*; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import java.io.IOException; + +/** + * 用户信息过滤器 + * + * @since 2026/1/19 + */ +@Slf4j +@Component +public class UserContextFilter implements Filter { + private static final String USER_HEADER = "User"; + + @Value("${datamate.jwt.enable:false}") + private Boolean jwtEnable; + + @Override + public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse, FilterChain filterChain) { + try { + HttpServletRequest httpRequest = (HttpServletRequest) servletRequest; + HttpServletResponse httpResponse = (HttpServletResponse) servletResponse; + String user = httpRequest.getHeader(USER_HEADER); + ObjectMapper objectMapper = new ObjectMapper(); + if (jwtEnable && StringUtils.isBlank(user)) { + httpResponse.setStatus(HttpServletResponse.SC_UNAUTHORIZED); + httpResponse.getWriter().write(objectMapper.writeValueAsString(Response.error(CommonErrorCode.UNAUTHORIZED))); + return; + } + DataScopeHandle.setUserInfo(user); + filterChain.doFilter(servletRequest, servletResponse); + } catch (IOException | ServletException e) { + log.error("Request failed!"); + throw new RuntimeException(e); + } finally { + DataScopeHandle.removeUserInfo(); + } + } +} diff --git a/backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/model/Operator.java b/backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/model/Operator.java index 4e1994aa5..9959f0076 100644 --- a/backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/model/Operator.java +++ b/backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/model/Operator.java @@ -1,6 +1,7 @@ package com.datamate.operator.domain.model; import com.baomidou.mybatisplus.annotation.TableName; +import com.datamate.common.domain.model.base.BaseEntity; import lombok.Getter; import lombok.Setter; @@ -9,9 +10,7 @@ @Getter @Setter @TableName(value = "t_operator") -public class Operator { - private String id; - +public class Operator extends BaseEntity { private String name; private String description; @@ -35,9 +34,5 @@ public class Operator { private Integer usageCount; private Boolean isStar; - - private LocalDateTime createdAt; - - private LocalDateTime updatedAt; } diff --git a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorViewRepositoryImpl.java b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorViewRepositoryImpl.java index 370236aef..54a77775d 100644 --- a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorViewRepositoryImpl.java +++ b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorViewRepositoryImpl.java @@ -78,7 +78,7 @@ private QueryWrapper getQueryWrapper(String keyword, List { } diff --git a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/CategoryRelationMapper.java b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/CategoryRelationMapper.java index 934b1d148..9237fcaf7 100644 --- a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/CategoryRelationMapper.java +++ b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/CategoryRelationMapper.java @@ -1,9 +1,11 @@ package com.datamate.operator.infrastructure.persistence.mapper; import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.datamate.common.infrastructure.config.IgnoreDataScopeAnnotation; import com.datamate.operator.domain.model.CategoryRelation; import org.apache.ibatis.annotations.Mapper; @Mapper +@IgnoreDataScopeAnnotation public interface CategoryRelationMapper extends BaseMapper { } diff --git a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorMapper.java b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorMapper.java index 7e91385d8..95ef7e1ad 100644 --- a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorMapper.java +++ b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorMapper.java @@ -1,6 +1,7 @@ package com.datamate.operator.infrastructure.persistence.mapper; import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.datamate.common.infrastructure.config.IgnoreDataScopeAnnotation; import com.datamate.operator.domain.model.Operator; import org.apache.ibatis.annotations.Mapper; import org.apache.ibatis.annotations.Select; @@ -8,10 +9,12 @@ @Mapper public interface OperatorMapper extends BaseMapper { + @IgnoreDataScopeAnnotation @Select("SELECT count(1) FROM t_operator_instance oi JOIN t_clean_template t ON oi.instance_id = t.id " + "WHERE oi.operator_id = #{operatorId}") int operatorInTemplate(String operatorId); + @IgnoreDataScopeAnnotation @Select("SELECT count(1) FROM t_operator_instance oi JOIN t_clean_task t ON oi.instance_id = t.id " + "WHERE oi.operator_id = #{operatorId} AND t.status != 'COMPLETED'") int operatorInUnstopTask(String operatorId); diff --git a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorReleaseMapper.java b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorReleaseMapper.java index 761c4be37..de9023d62 100644 --- a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorReleaseMapper.java +++ b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorReleaseMapper.java @@ -1,9 +1,11 @@ package com.datamate.operator.infrastructure.persistence.mapper; import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.datamate.common.infrastructure.config.IgnoreDataScopeAnnotation; import com.datamate.operator.domain.model.OperatorRelease; import org.apache.ibatis.annotations.Mapper; @Mapper +@IgnoreDataScopeAnnotation public interface OperatorReleaseMapper extends BaseMapper { } diff --git a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorViewMapper.java b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorViewMapper.java index c13b4a6d1..99ed5e4fa 100644 --- a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorViewMapper.java +++ b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorViewMapper.java @@ -13,20 +13,21 @@ @Mapper public interface OperatorViewMapper extends BaseMapper { @Select("SELECT operator_id AS id, operator_name AS name, description, version, inputs, outputs, runtime, " + - "settings, is_star, file_size, usage_count, created_at, updated_at, " + + "settings, is_star, file_size, usage_count, created_at, updated_at, created_by, updated_by, " + "STRING_AGG(CAST(category_id AS TEXT), ',' ORDER BY created_at DESC) AS categories " + "FROM v_operator ${ew.customSqlSegment}") IPage findOperatorsByCriteria(IPage page, @Param(Constants.WRAPPER) Wrapper queryWrapper); - @Select("SELECT COUNT(1) FROM (SELECT 1 FROM v_operator ${ew.customSqlSegment}) AS t") + @Select("SELECT COUNT(1) FROM (SELECT operator_id AS id, operator_name AS name, description, version, inputs, outputs, runtime, " + + "settings, is_star, file_size, usage_count, created_at, updated_at, created_by, updated_by FROM v_operator ${ew.customSqlSegment}) AS t") Integer countOperatorsByCriteria(@Param(Constants.WRAPPER) Wrapper queryWrapper); @Select("SELECT operator_id AS id, operator_name AS name, description, version, inputs, outputs, runtime, " + - "settings, is_star, file_name, file_size, usage_count, metrics, created_at, updated_at, " + + "settings, is_star, file_name, file_size, usage_count, metrics, created_at, updated_at, created_by, updated_by, " + "STRING_AGG(category_name, ',' ORDER BY created_at DESC) AS categories " + "FROM v_operator WHERE operator_id = #{id} " + "GROUP BY operator_id, operator_name, description, version, inputs, outputs, runtime, settings, is_star, " + - "file_name, file_size, usage_count, metrics, created_at, updated_at") + "file_name, file_size, usage_count, metrics, created_at, updated_at, created_by, updated_by") OperatorView findOperatorById(@Param("id") String id); } diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/client/GraphRagClient.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/client/GraphRagClient.java index 924f4e7d3..47f050f4c 100644 --- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/client/GraphRagClient.java +++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/client/GraphRagClient.java @@ -4,6 +4,7 @@ import org.springframework.cloud.openfeign.FeignClient; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestHeader; /** * 知识图谱RAG客户端 @@ -19,5 +20,5 @@ public interface GraphRagClient { * @return 任务详情 */ @PostMapping("/api/rag/process/{id}") - Response startGraphRagTask(@PathVariable("id") String knowledgeBaseId); + Response startGraphRagTask(@PathVariable("id") String knowledgeBaseId, @RequestHeader("User") String authorization); } diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java index 4dbfac59b..c80e528d8 100644 --- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java +++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java @@ -71,7 +71,7 @@ public void processAfterCommit(DataInsertedEvent event) { List ragFiles = ragFileRepository.findNotSuccessByKnowledgeBaseId(event.knowledgeBase().getId()); if (RagType.GRAPH.equals(event.knowledgeBase().getType())){ log.info("Knowledge base {} is of type GRAPH. Skipping RAG ETL processing.", event.knowledgeBase().getName()); - graphRagClient.startGraphRagTask(event.knowledgeBase().getId()); + graphRagClient.startGraphRagTask(event.knowledgeBase().getId(), event.knowledgeBase().getCreatedBy()); return; } diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/persistence/mapper/RagFileMapper.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/persistence/mapper/RagFileMapper.java index e0f233f5a..941c27aff 100644 --- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/persistence/mapper/RagFileMapper.java +++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/persistence/mapper/RagFileMapper.java @@ -2,6 +2,7 @@ import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.datamate.common.infrastructure.config.IgnoreDataScopeAnnotation; import com.datamate.rag.indexer.domain.model.RagFile; import org.apache.ibatis.annotations.Mapper; @@ -12,5 +13,6 @@ * @since 2025-10-24 */ @Mapper +@IgnoreDataScopeAnnotation public interface RagFileMapper extends BaseMapper { } diff --git a/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/DataScopeHandle.java b/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/DataScopeHandle.java new file mode 100644 index 000000000..58fac97ba --- /dev/null +++ b/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/DataScopeHandle.java @@ -0,0 +1,107 @@ +package com.datamate.common.infrastructure.config; + +import com.baomidou.mybatisplus.extension.plugins.handler.DataPermissionHandler; +import com.datamate.common.domain.model.base.BaseEntity; +import lombok.extern.slf4j.Slf4j; +import net.sf.jsqlparser.expression.Expression; +import net.sf.jsqlparser.expression.StringValue; +import net.sf.jsqlparser.expression.operators.conditional.AndExpression; +import net.sf.jsqlparser.expression.operators.relational.InExpression; +import net.sf.jsqlparser.expression.operators.relational.ParenthesedExpressionList; +import net.sf.jsqlparser.schema.Column; +import org.apache.commons.lang3.ObjectUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.ibatis.executor.Executor; +import org.apache.ibatis.mapping.MappedStatement; +import org.apache.ibatis.mapping.SqlCommandType; +import org.apache.ibatis.plugin.Interceptor; +import org.apache.ibatis.plugin.Intercepts; +import org.apache.ibatis.plugin.Invocation; +import org.apache.ibatis.plugin.Signature; +import org.springframework.stereotype.Component; + +import java.lang.reflect.Method; +import java.time.LocalDateTime; +import java.util.Arrays; +import java.util.Objects; + +/** + * 数据隔离处理 + * + * @since 2026/1/19 + */ +@Slf4j +@Component +@Intercepts({ + @Signature(type = Executor.class, method = "update", + args = {MappedStatement.class, Object.class}) +}) +public class DataScopeHandle implements DataPermissionHandler, Interceptor { + private static final ThreadLocal userInfoHolder = new ThreadLocal<>(); + + private static final StringValue SYSTEM_USER = new StringValue("system"); + + private static final String FILTER_COLUMN_NAME = "created_by"; + + private static final String C_DOTS = "."; + + public static void setUserInfo(String user) { + userInfoHolder.set(user); + } + + public static void removeUserInfo() { + userInfoHolder.remove(); + } + + @Override + public Expression getSqlSegment(Expression where, String mappedStatementId) { + if (StringUtils.isBlank(userInfoHolder.get())) { + return where; + } + try { + String className = mappedStatementId.substring(0, mappedStatementId.lastIndexOf(C_DOTS)); + Class clazz = Class.forName(className); + IgnoreDataScopeAnnotation annotation = clazz.getAnnotation(IgnoreDataScopeAnnotation.class); + if (annotation != null) { + return where; + } + String methodName = mappedStatementId.substring(mappedStatementId.lastIndexOf(C_DOTS) + 1); + for (Method method : clazz.getMethods()) { + if (method.getName().equals(methodName)) { + annotation = method.getAnnotation(IgnoreDataScopeAnnotation.class); + if (annotation != null) { + return where; + } + break; + } + } + ParenthesedExpressionList valueList = + new ParenthesedExpressionList<>(Arrays.asList(new StringValue(userInfoHolder.get()), SYSTEM_USER)); + InExpression inExpression = new InExpression(); + inExpression.setLeftExpression(new Column(FILTER_COLUMN_NAME)); + inExpression.setRightExpression(valueList); + return ObjectUtils.isNotEmpty(where) ? new AndExpression(where, inExpression) : inExpression; + } catch (Exception e) { + log.warn(e.getMessage()); + } + return where; + } + + @Override + public Object intercept(Invocation invocation) throws Throwable { + MappedStatement ms = (MappedStatement) invocation.getArgs()[0]; + Object parameter = invocation.getArgs()[1]; + + if (parameter instanceof BaseEntity baseEntity) { + // 根据SQL命令类型设置审计字段 + log.info("current user {}, ms {}", userInfoHolder.get(), ms); + if (Objects.requireNonNull(ms.getSqlCommandType()) != SqlCommandType.UPDATE) { + baseEntity.setCreatedAt(LocalDateTime.now()); + baseEntity.setCreatedBy(userInfoHolder.get()); + } + baseEntity.setUpdatedAt(LocalDateTime.now()); + baseEntity.setUpdatedBy(userInfoHolder.get()); + } + return invocation.proceed(); + } +} diff --git a/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/IgnoreDataScopeAnnotation.java b/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/IgnoreDataScopeAnnotation.java new file mode 100644 index 000000000..838ebf821 --- /dev/null +++ b/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/IgnoreDataScopeAnnotation.java @@ -0,0 +1,15 @@ +package com.datamate.common.infrastructure.config; + +import java.lang.annotation.*; + +/** + * 忽略数据隔离注解 + * 该注解添加到对应的mapper上,添加后该mapper的方法查询时就不需要进行用户过滤了 + * + * @since 2026/1/20 + */ +@Documented +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE, ElementType.METHOD}) +public @interface IgnoreDataScopeAnnotation { +} diff --git a/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/MybatisPlusConfig.java b/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/MybatisPlusConfig.java index e1f8f6566..5f7a08898 100644 --- a/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/MybatisPlusConfig.java +++ b/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/config/MybatisPlusConfig.java @@ -3,6 +3,7 @@ import com.baomidou.mybatisplus.annotation.DbType; import com.baomidou.mybatisplus.extension.handlers.JacksonTypeHandler; import com.baomidou.mybatisplus.extension.plugins.MybatisPlusInterceptor; +import com.baomidou.mybatisplus.extension.plugins.inner.DataPermissionInterceptor; import com.baomidou.mybatisplus.extension.plugins.inner.PaginationInnerInterceptor; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; @@ -39,8 +40,9 @@ public JacksonTypeHandler jacksonTypeHandler() { * @return MybatisPlusInterceptor */ @Bean - public MybatisPlusInterceptor mybatisPlusInterceptor() { + public MybatisPlusInterceptor mybatisPlusInterceptor(DataScopeHandle dataScopeHandle) { MybatisPlusInterceptor interceptor = new MybatisPlusInterceptor(); + interceptor.addInnerInterceptor(new DataPermissionInterceptor(dataScopeHandle)); interceptor.addInnerInterceptor(new PaginationInnerInterceptor(DbType.POSTGRE_SQL)); return interceptor; } diff --git a/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/mapper/ChunkUploadRequestMapper.java b/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/mapper/ChunkUploadRequestMapper.java index 4722aed49..4b0665f5e 100644 --- a/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/mapper/ChunkUploadRequestMapper.java +++ b/backend/shared/domain-common/src/main/java/com/datamate/common/infrastructure/mapper/ChunkUploadRequestMapper.java @@ -1,6 +1,7 @@ package com.datamate.common.infrastructure.mapper; import com.datamate.common.domain.model.ChunkUploadPreRequest; +import com.datamate.common.infrastructure.config.IgnoreDataScopeAnnotation; import org.apache.ibatis.annotations.Mapper; import org.apache.ibatis.annotations.Param; @@ -10,6 +11,7 @@ * 文件切片上传请求Mapper */ @Mapper +@IgnoreDataScopeAnnotation public interface ChunkUploadRequestMapper { /** diff --git a/deployment/docker/datamate/docker-compose.yml b/deployment/docker/datamate/docker-compose.yml index 6da18d02e..68085ec6e 100644 --- a/deployment/docker/datamate/docker-compose.yml +++ b/deployment/docker/datamate/docker-compose.yml @@ -9,7 +9,8 @@ services: restart: on-failure privileged: true environment: - DB_PASSWORD: ${DB_PASSWORD:-password} + - DB_PASSWORD=${DB_PASSWORD:-password} + - datamate.jwt.enable=${DATAMATE_JWT_ENABLE:-false} volumes: - dataset_volume:/dataset - flow_volume:/flow @@ -28,6 +29,7 @@ services: environment: - log_level=DEBUG - pgsql_password=${DB_PASSWORD:-password} + - datamate_jwt_enable=${DATAMATE_JWT_ENABLE:-false} volumes: - dataset_volume:/dataset - flow_volume:/flow @@ -46,6 +48,7 @@ services: - '8080:8080' environment: - JWT_SECRET=default-insecure-key-change-in-production + - datamate.jwt.enable=${DATAMATE_JWT_ENABLE:-false} networks: [ datamate ] datamate-frontend: diff --git a/deployment/helm/datamate/values.yaml b/deployment/helm/datamate/values.yaml index a3caa8dd8..9ba257005 100644 --- a/deployment/helm/datamate/values.yaml +++ b/deployment/helm/datamate/values.yaml @@ -113,6 +113,8 @@ backend: key: DB_PASSWORD - name: datamate.rag.milvus-uri value: "http://milvus:19530" + - name: datamate.jwt.enable + value: &DATAMATE_JWT_ENABLE 'false' volumes: - *datasetVolume - *flowVolume @@ -135,6 +137,8 @@ backend-python: secretKeyRef: name: datamate-conf key: DB_PASSWORD + - name: datamate_jwt_enable + value: *DATAMATE_JWT_ENABLE volumes: - *datasetVolume - *flowVolume @@ -151,6 +155,8 @@ gateway: env: - name: JWT_SECRET value: "default-insecure-key-change-in-production" + - name: datamate.jwt.enable + value: *DATAMATE_JWT_ENABLE volumes: - *logVolume volumeMounts: diff --git a/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts b/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts index 69ef19ee3..daaa06497 100644 --- a/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts +++ b/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts @@ -14,6 +14,7 @@ import { deleteDirectoryUsingDelete, renameDatasetFileUsingPut, renameDirectoryUsingPut, + getDatasetFileByIdUsingGet, } from "../dataset.api"; import { useParams } from "react-router"; @@ -111,21 +112,17 @@ export function useFilesOperation(dataset: Dataset) { setPreviewFileDetail(undefined); try { // 获取文件元信息(来自 t_dm_dataset_files) - const detailRes: any = await (await import("../dataset.api")).getDatasetFileByIdUsingGet( - datasetId, - file.id - ); + const detailRes: any = await getDatasetFileByIdUsingGet(datasetId, file.id); const detail = detailRes?.data || detailRes; setPreviewFileDetail(detail); - const downloadUrl = `/api/data-management/datasets/${datasetId}/files/${file.id}/download`; const image = isImageFile(detail?.fileName || file.fileName, detail?.fileType); + const { blob, blobUrl } = await downloadFileByIdUsingGet(datasetId, file.id, file.fileName, "preview"); if (image) { - setPreviewUrl(downloadUrl); + setPreviewUrl(blobUrl); } else { - const res = await fetch(downloadUrl); - const text = await res.text(); + const text = await blob.text(); setPreviewContent(text); } } catch (error) { diff --git a/frontend/src/pages/DataManagement/dataset.api.ts b/frontend/src/pages/DataManagement/dataset.api.ts index f6d4e3f37..a7f36fd03 100644 --- a/frontend/src/pages/DataManagement/dataset.api.ts +++ b/frontend/src/pages/DataManagement/dataset.api.ts @@ -104,12 +104,14 @@ export function renameDirectoryUsingPut( export function downloadFileByIdUsingGet( id: string | number, fileId: string | number, - fileName: string + fileName: string, + action: string = "download" ) { return download( `/api/data-management/datasets/${id}/files/${fileId}/download`, null, - fileName + fileName, + action ); } diff --git a/frontend/src/utils/request.ts b/frontend/src/utils/request.ts index a25c00c84..c5ff30444 100644 --- a/frontend/src/utils/request.ts +++ b/frontend/src/utils/request.ts @@ -104,14 +104,6 @@ class Request { } }); - // 请求完成 - // xhr.addEventListener("load", function () { - // if (xhr.status >= 200 && xhr.status < 300) { - // const response = JSON.parse(xhr.responseText); - // resolve(xhr); - // } - // }); - // 请求完成处理 xhr.addEventListener("load", () => { if (xhr.status >= 200 && xhr.status < 300) { @@ -400,9 +392,10 @@ class Request { * @param {string} url - 请求URL * @param {object} params - 查询参数 * @param {string} filename - 下载文件名 + * @param {string} action - 行为,包括下载文件和预览文件 * @param {object} options - 额外的fetch选项,包括showLoading, onDownloadProgress */ - async download(url, params = null, filename = "", options = {}) { + async download(url, params = null, filename = "", action = "download", options = {}) { const fullURL = this.buildURL(url, params); const config = { @@ -456,19 +449,34 @@ class Request { `download_${Date.now()}`; } - // 创建下载链接 - const downloadUrl = window.URL.createObjectURL(blob); - const link = document.createElement("a"); - link.href = downloadUrl; - link.download = filename ?? name; - - // 添加到DOM并触发下载 - document.body.appendChild(link); - link.click(); - document.body.removeChild(link); - - // 清理URL对象 - window.URL.revokeObjectURL(downloadUrl); + if (action === "download") { + // 创建下载链接 + const downloadUrl = window.URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = downloadUrl; + link.download = filename ?? name; + + // 添加到DOM并触发下载 + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + + // 清理URL对象 + window.URL.revokeObjectURL(downloadUrl); + } else if (action === "preview") { + // 预览逻辑 - 返回Blob URL和相关信息 + const blobUrl = window.URL.createObjectURL(blob); + + // 可以返回更多信息用于预览 + return { + blob, + blobUrl, + filename: name, + size: blob.size, + // 自动清理的钩子 + revoke: () => window.URL.revokeObjectURL(blobUrl) + }; + } return blob; } diff --git a/runtime/datamate-python/.env.example b/runtime/datamate-python/.env.example index 07b3221f0..d18838398 100644 --- a/runtime/datamate-python/.env.example +++ b/runtime/datamate-python/.env.example @@ -8,13 +8,15 @@ LOG_FILE_DIR=./logs RAG_STORAGE_DIR=./rag_storage # DataBase -MYSQL_HOST=localhost -MYSQL_PORT=3306 -MYSQL_USER=root -MYSQL_PASSWORD=password -MYSQL_DATABASE=datamate +PGSQL_HOST=localhost +PGSQL_PORT=5432 +PGSQL_USER=postgres +PGSQL_PASSWORD=password +PGSQL_DATABASE=datamate # Label Studio settings -LABEL_STUDIO_BASE_URL=http://localhost:8080 +LABEL_STUDIO_BASE_URL=http://localhost:30001 LABEL_STUDIO_USER_TOKEN="demo_dev_token" + +DATAMATE_JWT_ENABLE=false diff --git a/runtime/datamate-python/app/core/config.py b/runtime/datamate-python/app/core/config.py index 0e8a3c1d5..edaae3b74 100644 --- a/runtime/datamate-python/app/core/config.py +++ b/runtime/datamate-python/app/core/config.py @@ -75,5 +75,7 @@ def build_database_url(self): # DataMate dm_file_path_prefix: str = "/dataset" # DM存储文件夹前缀 + datamate_jwt_enable: bool = False + # 全局设置实例 settings = Settings() diff --git a/runtime/datamate-python/app/db/datascope.py b/runtime/datamate-python/app/db/datascope.py new file mode 100644 index 000000000..a8d19541f --- /dev/null +++ b/runtime/datamate-python/app/db/datascope.py @@ -0,0 +1,37 @@ +from contextvars import ContextVar +from typing import List, Optional + +_current_user: ContextVar[Optional[str]] = ContextVar("_current_user", default=None) +SYSTEM_USER = "system" + + +class DataScopeHandle: + """ + Hold current user info in a ContextVar and provide helpers for SQLAlchemy filters. + """ + + @staticmethod + def set_user_info(user: Optional[str]) -> None: + if user is None or user == "": + # set explicit None + _current_user.set(None) + else: + _current_user.set(user) + + @staticmethod + def remove_user_info() -> None: + _current_user.set(None) + + @staticmethod + def get_user_info() -> Optional[str]: + return _current_user.get() + + @staticmethod + def allowed_users() -> List[str]: + """ + Return list of allowed creators: current user + system. + """ + user = DataScopeHandle.get_user_info() + if not user: + return [] + return [user, SYSTEM_USER] diff --git a/runtime/datamate-python/app/db/models/annotation_management.py b/runtime/datamate-python/app/db/models/annotation_management.py index 81589f032..10163af75 100644 --- a/runtime/datamate-python/app/db/models/annotation_management.py +++ b/runtime/datamate-python/app/db/models/annotation_management.py @@ -4,13 +4,14 @@ from sqlalchemy import Column, String, Boolean, TIMESTAMP, Text, Integer, JSON, ForeignKey from sqlalchemy.sql import func -from app.db.session import Base +from app.db.models.base_entity import BaseEntity -class AnnotationTemplate(Base): +class AnnotationTemplate(BaseEntity): """标注配置模板模型""" - + __tablename__ = "t_dm_annotation_templates" - + __ignore_data_scope__ = True + id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID") name = Column(String(100), nullable=False, comment="模板名称") description = Column(String(500), nullable=True, comment="模板描述") @@ -21,23 +22,21 @@ class AnnotationTemplate(Base): category = Column(String(50), default='custom', comment="模板分类: medical/general/custom/system") built_in = Column(Boolean, default=False, comment="是否系统内置模板") version = Column(String(20), default='1.0', comment="模板版本") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)") - + def __repr__(self): return f"" - + @property def is_deleted(self) -> bool: """检查是否已被软删除""" return self.deleted_at is not None - -class LabelingProject(Base): + +class LabelingProject(BaseEntity): """标注项目模型""" - + __tablename__ = "t_dm_labeling_projects" - + id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID") dataset_id = Column(String(36), nullable=False, comment="数据集ID") name = Column(String(100), nullable=False, comment="项目名称") @@ -45,20 +44,18 @@ class LabelingProject(Base): template_id = Column(String(36), ForeignKey('t_dm_annotation_templates.id', ondelete='SET NULL'), nullable=True, comment="使用的模板ID") configuration = Column(JSON, nullable=True, comment="项目配置(可能包含对模板的自定义修改)") progress = Column(JSON, nullable=True, comment="项目进度信息") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)") - + def __repr__(self): return f"" - + @property def is_deleted(self) -> bool: """检查是否已被软删除""" return self.deleted_at is not None -class AutoAnnotationTask(Base): +class AutoAnnotationTask(BaseEntity): """自动标注任务模型,对应表 t_dm_auto_annotation_tasks""" __tablename__ = "t_dm_auto_annotation_tasks" @@ -76,13 +73,6 @@ class AutoAnnotationTask(Base): detected_objects = Column(Integer, default=0, comment="检测到的对象总数") output_path = Column(String(500), nullable=True, comment="输出路径") error_message = Column(Text, nullable=True, comment="错误信息") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column( - TIMESTAMP, - server_default=func.current_timestamp(), - onupdate=func.current_timestamp(), - comment="更新时间", - ) completed_at = Column(TIMESTAMP, nullable=True, comment="完成时间") deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)") @@ -92,4 +82,4 @@ def __repr__(self) -> str: # pragma: no cover - repr 简单返回 @property def is_deleted(self) -> bool: """检查是否已被软删除""" - return self.deleted_at is not None \ No newline at end of file + return self.deleted_at is not None diff --git a/runtime/datamate-python/app/db/models/base_entity.py b/runtime/datamate-python/app/db/models/base_entity.py new file mode 100644 index 000000000..9d31203a6 --- /dev/null +++ b/runtime/datamate-python/app/db/models/base_entity.py @@ -0,0 +1,22 @@ +from sqlalchemy import Column, String, TIMESTAMP +from sqlalchemy.orm import declarative_base +from sqlalchemy.sql import func + +Base = declarative_base() + + +class BaseEntity(Base): + """ + Common base entity with audit fields. + Subclasses may set `__ignore_data_scope__ = True` to opt-out of data-scope filtering. + """ + __abstract__ = True + + created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") + updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), + comment="更新时间") + created_by = Column(String(255), nullable=True, comment="创建者") + updated_by = Column(String(255), nullable=True, comment="更新者") + + # default: do enforce data scope unless subclass sets this to True + __ignore_data_scope__ = False diff --git a/runtime/datamate-python/app/db/models/data_collection.py b/runtime/datamate-python/app/db/models/data_collection.py index 547e2ddfb..5dd292bec 100644 --- a/runtime/datamate-python/app/db/models/data_collection.py +++ b/runtime/datamate-python/app/db/models/data_collection.py @@ -2,12 +2,14 @@ from sqlalchemy import Column, String, Text, TIMESTAMP, Integer, BigInteger, Numeric, JSON, Boolean from sqlalchemy.sql import func -from app.db.session import Base +from app.db.models.base_entity import BaseEntity -class CollectionTemplate(Base): + +class CollectionTemplate(BaseEntity): """归集模板表(UUID 主键) -> t_dc_collection_templates""" __tablename__ = "t_dc_collection_templates" + __ignore_data_scope__ = True id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="模板ID(UUID)") name = Column(String(255), nullable=False, comment="模板名称") @@ -18,12 +20,8 @@ class CollectionTemplate(Base): target_name = Column(String(64), nullable=False, comment="目标数据源名称") template_content = Column(JSON, nullable=False, comment="模板内容") built_in = Column(Boolean, default=False, comment="是否系统内置模板") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") -class CollectionTask(Base): +class CollectionTask(BaseEntity): """归集任务表(UUID 主键) -> t_dc_collection_tasks""" __tablename__ = "t_dc_collection_tasks" @@ -41,12 +39,8 @@ class CollectionTask(Base): retry_count = Column(Integer, nullable=True, server_default="3", comment="重试次数") timeout_seconds = Column(Integer, nullable=True, server_default="3600", comment="超时时间(秒)") last_execution_id = Column(String(36), nullable=True, comment="最后执行ID(UUID)") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") -class TaskExecution(Base): +class TaskExecution(BaseEntity): """任务执行记录表(UUID 主键) -> t_dc_task_executions""" __tablename__ = "t_dc_task_executions" @@ -60,7 +54,3 @@ class TaskExecution(Base): completed_at = Column(TIMESTAMP, nullable=True, comment="完成时间") duration_seconds = Column(Integer, nullable=True, server_default="0", comment="执行时长(秒)") error_message = Column(Text, nullable=True, comment="错误信息") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") diff --git a/runtime/datamate-python/app/db/models/data_evaluation.py b/runtime/datamate-python/app/db/models/data_evaluation.py index 7657d5e74..c029ba936 100644 --- a/runtime/datamate-python/app/db/models/data_evaluation.py +++ b/runtime/datamate-python/app/db/models/data_evaluation.py @@ -10,10 +10,10 @@ from sqlalchemy import Column, String, Text, Float, TIMESTAMP, ForeignKey, Integer from sqlalchemy.sql import func -from app.db.session import Base +from app.db.models.base_entity import BaseEntity -class EvaluationTask(Base): +class EvaluationTask(BaseEntity): """评估任务表(UUID 主键) -> t_de_eval_task Columns per data-evaluation-init.sql: @@ -36,16 +36,13 @@ class EvaluationTask(Base): eval_process = Column(Float, nullable=False, server_default="0", comment="评估进度") eval_prompt = Column(Text, nullable=True, comment="评估提示词") eval_config = Column(Text, nullable=True, comment="评估配置") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") -class EvaluationFile(Base): +class EvaluationFile(BaseEntity): """评估条目表(UUID 主键) -> t_de_eval_file""" __tablename__ = "t_de_eval_file" + __ignore_data_scope__ = True id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID") task_id = Column(String(36), ForeignKey('t_de_eval_task.id'), nullable=False, comment="评估任务ID") @@ -54,13 +51,9 @@ class EvaluationFile(Base): error_message = Column(Text, nullable=True, comment="错误信息") total_count = Column(Integer, nullable=False, default=0, comment="总数") evaluated_count = Column(Integer, nullable=False, default=0, comment="已评估数") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") -class EvaluationItem(Base): +class EvaluationItem(BaseEntity): """评估条目表(UUID 主键) -> t_de_eval_item Columns per data-evaluation-init.sql: @@ -68,6 +61,7 @@ class EvaluationItem(Base): """ __tablename__ = "t_de_eval_item" + __ignore_data_scope__ = True id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID") task_id = Column(String(36), ForeignKey('t_de_eval_task.id'), nullable=False, comment="评估任务ID") @@ -77,7 +71,3 @@ class EvaluationItem(Base): eval_score = Column(Float, nullable=False, server_default="0", comment="评估分数") eval_result = Column(Text, nullable=True, comment="评估结果") status = Column(String(50), server_default="PENDING", nullable=False, comment="状态:PENDING/EVALUATED") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") diff --git a/runtime/datamate-python/app/db/models/data_synthesis.py b/runtime/datamate-python/app/db/models/data_synthesis.py index b294a1bc4..e74446c06 100644 --- a/runtime/datamate-python/app/db/models/data_synthesis.py +++ b/runtime/datamate-python/app/db/models/data_synthesis.py @@ -2,7 +2,7 @@ from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, func -from app.db.session import Base +from app.db.models.base_entity import Base, BaseEntity from app.module.generation.schema.generation import CreateSynthesisTaskRequest @@ -43,7 +43,7 @@ async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskReq return synth_task_instance -class DataSynthInstance(Base): +class DataSynthInstance(BaseEntity): """数据合成任务表,对应表 t_data_synth_instances create table if not exists t_data_synth_instances @@ -82,19 +82,9 @@ class DataSynthInstance(Base): total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数") processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数") total_synth_data = Column(Integer, nullable=False, default=0, comment="总合成数据量") - created_at = Column(TIMESTAMP, nullable=False, default=func.now(), comment="创建时间") - updated_at = Column( - TIMESTAMP, - nullable=False, - default=func.now(), - onupdate=func.now(), - comment="更新时间", - ) - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") -class DataSynthesisFileInstance(Base): +class DataSynthesisFileInstance(BaseEntity): """数据合成文件任务表,对应表 t_data_synthesis_file_instances create table if not exists t_data_synthesis_file_instances ( @@ -129,17 +119,6 @@ class DataSynthesisFileInstance(Base): total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数") processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间") - updated_at = Column( - TIMESTAMP, - server_default=func.current_timestamp(), - onupdate=func.current_timestamp(), - nullable=True, - comment="更新时间", - ) - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") - class DataSynthesisChunkInstance(Base): """数据合成分块任务表,对应表 t_data_synthesis_chunk_instances diff --git a/runtime/datamate-python/app/db/models/dataset_management.py b/runtime/datamate-python/app/db/models/dataset_management.py index f0ed15967..702663321 100644 --- a/runtime/datamate-python/app/db/models/dataset_management.py +++ b/runtime/datamate-python/app/db/models/dataset_management.py @@ -6,13 +6,14 @@ from sqlalchemy import Column, String, BigInteger, Boolean, TIMESTAMP, Text, Integer, JSON, Date from sqlalchemy.sql import func -from app.db.session import Base +from app.db.models.base_entity import Base, BaseEntity -class Dataset(Base): + +class Dataset(BaseEntity): """数据集模型(支持医学影像、文本、问答等多种类型)""" - + __tablename__ = "t_dm_datasets" - + id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID") name = Column(String(255), nullable=False, comment="数据集名称") description = Column(Text, nullable=True, comment="数据集描述") @@ -31,31 +32,26 @@ class Dataset(Base): is_public = Column(Boolean, default=False, comment="是否公开") is_featured = Column(Boolean, default=False, comment="是否推荐") version = Column(BigInteger, nullable=False, default=0, comment="版本号") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") - + def __repr__(self): return f"" -class DatasetTag(Base): +class DatasetTag(BaseEntity): """数据集标签关联模型""" - + __tablename__ = "t_dm_dataset_tags" - + dataset_id = Column(String(36), primary_key=True, comment="数据集ID(UUID)") tag_id = Column(String(36), primary_key=True, comment="标签ID(UUID)") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - + def __repr__(self): return f"" class DatasetFiles(Base): """DM数据集文件模型""" - + __tablename__ = "t_dm_dataset_files" - + id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID") dataset_id = Column(String(36), nullable=False, comment="所属数据集ID(UUID)") file_name = Column(String(255), nullable=False, comment="文件名") @@ -71,15 +67,15 @@ class DatasetFiles(Base): last_access_time = Column(TIMESTAMP, nullable=True, comment="最后访问时间") created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - + def __repr__(self): return f"" - + class DatasetStatistics(Base): """数据集统计信息模型""" - + __tablename__ = "t_dm_dataset_statistics" - + id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID") dataset_id = Column(String(36), nullable=False, comment="数据集ID(UUID)") stat_date = Column(Date, nullable=False, comment="统计日期") @@ -92,15 +88,15 @@ class DatasetStatistics(Base): quality_metrics = Column(JSON, nullable=True, comment="质量指标") created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - + def __repr__(self): return f"" class Tag(Base): """标签集合模型""" - + __tablename__ = "t_dm_tags" - + id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID") name = Column(String(100), nullable=False, unique=True, comment="标签名称") description = Column(Text, nullable=True, comment="标签描述") @@ -109,6 +105,6 @@ class Tag(Base): usage_count = Column(BigInteger, default=0, comment="使用次数") created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - + def __repr__(self): - return f"" \ No newline at end of file + return f"" diff --git a/runtime/datamate-python/app/db/models/knowledge_gen.py b/runtime/datamate-python/app/db/models/knowledge_gen.py index 423ae2246..daa553630 100644 --- a/runtime/datamate-python/app/db/models/knowledge_gen.py +++ b/runtime/datamate-python/app/db/models/knowledge_gen.py @@ -4,10 +4,10 @@ import uuid from sqlalchemy import Column, String, TIMESTAMP, Text, Integer, JSON from sqlalchemy.sql import func -from app.db.session import Base +from app.db.models.base_entity import BaseEntity -class RagKnowledgeBase(Base): +class RagKnowledgeBase(BaseEntity): """知识库模型""" __tablename__ = "t_rag_knowledge_base" @@ -17,19 +17,15 @@ class RagKnowledgeBase(Base): description = Column(String(512), nullable=True, comment="知识库描述") embedding_model = Column(String(255), nullable=False, comment="嵌入模型") chat_model = Column(String(255), nullable=True, comment="聊天模型") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), - comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") def __repr__(self): return f"" -class RagFile(Base): +class RagFile(BaseEntity): """知识库文件模型""" __tablename__ = "t_rag_file" + __ignore_data_scope__ = True id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID") knowledge_base_id = Column(String(36), nullable=False, comment="知识库ID") @@ -39,9 +35,4 @@ class RagFile(Base): file_metadata = Column("metadata", JSON, nullable=True, comment="元数据") status = Column(String(50), nullable=True, comment="文件状态") err_msg = Column(Text, nullable=True, comment="错误信息") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), - comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") diff --git a/runtime/datamate-python/app/db/models/model_config.py b/runtime/datamate-python/app/db/models/model_config.py index be75043f9..bbea7bed1 100644 --- a/runtime/datamate-python/app/db/models/model_config.py +++ b/runtime/datamate-python/app/db/models/model_config.py @@ -1,6 +1,6 @@ from sqlalchemy import Column, String, Integer, TIMESTAMP, select -from app.db.session import Base +from app.db.models.base_entity import BaseEntity async def get_model_by_id(db_session, model_id: str): @@ -9,7 +9,7 @@ async def get_model_by_id(db_session, model_id: str): model_config = result.scalar_one_or_none() return model_config -class ModelConfig(Base): +class ModelConfig(BaseEntity): """模型配置表,对应表 t_model_config CREATE TABLE IF NOT EXISTS t_model_config ( @@ -42,11 +42,6 @@ class ModelConfig(Base): is_enabled = Column(Integer, nullable=False, default=1, comment="是否启用:1-启用,0-禁用") is_default = Column(Integer, nullable=False, default=0, comment="是否默认:1-默认,0-非默认") - created_at = Column(TIMESTAMP, nullable=True, comment="创建时间") - updated_at = Column(TIMESTAMP, nullable=True, comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") - __table_args__ = ( # 与 DDL 中的 uk_model_provider 保持一致 { diff --git a/runtime/datamate-python/app/db/models/ratio_task.py b/runtime/datamate-python/app/db/models/ratio_task.py index 5b5e30072..a91f6e5a3 100644 --- a/runtime/datamate-python/app/db/models/ratio_task.py +++ b/runtime/datamate-python/app/db/models/ratio_task.py @@ -11,10 +11,10 @@ from sqlalchemy.orm import relationship from sqlalchemy.sql import func -from app.db.session import Base +from app.db.models.base_entity import BaseEntity -class RatioInstance(Base): +class RatioInstance(BaseEntity): """配比实例表(UUID 主键) -> t_st_ratio_instances Columns per data-ratio-init.sql: @@ -32,16 +32,12 @@ class RatioInstance(Base): merge_method = Column(String(50), nullable=True, comment="合并方式") status = Column(String(20), nullable=True, comment="状态") totals = Column(BigInteger, nullable=True, comment="总数") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column( TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") def __repr__(self) -> str: return f"" -class RatioRelation(Base): +class RatioRelation(BaseEntity): """配比关系表(UUID 主键) -> t_st_ratio_relations Columns per data-ratio-init.sql: @@ -57,10 +53,6 @@ class RatioRelation(Base): ratio_value = Column(String(256), nullable=True) counts = Column(BigInteger, nullable=True, comment="条数") filter_conditions = Column(Text, nullable=True, comment="过滤条件") - created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") - updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - created_by = Column(String(255), nullable=True, comment="创建者") - updated_by = Column(String(255), nullable=True, comment="更新者") def __repr__(self) -> str: return ( diff --git a/runtime/datamate-python/app/db/models/user_management.py b/runtime/datamate-python/app/db/models/user_management.py index e0d6d6b54..a8c1f583f 100644 --- a/runtime/datamate-python/app/db/models/user_management.py +++ b/runtime/datamate-python/app/db/models/user_management.py @@ -5,13 +5,13 @@ from sqlalchemy import Column, String, BigInteger, Boolean, TIMESTAMP from sqlalchemy.sql import func -from app.db.session import Base +from app.db.models.base_entity import Base class User(Base): """用户模型""" - + __tablename__ = "users" - + id = Column(BigInteger, primary_key=True, autoincrement=True, comment="用户ID") username = Column(String(255), nullable=False, unique=True, comment="用户名") email = Column(String(255), nullable=False, unique=True, comment="邮箱") @@ -24,6 +24,6 @@ class User(Base): last_login_at = Column(TIMESTAMP, nullable=True, comment="最后登录时间") created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") - + def __repr__(self): return f"" diff --git a/runtime/datamate-python/app/db/session.py b/runtime/datamate-python/app/db/session.py index bd19b1e4d..d2bd55ed4 100644 --- a/runtime/datamate-python/app/db/session.py +++ b/runtime/datamate-python/app/db/session.py @@ -1,9 +1,15 @@ +from datetime import datetime +from sqlalchemy import event, true from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker -from sqlalchemy.orm import declarative_base +from sqlalchemy.orm import Session, declarative_base, with_loader_criteria +from sqlalchemy.orm.attributes import flag_modified from app.core.config import settings from app.core.logging import get_logger from typing import AsyncGenerator +from app.db.datascope import DataScopeHandle +from app.db.models.base_entity import BaseEntity + logger = get_logger(__name__) # 创建数据库引擎 @@ -22,8 +28,62 @@ expire_on_commit=False ) -# 创建基础模型类 -Base = declarative_base() +@event.listens_for(Session, "do_orm_execute") +def _apply_data_scope(orm_execute_state): + # only act on SELECT queries generated by ORM + if not orm_execute_state.is_select: + return + + allowed = DataScopeHandle.allowed_users() + if not allowed or len(allowed) == 0: + return + + # predicate builder: return None to skip for classes that opt-out + def criteria_fn(cls): + # skip if the mapped class explicitly disables data-scope + if getattr(cls, "__ignore_data_scope__", False): + return true() + # some classes may not have created_by column; guard dynamically + col = getattr(cls, "created_by", None) + if col is None: + return true() + return col.in_(allowed) + + # apply loader-level criteria to all subclasses of BaseEntity + orm_execute_state.statement = orm_execute_state.statement.options( + with_loader_criteria(BaseEntity, criteria_fn, include_aliases=True) + ) + +@event.listens_for(Session, "before_flush") +def _audit_before_flush(session, flush_context, instances): + user = DataScopeHandle.get_user_info() + now = datetime.now() + + # new -> set created_* and updated_* + for obj in list(session.new): + if isinstance(obj, BaseEntity): + if getattr(obj, "created_at", None) is None: + obj.created_at = now + if getattr(obj, "created_by", None) is None: + obj.created_by = user + obj.updated_at = now + obj.updated_by = user + # ensure SQLAlchemy sees changes + try: + flag_modified(obj, "created_by") + except Exception: + pass + + # dirty -> set updated_* + for obj in list(session.dirty): + if isinstance(obj, BaseEntity): + obj.updated_at = now + obj.updated_by = user + try: + flag_modified(obj, "updated_by") + except Exception: + pass + async def get_db() -> AsyncGenerator[AsyncSession, None]: """获取数据库会话""" async with AsyncSessionLocal() as session: diff --git a/runtime/datamate-python/app/main.py b/runtime/datamate-python/app/main.py index c373edff9..a49d77fc5 100644 --- a/runtime/datamate-python/app/main.py +++ b/runtime/datamate-python/app/main.py @@ -8,6 +8,7 @@ from sqlalchemy import text from starlette.exceptions import HTTPException as StarletteHTTPException +from app.middleware import UserContextMiddleware from .core.config import settings from .core.logging import setup_logging, get_logger from .db.session import AsyncSessionLocal @@ -69,6 +70,7 @@ def mask_db_url(url: str) -> Literal[b""] | str: lifespan=lifespan ) +app.add_middleware(UserContextMiddleware) # CORS Middleware # app.add_middleware( # CORSMiddleware, diff --git a/runtime/datamate-python/app/middleware.py b/runtime/datamate-python/app/middleware.py new file mode 100644 index 000000000..06e228ed4 --- /dev/null +++ b/runtime/datamate-python/app/middleware.py @@ -0,0 +1,35 @@ +from fastapi import Request, Response +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.status import HTTP_401_UNAUTHORIZED +import json +from typing import Optional + +from app.core.config import settings +from app.core.logging import get_logger +from app.db.datascope import DataScopeHandle + +logger = get_logger(__name__) + +class UserContextMiddleware(BaseHTTPMiddleware): + """ + FastAPI middleware that reads `User` header and sets DataScopeHandle. + If `jwt_enable` is True, missing header returns 401. + """ + + def __init__(self, app): + super().__init__(app) + self.jwt_enable = settings.datamate_jwt_enable + + async def dispatch(self, request: Request, call_next): + user: Optional[str] = request.headers.get("User") + logger.info(f"start filter, current user: {user}, need filter: {self.jwt_enable}") + if self.jwt_enable and (user is None or user.strip() == ""): + payload = {"code": HTTP_401_UNAUTHORIZED, "message": "unauthorized"} + return Response(content=json.dumps(payload), status_code=HTTP_401_UNAUTHORIZED, media_type="application/json") + + DataScopeHandle.set_user_info(user) + try: + response = await call_next(request) + return response + finally: + DataScopeHandle.remove_user_info() diff --git a/runtime/datamate-python/app/module/generation/service/export_service.py b/runtime/datamate-python/app/module/generation/service/export_service.py index fd8037729..ee3eb24e1 100644 --- a/runtime/datamate-python/app/module/generation/service/export_service.py +++ b/runtime/datamate-python/app/module/generation/service/export_service.py @@ -87,7 +87,7 @@ async def export_task_to_dataset( file_path=file_path, file_type="jsonl", file_size=file_size, - last_access_time=datetime.datetime.now(datetime.UTC), + last_access_time=datetime.datetime.now(), ) self._db.add(df) created_files.append(df) diff --git a/scripts/db/data-annotation-init.sql b/scripts/db/data-annotation-init.sql index 05c10a184..c83eb8822 100644 --- a/scripts/db/data-annotation-init.sql +++ b/scripts/db/data-annotation-init.sql @@ -13,6 +13,8 @@ CREATE TABLE IF NOT EXISTS t_dm_annotation_templates ( category VARCHAR(50) DEFAULT 'custom', built_in BOOLEAN DEFAULT FALSE, version VARCHAR(20) DEFAULT '1.0', + created_by VARCHAR(256), + updated_by VARCHAR(256), created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, deleted_at TIMESTAMP @@ -30,6 +32,8 @@ COMMENT ON COLUMN t_dm_annotation_templates.style IS '样式配置: horizontal/v COMMENT ON COLUMN t_dm_annotation_templates.category IS '模板分类: medical/general/custom/system'; COMMENT ON COLUMN t_dm_annotation_templates.built_in IS '是否系统内置模板'; COMMENT ON COLUMN t_dm_annotation_templates.version IS '模板版本'; +COMMENT ON COLUMN t_dm_annotation_templates.created_by IS '创建者'; +COMMENT ON COLUMN t_dm_annotation_templates.updated_by IS '更新者'; COMMENT ON COLUMN t_dm_annotation_templates.created_at IS '创建时间'; COMMENT ON COLUMN t_dm_annotation_templates.updated_at IS '更新时间'; COMMENT ON COLUMN t_dm_annotation_templates.deleted_at IS '删除时间(软删除)'; @@ -50,6 +54,8 @@ CREATE TABLE IF NOT EXISTS t_dm_labeling_projects ( template_id VARCHAR(36), configuration JSONB, progress JSONB, + created_by VARCHAR(256), + updated_by VARCHAR(256), created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, deleted_at TIMESTAMP @@ -64,6 +70,8 @@ COMMENT ON COLUMN t_dm_labeling_projects.labeling_project_id IS 'Label Studio项 COMMENT ON COLUMN t_dm_labeling_projects.template_id IS '使用的模板ID'; COMMENT ON COLUMN t_dm_labeling_projects.configuration IS '项目配置(可能包含对模板的自定义修改)'; COMMENT ON COLUMN t_dm_labeling_projects.progress IS '项目进度信息'; +COMMENT ON COLUMN t_dm_labeling_projects.created_by IS '创建者'; +COMMENT ON COLUMN t_dm_labeling_projects.updated_by IS '更新者'; COMMENT ON COLUMN t_dm_labeling_projects.created_at IS '创建时间'; COMMENT ON COLUMN t_dm_labeling_projects.updated_at IS '更新时间'; COMMENT ON COLUMN t_dm_labeling_projects.deleted_at IS '删除时间(软删除)'; @@ -93,6 +101,8 @@ CREATE TABLE IF NOT EXISTS t_dm_auto_annotation_tasks ( detected_objects INTEGER DEFAULT 0, output_path VARCHAR(500), error_message TEXT, + created_by VARCHAR(256), + updated_by VARCHAR(256), created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, completed_at TIMESTAMP, @@ -114,6 +124,8 @@ COMMENT ON COLUMN t_dm_auto_annotation_tasks.processed_images IS '已处理图 COMMENT ON COLUMN t_dm_auto_annotation_tasks.detected_objects IS '检测到的对象总数'; COMMENT ON COLUMN t_dm_auto_annotation_tasks.output_path IS '输出路径'; COMMENT ON COLUMN t_dm_auto_annotation_tasks.error_message IS '错误信息'; +COMMENT ON COLUMN t_dm_auto_annotation_tasks.created_by IS '创建者'; +COMMENT ON COLUMN t_dm_auto_annotation_tasks.updated_by IS '更新者'; COMMENT ON COLUMN t_dm_auto_annotation_tasks.created_at IS '创建时间'; COMMENT ON COLUMN t_dm_auto_annotation_tasks.updated_at IS '更新时间'; COMMENT ON COLUMN t_dm_auto_annotation_tasks.completed_at IS '完成时间'; @@ -431,4 +443,4 @@ VALUES ('tpl-image-classification-001', '图像分类', '简单的多标签图 category = EXCLUDED.category, built_in = EXCLUDED.built_in, version = EXCLUDED.version, - updated_at = CURRENT_TIMESTAMP; \ No newline at end of file + updated_at = CURRENT_TIMESTAMP; diff --git a/scripts/db/data-cleaning-init.sql b/scripts/db/data-cleaning-init.sql index c8ffd8d3b..8181c4ec5 100644 --- a/scripts/db/data-cleaning-init.sql +++ b/scripts/db/data-cleaning-init.sql @@ -34,11 +34,13 @@ CREATE TABLE IF NOT EXISTS t_clean_task before_size BIGINT, after_size BIGINT, file_count INTEGER, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, started_at TIMESTAMP, finished_at TIMESTAMP, - created_by VARCHAR(256) - ); + created_by VARCHAR(256), + updated_by VARCHAR(256), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); COMMENT ON TABLE t_clean_task IS '清洗任务表'; COMMENT ON COLUMN t_clean_task.id IS '主键ID'; @@ -52,10 +54,12 @@ COMMENT ON COLUMN t_clean_task.dest_dataset_name IS '目标数据集名称'; COMMENT ON COLUMN t_clean_task.before_size IS '清洗前大小'; COMMENT ON COLUMN t_clean_task.after_size IS '清洗后大小'; COMMENT ON COLUMN t_clean_task.file_count IS '文件数量'; -COMMENT ON COLUMN t_clean_task.created_at IS '创建时间'; COMMENT ON COLUMN t_clean_task.started_at IS '开始时间'; COMMENT ON COLUMN t_clean_task.finished_at IS '完成时间'; +COMMENT ON COLUMN t_clean_task.created_at IS '创建时间'; +COMMENT ON COLUMN t_clean_task.updated_at IS '更新时间'; COMMENT ON COLUMN t_clean_task.created_by IS '创建者'; +COMMENT ON COLUMN t_clean_task.updated_by IS '更新者'; -- 操作员实例表 CREATE TABLE IF NOT EXISTS t_operator_instance diff --git a/scripts/db/data-management-init.sql b/scripts/db/data-management-init.sql index 0d8980248..4a6316cf1 100644 --- a/scripts/db/data-management-init.sql +++ b/scripts/db/data-management-init.sql @@ -190,7 +190,9 @@ CREATE TABLE IF NOT EXISTS t_dm_tags ( color VARCHAR(7), usage_count BIGINT DEFAULT 0, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + created_by VARCHAR(255), + updated_by VARCHAR(255) ); COMMENT ON TABLE t_dm_tags IS '标签表(UUID 主键)'; @@ -202,6 +204,8 @@ COMMENT ON COLUMN t_dm_tags.color IS '标签颜色(十六进制)'; COMMENT ON COLUMN t_dm_tags.usage_count IS '使用次数'; COMMENT ON COLUMN t_dm_tags.created_at IS '创建时间'; COMMENT ON COLUMN t_dm_tags.updated_at IS '更新时间'; +COMMENT ON COLUMN t_dm_tags.created_by IS '创建者'; +COMMENT ON COLUMN t_dm_tags.updated_by IS '更新者'; -- 创建索引 CREATE INDEX IF NOT EXISTS idx_dm_tag_category ON t_dm_tags(category); diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index 4abcdfa9e..93e650da7 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -37,6 +37,8 @@ COMMENT ON COLUMN t_operator.file_size IS '文件大小'; COMMENT ON COLUMN t_operator.metrics IS '性能指标'; COMMENT ON COLUMN t_operator.usage_count IS '使用次数'; COMMENT ON COLUMN t_operator.is_star IS '是否收藏'; +COMMENT ON COLUMN t_operator.created_by IS '创建者'; +COMMENT ON COLUMN t_operator.updated_by IS '更新者'; COMMENT ON COLUMN t_operator.created_at IS '创建时间'; COMMENT ON COLUMN t_operator.updated_at IS '更新时间'; @@ -127,6 +129,8 @@ SELECT o.metrics, o.created_at, o.updated_at, + o.created_by, + o.updated_by, toc.id AS category_id, toc.name AS category_name FROM t_operator_category_relation tocr @@ -154,50 +158,50 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined' ON CONFLICT DO NOTHING; INSERT INTO t_operator -(id, name, description, version, inputs, outputs, runtime, settings, file_name, file_size, is_star) +(id, name, description, version, inputs, outputs, runtime, settings, file_name, file_size, is_star, created_by, updated_by) VALUES -('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, '{"mineruApi":{"name":"Mineru Api地址","description":"指定mineru服务的api地址,默认为本地同一集群内地址。","type":"input","defaultVal":"http://datamate-mineru:8000","required":false},"exportType":{"name":"导出类型","description":"指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。","type":"select","defaultVal":"markdown","required":false,"options":[{"label":"markdown","value":"md"},{"label":"txt","value":"txt"}]}}', '', 12288, 'false'), -('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 16384, 'false'), -('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 8192, 'false'), -('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 5120, 'false'), -('DuplicateFilesFilter', '相似文档去除', '相似文档去除。', '1.0.0', 'text', 'text', null, '{"fileDuplicateThreshold": {"name": "文档相似度", "description": "基于MinHash算法和Jaccard相似度,计算当前文档与数据集中其它文档相似性,超过设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 13312, 'false'), -('FileWithManySensitiveWordsFilter', '文档敏感词率检查', '去除敏感词过多的文档。', '1.0.0', 'text', 'text', null, '{"sensitiveWordsRate": {"name": "文档敏感词率", "description": "敏感词的字数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.01, "min": 0, "max": 1, "step": 0.01}}', '', 29696, 'false'), -('FileWithShortOrLongLengthFilter', '文档字数检查', '字数不在指定范围会被过滤掉。', '1.0.0', 'text', 'text', null, '{"fileLength": {"name": "文档字数", "description": "过滤字数不在指定范围内的文档,如[10,10000000]。若输入为空,则不对字数上/下限做限制。", "type": "range", "defaultVal": [10, 10000000], "min": 0, "max": 10000000000000000, "step": 1}}', '', 8192, 'false'), -('ContentCleaner', '文档目录去除', '去除文档中的目录。', '1.0.0', 'text', 'text', null, null, '', 4096, 'false'), -('AnonymizedCreditCardNumber', '信用卡号匿名化', '信用卡号匿名化', '1.0.0', 'text', 'text', null, null, '', 8192, 'false'), -('EmailNumberCleaner', '邮件地址匿名化', '邮件地址匿名化', '1.0.0', 'text', 'text', null, null, '', 4096, 'false'), -('EmojiCleaner', '文档表情去除', '去除文档中表情字符或者emoji符号。', '1.0.0', 'text', 'text', null, null, '', 5120, 'false'), -('ExtraSpaceCleaner', '多余空格去除', '移除文档首尾、句中或标点符号附近多余空格和 tab 等。', '1.0.0', 'text', 'text', null, null, '', 8192, 'false'), -('FullWidthCharacterCleaner', '全角转半角', '将文档中的所有全角字符转换成半角字符。', '1.0.0', 'text', 'text', null, null, '', 8192, 'false'), -('GrableCharactersCleaner', '文档乱码去除', '去除文档中的乱码和无意义的unicode。', '1.0.0', 'text', 'text', null, null, '', 4096, 'false'), -('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签,如 、

等。', '1.0.0', 'text', 'text', null, '{"removeTableTags":{"name":"是否去除表格标签","description":"若为是,则会去除表格标签等。","type":"switch","defaultVal":"false","required":false,"checkedLabel":"是","unCheckedLabel":"否"}}', '', 12288, 'false'), -('AnonymizedIdNumber', '身份证号匿名化', '身份证号匿名化。', '1.0.0', 'text', 'text', null, null, '', 36864, 'false'), -('InvisibleCharactersCleaner', '不可见字符去除', '去除文档中的不可见字符,例如 0-31 号字符中的部分字符。', '1.0.0', 'text', 'text', null, null, '', 5120, 'false'), -('AnonymizedIpAddress', 'IP地址匿名化', 'IP地址匿名化', '1.0.0', 'text', 'text', null, null, '', 4096, 'false'), -('LegendCleaner', '图注表注去除', '去除文档中的图注、表注等内容。', '1.0.0', 'text', 'text', null, null, '', 4096, 'false'), -('AnonymizedPhoneNumber', '电话号码匿名化', '电话号码匿名化', '1.0.0', 'text', 'text', null, null, '', 4096, 'false'), -('PoliticalWordCleaner', '政治文本匿名化', '将政治文本进行匿名化。', '1.0.0', 'text', 'text', null, null, '', 8192, 'false'), -('DuplicateSentencesFilter', '文档局部内容去重', '文档局部内容去重。', '1.0.0', 'text', 'text', null, null, '', 5120, 'false'), -('SexualAndViolentWordCleaner', '暴力色情文本匿名化', '将暴力、色情文本进行匿名化。', '1.0.0', 'text', 'text', null, null, '', 20480, 'false'), -('TraditionalChineseCleaner', '繁体转简体', '将繁体转换为简体。', '1.0.0', 'text', 'text', null, null, '', 5120, 'false'), -('UnicodeSpaceCleaner', '空格标准化', '将文档中不同的 unicode 空格,如 u2008,转换为正常空格\\u0020。', '1.0.0', 'text', 'text', null, null, '', 8192, 'false'), -('AnonymizedUrlCleaner', 'URL网址匿名化', '将文档中的url网址匿名化。', '1.0.0', 'text', 'text', null, null, '', 4096, 'false'), -('XMLTagCleaner', 'XML标签去除', '去除XML中的标签。', '1.0.0', 'text', 'text', null, null, '', 4096, 'false'), -('ImgBlurredImagesCleaner', '模糊图片过滤', '去除模糊的图片。', '1.0.0', 'image', 'image', null, '{"blurredThreshold": {"name": "梯度函数值", "description": "梯度函数值取值越小,图片模糊度越高。", "type": "slider", "defaultVal": 1000, "min": 1, "max": 10000, "step": 1}}', '', 5120, 'false'), -('ImgBrightness', '图片亮度增强', '自适应调节图片的亮度。', '1.0.0', 'image', 'image', null, null, '', 4096, 'false'), -('ImgContrast', '图片对比度增强', '自适应调节图片的对比度。', '1.0.0', 'image', 'image', null, null, '', 4096, 'false'), -('ImgDenoise', '图片噪点去除', '去除图片中的噪点,主要适用于自然场景。', '1.0.0', 'image', 'image', null, null, '', 4096, 'false'), -('ImgDuplicatedImagesCleaner', '重复图片去除', '去除重复的图片。', '1.0.0', 'image', 'image', null, null, '', 8192, 'false'), -('ImgPerspectiveTransformation', '图片透视变换', '自适应校正图片的视角,主要适用于文档校正场景。', '1.0.0', 'image', 'image', null, null, '', 8192, 'false'), -('ImgResize', '图片重采样', '将图片放大或缩小到指定像素。', '1.0.0', 'image', 'image', null, '{"targetSize": {"name": "重采样尺寸", "name_en": "Resample Size", "type": "multiple", "properties": [{"type": "inputNumber", "name": "宽度", "description": "像素", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}, {"type": "inputNumber", "name": "高度", "description": "像素", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}]}}', '', 8192, 'false'), -('ImgSaturation', '图片饱和度增强', '自适应调节图片的饱和度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 4096, 'false'), -('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 4096, 'false'), -('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 4096, 'false'), -('ImgSimilarImagesCleaner', '相似图片去除', '去除相似的图片。', '1.0.0', 'image', 'image', null, '{"similarThreshold": {"name": "相似度", "description": "相似度取值越大,图片相似度越高。", "type": "slider", "defaultVal": 0.8, "min": 0, "max": 1, "step": 0.01}}', '', 14336, 'false'), -('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 5120, 'false'), -('ImgDirectionCorrect', '图片方向校正', '将含有文字的图片校正到文字水平方向,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 8192, 'false'), -('PiiDetector', '高级匿名化', '高级匿名化算子,检测命名实体并匿名化。', '1.0.0', 'text', 'text', null, null, '', 8192, 'false'), -('ObjectDetectionRectangle', '图像目标检测与预标注', '基于 YOLOv8 的图像目标检测算子。对输入图像进行目标检测,输出带矩形框与类别标签的标注图像,并生成结构化标注 JSON(包含类别、置信度与边界框坐标)。支持将检测结果导出为 Label Studio 兼容的 predictions 预标注格式(rectanglelabels),可在标注任务中直接加载并进行人工校正,从而显著降低人工标注成本并提升标注效率。', '1.0.0', 'image', 'image,json', null, null, '', 12288, 'false') +('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, '{"mineruApi":{"name":"Mineru Api地址","description":"指定mineru服务的api地址,默认为本地同一集群内地址。","type":"input","defaultVal":"http://datamate-mineru:8000","required":false},"exportType":{"name":"导出类型","description":"指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。","type":"select","defaultVal":"markdown","required":false,"options":[{"label":"markdown","value":"md"},{"label":"txt","value":"txt"}]}}', '', 12288, false, 'system', 'system'), +('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 16384, false, 'system', 'system'), +('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 8192, false, 'system', 'system'), +('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 5120, false, 'system', 'system'), +('DuplicateFilesFilter', '相似文档去除', '相似文档去除。', '1.0.0', 'text', 'text', null, '{"fileDuplicateThreshold": {"name": "文档相似度", "description": "基于MinHash算法和Jaccard相似度,计算当前文档与数据集中其它文档相似性,超过设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 13312, false, 'system', 'system'), +('FileWithManySensitiveWordsFilter', '文档敏感词率检查', '去除敏感词过多的文档。', '1.0.0', 'text', 'text', null, '{"sensitiveWordsRate": {"name": "文档敏感词率", "description": "敏感词的字数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.01, "min": 0, "max": 1, "step": 0.01}}', '', 29696, false, 'system', 'system'), +('FileWithShortOrLongLengthFilter', '文档字数检查', '字数不在指定范围会被过滤掉。', '1.0.0', 'text', 'text', null, '{"fileLength": {"name": "文档字数", "description": "过滤字数不在指定范围内的文档,如[10,10000000]。若输入为空,则不对字数上/下限做限制。", "type": "range", "defaultVal": [10, 10000000], "min": 0, "max": 10000000000000000, "step": 1}}', '', 8192, false, 'system', 'system'), +('ContentCleaner', '文档目录去除', '去除文档中的目录。', '1.0.0', 'text', 'text', null, null, '', 4096, false, 'system', 'system'), +('AnonymizedCreditCardNumber', '信用卡号匿名化', '信用卡号匿名化', '1.0.0', 'text', 'text', null, null, '', 8192, false, 'system', 'system'), +('EmailNumberCleaner', '邮件地址匿名化', '邮件地址匿名化', '1.0.0', 'text', 'text', null, null, '', 4096, false, 'system', 'system'), +('EmojiCleaner', '文档表情去除', '去除文档中表情字符或者emoji符号。', '1.0.0', 'text', 'text', null, null, '', 5120, false, 'system', 'system'), +('ExtraSpaceCleaner', '多余空格去除', '移除文档首尾、句中或标点符号附近多余空格和 tab 等。', '1.0.0', 'text', 'text', null, null, '', 8192, false, 'system', 'system'), +('FullWidthCharacterCleaner', '全角转半角', '将文档中的所有全角字符转换成半角字符。', '1.0.0', 'text', 'text', null, null, '', 8192, false, 'system', 'system'), +('GrableCharactersCleaner', '文档乱码去除', '去除文档中的乱码和无意义的unicode。', '1.0.0', 'text', 'text', null, null, '', 4096, false, 'system', 'system'), +('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签,如 、

等。', '1.0.0', 'text', 'text', null, '{"removeTableTags":{"name":"是否去除表格标签","description":"若为是,则会去除表格标签等。","type":"switch","defaultVal":"false","required":false,"checkedLabel":"是","unCheckedLabel":"否"}}', '', 12288, false, 'system', 'system'), +('AnonymizedIdNumber', '身份证号匿名化', '身份证号匿名化。', '1.0.0', 'text', 'text', null, null, '', 36864, false, 'system', 'system'), +('InvisibleCharactersCleaner', '不可见字符去除', '去除文档中的不可见字符,例如 0-31 号字符中的部分字符。', '1.0.0', 'text', 'text', null, null, '', 5120, false, 'system', 'system'), +('AnonymizedIpAddress', 'IP地址匿名化', 'IP地址匿名化', '1.0.0', 'text', 'text', null, null, '', 4096, false, 'system', 'system'), +('LegendCleaner', '图注表注去除', '去除文档中的图注、表注等内容。', '1.0.0', 'text', 'text', null, null, '', 4096, false, 'system', 'system'), +('AnonymizedPhoneNumber', '电话号码匿名化', '电话号码匿名化', '1.0.0', 'text', 'text', null, null, '', 4096, false, 'system', 'system'), +('PoliticalWordCleaner', '政治文本匿名化', '将政治文本进行匿名化。', '1.0.0', 'text', 'text', null, null, '', 8192, false, 'system', 'system'), +('DuplicateSentencesFilter', '文档局部内容去重', '文档局部内容去重。', '1.0.0', 'text', 'text', null, null, '', 5120, false, 'system', 'system'), +('SexualAndViolentWordCleaner', '暴力色情文本匿名化', '将暴力、色情文本进行匿名化。', '1.0.0', 'text', 'text', null, null, '', 20480, false, 'system', 'system'), +('TraditionalChineseCleaner', '繁体转简体', '将繁体转换为简体。', '1.0.0', 'text', 'text', null, null, '', 5120, false, 'system', 'system'), +('UnicodeSpaceCleaner', '空格标准化', '将文档中不同的 unicode 空格,如 u2008,转换为正常空格\\u0020。', '1.0.0', 'text', 'text', null, null, '', 8192, false, 'system', 'system'), +('AnonymizedUrlCleaner', 'URL网址匿名化', '将文档中的url网址匿名化。', '1.0.0', 'text', 'text', null, null, '', 4096, false, 'system', 'system'), +('XMLTagCleaner', 'XML标签去除', '去除XML中的标签。', '1.0.0', 'text', 'text', null, null, '', 4096, false, 'system', 'system'), +('ImgBlurredImagesCleaner', '模糊图片过滤', '去除模糊的图片。', '1.0.0', 'image', 'image', null, '{"blurredThreshold": {"name": "梯度函数值", "description": "梯度函数值取值越小,图片模糊度越高。", "type": "slider", "defaultVal": 1000, "min": 1, "max": 10000, "step": 1}}', '', 5120, false, 'system', 'system'), +('ImgBrightness', '图片亮度增强', '自适应调节图片的亮度。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'), +('ImgContrast', '图片对比度增强', '自适应调节图片的对比度。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'), +('ImgDenoise', '图片噪点去除', '去除图片中的噪点,主要适用于自然场景。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'), +('ImgDuplicatedImagesCleaner', '重复图片去除', '去除重复的图片。', '1.0.0', 'image', 'image', null, null, '', 8192, false, 'system', 'system'), +('ImgPerspectiveTransformation', '图片透视变换', '自适应校正图片的视角,主要适用于文档校正场景。', '1.0.0', 'image', 'image', null, null, '', 8192, false, 'system', 'system'), +('ImgResize', '图片重采样', '将图片放大或缩小到指定像素。', '1.0.0', 'image', 'image', null, '{"targetSize": {"name": "重采样尺寸", "name_en": "Resample Size", "type": "multiple", "properties": [{"type": "inputNumber", "name": "宽度", "description": "像素", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}, {"type": "inputNumber", "name": "高度", "description": "像素", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}]}}', '', 8192, false, 'system', 'system'), +('ImgSaturation', '图片饱和度增强', '自适应调节图片的饱和度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'), +('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'), +('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'), +('ImgSimilarImagesCleaner', '相似图片去除', '去除相似的图片。', '1.0.0', 'image', 'image', null, '{"similarThreshold": {"name": "相似度", "description": "相似度取值越大,图片相似度越高。", "type": "slider", "defaultVal": 0.8, "min": 0, "max": 1, "step": 0.01}}', '', 14336, false, 'system', 'system'), +('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 5120, false, 'system', 'system'), +('ImgDirectionCorrect', '图片方向校正', '将含有文字的图片校正到文字水平方向,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 8192, false, 'system', 'system'), +('PiiDetector', '高级匿名化', '高级匿名化算子,检测命名实体并匿名化。', '1.0.0', 'text', 'text', null, null, '', 8192, false, 'system', 'system'), +('ObjectDetectionRectangle', '图像目标检测与预标注', '基于 YOLOv8 的图像目标检测算子。对输入图像进行目标检测,输出带矩形框与类别标签的标注图像,并生成结构化标注 JSON(包含类别、置信度与边界框坐标)。支持将检测结果导出为 Label Studio 兼容的 predictions 预标注格式(rectanglelabels),可在标注任务中直接加载并进行人工校正,从而显著降低人工标注成本并提升标注效率。', '1.0.0', 'image', 'image,json', null, null, '', 12288, false, 'system', 'system') ON CONFLICT DO NOTHING; INSERT INTO t_operator_release(id, version, release_date, changelog) @@ -231,177 +235,177 @@ ON CONFLICT DO NOTHING; INSERT INTO t_operator -(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star) +(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star, created_by, updated_by) VALUES - ('entity_attribute_aggregator', '实体属性聚合器', 'Summarizes a given attribute of an entity from a set of documents. 汇总一组文档中实体的给定属性。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('meta_tags_aggregator', '元标签聚合器', 'Merge similar meta tags into a single, unified tag. 将类似的元标记合并到一个统一的标记中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('most_relevant_entities_aggregator', '最相关实体聚合器', 'Extracts and ranks entities closely related to a given entity from provided texts. 从提供的文本中提取与给定实体密切相关的实体并对其进行排名。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('nested_aggregator', '嵌套聚合器', 'Aggregates nested content from multiple samples into a single summary. 将多个示例中的嵌套内容聚合到单个摘要中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('document_deduplicator', '文档去重器', 'Deduplicates samples at the document level using exact matching. 使用完全匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('document_minhash_deduplicator', '文档MinHash去重器', 'Deduplicates samples at the document level using MinHash LSH. 使用MinHash LSH在文档级别删除重复样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('document_simhash_deduplicator', '文档SimHash去重器', 'Deduplicates samples at the document level using SimHash. 使用SimHash在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('image_deduplicator', '图像去重器', 'Deduplicates samples at the document level by exact matching of images. 通过图像的精确匹配在文档级别删除重复的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('ray_basic_deduplicator', 'Ray基础去重器', 'Backend for deduplicator. deduplicator的后端。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('ray_bts_minhash_deduplicator', 'Ray BTS MinHash去重器', 'A distributed implementation of Union-Find with load balancing. 具有负载平衡的Union-Find的分布式实现。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('ray_document_deduplicator', 'Ray文档去重器', 'Deduplicates samples at the document level using exact matching in Ray distributed mode. 在Ray分布式模式下使用精确匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('ray_image_deduplicator', 'Ray图像去重器', 'Deduplicates samples at the document level using exact matching of images in Ray distributed mode. 在光线分布模式下使用图像的精确匹配在文档级别删除重复样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('ray_video_deduplicator', 'Ray视频去重器', 'Deduplicates samples at document-level using exact matching of videos in Ray distributed mode. 在Ray分布式模式下使用视频的精确匹配在文档级删除重复样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_deduplicator', '视频去重器', 'Deduplicates samples at the document level using exact matching of videos. 使用视频的精确匹配在文档级别删除重复的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('alphanumeric_filter', '字母数字过滤器', 'Filter to keep samples with an alphabet/numeric ratio within a specific range. 过滤器,以保持具有特定范围内的字母/数字比率的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('audio_duration_filter', '音频时长过滤器', 'Keep data samples whose audio durations are within a specified range. 保留音频持续时间在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), - ('audio_nmf_snr_filter', '音频NMF信噪比过滤器', 'Keep data samples whose audio Signal-to-Noise Ratios (SNRs) are within a specified range. 保留音频信噪比 (snr) 在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), - ('audio_size_filter', '音频大小过滤器', 'Keep data samples based on the size of their audio files. 根据音频文件的大小保留数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), - ('average_line_length_filter', '平均行长过滤器', 'Filter to keep samples with average line length within a specific range. 过滤器,以保持平均线长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('character_repetition_filter', '字符重复过滤器', 'Filter to keep samples with character-level n-gram repetition ratio within a specific range. 过滤器将具有字符级n-gram重复比的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('flagged_words_filter', '标记词过滤器', 'Filter to keep samples with flagged-word ratio in a specified range. 过滤器将标记词比率的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('general_field_filter', '通用字段过滤器', 'Filter to keep samples based on a general field filter condition. 根据常规字段筛选条件保留样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('image_aesthetics_filter', '图像美学过滤器', 'Filter to keep samples with aesthetics scores within a specific range. 过滤以保持美学分数在特定范围内的样品。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_aspect_ratio_filter', '图像长宽比过滤器', 'Filter to keep samples with image aspect ratio within a specific range. 过滤器,以保持样本的图像纵横比在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_face_count_filter', '图像人脸计数过滤器', 'Filter to keep samples with the number of faces within a specific range. 过滤以保持样本的面数在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_face_ratio_filter', '图像人脸占比过滤器', 'Filter to keep samples with face area ratios within a specific range. 过滤以保持面面积比在特定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_nsfw_filter', '图像NSFW过滤器', 'Filter to keep samples whose images have nsfw scores in a specified range. 过滤器保留其图像的nsfw分数在指定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_pair_similarity_filter', '图像对相似度过滤器', 'Filter to keep image pairs with similarities between images within a specific range. 过滤器将图像之间具有相似性的图像对保持在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_shape_filter', '图像形状过滤器', 'Filter to keep samples with image shape (width, height) within specific ranges. 过滤器,以保持样本的图像形状 (宽度,高度) 在特定的范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_size_filter', '图像大小过滤器', 'Keep data samples whose image size (in Bytes/KB/MB/...) is within a specific range. 保留图像大小 (以字节/KB/MB/... 为单位) 在特定范围内的数据样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_text_matching_filter', '图文匹配过滤器', 'Filter to keep samples with image-text matching scores within a specific range. 过滤器将图像文本匹配分数的样本保持在特定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('image_text_similarity_filter', '图文相似度过滤器', 'Filter to keep samples with image-text similarity within a specified range. 过滤器将具有图像-文本相似性的样本保持在指定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('image_watermark_filter', '图像水印过滤器', 'Filter to keep samples whose images have no watermark with high probability. 过滤器以保持其图像没有水印的样本具有高概率。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('in_context_influence_filter', '上下文影响过滤器', 'Filter to keep texts based on their in-context influence on a validation set. 过滤以根据文本在上下文中对验证集的影响来保留文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('instruction_following_difficulty_filter', '指令跟随难度过滤器', 'Filter to keep texts based on their instruction following difficulty (IFD, https://arxiv.org/abs/2308.12032) score. 过滤以保持文本基于他们的指令跟随难度 (IFD, https://arxiv.org/abs/ 2308.12032) 分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('language_id_score_filter', '语种识别得分过滤器', 'Filter to keep samples in a specific language with a confidence score above a threshold. 过滤器以保留置信度高于阈值的特定语言的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('llm_analysis_filter', 'LLM分析过滤器', 'Base filter class for leveraging LLMs to analyze and filter data samples. 用于利用LLMs分析和过滤数据样本的基本筛选器类。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('llm_difficulty_score_filter', 'LLM难度得分过滤器', 'Filter to keep samples with high difficulty scores estimated by an LLM. 过滤器以保留由LLM估计的高难度分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('llm_perplexity_filter', 'LLM困惑度过滤器', 'Filter to keep samples with perplexity scores within a specified range, computed using a specified LLM. 过滤器将困惑分数的样本保留在指定范围内,使用指定的LLM计算。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('llm_quality_score_filter', 'LLM质量得分过滤器', 'Filter to keep samples with a high quality score estimated by a language model. 过滤器,以保留具有语言模型估计的高质量分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('llm_task_relevance_filter', 'LLM任务相关性过滤器', 'Filter to keep samples with high relevance scores to validation tasks estimated by an LLM. 过滤器以保留与LLM估计的验证任务具有高相关性分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('maximum_line_length_filter', '最大行长过滤器', 'Filter to keep samples with a maximum line length within a specified range. 筛选器将最大行长度的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('perplexity_filter', '困惑度过滤器', 'Filter to keep samples with perplexity score in a specified range. 过滤以保持困惑分数在指定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('phrase_grounding_recall_filter', '短语定位召回过滤器', 'Filter to keep samples based on the phrase grounding recall of phrases extracted from text in images. 根据从图像中的文本中提取的短语接地召回来过滤以保留样本。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('special_characters_filter', '特殊字符过滤器', 'Filter to keep samples with special-character ratio within a specific range. 过滤器,以将具有特殊字符比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('specified_field_filter', '指定字段过滤器', 'Filter samples based on the specified field information. 根据指定的字段信息筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('specified_numeric_field_filter', '指定数值字段过滤器', 'Filter samples based on a specified numeric field value. 根据指定的数值字段值筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('stopwords_filter', '停用词过滤器', 'Filter to keep samples with stopword ratio within a specified range. 过滤器将停止词比率的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('suffix_filter', '后缀过滤器', 'Filter to keep samples with specified suffix. 过滤器以保留具有指定后缀的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('text_action_filter', '文本动作过滤器', 'Filter to keep texts that contain a minimum number of actions. 过滤以保留包含最少数量操作的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('text_embd_similarity_filter', '文本嵌入相似度过滤器', 'Filter to keep texts whose average embedding similarity to a set of given validation texts falls within a specific range. 过滤器,以保留与一组给定验证文本的平均嵌入相似度在特定范围内的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('text_entity_dependency_filter', '文本实体依赖过滤器', 'Identify and filter text samples based on entity dependencies. 根据实体依赖关系识别和过滤文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('text_length_filter', '文本长度过滤器', 'Filter to keep samples with total text length within a specific range. 过滤以保持文本总长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('text_pair_similarity_filter', '文本对相似度过滤器', 'Filter to keep text pairs with similarities within a specific range. 过滤以将具有相似性的文本对保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('token_num_filter', 'Token数量过滤器', 'Filter to keep samples with a total token number within a specified range. 筛选器将总令牌数的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('video_aesthetics_filter', '视频美学过滤器', 'Filter to keep data samples with aesthetics scores for specified frames in the videos within a specific range. 过滤器将视频中指定帧的美学得分数据样本保留在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_aspect_ratio_filter', '视频长宽比过滤器', 'Filter to keep samples with video aspect ratio within a specific range. 过滤器将视频纵横比的样本保持在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_duration_filter', '视频时长过滤器', 'Keep data samples whose videos'' durations are within a specified range. 保留视频持续时间在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_frames_text_similarity_filter', '视频帧文本相似度过滤器', 'Filter to keep samples based on the similarity between video frame images and text within a specific range. 根据视频帧图像和文本之间的相似性进行过滤,以保持样本在特定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_motion_score_filter', '视频运动得分过滤器', 'Filter to keep samples with video motion scores within a specific range. 过滤器将视频运动分数的样本保持在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_motion_score_raft_filter', '视频RAFT运动得分过滤器', 'Filter to keep samples with video motion scores within a specified range. 过滤器将视频运动分数的样本保持在指定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_nsfw_filter', '视频NSFW过滤器', 'Filter to keep samples whose videos have nsfw scores in a specified range. 过滤器以保留其视频的nsfw分数在指定范围内的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_ocr_area_ratio_filter', '视频OCR面积占比过滤器', 'Keep data samples whose detected text area ratios for specified frames in the video are within a specified range. 保留检测到的视频中指定帧的文本面积比率在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_resolution_filter', '视频分辨率过滤器', 'Keep data samples whose videos'' resolutions are within a specified range. 保留视频分辨率在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_tagging_from_frames_filter', '视频帧标签过滤器', 'Filter to keep samples whose videos contain specified tags. 过滤器以保留其视频包含指定标签的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_watermark_filter', '视频水印过滤器', 'Filter to keep samples whose videos have no watermark with high probability. 过滤器以保持其视频具有高概率没有水印的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('word_repetition_filter', '单词重复过滤器', 'Filter to keep samples with word-level n-gram repetition ratio within a specific range. 过滤器将单词级n-gram重复比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('words_num_filter', '词数过滤器', 'Filter to keep samples with a total word count within a specified range. 过滤器将样本的总字数保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('key_value_grouper', '键值分组器', 'Groups samples into batches based on values in specified keys. 根据指定键中的值将样本分组为批处理。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('naive_grouper', '朴素分组器', 'Group all samples in a dataset into a single batched sample. 将数据集中的所有样本分组为单个批处理样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('naive_reverse_grouper', '朴素反向分组器', 'Split batched samples into individual samples. 将批处理的样品分成单个样品。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('audio_add_gaussian_noise_mapper', '音频高斯噪声添加映射器', 'Mapper to add Gaussian noise to audio samples. 映射器将高斯噪声添加到音频样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), - ('audio_ffmpeg_wrapped_mapper', '音频FFmpeg封装映射器', 'Wraps FFmpeg audio filters for processing audio files in a dataset. 包装FFmpeg音频过滤器,用于处理数据集中的音频文件。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), - ('calibrate_qa_mapper', 'QA校准映射器', 'Calibrates question-answer pairs based on reference text using an API model. 使用API模型根据参考文本校准问答对。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('calibrate_query_mapper', '查询校准映射器', 'Calibrate query in question-answer pairs based on reference text. 基于参考文本校准问答对中的查询。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('calibrate_response_mapper', '回复校准映射器', 'Calibrate response in question-answer pairs based on reference text. 根据参考文本校准问答对中的回答。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('chinese_convert_mapper', '中文简繁转换映射器', 'Mapper to convert Chinese text between Traditional, Simplified, and Japanese Kanji. 映射器在繁体、简体和日文汉字之间转换中文文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('clean_copyright_mapper', '版权清洗映射器', 'Cleans copyright comments at the beginning of text samples. 清除文本示例开头的版权注释。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('clean_email_mapper', '邮箱清洗映射器', 'Cleans email addresses from text samples using a regular expression. 使用正则表达式从文本示例中清除电子邮件地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('clean_html_mapper', 'HTML清洗映射器', 'Cleans HTML code from text samples, converting HTML to plain text. 从文本示例中清除HTML代码,将HTML转换为纯文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('clean_ip_mapper', 'IP清洗映射器', 'Cleans IPv4 and IPv6 addresses from text samples. 从文本示例中清除IPv4和IPv6地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('clean_links_mapper', '链接清洗映射器', 'Mapper to clean links like http/https/ftp in text samples. 映射器来清理链接,如文本示例中的http/https/ftp。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('detect_character_attributes_mapper', '角色属性检测映射器', 'Takes an image, a caption, and main character names as input to extract the characters'' attributes. 根据给定的图像、图像描述信息和(多个)角色名称,提取图像中主要角色的属性。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('detect_character_locations_mapper', '角色位置检测映射器', 'Given an image and a list of main character names, extract the bounding boxes for each present character. 给定一张图像和主要角色的名称列表,提取每个在场角色的边界框。(YOLOE + MLLM)', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('detect_main_character_mapper', '主要角色检测映射器', 'Extract all main character names based on the given image and its caption. 根据给定的图像及其图像描述,提取所有主要角色的名字。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('dialog_intent_detection_mapper', '对话意图检测映射器', 'Generates user''s intent labels in a dialog by analyzing the history, query, and response. 通过分析历史记录、查询和响应,在对话框中生成用户的意图标签。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('dialog_sentiment_detection_mapper', '对话情感检测映射器', 'Generates sentiment labels and analysis for user queries in a dialog. 在对话框中为用户查询生成情绪标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('dialog_sentiment_intensity_mapper', '对话情感强度映射器', 'Mapper to predict user''s sentiment intensity in a dialog, ranging from -5 to 5. Mapper预测用户在对话框中的情绪强度,范围从-5到5。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('dialog_topic_detection_mapper', '对话主题检测映射器', 'Generates user''s topic labels and analysis in a dialog. 在对话框中生成用户的主题标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('download_file_mapper', '文件下载映射器', 'Mapper to download URL files to local files or load them into memory. 映射器将URL文件下载到本地文件或将其加载到内存中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('expand_macro_mapper', '宏展开映射器', 'Expands macro definitions in the document body of LaTeX samples. 展开LaTeX示例文档主体中的宏定义。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('extract_entity_attribute_mapper', '实体属性提取映射器', 'Extracts attributes for given entities from the text and stores them in the sample''s metadata. 从文本中提取给定实体的属性,并将其存储在示例的元数据中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('extract_entity_relation_mapper', '实体关系提取映射器', 'Extracts entities and relations from text to build a knowledge graph. 从文本中提取实体和关系以构建知识图谱。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('extract_event_mapper', '事件提取映射器', 'Extracts events and relevant characters from the text. 从文本中提取事件和相关字符。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('extract_keyword_mapper', '关键词提取映射器', 'Generate keywords for the text. 为文本生成关键字。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('extract_nickname_mapper', '昵称提取映射器', 'Extracts nickname relationships in the text using a language model. 使用语言模型提取文本中的昵称关系。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('extract_support_text_mapper', '支撑文本提取映射器', 'Extracts a supporting sub-text from the original text based on a given summary. 根据给定的摘要从原始文本中提取支持子文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('extract_tables_from_html_mapper', 'HTML表格提取映射器', 'Extracts tables from HTML content and stores them in a specified field. 从HTML内容中提取表并将其存储在指定字段中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('fix_unicode_mapper', 'Unicode修复映射器', 'Fixes unicode errors in text samples. 修复文本示例中的unicode错误。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('generate_qa_from_examples_mapper', '示例生成QA映射器', 'Generates question and answer pairs from examples using a Hugging Face model. 使用拥抱面部模型从示例生成问题和答案对。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('generate_qa_from_text_mapper', '文本生成QA映射器', 'Generates question and answer pairs from text using a specified model. 使用指定的模型从文本生成问题和答案对。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('image_blur_mapper', '图像模糊映射器', 'Blurs images in the dataset with a specified probability and blur type. 使用指定的概率和模糊类型对数据集中的图像进行模糊处理。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_captioning_from_gpt4v_mapper', 'GPT4V图像描述映射器', 'Generates text captions for images using the GPT-4 Vision model. 使用GPT-4视觉模型为图像生成文本标题。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('image_captioning_mapper', '图像描述映射器', 'Generates image captions using a Hugging Face model and appends them to samples. 使用拥抱面部模型生成图像标题,并将其附加到样本中。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('image_detection_yolo_mapper', 'YOLO图像检测映射器', 'Perform object detection using YOLO on images and return bounding boxes and class labels. 使用YOLO对图像执行对象检测,并返回边界框和类标签。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_diffusion_mapper', '图像扩散生成映射器', 'Generate images using a diffusion model based on provided captions. 使用基于提供的字幕的扩散模型生成图像。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('image_face_blur_mapper', '图像人脸模糊映射器', 'Mapper to blur faces detected in images. 映射器模糊图像中检测到的人脸。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_remove_background_mapper', '图像去背景映射器', 'Mapper to remove the background of images. 映射器删除图像的背景。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_segment_mapper', '图像分割映射器', 'Perform segment-anything on images and return the bounding boxes. 对图像执行segment-任何操作并返回边界框。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('image_tagging_mapper', '图像打标映射器', 'Generates image tags for each image in the sample. 为样本中的每个图像生成图像标记。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('imgdiff_difference_area_generator_mapper', 'ImgDiff差异区域生成映射器', 'Generates and filters bounding boxes for image pairs based on similarity, segmentation, and text matching. 根据相似性、分割和文本匹配生成和过滤图像对的边界框。', '1.4.4', 'image', 'image', NULL, NULL, '', false), - ('imgdiff_difference_caption_generator_mapper', 'ImgDiff差异描述生成映射器', 'Generates difference captions for bounding box regions in two images. 为两个图像中的边界框区域生成差异字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('mllm_mapper', 'MLLM视觉问答映射器', 'Mapper to use MLLMs for visual question answering tasks. Mapper使用MLLMs进行视觉问答任务。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('nlpaug_en_mapper', 'NLPAug英语增强映射器', 'Augments English text samples using various methods from the nlpaug library. 使用nlpaug库中的各种方法增强英语文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('nlpcda_zh_mapper', 'NLPCDA中文增强映射器', 'Augments Chinese text samples using the nlpcda library. 使用nlpcda库扩充中文文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('optimize_prompt_mapper', 'Prompt优化映射器', 'Optimize prompts based on existing ones in the same batch. 根据同一批次中的现有提示优化提示。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('optimize_qa_mapper', 'QA优化映射器', 'Mapper to optimize question-answer pairs. 映射器来优化问题-答案对。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('optimize_query_mapper', '查询优化映射器', 'Optimize queries in question-answer pairs to make them more specific and detailed. 优化问答对中的查询,使其更加具体和详细。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('optimize_response_mapper', '回复优化映射器', 'Optimize response in question-answer pairs to be more detailed and specific. 优化问答对中的响应,使其更加详细和具体。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('pair_preference_mapper', '配对偏好映射器', 'Mapper to construct paired preference samples by generating a rejected response and its reason. Mapper通过生成拒绝响应及其原因来构造成对的偏好样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('punctuation_normalization_mapper', '标点归一化映射器', 'Normalizes unicode punctuations to their English equivalents in text samples. 将unicode标点规范化为文本示例中的英语等效项。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('python_file_mapper', 'Python文件映射器', 'Executes a Python function defined in a file on input data. 对输入数据执行文件中定义的Python函数。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('python_lambda_mapper', 'Python Lambda映射器', 'Mapper for applying a Python lambda function to data samples. Mapper,用于将Python lambda函数应用于数据样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('query_intent_detection_mapper', '查询意图检测映射器', 'Predicts the user''s intent label and corresponding score for a given query. 为给定查询预测用户的意图标签和相应的分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('query_sentiment_detection_mapper', '查询情感检测映射器', 'Predicts user''s sentiment label (''negative'', ''neutral'', ''positive'') in a query. 在查询中预测用户的情绪标签 (“负面” 、 “中性” 、 “正面”)。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('query_topic_detection_mapper', '查询主题检测映射器', 'Predicts the topic label and its corresponding score for a given query. 预测给定查询的主题标签及其相应的分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('relation_identity_mapper', '关系识别映射器', 'Identify the relation between two entities in a given text. 确定给定文本中两个实体之间的关系。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('remove_bibliography_mapper', '参考书目移除映射器', 'Removes bibliography sections at the end of LaTeX documents. 删除LaTeX文档末尾的参考书目部分。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('remove_comments_mapper', '注释移除映射器', 'Removes comments from documents, currently supporting only ''tex'' format. 从文档中删除注释,当前仅支持 “文本” 格式。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('remove_header_mapper', '页眉移除映射器', 'Removes headers at the beginning of documents in LaTeX samples. 删除LaTeX示例中文档开头的标题。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('remove_long_words_mapper', '长词移除映射器', 'Mapper to remove long words within a specific range. 映射器删除特定范围内的长词。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('remove_non_chinese_character_mapper', '非中文字符移除映射器', 'Removes non-Chinese characters from text samples. 从文本样本中删除非中文字符。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('remove_repeat_sentences_mapper', '重复句移除映射器', 'Mapper to remove repeat sentences in text samples. 映射器删除文本样本中的重复句子。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('remove_specific_chars_mapper', '指定字符移除映射器', 'Removes specific characters from text samples. 从文本示例中删除特定字符。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('remove_table_text_mapper', '表格文本移除映射器', 'Mapper to remove table texts from text samples. 映射器从文本样本中删除表文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('remove_words_with_incorrect_substrings_mapper', '错误子串单词移除映射器', 'Mapper to remove words containing specified incorrect substrings. 映射程序删除包含指定的不正确子字符串的单词。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('replace_content_mapper', '内容替换映射器', 'Replaces content in the text that matches a specific regular expression pattern with a designated replacement string. 用指定的替换字符串替换与特定正则表达式模式匹配的文本中的内容。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('sdxl_prompt2prompt_mapper', 'SDXL Prompt2Prompt映射器', 'Generates pairs of similar images using the SDXL model. 使用SDXL模型生成成对的相似图像。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('sentence_augmentation_mapper', '句子增强映射器', 'Augments sentences by generating enhanced versions using a Hugging Face model. 通过使用拥抱面部模型生成增强版本来增强句子。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('sentence_split_mapper', '句子切分映射器', 'Splits text samples into individual sentences based on the specified language. 根据指定的语言将文本样本拆分为单个句子。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('text_chunk_mapper', '文本分块映射器', 'Split input text into chunks based on specified criteria. 根据指定的条件将输入文本拆分为块。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('text_tagging_by_prompt_mapper', 'Prompt文本打标映射器', 'Mapper to generate text tags using prompt with LLM. Mapper使用带有LLM的prompt生成文本标记。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('vggt_mapper', 'VGGT视频提取映射器', 'Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks. 输入单个场景的视频,并使用VGGT提取包括相机姿态、深度图、点图和3D点轨迹的信息。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_captioning_from_audio_mapper', '音频生成视频描述映射器', 'Mapper to caption a video according to its audio streams based on Qwen-Audio model. 映射器根据基于qwen-audio模型的音频流为视频添加字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_captioning_from_frames_mapper', '帧生成视频描述映射器', 'Generates video captions from sampled frames using an image-to-text model. 使用图像到文本模型从采样帧生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_captioning_from_summarizer_mapper', '摘要生成视频描述映射器', 'Mapper to generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...). 映射器通过总结几种生成的文本 (来自视频/音频/帧的字幕,来自音频/帧的标签,...) 来生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_captioning_from_video_mapper', '视频生成视频描述映射器', 'Generates video captions using a Hugging Face video-to-text model and sampled video frames. 使用拥抱面部视频到文本模型和采样视频帧生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_captioning_from_vlm_mapper', 'VLM视频描述映射器', 'Generates video captions using a VLM that accepts videos as inputs. 使用接受视频作为输入的VLM生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_depth_estimation_mapper', '视频深度估计映射器', 'Perform depth estimation on the video. 对视频进行深度估计。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_extract_frames_mapper', '视频抽帧映射器', 'Mapper to extract frames from video files according to specified methods. 映射器根据指定的方法从视频文件中提取帧。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_face_blur_mapper', '视频人脸模糊映射器', 'Mapper to blur faces detected in videos. 映射器模糊在视频中检测到的人脸。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_ffmpeg_wrapped_mapper', '视频FFmpeg封装映射器', 'Wraps FFmpeg video filters for processing video files in a dataset. 包装FFmpeg视频过滤器,用于处理数据集中的视频文件。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_hand_reconstruction_mapper', '视频手部重建映射器', 'Use the WiLoR model for hand localization and reconstruction. 使用WiLoR模型进行手部定位和重建。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_object_segmenting_mapper', '视频对象分割映射器', 'Text-guided semantic segmentation of valid objects throughout the video (YOLOE + SAM2). 在整个视频中对有效对象进行文本引导的语义分割 (YOLOE SAM2)。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_remove_watermark_mapper', '视频去水印映射器', 'Remove watermarks from videos based on specified regions. 根据指定区域从视频中删除水印。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_resize_aspect_ratio_mapper', '视频宽高比调整映射器', 'Resizes videos to fit within a specified aspect ratio range. 调整视频大小以适应指定的宽高比范围。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_resize_resolution_mapper', '视频分辨率调整映射器', 'Resizes video resolution based on specified width and height constraints. 根据指定的宽度和高度限制调整视频分辨率。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_split_by_duration_mapper', '视频按时长切分映射器', 'Splits videos into segments based on a specified duration. 根据指定的持续时间将视频拆分为多个片段。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_split_by_key_frame_mapper', '视频关键帧切分映射器', 'Splits a video into segments based on key frames. 根据关键帧将视频分割为多个片段。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_split_by_scene_mapper', '视频场景切分映射器', 'Splits videos into scene clips based on detected scene changes. 根据检测到的场景变化将视频拆分为场景剪辑。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), - ('video_tagging_from_audio_mapper', '音频视频打标映射器', 'Generates video tags from audio streams using the Audio Spectrogram Transformer. 使用音频频谱图转换器从音频流生成视频标签。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_tagging_from_frames_mapper', '帧视频打标映射器', 'Generates video tags from frames extracted from videos. 从视频中提取的帧生成视频标签。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('video_whole_body_pose_estimation_mapper', '视频全身姿态估计映射器', 'Input a video containing people, and use the DWPose model to extract the body, hand, feet, and face keypoints of the human subjects in the video, i.e., 2D Whole-body Pose Estimation. 输入包含人的视频,并使用DWPose模型来提取视频中人类主体的身体、手、脚和面部关键点,即2D全身姿态估计。', '1.4.4', 'video', 'video', NULL, NULL, '', false), - ('whitespace_normalization_mapper', '空白字符归一化映射器', 'Normalizes various types of whitespace characters to standard spaces in text samples. 将文本样本中各种类型的空白字符规范化为标准空格。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('frequency_specified_field_selector', '频率指定字段选择器', 'Selector to filter samples based on the frequency of a specified field. 选择器根据指定字段的频率过滤样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('random_selector', '随机选择器', 'Randomly selects a subset of samples from the dataset. 从数据集中随机选择样本子集。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('range_specified_field_selector', '范围指定字段选择器', 'Selects a range of samples based on the sorted values of a specified field. 根据指定字段的排序值选择采样范围。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('tags_specified_field_selector', '标签指定字段选择器', 'Selector to filter samples based on the tags of a specified field. 选择器根据指定字段的标签过滤样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), - ('topk_specified_field_selector', 'TopK指定字段选择器', 'Selects top samples based on the sorted values of a specified field. 根据指定字段的排序值选择顶部样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false) + ('entity_attribute_aggregator', '实体属性聚合器', 'Summarizes a given attribute of an entity from a set of documents. 汇总一组文档中实体的给定属性。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('meta_tags_aggregator', '元标签聚合器', 'Merge similar meta tags into a single, unified tag. 将类似的元标记合并到一个统一的标记中。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('most_relevant_entities_aggregator', '最相关实体聚合器', 'Extracts and ranks entities closely related to a given entity from provided texts. 从提供的文本中提取与给定实体密切相关的实体并对其进行排名。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('nested_aggregator', '嵌套聚合器', 'Aggregates nested content from multiple samples into a single summary. 将多个示例中的嵌套内容聚合到单个摘要中。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('document_deduplicator', '文档去重器', 'Deduplicates samples at the document level using exact matching. 使用完全匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('document_minhash_deduplicator', '文档MinHash去重器', 'Deduplicates samples at the document level using MinHash LSH. 使用MinHash LSH在文档级别删除重复样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('document_simhash_deduplicator', '文档SimHash去重器', 'Deduplicates samples at the document level using SimHash. 使用SimHash在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('image_deduplicator', '图像去重器', 'Deduplicates samples at the document level by exact matching of images. 通过图像的精确匹配在文档级别删除重复的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('ray_basic_deduplicator', 'Ray基础去重器', 'Backend for deduplicator. deduplicator的后端。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('ray_bts_minhash_deduplicator', 'Ray BTS MinHash去重器', 'A distributed implementation of Union-Find with load balancing. 具有负载平衡的Union-Find的分布式实现。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('ray_document_deduplicator', 'Ray文档去重器', 'Deduplicates samples at the document level using exact matching in Ray distributed mode. 在Ray分布式模式下使用精确匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('ray_image_deduplicator', 'Ray图像去重器', 'Deduplicates samples at the document level using exact matching of images in Ray distributed mode. 在光线分布模式下使用图像的精确匹配在文档级别删除重复样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('ray_video_deduplicator', 'Ray视频去重器', 'Deduplicates samples at document-level using exact matching of videos in Ray distributed mode. 在Ray分布式模式下使用视频的精确匹配在文档级删除重复样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_deduplicator', '视频去重器', 'Deduplicates samples at the document level using exact matching of videos. 使用视频的精确匹配在文档级别删除重复的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('alphanumeric_filter', '字母数字过滤器', 'Filter to keep samples with an alphabet/numeric ratio within a specific range. 过滤器,以保持具有特定范围内的字母/数字比率的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('audio_duration_filter', '音频时长过滤器', 'Keep data samples whose audio durations are within a specified range. 保留音频持续时间在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false, 'system', 'system'), + ('audio_nmf_snr_filter', '音频NMF信噪比过滤器', 'Keep data samples whose audio Signal-to-Noise Ratios (SNRs) are within a specified range. 保留音频信噪比 (snr) 在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false, 'system', 'system'), + ('audio_size_filter', '音频大小过滤器', 'Keep data samples based on the size of their audio files. 根据音频文件的大小保留数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false, 'system', 'system'), + ('average_line_length_filter', '平均行长过滤器', 'Filter to keep samples with average line length within a specific range. 过滤器,以保持平均线长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('character_repetition_filter', '字符重复过滤器', 'Filter to keep samples with character-level n-gram repetition ratio within a specific range. 过滤器将具有字符级n-gram重复比的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('flagged_words_filter', '标记词过滤器', 'Filter to keep samples with flagged-word ratio in a specified range. 过滤器将标记词比率的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('general_field_filter', '通用字段过滤器', 'Filter to keep samples based on a general field filter condition. 根据常规字段筛选条件保留样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('image_aesthetics_filter', '图像美学过滤器', 'Filter to keep samples with aesthetics scores within a specific range. 过滤以保持美学分数在特定范围内的样品。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_aspect_ratio_filter', '图像长宽比过滤器', 'Filter to keep samples with image aspect ratio within a specific range. 过滤器,以保持样本的图像纵横比在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_face_count_filter', '图像人脸计数过滤器', 'Filter to keep samples with the number of faces within a specific range. 过滤以保持样本的面数在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_face_ratio_filter', '图像人脸占比过滤器', 'Filter to keep samples with face area ratios within a specific range. 过滤以保持面面积比在特定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_nsfw_filter', '图像NSFW过滤器', 'Filter to keep samples whose images have nsfw scores in a specified range. 过滤器保留其图像的nsfw分数在指定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_pair_similarity_filter', '图像对相似度过滤器', 'Filter to keep image pairs with similarities between images within a specific range. 过滤器将图像之间具有相似性的图像对保持在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_shape_filter', '图像形状过滤器', 'Filter to keep samples with image shape (width, height) within specific ranges. 过滤器,以保持样本的图像形状 (宽度,高度) 在特定的范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_size_filter', '图像大小过滤器', 'Keep data samples whose image size (in Bytes/KB/MB/...) is within a specific range. 保留图像大小 (以字节/KB/MB/... 为单位) 在特定范围内的数据样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_text_matching_filter', '图文匹配过滤器', 'Filter to keep samples with image-text matching scores within a specific range. 过滤器将图像文本匹配分数的样本保持在特定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('image_text_similarity_filter', '图文相似度过滤器', 'Filter to keep samples with image-text similarity within a specified range. 过滤器将具有图像-文本相似性的样本保持在指定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('image_watermark_filter', '图像水印过滤器', 'Filter to keep samples whose images have no watermark with high probability. 过滤器以保持其图像没有水印的样本具有高概率。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('in_context_influence_filter', '上下文影响过滤器', 'Filter to keep texts based on their in-context influence on a validation set. 过滤以根据文本在上下文中对验证集的影响来保留文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('instruction_following_difficulty_filter', '指令跟随难度过滤器', 'Filter to keep texts based on their instruction following difficulty (IFD, https://arxiv.org/abs/2308.12032) score. 过滤以保持文本基于他们的指令跟随难度 (IFD, https://arxiv.org/abs/ 2308.12032) 分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('language_id_score_filter', '语种识别得分过滤器', 'Filter to keep samples in a specific language with a confidence score above a threshold. 过滤器以保留置信度高于阈值的特定语言的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('llm_analysis_filter', 'LLM分析过滤器', 'Base filter class for leveraging LLMs to analyze and filter data samples. 用于利用LLMs分析和过滤数据样本的基本筛选器类。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('llm_difficulty_score_filter', 'LLM难度得分过滤器', 'Filter to keep samples with high difficulty scores estimated by an LLM. 过滤器以保留由LLM估计的高难度分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('llm_perplexity_filter', 'LLM困惑度过滤器', 'Filter to keep samples with perplexity scores within a specified range, computed using a specified LLM. 过滤器将困惑分数的样本保留在指定范围内,使用指定的LLM计算。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('llm_quality_score_filter', 'LLM质量得分过滤器', 'Filter to keep samples with a high quality score estimated by a language model. 过滤器,以保留具有语言模型估计的高质量分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('llm_task_relevance_filter', 'LLM任务相关性过滤器', 'Filter to keep samples with high relevance scores to validation tasks estimated by an LLM. 过滤器以保留与LLM估计的验证任务具有高相关性分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('maximum_line_length_filter', '最大行长过滤器', 'Filter to keep samples with a maximum line length within a specified range. 筛选器将最大行长度的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('perplexity_filter', '困惑度过滤器', 'Filter to keep samples with perplexity score in a specified range. 过滤以保持困惑分数在指定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('phrase_grounding_recall_filter', '短语定位召回过滤器', 'Filter to keep samples based on the phrase grounding recall of phrases extracted from text in images. 根据从图像中的文本中提取的短语接地召回来过滤以保留样本。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('special_characters_filter', '特殊字符过滤器', 'Filter to keep samples with special-character ratio within a specific range. 过滤器,以将具有特殊字符比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('specified_field_filter', '指定字段过滤器', 'Filter samples based on the specified field information. 根据指定的字段信息筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('specified_numeric_field_filter', '指定数值字段过滤器', 'Filter samples based on a specified numeric field value. 根据指定的数值字段值筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('stopwords_filter', '停用词过滤器', 'Filter to keep samples with stopword ratio within a specified range. 过滤器将停止词比率的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('suffix_filter', '后缀过滤器', 'Filter to keep samples with specified suffix. 过滤器以保留具有指定后缀的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('text_action_filter', '文本动作过滤器', 'Filter to keep texts that contain a minimum number of actions. 过滤以保留包含最少数量操作的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('text_embd_similarity_filter', '文本嵌入相似度过滤器', 'Filter to keep texts whose average embedding similarity to a set of given validation texts falls within a specific range. 过滤器,以保留与一组给定验证文本的平均嵌入相似度在特定范围内的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('text_entity_dependency_filter', '文本实体依赖过滤器', 'Identify and filter text samples based on entity dependencies. 根据实体依赖关系识别和过滤文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('text_length_filter', '文本长度过滤器', 'Filter to keep samples with total text length within a specific range. 过滤以保持文本总长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('text_pair_similarity_filter', '文本对相似度过滤器', 'Filter to keep text pairs with similarities within a specific range. 过滤以将具有相似性的文本对保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('token_num_filter', 'Token数量过滤器', 'Filter to keep samples with a total token number within a specified range. 筛选器将总令牌数的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('video_aesthetics_filter', '视频美学过滤器', 'Filter to keep data samples with aesthetics scores for specified frames in the videos within a specific range. 过滤器将视频中指定帧的美学得分数据样本保留在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_aspect_ratio_filter', '视频长宽比过滤器', 'Filter to keep samples with video aspect ratio within a specific range. 过滤器将视频纵横比的样本保持在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_duration_filter', '视频时长过滤器', 'Keep data samples whose videos'' durations are within a specified range. 保留视频持续时间在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_frames_text_similarity_filter', '视频帧文本相似度过滤器', 'Filter to keep samples based on the similarity between video frame images and text within a specific range. 根据视频帧图像和文本之间的相似性进行过滤,以保持样本在特定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_motion_score_filter', '视频运动得分过滤器', 'Filter to keep samples with video motion scores within a specific range. 过滤器将视频运动分数的样本保持在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_motion_score_raft_filter', '视频RAFT运动得分过滤器', 'Filter to keep samples with video motion scores within a specified range. 过滤器将视频运动分数的样本保持在指定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_nsfw_filter', '视频NSFW过滤器', 'Filter to keep samples whose videos have nsfw scores in a specified range. 过滤器以保留其视频的nsfw分数在指定范围内的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_ocr_area_ratio_filter', '视频OCR面积占比过滤器', 'Keep data samples whose detected text area ratios for specified frames in the video are within a specified range. 保留检测到的视频中指定帧的文本面积比率在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_resolution_filter', '视频分辨率过滤器', 'Keep data samples whose videos'' resolutions are within a specified range. 保留视频分辨率在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_tagging_from_frames_filter', '视频帧标签过滤器', 'Filter to keep samples whose videos contain specified tags. 过滤器以保留其视频包含指定标签的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_watermark_filter', '视频水印过滤器', 'Filter to keep samples whose videos have no watermark with high probability. 过滤器以保持其视频具有高概率没有水印的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('word_repetition_filter', '单词重复过滤器', 'Filter to keep samples with word-level n-gram repetition ratio within a specific range. 过滤器将单词级n-gram重复比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('words_num_filter', '词数过滤器', 'Filter to keep samples with a total word count within a specified range. 过滤器将样本的总字数保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('key_value_grouper', '键值分组器', 'Groups samples into batches based on values in specified keys. 根据指定键中的值将样本分组为批处理。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('naive_grouper', '朴素分组器', 'Group all samples in a dataset into a single batched sample. 将数据集中的所有样本分组为单个批处理样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('naive_reverse_grouper', '朴素反向分组器', 'Split batched samples into individual samples. 将批处理的样品分成单个样品。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('audio_add_gaussian_noise_mapper', '音频高斯噪声添加映射器', 'Mapper to add Gaussian noise to audio samples. 映射器将高斯噪声添加到音频样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false, 'system', 'system'), + ('audio_ffmpeg_wrapped_mapper', '音频FFmpeg封装映射器', 'Wraps FFmpeg audio filters for processing audio files in a dataset. 包装FFmpeg音频过滤器,用于处理数据集中的音频文件。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false, 'system', 'system'), + ('calibrate_qa_mapper', 'QA校准映射器', 'Calibrates question-answer pairs based on reference text using an API model. 使用API模型根据参考文本校准问答对。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('calibrate_query_mapper', '查询校准映射器', 'Calibrate query in question-answer pairs based on reference text. 基于参考文本校准问答对中的查询。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('calibrate_response_mapper', '回复校准映射器', 'Calibrate response in question-answer pairs based on reference text. 根据参考文本校准问答对中的回答。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('chinese_convert_mapper', '中文简繁转换映射器', 'Mapper to convert Chinese text between Traditional, Simplified, and Japanese Kanji. 映射器在繁体、简体和日文汉字之间转换中文文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('clean_copyright_mapper', '版权清洗映射器', 'Cleans copyright comments at the beginning of text samples. 清除文本示例开头的版权注释。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('clean_email_mapper', '邮箱清洗映射器', 'Cleans email addresses from text samples using a regular expression. 使用正则表达式从文本示例中清除电子邮件地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('clean_html_mapper', 'HTML清洗映射器', 'Cleans HTML code from text samples, converting HTML to plain text. 从文本示例中清除HTML代码,将HTML转换为纯文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('clean_ip_mapper', 'IP清洗映射器', 'Cleans IPv4 and IPv6 addresses from text samples. 从文本示例中清除IPv4和IPv6地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('clean_links_mapper', '链接清洗映射器', 'Mapper to clean links like http/https/ftp in text samples. 映射器来清理链接,如文本示例中的http/https/ftp。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('detect_character_attributes_mapper', '角色属性检测映射器', 'Takes an image, a caption, and main character names as input to extract the characters'' attributes. 根据给定的图像、图像描述信息和(多个)角色名称,提取图像中主要角色的属性。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('detect_character_locations_mapper', '角色位置检测映射器', 'Given an image and a list of main character names, extract the bounding boxes for each present character. 给定一张图像和主要角色的名称列表,提取每个在场角色的边界框。(YOLOE + MLLM)', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('detect_main_character_mapper', '主要角色检测映射器', 'Extract all main character names based on the given image and its caption. 根据给定的图像及其图像描述,提取所有主要角色的名字。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('dialog_intent_detection_mapper', '对话意图检测映射器', 'Generates user''s intent labels in a dialog by analyzing the history, query, and response. 通过分析历史记录、查询和响应,在对话框中生成用户的意图标签。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('dialog_sentiment_detection_mapper', '对话情感检测映射器', 'Generates sentiment labels and analysis for user queries in a dialog. 在对话框中为用户查询生成情绪标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('dialog_sentiment_intensity_mapper', '对话情感强度映射器', 'Mapper to predict user''s sentiment intensity in a dialog, ranging from -5 to 5. Mapper预测用户在对话框中的情绪强度,范围从-5到5。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('dialog_topic_detection_mapper', '对话主题检测映射器', 'Generates user''s topic labels and analysis in a dialog. 在对话框中生成用户的主题标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('download_file_mapper', '文件下载映射器', 'Mapper to download URL files to local files or load them into memory. 映射器将URL文件下载到本地文件或将其加载到内存中。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('expand_macro_mapper', '宏展开映射器', 'Expands macro definitions in the document body of LaTeX samples. 展开LaTeX示例文档主体中的宏定义。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('extract_entity_attribute_mapper', '实体属性提取映射器', 'Extracts attributes for given entities from the text and stores them in the sample''s metadata. 从文本中提取给定实体的属性,并将其存储在示例的元数据中。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('extract_entity_relation_mapper', '实体关系提取映射器', 'Extracts entities and relations from text to build a knowledge graph. 从文本中提取实体和关系以构建知识图谱。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('extract_event_mapper', '事件提取映射器', 'Extracts events and relevant characters from the text. 从文本中提取事件和相关字符。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('extract_keyword_mapper', '关键词提取映射器', 'Generate keywords for the text. 为文本生成关键字。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('extract_nickname_mapper', '昵称提取映射器', 'Extracts nickname relationships in the text using a language model. 使用语言模型提取文本中的昵称关系。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('extract_support_text_mapper', '支撑文本提取映射器', 'Extracts a supporting sub-text from the original text based on a given summary. 根据给定的摘要从原始文本中提取支持子文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('extract_tables_from_html_mapper', 'HTML表格提取映射器', 'Extracts tables from HTML content and stores them in a specified field. 从HTML内容中提取表并将其存储在指定字段中。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('fix_unicode_mapper', 'Unicode修复映射器', 'Fixes unicode errors in text samples. 修复文本示例中的unicode错误。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('generate_qa_from_examples_mapper', '示例生成QA映射器', 'Generates question and answer pairs from examples using a Hugging Face model. 使用拥抱面部模型从示例生成问题和答案对。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('generate_qa_from_text_mapper', '文本生成QA映射器', 'Generates question and answer pairs from text using a specified model. 使用指定的模型从文本生成问题和答案对。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('image_blur_mapper', '图像模糊映射器', 'Blurs images in the dataset with a specified probability and blur type. 使用指定的概率和模糊类型对数据集中的图像进行模糊处理。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_captioning_from_gpt4v_mapper', 'GPT4V图像描述映射器', 'Generates text captions for images using the GPT-4 Vision model. 使用GPT-4视觉模型为图像生成文本标题。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('image_captioning_mapper', '图像描述映射器', 'Generates image captions using a Hugging Face model and appends them to samples. 使用拥抱面部模型生成图像标题,并将其附加到样本中。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('image_detection_yolo_mapper', 'YOLO图像检测映射器', 'Perform object detection using YOLO on images and return bounding boxes and class labels. 使用YOLO对图像执行对象检测,并返回边界框和类标签。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_diffusion_mapper', '图像扩散生成映射器', 'Generate images using a diffusion model based on provided captions. 使用基于提供的字幕的扩散模型生成图像。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('image_face_blur_mapper', '图像人脸模糊映射器', 'Mapper to blur faces detected in images. 映射器模糊图像中检测到的人脸。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_remove_background_mapper', '图像去背景映射器', 'Mapper to remove the background of images. 映射器删除图像的背景。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_segment_mapper', '图像分割映射器', 'Perform segment-anything on images and return the bounding boxes. 对图像执行segment-任何操作并返回边界框。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('image_tagging_mapper', '图像打标映射器', 'Generates image tags for each image in the sample. 为样本中的每个图像生成图像标记。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('imgdiff_difference_area_generator_mapper', 'ImgDiff差异区域生成映射器', 'Generates and filters bounding boxes for image pairs based on similarity, segmentation, and text matching. 根据相似性、分割和文本匹配生成和过滤图像对的边界框。', '1.4.4', 'image', 'image', NULL, NULL, '', false, 'system', 'system'), + ('imgdiff_difference_caption_generator_mapper', 'ImgDiff差异描述生成映射器', 'Generates difference captions for bounding box regions in two images. 为两个图像中的边界框区域生成差异字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('mllm_mapper', 'MLLM视觉问答映射器', 'Mapper to use MLLMs for visual question answering tasks. Mapper使用MLLMs进行视觉问答任务。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('nlpaug_en_mapper', 'NLPAug英语增强映射器', 'Augments English text samples using various methods from the nlpaug library. 使用nlpaug库中的各种方法增强英语文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('nlpcda_zh_mapper', 'NLPCDA中文增强映射器', 'Augments Chinese text samples using the nlpcda library. 使用nlpcda库扩充中文文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('optimize_prompt_mapper', 'Prompt优化映射器', 'Optimize prompts based on existing ones in the same batch. 根据同一批次中的现有提示优化提示。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('optimize_qa_mapper', 'QA优化映射器', 'Mapper to optimize question-answer pairs. 映射器来优化问题-答案对。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('optimize_query_mapper', '查询优化映射器', 'Optimize queries in question-answer pairs to make them more specific and detailed. 优化问答对中的查询,使其更加具体和详细。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('optimize_response_mapper', '回复优化映射器', 'Optimize response in question-answer pairs to be more detailed and specific. 优化问答对中的响应,使其更加详细和具体。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('pair_preference_mapper', '配对偏好映射器', 'Mapper to construct paired preference samples by generating a rejected response and its reason. Mapper通过生成拒绝响应及其原因来构造成对的偏好样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('punctuation_normalization_mapper', '标点归一化映射器', 'Normalizes unicode punctuations to their English equivalents in text samples. 将unicode标点规范化为文本示例中的英语等效项。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('python_file_mapper', 'Python文件映射器', 'Executes a Python function defined in a file on input data. 对输入数据执行文件中定义的Python函数。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('python_lambda_mapper', 'Python Lambda映射器', 'Mapper for applying a Python lambda function to data samples. Mapper,用于将Python lambda函数应用于数据样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('query_intent_detection_mapper', '查询意图检测映射器', 'Predicts the user''s intent label and corresponding score for a given query. 为给定查询预测用户的意图标签和相应的分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('query_sentiment_detection_mapper', '查询情感检测映射器', 'Predicts user''s sentiment label (''negative'', ''neutral'', ''positive'') in a query. 在查询中预测用户的情绪标签 (“负面” 、 “中性” 、 “正面”)。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('query_topic_detection_mapper', '查询主题检测映射器', 'Predicts the topic label and its corresponding score for a given query. 预测给定查询的主题标签及其相应的分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('relation_identity_mapper', '关系识别映射器', 'Identify the relation between two entities in a given text. 确定给定文本中两个实体之间的关系。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('remove_bibliography_mapper', '参考书目移除映射器', 'Removes bibliography sections at the end of LaTeX documents. 删除LaTeX文档末尾的参考书目部分。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('remove_comments_mapper', '注释移除映射器', 'Removes comments from documents, currently supporting only ''tex'' format. 从文档中删除注释,当前仅支持 “文本” 格式。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('remove_header_mapper', '页眉移除映射器', 'Removes headers at the beginning of documents in LaTeX samples. 删除LaTeX示例中文档开头的标题。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('remove_long_words_mapper', '长词移除映射器', 'Mapper to remove long words within a specific range. 映射器删除特定范围内的长词。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('remove_non_chinese_character_mapper', '非中文字符移除映射器', 'Removes non-Chinese characters from text samples. 从文本样本中删除非中文字符。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('remove_repeat_sentences_mapper', '重复句移除映射器', 'Mapper to remove repeat sentences in text samples. 映射器删除文本样本中的重复句子。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('remove_specific_chars_mapper', '指定字符移除映射器', 'Removes specific characters from text samples. 从文本示例中删除特定字符。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('remove_table_text_mapper', '表格文本移除映射器', 'Mapper to remove table texts from text samples. 映射器从文本样本中删除表文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('remove_words_with_incorrect_substrings_mapper', '错误子串单词移除映射器', 'Mapper to remove words containing specified incorrect substrings. 映射程序删除包含指定的不正确子字符串的单词。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('replace_content_mapper', '内容替换映射器', 'Replaces content in the text that matches a specific regular expression pattern with a designated replacement string. 用指定的替换字符串替换与特定正则表达式模式匹配的文本中的内容。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('sdxl_prompt2prompt_mapper', 'SDXL Prompt2Prompt映射器', 'Generates pairs of similar images using the SDXL model. 使用SDXL模型生成成对的相似图像。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('sentence_augmentation_mapper', '句子增强映射器', 'Augments sentences by generating enhanced versions using a Hugging Face model. 通过使用拥抱面部模型生成增强版本来增强句子。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('sentence_split_mapper', '句子切分映射器', 'Splits text samples into individual sentences based on the specified language. 根据指定的语言将文本样本拆分为单个句子。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('text_chunk_mapper', '文本分块映射器', 'Split input text into chunks based on specified criteria. 根据指定的条件将输入文本拆分为块。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('text_tagging_by_prompt_mapper', 'Prompt文本打标映射器', 'Mapper to generate text tags using prompt with LLM. Mapper使用带有LLM的prompt生成文本标记。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('vggt_mapper', 'VGGT视频提取映射器', 'Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks. 输入单个场景的视频,并使用VGGT提取包括相机姿态、深度图、点图和3D点轨迹的信息。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_captioning_from_audio_mapper', '音频生成视频描述映射器', 'Mapper to caption a video according to its audio streams based on Qwen-Audio model. 映射器根据基于qwen-audio模型的音频流为视频添加字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_captioning_from_frames_mapper', '帧生成视频描述映射器', 'Generates video captions from sampled frames using an image-to-text model. 使用图像到文本模型从采样帧生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_captioning_from_summarizer_mapper', '摘要生成视频描述映射器', 'Mapper to generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...). 映射器通过总结几种生成的文本 (来自视频/音频/帧的字幕,来自音频/帧的标签,...) 来生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_captioning_from_video_mapper', '视频生成视频描述映射器', 'Generates video captions using a Hugging Face video-to-text model and sampled video frames. 使用拥抱面部视频到文本模型和采样视频帧生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_captioning_from_vlm_mapper', 'VLM视频描述映射器', 'Generates video captions using a VLM that accepts videos as inputs. 使用接受视频作为输入的VLM生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_depth_estimation_mapper', '视频深度估计映射器', 'Perform depth estimation on the video. 对视频进行深度估计。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_extract_frames_mapper', '视频抽帧映射器', 'Mapper to extract frames from video files according to specified methods. 映射器根据指定的方法从视频文件中提取帧。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_face_blur_mapper', '视频人脸模糊映射器', 'Mapper to blur faces detected in videos. 映射器模糊在视频中检测到的人脸。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_ffmpeg_wrapped_mapper', '视频FFmpeg封装映射器', 'Wraps FFmpeg video filters for processing video files in a dataset. 包装FFmpeg视频过滤器,用于处理数据集中的视频文件。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_hand_reconstruction_mapper', '视频手部重建映射器', 'Use the WiLoR model for hand localization and reconstruction. 使用WiLoR模型进行手部定位和重建。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_object_segmenting_mapper', '视频对象分割映射器', 'Text-guided semantic segmentation of valid objects throughout the video (YOLOE + SAM2). 在整个视频中对有效对象进行文本引导的语义分割 (YOLOE SAM2)。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_remove_watermark_mapper', '视频去水印映射器', 'Remove watermarks from videos based on specified regions. 根据指定区域从视频中删除水印。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_resize_aspect_ratio_mapper', '视频宽高比调整映射器', 'Resizes videos to fit within a specified aspect ratio range. 调整视频大小以适应指定的宽高比范围。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_resize_resolution_mapper', '视频分辨率调整映射器', 'Resizes video resolution based on specified width and height constraints. 根据指定的宽度和高度限制调整视频分辨率。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_split_by_duration_mapper', '视频按时长切分映射器', 'Splits videos into segments based on a specified duration. 根据指定的持续时间将视频拆分为多个片段。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_split_by_key_frame_mapper', '视频关键帧切分映射器', 'Splits a video into segments based on key frames. 根据关键帧将视频分割为多个片段。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_split_by_scene_mapper', '视频场景切分映射器', 'Splits videos into scene clips based on detected scene changes. 根据检测到的场景变化将视频拆分为场景剪辑。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false, 'system', 'system'), + ('video_tagging_from_audio_mapper', '音频视频打标映射器', 'Generates video tags from audio streams using the Audio Spectrogram Transformer. 使用音频频谱图转换器从音频流生成视频标签。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_tagging_from_frames_mapper', '帧视频打标映射器', 'Generates video tags from frames extracted from videos. 从视频中提取的帧生成视频标签。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('video_whole_body_pose_estimation_mapper', '视频全身姿态估计映射器', 'Input a video containing people, and use the DWPose model to extract the body, hand, feet, and face keypoints of the human subjects in the video, i.e., 2D Whole-body Pose Estimation. 输入包含人的视频,并使用DWPose模型来提取视频中人类主体的身体、手、脚和面部关键点,即2D全身姿态估计。', '1.4.4', 'video', 'video', NULL, NULL, '', false, 'system', 'system'), + ('whitespace_normalization_mapper', '空白字符归一化映射器', 'Normalizes various types of whitespace characters to standard spaces in text samples. 将文本样本中各种类型的空白字符规范化为标准空格。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('frequency_specified_field_selector', '频率指定字段选择器', 'Selector to filter samples based on the frequency of a specified field. 选择器根据指定字段的频率过滤样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('random_selector', '随机选择器', 'Randomly selects a subset of samples from the dataset. 从数据集中随机选择样本子集。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('range_specified_field_selector', '范围指定字段选择器', 'Selects a range of samples based on the sorted values of a specified field. 根据指定字段的排序值选择采样范围。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('tags_specified_field_selector', '标签指定字段选择器', 'Selector to filter samples based on the tags of a specified field. 选择器根据指定字段的标签过滤样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system'), + ('topk_specified_field_selector', 'TopK指定字段选择器', 'Selects top samples based on the sorted values of a specified field. 根据指定字段的排序值选择顶部样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false, 'system', 'system') ON CONFLICT DO NOTHING; INSERT INTO t_operator_category_relation(category_id, operator_id)