diff --git a/Makefile b/Makefile index fb3bfff5f..772afa19b 100644 --- a/Makefile +++ b/Makefile @@ -152,7 +152,7 @@ endef # ========== Build Targets ========== # Valid build targets -VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru mineru-npu gateway label-studio +VALID_BUILD_TARGETS := frontend backend gateway database runtime backend-python deer-flow label-studio mineru mineru-910B mineru-910C mineru-310P # Generic docker build target with service name as parameter # Automatically prefixes image names with "datamate-" unless it's deer-flow @@ -169,6 +169,12 @@ VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flo @if [ "$*" = "deer-flow" ]; then \ $(call docker-build,deer-flow-backend,deer-flow-backend); \ $(call docker-build,deer-flow-frontend,deer-flow-frontend); \ + elif [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ]; then \ + wget -qO - https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/npu.Dockerfile | docker build --network=host -t datamate-mineru -f - .; \ + elif [ "$*" = "mineru-910C" ]; then \ + wget -qO - https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/npu.Dockerfile | sed 's/v0.11.0/v0.11.0-a3/g' | docker build --network=host -t datamate-mineru -f - .; \ + elif [ "$*" = "mineru-310P" ]; then \ + wget -qO - https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/npu.Dockerfile | sed 's/v0.11.0/v0.10.0rc1-310p/g' | docker build --network=host -t datamate-mineru -f - .; \ else \ $(call docker-build,$*,datamate-$*); \ fi @@ -235,7 +241,7 @@ endif # ========== Docker Install/Uninstall Targets ========== # Valid service targets for docker install/uninstall -VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python database gateway redis mineru deer-flow milvus label-studio data-juicer dj +VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python database gateway redis deer-flow milvus label-studio data-juicer mineru mineru-910B mineru-910C mineru-310P # Generic docker service install target .PHONY: %-docker-install @@ -252,9 +258,11 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python databa REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml --profile label-studio up -d; \ elif [ "$*" = "datamate" ]; then \ REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml up -d; \ - elif [ "$*" = "mineru" ]; then \ + elif [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ] || [ "$*" = "mineru-910C" ]; then \ REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml --profile mineru up -d datamate-mineru; \ - elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \ + elif [ "$*" = "mineru-310P" ]; then \ + REGISTRY=$(REGISTRY) EXTRA_ARGS="--enforce-eager --dtype float16" docker compose -f deployment/docker/datamate/docker-compose.yml --profile mineru up -d datamate-mineru; \ + elif [ "$*" = "data-juicer" ]; then \ REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml --profile data-juicer up -d datamate-data-juicer; \ elif [ "$*" = "redis" ]; then \ REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml --profile redis up -d datamate-redis; \ @@ -281,7 +289,7 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python databa fi @if [ "$*" = "label-studio" ]; then \ docker compose -f deployment/docker/datamate/docker-compose.yml rm -f -s label-studio; \ - elif [ "$*" = "mineru" ]; then \ + elif [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ] || [ "$*" = "mineru-910C" ] || [ "$*" = "mineru-310P" ]; then \ docker compose -f deployment/docker/datamate/docker-compose.yml rm -f -s datamate-mineru; \ elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \ docker compose -f deployment/docker/datamate/docker-compose.yml rm -f -s datamate-data-juicer; \ @@ -304,7 +312,7 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python databa # ========== Kubernetes Install/Uninstall Targets ========== # Valid k8s targets -VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer dj +VALID_K8S_TARGETS := datamate deer-flow milvus label-studio data-juicer mineru mineru-910B mineru-910C mineru-310P # Generic k8s install target .PHONY: %-k8s-install @@ -319,8 +327,10 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer d fi @if [ "$*" = "label-studio" ]; then \ helm upgrade label-studio deployment/helm/label-studio/ -n $(NAMESPACE) --install; \ - elif [ "$*" = "mineru" ]; then \ - kubectl apply -f deployment/kubernetes/mineru/deploy.yaml -n $(NAMESPACE); \ + elif [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ] || [ "$*" = "mineru-910C" ]; then \ + kubectl apply -f deployment/kubernetes/mineru/deploy-910.yaml -n $(NAMESPACE); \ + elif [ "$*" = "mineru-310P" ]; then \ + kubectl apply -f deployment/kubernetes/mineru/deploy-310.yaml -n $(NAMESPACE); \ elif [ "$*" = "datamate" ]; then \ helm upgrade datamate deployment/helm/datamate/ -n $(NAMESPACE) --install --set global.image.repository=$(REGISTRY); \ elif [ "$*" = "deer-flow" ]; then \ @@ -346,8 +356,10 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer d done; \ exit 1; \ fi - @if [ "$*" = "mineru" ]; then \ - kubectl delete -f deployment/kubernetes/mineru/deploy.yaml -n $(NAMESPACE); \ + @if [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ] || [ "$*" = "mineru-910C" ]; then \ + kubectl delete -f deployment/kubernetes/mineru/deploy-910.yaml -n $(NAMESPACE); \ + elif [ "$*" = "mineru-310P" ]; then \ + kubectl delete -f deployment/kubernetes/mineru/deploy-310.yaml -n $(NAMESPACE); \ elif [ "$*" = "datamate" ]; then \ helm uninstall datamate -n $(NAMESPACE) --ignore-not-found; \ elif [ "$*" = "deer-flow" ]; then \ @@ -360,26 +372,6 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer d kubectl delete -f deployment/kubernetes/data-juicer/deploy.yaml -n $(NAMESPACE); \ fi -# ========== Upgrade Targets ========== - -# Valid upgrade targets -VALID_UPGRADE_TARGETS := datamate - -# Generic docker upgrade target -.PHONY: %-docker-upgrade -%-docker-upgrade: - @if ! echo " $(VALID_UPGRADE_TARGETS) " | grep -q " $* "; then \ - echo "Error: Unknown upgrade target '$*'"; \ - echo "Valid upgrade targets are:"; \ - for target in $(VALID_UPGRADE_TARGETS); do \ - echo " - $$target"; \ - done; \ - exit 1; \ - fi - @if [ "$*" = "datamate" ]; then \ - docker compose -f deployment/docker/datamate/docker-compose.yml --profile mineru up -d --force-recreate --remove-orphans; \ - fi - # ========== Download Targets ========== # List of all images to download diff --git a/deployment/docker/datamate/docker-compose.yml b/deployment/docker/datamate/docker-compose.yml index 932dcd5a0..6da18d02e 100644 --- a/deployment/docker/datamate/docker-compose.yml +++ b/deployment/docker/datamate/docker-compose.yml @@ -42,6 +42,8 @@ services: image: ${REGISTRY:-}datamate-gateway restart: on-failure privileged: true + ports: + - '8080:8080' environment: - JWT_SECRET=default-insecure-key-change-in-production networks: [ datamate ] @@ -116,9 +118,8 @@ services: --engine vllm --host 0.0.0.0 --port 8000 + ${EXTRA_ARGS:-} volumes: - - dataset_volume:/dataset - - mineru_log_volume:/var/log/datamate/mineru - /var/log/npu/:/usr/slog - /usr/local/dcmi:/usr/local/dcmi - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi @@ -334,8 +335,6 @@ volumes: name: datamate-operator-runtime-volume operator-packages-volume: name: datamate-operator-packages-volume - mineru_log_volume: - name: datamate-mineru_log_volume graph_data_volume: name: datamate-graph-data-volume diff --git a/deployment/kubernetes/mineru/deploy-310.yaml b/deployment/kubernetes/mineru/deploy-310.yaml new file mode 100644 index 000000000..c914b799b --- /dev/null +++ b/deployment/kubernetes/mineru/deploy-310.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: datamate-mineru + labels: + app: datamate + tier: mineru +spec: + replicas: 1 + selector: + matchLabels: + app: datamate + tier: mineru + template: + metadata: + labels: + app: datamate + tier: mineru + spec: + containers: + - name: mineru + image: datamate-mineru + imagePullPolicy: IfNotPresent + command: + - mineru-openai-server + args: + - --engine + - vllm + - --host + - 0.0.0.0 + - --port + - "8000" + - --enforce-eager + - --dtype + - float16 + env: + - name: MINERU_MODEL_SOURCE + value: local + - name: MINERU_DEVICE_MODE + value: npu + - name: VLLM_WORKER_MULTIPROC_METHOD + value: spawn + ports: + - containerPort: 8000 + resources: + limits: + cpu: 8 + memory: 32Gi + huawei.com/Ascend310P: 1 + requests: + cpu: 100m + memory: 100Mi + huawei.com/Ascend310P: 1 + +--- +apiVersion: v1 +kind: Service +metadata: + name: datamate-mineru + labels: + app: datamate + tier: mineru +spec: + type: ClusterIP + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP + selector: + app: datamate + tier: mineru diff --git a/deployment/kubernetes/mineru/deploy.yaml b/deployment/kubernetes/mineru/deploy-910.yaml similarity index 76% rename from deployment/kubernetes/mineru/deploy.yaml rename to deployment/kubernetes/mineru/deploy-910.yaml index c4b1491f7..c031f1e37 100644 --- a/deployment/kubernetes/mineru/deploy.yaml +++ b/deployment/kubernetes/mineru/deploy-910.yaml @@ -48,19 +48,6 @@ spec: cpu: 100m memory: 100Mi huawei.com/Ascend910: 1 - volumeMounts: - - name: dataset-volume - mountPath: /dataset - - name: log-volume - mountPath: /var/log/datamate/mineru - subPath: mineru - volumes: - - name: dataset-volume - persistentVolumeClaim: - claimName: datamate-dataset-pvc - - name: log-volume - persistentVolumeClaim: - claimName: datamate-log-pvc --- apiVersion: v1 diff --git a/runtime/mineru/mineru_api.py b/runtime/mineru/mineru_api.py deleted file mode 100644 index 5ad9c1855..000000000 --- a/runtime/mineru/mineru_api.py +++ /dev/null @@ -1,112 +0,0 @@ -import shutil -import time -import uuid -import os - -import click -import uvicorn -from pydantic import BaseModel -from pathlib import Path -from fastapi import FastAPI -from fastapi.responses import JSONResponse -from loguru import logger -from mineru.cli.common import aio_do_parse, read_fn -from mineru.cli.fast_api import get_infer_result - -# 日志配置 -LOG_DIR = "/var/log/datamate/mineru" -os.makedirs(LOG_DIR, exist_ok=True) -logger.add( - f"{LOG_DIR}/mineru.log", - format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}", - level="DEBUG", - enqueue=True -) - -app = FastAPI() -class PDFParseRequest(BaseModel): - source_path: str - export_path: str - -@app.post(path="/api/pdf-extract") -async def parse_pdf(request: PDFParseRequest): - try: - start = time.time() - # 创建唯一的输出目录 - unique_id = str(uuid.uuid4()) - unique_dir = os.path.join(request.export_path, unique_id) - os.makedirs(unique_dir, exist_ok=True) - - # 如果是PDF,使用read_fn处理 - file_path = Path(request.source_path) - file_suffix = file_path.suffix.lower() - if file_suffix == ".pdf": - try: - pdf_bytes = read_fn(file_path) - pdf_name = file_path.stem - pdf_bytes_list = [pdf_bytes] - pdf_file_names = [pdf_name] - except Exception as e: - return JSONResponse( - status_code=400, - content={"error": f"Failed to load file: {str(e)}"} - ) - else: - return JSONResponse( - status_code=400, - content={"error": f"Unsupported file type: {file_suffix}"} - ) - - # 调用异步处理函数 - await aio_do_parse( - output_dir=unique_dir, - pdf_file_names=pdf_file_names, - pdf_bytes_list=pdf_bytes_list, - p_lang_list=["ch"], - f_draw_layout_bbox=False, - f_draw_span_bbox=False, - f_dump_orig_pdf=False, - ) - - if os.getenv("MINERU_BACKEND_MODE").startswith("pipeline"): - parse_dir = os.path.join(unique_dir, pdf_name, "auto") - else: - parse_dir = os.path.join(unique_dir, pdf_name, "vlm") - - content = "" - if os.path.exists(parse_dir): - content = get_infer_result(".md", pdf_name, parse_dir) - - if os.path.exists(unique_dir): - try: - shutil.rmtree(unique_dir) - except Exception as e: - logger.error(f"Failed to remove unique dir for {unique_id}: {str(e)}") - - logger.info(f"fileName: {file_path.name} costs {time.time() - start:.6f} s") - - return JSONResponse(status_code=200, content={"result": content}) - except Exception as e: - logger.exception(e) - return JSONResponse( - status_code=500, - content={"error": f"Failed to process file: {str(e)}"} - ) - - -@click.command() -@click.option('--ip', default='0.0.0.0', help='Service ip for this API, default to use 0.0.0.0.') -@click.option('--port', default=9001, type=int, help='Service port for this API, default to use 8082.') -def main(ip, port): - """Create API for Submitting Job to MinerU""" - logger.info(f"Start MinerU FastAPI Service: http://{ip}:{port}") - uvicorn.run( - app, - host=ip, - port=port - ) - - -if __name__ == "__main__": - main() - diff --git a/runtime/ops/formatter/mineru_formatter/metadata.yml b/runtime/ops/formatter/mineru_formatter/metadata.yml index 81af9ec97..3211c7789 100644 --- a/runtime/ops/formatter/mineru_formatter/metadata.yml +++ b/runtime/ops/formatter/mineru_formatter/metadata.yml @@ -15,6 +15,12 @@ effect: inputs: 'text' outputs: 'text' settings: + mineruApi: + name: 'Mineru Api地址' + description: '指定mineru服务的api地址,默认为本地同一集群内地址。' + type: 'input' + defaultVal: 'http://datamate-mineru:8000' + required: false exportType: name: '导出类型' description: '指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。' diff --git a/runtime/ops/formatter/mineru_formatter/process.py b/runtime/ops/formatter/mineru_formatter/process.py index ffa9356f7..db3de63ae 100644 --- a/runtime/ops/formatter/mineru_formatter/process.py +++ b/runtime/ops/formatter/mineru_formatter/process.py @@ -27,7 +27,7 @@ class MineruFormatter(Mapper): def __init__(self, *args, **kwargs): super(MineruFormatter, self).__init__(*args, **kwargs) - self.server_url = "http://datamate-mineru:8000" + self.server_url = kwargs.get("mineruApi", "http://datamate-mineru:8000") self.backend = "vlm-http-client" self.output_dir = "/dataset/outputs" self.max_retries = 3 diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index 5f2e4d4b7..9bf6ccd17 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -134,7 +134,7 @@ ON CONFLICT DO NOTHING; INSERT INTO t_operator (id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star) -VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, '{"exportType":{"name":"导出类型","description":"指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。","type":"select","defaultVal":"markdown","required":false,"options":[{"label":"markdown","value":"md"},{"label":"txt","value":"txt"}]}}', '', false), +VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, '{"mineruApi":{"name":"Mineru Api地址","description":"指定mineru服务的api地址,默认为本地同一集群内地址。","type":"input","defaultVal":"http://datamate-mineru:8000","required":false},"exportType":{"name":"导出类型","description":"指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。","type":"select","defaultVal":"markdown","required":false,"options":[{"label":"markdown","value":"md"},{"label":"txt","value":"txt"}]}}', '', false), ('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'), ('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'), ('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'), diff --git a/scripts/images/mineru-npu/Dockerfile b/scripts/images/mineru-npu/Dockerfile deleted file mode 100644 index f80721ed9..000000000 --- a/scripts/images/mineru-npu/Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -# 基础镜像配置 vLLM 或 LMDeploy ,请根据实际需要选择其中一个,要求 ARM(AArch64) CPU + Ascend NPU。 -# Base image containing the vLLM inference environment, requiring ARM(AArch64) CPU + Ascend NPU. -FROM quay.io/ascend/vllm-ascend:v0.11.0rc2 -# Base image containing the LMDeploy inference environment, requiring ARM(AArch64) CPU + Ascend NPU. -# FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:mineru-a2 - - -# Install libgl for opencv support & Noto fonts for Chinese characters -RUN apt-get update && \ - apt-get install -y \ - fonts-noto-core \ - fonts-noto-cjk \ - fontconfig \ - libgl1 \ - libglib2.0-0 && \ - fc-cache -fv && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Install mineru latest -RUN python3 -m pip install -U pip -i https://mirrors.aliyun.com/pypi/simple && \ - python3 -m pip install 'mineru[core]>=2.6.5' \ - numpy==1.26.4 \ - opencv-python==4.11.0.86 \ - -i https://mirrors.aliyun.com/pypi/simple && \ - python3 -m pip cache purge - -# Download models and update the configuration file -RUN TORCH_DEVICE_BACKEND_AUTOLOAD=0 /bin/bash -c "mineru-models-download -s modelscope -m all" - -# Set the entry point to activate the virtual environment and run the command line tool -ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"] \ No newline at end of file diff --git a/scripts/images/mineru/Dockerfile b/scripts/images/mineru/Dockerfile deleted file mode 100644 index bb6166d94..000000000 --- a/scripts/images/mineru/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -FROM python:3.11-slim - -COPY runtime/mineru /opt/runtime/datamate/mineru - -RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \ - && apt-get update \ - && apt-get install -y curl vim libgl1 libglx0 libopengl0 libglib2.0-0 procps \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN pip config --user set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple && \ - pip config --user set global.trusted-host mirrors.huaweicloud.com && \ - pip install --upgrade setuptools && \ - pip install -U 'mineru[core]==2.5.4' --break-system-packages && \ - pip cache purge - -ENV CURL_CA_BUNDLE="" -ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0 - -RUN /bin/bash -c "mineru-models-download -s modelscope -m all" - -ENV MINERU_MODEL_SOURCE=local