Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 22 additions & 30 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ endef
# ========== Build Targets ==========

# Valid build targets
VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru mineru-npu gateway label-studio
VALID_BUILD_TARGETS := frontend backend gateway database runtime backend-python deer-flow label-studio mineru mineru-910B mineru-910C mineru-310P

# Generic docker build target with service name as parameter
# Automatically prefixes image names with "datamate-" unless it's deer-flow
Expand All @@ -169,6 +169,12 @@ VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flo
@if [ "$*" = "deer-flow" ]; then \
$(call docker-build,deer-flow-backend,deer-flow-backend); \
$(call docker-build,deer-flow-frontend,deer-flow-frontend); \
elif [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ]; then \
wget -qO - https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/npu.Dockerfile | docker build --network=host -t datamate-mineru -f - .; \
elif [ "$*" = "mineru-910C" ]; then \
wget -qO - https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/npu.Dockerfile | sed 's/v0.11.0/v0.11.0-a3/g' | docker build --network=host -t datamate-mineru -f - .; \
elif [ "$*" = "mineru-310P" ]; then \
wget -qO - https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/npu.Dockerfile | sed 's/v0.11.0/v0.10.0rc1-310p/g' | docker build --network=host -t datamate-mineru -f - .; \
else \
$(call docker-build,$*,datamate-$*); \
fi
Expand Down Expand Up @@ -235,7 +241,7 @@ endif
# ========== Docker Install/Uninstall Targets ==========

# Valid service targets for docker install/uninstall
VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python database gateway redis mineru deer-flow milvus label-studio data-juicer dj
VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python database gateway redis deer-flow milvus label-studio data-juicer mineru mineru-910B mineru-910C mineru-310P

# Generic docker service install target
.PHONY: %-docker-install
Expand All @@ -252,9 +258,11 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python databa
REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml --profile label-studio up -d; \
elif [ "$*" = "datamate" ]; then \
REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml up -d; \
elif [ "$*" = "mineru" ]; then \
elif [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ] || [ "$*" = "mineru-910C" ]; then \
REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml --profile mineru up -d datamate-mineru; \
elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \
elif [ "$*" = "mineru-310P" ]; then \
REGISTRY=$(REGISTRY) EXTRA_ARGS="--enforce-eager --dtype float16" docker compose -f deployment/docker/datamate/docker-compose.yml --profile mineru up -d datamate-mineru; \
elif [ "$*" = "data-juicer" ]; then \
REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml --profile data-juicer up -d datamate-data-juicer; \
elif [ "$*" = "redis" ]; then \
REGISTRY=$(REGISTRY) docker compose -f deployment/docker/datamate/docker-compose.yml --profile redis up -d datamate-redis; \
Expand All @@ -281,7 +289,7 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python databa
fi
@if [ "$*" = "label-studio" ]; then \
docker compose -f deployment/docker/datamate/docker-compose.yml rm -f -s label-studio; \
elif [ "$*" = "mineru" ]; then \
elif [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ] || [ "$*" = "mineru-910C" ] || [ "$*" = "mineru-310P" ]; then \
docker compose -f deployment/docker/datamate/docker-compose.yml rm -f -s datamate-mineru; \
elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \
docker compose -f deployment/docker/datamate/docker-compose.yml rm -f -s datamate-data-juicer; \
Expand All @@ -304,7 +312,7 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime backend-python databa
# ========== Kubernetes Install/Uninstall Targets ==========

# Valid k8s targets
VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer dj
VALID_K8S_TARGETS := datamate deer-flow milvus label-studio data-juicer mineru mineru-910B mineru-910C mineru-310P

# Generic k8s install target
.PHONY: %-k8s-install
Expand All @@ -319,8 +327,10 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer d
fi
@if [ "$*" = "label-studio" ]; then \
helm upgrade label-studio deployment/helm/label-studio/ -n $(NAMESPACE) --install; \
elif [ "$*" = "mineru" ]; then \
kubectl apply -f deployment/kubernetes/mineru/deploy.yaml -n $(NAMESPACE); \
elif [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ] || [ "$*" = "mineru-910C" ]; then \
kubectl apply -f deployment/kubernetes/mineru/deploy-910.yaml -n $(NAMESPACE); \
elif [ "$*" = "mineru-310P" ]; then \
kubectl apply -f deployment/kubernetes/mineru/deploy-310.yaml -n $(NAMESPACE); \
elif [ "$*" = "datamate" ]; then \
helm upgrade datamate deployment/helm/datamate/ -n $(NAMESPACE) --install --set global.image.repository=$(REGISTRY); \
elif [ "$*" = "deer-flow" ]; then \
Expand All @@ -346,8 +356,10 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer d
done; \
exit 1; \
fi
@if [ "$*" = "mineru" ]; then \
kubectl delete -f deployment/kubernetes/mineru/deploy.yaml -n $(NAMESPACE); \
@if [ "$*" = "mineru" ] || [ "$*" = "mineru-910B" ] || [ "$*" = "mineru-910C" ]; then \
kubectl delete -f deployment/kubernetes/mineru/deploy-910.yaml -n $(NAMESPACE); \
elif [ "$*" = "mineru-310P" ]; then \
kubectl delete -f deployment/kubernetes/mineru/deploy-310.yaml -n $(NAMESPACE); \
elif [ "$*" = "datamate" ]; then \
helm uninstall datamate -n $(NAMESPACE) --ignore-not-found; \
elif [ "$*" = "deer-flow" ]; then \
Expand All @@ -360,26 +372,6 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer d
kubectl delete -f deployment/kubernetes/data-juicer/deploy.yaml -n $(NAMESPACE); \
fi

# ========== Upgrade Targets ==========

# Valid upgrade targets
VALID_UPGRADE_TARGETS := datamate

# Generic docker upgrade target
.PHONY: %-docker-upgrade
%-docker-upgrade:
@if ! echo " $(VALID_UPGRADE_TARGETS) " | grep -q " $* "; then \
echo "Error: Unknown upgrade target '$*'"; \
echo "Valid upgrade targets are:"; \
for target in $(VALID_UPGRADE_TARGETS); do \
echo " - $$target"; \
done; \
exit 1; \
fi
@if [ "$*" = "datamate" ]; then \
docker compose -f deployment/docker/datamate/docker-compose.yml --profile mineru up -d --force-recreate --remove-orphans; \
fi

# ========== Download Targets ==========

# List of all images to download
Expand Down
7 changes: 3 additions & 4 deletions deployment/docker/datamate/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ services:
image: ${REGISTRY:-}datamate-gateway
restart: on-failure
privileged: true
ports:
- '8080:8080'
environment:
- JWT_SECRET=default-insecure-key-change-in-production
networks: [ datamate ]
Expand Down Expand Up @@ -116,9 +118,8 @@ services:
--engine vllm
--host 0.0.0.0
--port 8000
${EXTRA_ARGS:-}
volumes:
- dataset_volume:/dataset
- mineru_log_volume:/var/log/datamate/mineru
- /var/log/npu/:/usr/slog
- /usr/local/dcmi:/usr/local/dcmi
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
Expand Down Expand Up @@ -334,8 +335,6 @@ volumes:
name: datamate-operator-runtime-volume
operator-packages-volume:
name: datamate-operator-packages-volume
mineru_log_volume:
name: datamate-mineru_log_volume
graph_data_volume:
name: datamate-graph-data-volume

Expand Down
71 changes: 71 additions & 0 deletions deployment/kubernetes/mineru/deploy-310.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: datamate-mineru
labels:
app: datamate
tier: mineru
spec:
replicas: 1
selector:
matchLabels:
app: datamate
tier: mineru
template:
metadata:
labels:
app: datamate
tier: mineru
spec:
containers:
- name: mineru
image: datamate-mineru
imagePullPolicy: IfNotPresent
command:
- mineru-openai-server
args:
- --engine
- vllm
- --host
- 0.0.0.0
- --port
- "8000"
- --enforce-eager
- --dtype
- float16
env:
- name: MINERU_MODEL_SOURCE
value: local
- name: MINERU_DEVICE_MODE
value: npu
- name: VLLM_WORKER_MULTIPROC_METHOD
value: spawn
ports:
- containerPort: 8000
resources:
limits:
cpu: 8
memory: 32Gi
huawei.com/Ascend310P: 1
requests:
cpu: 100m
memory: 100Mi
huawei.com/Ascend310P: 1

---
apiVersion: v1
kind: Service
metadata:
name: datamate-mineru
labels:
app: datamate
tier: mineru
spec:
type: ClusterIP
ports:
- port: 8000
targetPort: 8000
protocol: TCP
selector:
app: datamate
tier: mineru
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,6 @@ spec:
cpu: 100m
memory: 100Mi
huawei.com/Ascend910: 1
volumeMounts:
- name: dataset-volume
mountPath: /dataset
- name: log-volume
mountPath: /var/log/datamate/mineru
subPath: mineru
volumes:
- name: dataset-volume
persistentVolumeClaim:
claimName: datamate-dataset-pvc
- name: log-volume
persistentVolumeClaim:
claimName: datamate-log-pvc

---
apiVersion: v1
Expand Down
112 changes: 0 additions & 112 deletions runtime/mineru/mineru_api.py

This file was deleted.

6 changes: 6 additions & 0 deletions runtime/ops/formatter/mineru_formatter/metadata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ effect:
inputs: 'text'
outputs: 'text'
settings:
mineruApi:
name: 'Mineru Api地址'
description: '指定mineru服务的api地址,默认为本地同一集群内地址。'
type: 'input'
defaultVal: 'http://datamate-mineru:8000'
required: false
exportType:
name: '导出类型'
description: '指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。'
Expand Down
2 changes: 1 addition & 1 deletion runtime/ops/formatter/mineru_formatter/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class MineruFormatter(Mapper):

def __init__(self, *args, **kwargs):
super(MineruFormatter, self).__init__(*args, **kwargs)
self.server_url = "http://datamate-mineru:8000"
self.server_url = kwargs.get("mineruApi", "http://datamate-mineru:8000")
self.backend = "vlm-http-client"
self.output_dir = "/dataset/outputs"
self.max_retries = 3
Expand Down
2 changes: 1 addition & 1 deletion scripts/db/data-operator-init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ ON CONFLICT DO NOTHING;

INSERT INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, '{"exportType":{"name":"导出类型","description":"指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。","type":"select","defaultVal":"markdown","required":false,"options":[{"label":"markdown","value":"md"},{"label":"txt","value":"txt"}]}}', '', false),
VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, '{"mineruApi":{"name":"Mineru Api地址","description":"指定mineru服务的api地址,默认为本地同一集群内地址。","type":"input","defaultVal":"http://datamate-mineru:8000","required":false},"exportType":{"name":"导出类型","description":"指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。","type":"select","defaultVal":"markdown","required":false,"options":[{"label":"markdown","value":"md"},{"label":"txt","value":"txt"}]}}', '', false),
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
Expand Down
Loading
Loading