Skip to content

Commit c1af7bb

Browse files
committed
chore: enhance Markdown table handling with cell content escaping
--bug=1065801@tapd-62980211 --user=刘瑞斌 【知识库】导出的zip文件导入工作流知识库后不显示内容 https://www.tapd.cn/62980211/s/1836723
1 parent 59eaeac commit c1af7bb

File tree

2 files changed

+28
-4
lines changed

2 files changed

+28
-4
lines changed

apps/common/handle/impl/qa/md_parse_qa_handle.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"""
99
import re
1010
import traceback
11+
from typing import Any
1112

1213
from charset_normalizer import detect
1314

@@ -47,7 +48,7 @@ def parse_markdown_table(self, content):
4748
line = lines[i].strip()
4849
if not line.startswith('|'):
4950
break
50-
row = [cell.strip() for cell in line.split('|')[1:-1]]
51+
row = [self._unescape_cell_content(cell) for cell in line.split('|')[1:-1]]
5152
if len(row) > 0:
5253
table_data.append(row)
5354
i += 1
@@ -59,6 +60,11 @@ def parse_markdown_table(self, content):
5960

6061
return tables
6162

63+
def _unescape_cell_content(self, cell) -> Any:
64+
text = cell.strip().replace('|', '|')
65+
text = text.replace('|<br>|', '|\n|')
66+
return text
67+
6268
def handle(self, file, get_buffer, save_image):
6369
buffer = get_buffer(file)
6470
try:

apps/common/handle/impl/text/xlsx_split_handle.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,8 @@ def get_content(self, file, save_image):
153153
md_table = '| ' + ' | '.join(headers) + ' |\n'
154154
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
155155
for row in rows:
156-
r = [f'{value}' if value is not None else '' for key, value in row.items()]
157-
md_table += '| ' + ' | '.join(
158-
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in r]) + ' |\n'
156+
r = [self._escape_cell_content(value) for key, value in row.items()]
157+
md_table += '| ' + ' | '.join(r) + ' |\n'
159158

160159
md_tables += md_table + '\n\n'
161160

@@ -164,6 +163,25 @@ def get_content(self, file, save_image):
164163
maxkb_logger.error(f'excel split handle error: {e}')
165164
return f'error: {e}'
166165

166+
def _escape_cell_content(self, cell_value):
167+
"""转义单元格内容,避免破坏 Markdown 表格结构"""
168+
if cell_value is None:
169+
return ''
170+
171+
cell_str = str(cell_value)
172+
173+
# 替换换行符为 <br>
174+
cell_str = cell_str.replace('\n', '<br>')
175+
176+
# 转义管道符 | 为 HTML 实体
177+
cell_str = cell_str.replace('|', '&#124;')
178+
179+
# 如果内容包含反引号,需要转义
180+
if '`' in cell_str:
181+
cell_str = cell_str.replace('`', '&#96;')
182+
183+
return cell_str
184+
167185
def support(self, file, get_buffer):
168186
file_name: str = file.name.lower()
169187
if file_name.endswith(".xlsx"):

0 commit comments

Comments
 (0)