diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..aac8e09
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,12 @@
+[flake8]
+max-line-length = 120
+max-complexity = 12
+select = E,F,W,C90
+extend-ignore = F403,F405
+exclude =
+ .git,
+ __pycache__,
+ venv,
+ build,
+ dist,
+ sdiff.egg-info
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..bf9a1e8
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,37 @@
+name: CI
+
+on:
+ workflow_dispatch:
+ pull_request:
+ types: [opened, synchronize, reopened, ready_for_review]
+ push:
+ branches: [master]
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.11"
+ cache: "pip"
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install .[tests]
+
+ - name: Format check
+ run: python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests
+
+ - name: Lint
+ run: python -m flake8 --config .flake8 sdiff tests
+
+ - name: Test
+ run: python -m coverage run -m pytest -s --durations=3 --durations-min=0.005
+
+ - name: Coverage report
+ run: python -m coverage report -m
diff --git a/.husky/pre-commit b/.husky/pre-commit
new file mode 100755
index 0000000..bc7696e
--- /dev/null
+++ b/.husky/pre-commit
@@ -0,0 +1,5 @@
+#!/usr/bin/env sh
+. "$(dirname -- "$0")/_/husky.sh"
+
+python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests
+python -m flake8 --config .flake8 sdiff tests
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index df31221..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-language: python
-dist: jammy
-python:
- - "3.11"
-# command to install dependencies
-install:
- - make dev
-# command to run tests
-script:
- - make test
- - make coverage
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..526549e
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,32 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+The core library lives in `sdiff/` (parser, comparer, renderer, and models). Tests are in `tests/`, with shared fixtures in `tests/fixtures/`. Reference PDFs sit in `docs/`. Packaging and tooling are defined in `setup.py`, `setup.cfg`, and the `Makefile`; `CHANGELOG` tracks releases.
+
+## Build, Test, and Development Commands
+- `make env` creates the local `venv/` (Python 3.11+).
+- `make dev` installs the package plus test/dev extras (`.[tests,devtools]`) into the venv.
+- `make test` runs linting and the full pytest suite with coverage.
+- `make vtest` runs pytest verbosely.
+- `make flake` runs the autopep8 format check and flake8 on `sdiff/` and `tests/`.
+- `make format` applies autopep8 formatting to `sdiff/` and `tests/`.
+- `make cov` prints the coverage report.
+- `make clean` removes build artifacts and the venv.
+- `make hooks` installs Husky git hooks (requires Node/npm; `make dev` runs this).
+
+Lint parity: CI and the Husky pre-commit hook both run the same checks as `make flake` (autopep8 check + flake8). Run `make flake` or `make test` locally to mirror CI.
+
+Example flow:
+```sh
+make dev
+make test
+```
+
+## Coding Style & Naming Conventions
+Use standard Python conventions: 4-space indentation, `snake_case` for modules/functions/variables, and `PascalCase` for classes. Flake8 enforces a 120-character line limit (see `setup.cfg`). `autopep8` is available for formatting. Keep new modules in `sdiff/` and new tests in `tests/` with filenames like `test_.py`.
+
+## Testing Guidelines
+The suite uses `pytest` with `coverage`. Coverage is expected to stay high (current config fails under 96%). Add or update tests for behavior changes, and prefer small, focused unit tests. Place reusable data in `tests/fixtures/`. Run `make test` before submitting changes.
+
+## Commit & Pull Request Guidelines
+Commit messages in this repo are short and often use a type prefix (e.g., `chore: ...`, `fixes: ...`, `hotfix: ...`, `refactors: ...`). Follow that pattern where practical, and keep the summary concise. For PRs, include a brief description, list tests run (e.g., `make test`), and link related issues or tickets when available.
diff --git a/Makefile b/Makefile
index 6eeb1e2..4be00c9 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,7 @@ env:
dev: env update
$(PIP) install .[tests,devtools]
+ @$(MAKE) hooks
install: env update
@@ -28,8 +29,20 @@ publish:
$(TWINE) upload --verbose --sign --username developer --repository-url http://$(PYPICLOUD_HOST)/simple/ dist/*.whl
flake:
+ $(PYTHON) -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests
$(FLAKE) sdiff tests
+format:
+ $(PYTHON) -m autopep8 --in-place --max-line-length 120 -r sdiff tests
+
+hooks:
+ @if command -v npm >/dev/null 2>&1; then \
+ npm install --no-package-lock --silent; \
+ npm run --silent prepare; \
+ else \
+ echo "npm not found; skipping husky install"; \
+ fi
+
test: flake
$(COVERAGE) run -m pytest $(TEST_RUNNER_FLAGS)
@@ -57,4 +70,4 @@ clean:
rm -rf venv
-.PHONY: all build env linux run pep test vtest testloop cov clean
+.PHONY: all build env linux run pep test vtest testloop cov clean hooks format
diff --git a/README.md b/README.md
index b8bb2a8..7ab5d32 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,40 @@
# md-sdiff
-Diffs to markdown texts only based on their structure. Ignores content. Helpful to diff 2 files that contain the same content in different languages.
+
+Structural diffs for Markdown. The library parses two Markdown inputs into a lightweight tree and compares the *shape* (headings, lists, paragraphs, links, etc.) instead of the text content. This is useful when you expect the same document structure across translations or when you want to validate formatting consistency without caring about the wording.
+
+## What it does
+- Parses Markdown into an AST-like node tree using `mistune`.
+- Compares trees node-by-node and flags insertions/deletions in structure.
+- Returns a rendered view of each document plus a list of structural errors.
+- Supports a Zendesk-specific parser (`ZendeskHelpMdParser`) for ``, ``, and `` blocks.
+
+## Example usage
+```python
+from sdiff import diff, TextRenderer, MdParser
+
+left = "# Title\n\n- One\n- Two"
+right = "# Title\n\n- One\n- Two\n- Three"
+
+rendered_left, rendered_right, errors = diff(left, right, renderer=TextRenderer(), parser_cls=MdParser)
+print(errors[0]) # "There is a missing element `li`."
+```
+
+## Renderers
+`TextRenderer` returns the original Markdown structure as text. `HtmlRenderer` wraps the output and marks structural insertions/deletions with `` and ``.
+
+## One-off usage
+```sh
+python - <<'PY'
+from sdiff import diff, TextRenderer
+
+left = open("left.md", "r", encoding="utf-8").read()
+right = open("right.md", "r", encoding="utf-8").read()
+_, _, errors = diff(left, right, renderer=TextRenderer())
+
+for err in errors:
+ print(err)
+PY
+```
+
+## Notes
+This project is a library (no CLI). If you need different token handling, you can provide a custom parser class that extends `MdParser`.
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..d682872
--- /dev/null
+++ b/package.json
@@ -0,0 +1,10 @@
+{
+ "name": "html-structure-diff",
+ "private": true,
+ "devDependencies": {
+ "husky": "^9.0.0"
+ },
+ "scripts": {
+ "prepare": "husky install"
+ }
+}
diff --git a/requirements.txt b/requirements.txt
index 1f202e5..a234623 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-mistune==0.8.1
+mistune==3.2.0
diff --git a/sdiff/__init__.py b/sdiff/__init__.py
index 853d12c..85b6af4 100644
--- a/sdiff/__init__.py
+++ b/sdiff/__init__.py
@@ -4,13 +4,21 @@
def diff(md1, md2, renderer=TextRenderer(), parser_cls: type[MdParser] = MdParser):
+ """Compare two Markdown strings by structure and return rendered outputs + errors.
+
+ Args:
+ md1: Left Markdown string.
+ md2: Right Markdown string.
+ renderer: Renderer instance used to format the output (TextRenderer by default).
+ parser_cls: Parser class to use (MdParser by default).
+
+ Returns:
+ (rendered_left, rendered_right, errors)
+ """
tree1 = parse(md1, parser_cls)
tree2 = parse(md2, parser_cls)
tree1, tree2, struct_errors = diff_struct(tree1, tree2)
- # tree1, tree2, links_errors = diff_links(tree1, tree2)
-
- # errors = struct_errors + links_errors
errors = struct_errors
return renderer.render(tree1), renderer.render(tree2), errors
diff --git a/sdiff/compare.py b/sdiff/compare.py
index 5958ada..34d75ca 100644
--- a/sdiff/compare.py
+++ b/sdiff/compare.py
@@ -44,8 +44,10 @@ def _diff(tree1, tree2, include_symbols=None, exclude_symbols=None):
def diff_links(tree1, tree2):
+ """Diff only link-relevant structure (paragraphs/headers/lists/links)."""
return _diff(tree1, tree2, include_symbols=['p', 'h', 'l', 'a'])
def diff_struct(tree1, tree2):
+ """Diff overall structure, ignoring link and image content."""
return _diff(tree1, tree2, exclude_symbols=['a', 'i'])
diff --git a/sdiff/parser.py b/sdiff/parser.py
index 93a4736..112831b 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -1,207 +1,508 @@
-from re import Match
-
-import mistune
import re
+import textwrap
+from typing import Iterable
-from .model import *
-
-
-class InlineLexer(mistune.BlockLexer):
- grammar_class = mistune.InlineGrammar
-
- default_rules = [
- 'linebreak', 'link',
- 'reflink', 'text',
- ]
-
- def __init__(self):
- self.links = {}
- self.grammar_class.text = re.compile(r'^ {1,}\n|^[\s\S]+?(?=[\[`~]| {2,}\n|$)')
- super().__init__()
-
- def parse_autolink(self, m):
- self.tokens.append(Link(m.group(0)))
-
- def parse_url(self, m):
- self.tokens.append(Link(m.group(0)))
-
- def parse_link(self, m):
- return self._process_link(m)
-
- def parse_reflink(self, m):
- # TODO skip this check for now
- # key = mistune._keyify(m.group(2) or m.group(1))
- # if key not in self.links:
- # return None
- # ret = self.links[key]
- return self._process_link(m)
-
- def _process_link(self, m):
- line = m.group(0)
- if line[0] == '!':
- node = Image(line)
- else:
- node = Link(line)
-
- self.tokens.append(node)
+import mistune
+from mistune import block_parser
- def parse_linebreak(self, m):
- node = NewLine()
- self.tokens.append(node)
+from .model import (Html, Image, Link, List, ListItem, NewLine, Paragraph, Root,
+ Text, Header, ZendeskHelpCallout, ZendeskHelpSteps,
+ ZendeskHelpTabs)
- def parse_text(self, m):
- text = m.group(0)
- if text.strip():
- escaped_text = mistune.escape(text)
- node = Text(escaped_text)
- self.tokens.append(node)
+_BLOCK_TAGS = {tag.lower() for tag in block_parser.BLOCK_TAGS}
+_HEADING_LINE_RE = re.compile(r'^(\s*)(#{1,6})(?!#)(?=\S)')
+_REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\s*\[[^\]]*\]')
+_REF_DEF_LINE_RE = re.compile(r'^\s{0,3}\[[^\]]+\]:\s+\S+')
+_FENCE_RE = re.compile(r'^\s*(`{3,}|~{3,})')
+_INLINE_MARKERS = {
+ 'strong': '**',
+ 'emphasis': '*',
+ 'strikethrough': '~~',
+}
-class MdParser(mistune.BlockLexer):
- default_rules = [
- 'newline', 'list_block', 'block_html',
- 'heading', 'lheading',
- 'paragraph', 'text',
- ]
+class MdParser:
+ """Markdown parser that builds a lightweight structural tree.
- list_rules = (
- 'newline', 'heading', 'lheading',
- 'hrule', 'list_block', 'text',
- )
+ Uses Mistune AST tokens to build sdiff Node objects.
+ """
+ list_rules = None
@classmethod
def get_lexer(cls):
return cls()
def __init__(self):
- super().__init__()
- self.grammar_class.block_html = re.compile(
- r'^\s* *(?:{}|{}|{}) *(?:\n{{1,}}|\s*$)'.format(
- r'',
- r'<({})((?:{})*?)>([\s\S]+?)<\/\1>'.format(mistune._block_tag, mistune._valid_attr),
- r'<{}(?:{})*?>'.format(mistune._block_tag, mistune._valid_attr),
- )
- )
-
- def _parse_inline(self, text):
- inline = InlineLexer()
- return inline.parse(text)
-
- def parse_newline(self, m):
- length = len(m.group(0))
- if length > 1:
- self.tokens.append(NewLine())
-
- def parse_heading(self, m):
- level = len(m.group(1))
- node = Header(level)
- node.add_nodes(self._parse_inline(m.group(2)))
- self.tokens.append(node)
-
- def parse_lheading(self, m):
- level = 1 if m.group(2) == '=' else 2
- text = m.group(1)
- node = Header(level)
- node.add_nodes(self._parse_inline(text))
- self.tokens.append(node)
-
- def parse_block_html(self, m):
- text = m.group(0)
- html = Html(text)
- self.tokens.append(html)
-
- def parse_paragraph(self, m):
- text = m.group(1).rstrip('\n')
- node = Paragraph()
- node.add_nodes(self._parse_inline(text))
- self.tokens.append(node)
-
- def parse_text(self, m):
- text = m.group(0)
- escaped_text = mistune.escape(text)
- node = Text(escaped_text)
- self.tokens.append(node)
-
- def parse_list_block(self, m):
- bull = m.group(2)
- cap = m.group(0)
- ordered = '.' in bull
- node = List(ordered)
- node.add_nodes(self._process_list_item(cap, bull))
- self.tokens.append(node)
-
- def _process_list_item(self, cap, bull):
- result = []
- cap = self.rules.list_item.findall(cap)
-
- _next = False
- length = len(cap)
-
- for i in range(length):
- item = cap[i][0]
-
- # remove the bullet
- space = len(item)
- item = self.rules.list_bullet.sub('', item)
-
- # outdent
- if '\n ' in item:
- space = space - len(item)
- pattern = re.compile(r'^ {1,%d}' % space, flags=re.M)
- item = pattern.sub('', item)
-
- # determine whether item is loose or not
- loose = _next
- if not loose and re.search(r'\n\n(?!\s*$)', item):
- loose = True
-
- rest = len(item)
- if i != length - 1 and rest:
- _next = item[rest - 1] == '\n'
- if not loose:
- loose = _next
-
- node = ListItem()
- block_lexer = self.get_lexer()
- nodes = block_lexer.parse(item, self.list_rules)
- node.add_nodes(nodes)
- result.append(node)
- return result
+ self._markdown = mistune.create_markdown(renderer='ast')
+ self._reference_definitions = {}
+
+ def parse(self, text, rules=None):
+ """Parse Markdown text into a list of Node objects.
+
+ Args:
+ text: Markdown string.
+ rules: Optional rules argument kept for compatibility.
+
+ Returns:
+ list[Node]
+ """
+ tokens = self._markdown(text)
+ return self._convert_block_tokens(tokens)
+
+ def _set_reference_definitions(self, definitions):
+ self._reference_definitions = definitions
+
+ def _convert_block_tokens(self, tokens: Iterable[dict]):
+ nodes = []
+ for token in tokens:
+ nodes.extend(self._convert_block_token(token))
+ return nodes
+
+ def _convert_block_token(self, token):
+ token_type = token.get('type')
+ if token_type == 'paragraph':
+ return [self._convert_paragraph_or_heading(token.get('children', []))]
+ if token_type == 'heading':
+ return [self._convert_heading(token)]
+ if token_type == 'list':
+ return [self._convert_list(token)]
+ if token_type == 'list_item':
+ return [self._convert_list_item(token)]
+ if token_type == 'block_text':
+ return [self._convert_paragraph_or_heading(token.get('children', []))]
+ if token_type == 'block_html':
+ return self._convert_block_html(token)
+ if token_type == 'block_quote':
+ return self._convert_block_quote(token)
+ if token_type == 'block_code':
+ return self._convert_block_code(token)
+ if token_type == 'thematic_break':
+ return self._convert_passthrough_block(token)
+ return self._convert_passthrough_block(token)
+
+ def _convert_heading(self, token):
+ level = token.get('level') or token.get('attrs', {}).get('level', 1)
+ header = Header(level)
+ header.add_nodes(self._convert_inline_tokens(token.get('children', [])))
+ return header
+
+ def _convert_list(self, token):
+ ordered = token.get('ordered')
+ if ordered is None:
+ ordered = token.get('attrs', {}).get('ordered', False)
+ list_node = List(bool(ordered))
+ for item in token.get('children', []):
+ list_node.add_node(self._convert_list_item(item))
+ return list_node
+
+ def _convert_block_html(self, token):
+ raw = token.get('raw', '')
+ if _is_block_html(raw):
+ return [Html(raw)]
+ text = mistune.escape(raw)
+ if text.strip():
+ return [Paragraph([Text(text)])]
+ return []
+
+ def _convert_passthrough_block(self, token):
+ child_nodes = self._convert_block_tokens(token.get('children', []))
+ if child_nodes:
+ return child_nodes
+ raw = token.get('raw') or token.get('text') or ''
+ if raw.strip():
+ return [Paragraph([Text(mistune.escape(raw))])]
+ return []
+
+ def _convert_block_quote(self, token):
+ children = token.get('children', [])
+ if not children:
+ return []
+ content = self._render_inline_children(children)
+ if not content.strip():
+ return []
+ lines = content.splitlines()
+ quoted = '\n'.join([f'> {line}' if line.strip() else '>' for line in lines])
+ return [Paragraph([Text(mistune.escape(quoted))])]
+
+ def _convert_block_code(self, token):
+ raw = token.get('raw') or ''
+ marker = token.get('marker') or '```'
+ fence = marker if marker else '```'
+ content = raw.rstrip('\n')
+ code_block = f'{fence}\n{content}\n{fence}'
+ return [Paragraph([Text(mistune.escape(code_block))])]
+
+ def _render_inline_children(self, children):
+ parts = []
+ for child in children:
+ child_type = child.get('type')
+ if child_type in {'paragraph', 'block_text'}:
+ parts.append(self._flatten_inline_text(child.get('children', [])))
+ else:
+ raw = child.get('raw') or child.get('text') or ''
+ if raw:
+ parts.append(raw)
+ return '\n'.join([part for part in parts if part is not None])
+
+ def _convert_list_item(self, token):
+ item = ListItem()
+ for child in token.get('children', []):
+ child_type = child.get('type')
+ if child_type in {'block_text', 'paragraph'}:
+ item.add_nodes(self._convert_list_block_nodes(child.get('children', [])))
+ else:
+ item.add_nodes(self._convert_block_tokens([child]))
+ return item
+
+ def _convert_inline_tokens(self, tokens: Iterable[dict]):
+ nodes = []
+ buffer = ''
+
+ def flush_buffer():
+ nonlocal buffer
+ if buffer:
+ self._split_reference_links(buffer, nodes)
+ buffer = ''
+
+ handlers = {
+ 'text': self._handle_inline_text,
+ 'inline_html': self._handle_inline_text,
+ 'block_html': self._handle_inline_text,
+ 'codespan': self._handle_inline_codespan,
+ 'softbreak': self._handle_inline_softbreak,
+ 'linebreak': self._handle_inline_linebreak,
+ 'link': self._handle_inline_link,
+ 'image': self._handle_inline_image,
+ 'strong': self._handle_inline_marker,
+ 'emphasis': self._handle_inline_marker,
+ 'strikethrough': self._handle_inline_marker,
+ }
+
+ for token in tokens:
+ token_type = token.get('type')
+ handler = handlers.get(token_type)
+ if handler:
+ buffer = handler(token, nodes, buffer, flush_buffer)
+ else:
+ buffer = self._handle_inline_other(token, nodes, buffer, flush_buffer)
+
+ flush_buffer()
+ return nodes
+
+ def _handle_inline_text(self, token, nodes, buffer, flush_buffer):
+ raw = token.get('raw', '')
+ buffer += self._reference_definitions.get(raw, raw)
+ return buffer
+
+ def _handle_inline_codespan(self, token, nodes, buffer, flush_buffer):
+ buffer += f"`{token.get('raw') or token.get('text') or ''}`"
+ return buffer
+
+ def _handle_inline_softbreak(self, token, nodes, buffer, flush_buffer):
+ buffer += ' '
+ return buffer
+
+ def _handle_inline_linebreak(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ nodes.append(NewLine())
+ return buffer
+
+ def _handle_inline_link(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ text = self._flatten_inline_text(token.get('children', []))
+ attrs = token.get('attrs', {})
+ url = attrs.get('url', '')
+ title = attrs.get('title')
+ nodes.append(Link(_format_link_markup(text, url, title)))
+ return buffer
+
+ def _handle_inline_image(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', []))
+ attrs = token.get('attrs', {})
+ url = attrs.get('url', '')
+ title = attrs.get('title')
+ nodes.append(Image(_format_image_markup(alt, url, title)))
+ return buffer
+
+ def _handle_inline_marker(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ marker = _INLINE_MARKERS[token.get('type')]
+ _append_text(nodes, marker)
+ children = token.get('children', [])
+ if children:
+ nodes.extend(self._convert_inline_tokens(children))
+ _append_text(nodes, marker)
+ return buffer
+
+ def _handle_inline_other(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ children = token.get('children', [])
+ if children:
+ nodes.extend(self._convert_inline_tokens(children))
+ else:
+ raw = token.get('raw') or token.get('text') or ''
+ if raw.strip():
+ _append_text(nodes, mistune.escape(raw))
+ return buffer
+
+ def _flatten_inline_text(self, tokens: Iterable[dict]):
+ parts = []
+ for token in tokens:
+ token_type = token.get('type')
+ if token_type in {'text', 'inline_html', 'block_html'}:
+ raw = token.get('raw') or token.get('text') or ''
+ parts.append(self._reference_definitions.get(raw, raw))
+ elif token_type == 'codespan':
+ parts.append(f"`{token.get('raw') or token.get('text') or ''}`")
+ elif token_type in _INLINE_MARKERS:
+ marker = _INLINE_MARKERS[token_type]
+ inner = self._flatten_inline_text(token.get('children', []))
+ parts.append(f'{marker}{inner}{marker}')
+ elif token_type in {'linebreak', 'softbreak'}:
+ parts.append(' ')
+ else:
+ children = token.get('children', [])
+ if children:
+ parts.append(self._flatten_inline_text(children))
+ else:
+ parts.append(token.get('raw') or token.get('text') or '')
+ return ''.join(parts).strip()
+
+ def _convert_paragraph_or_heading(self, inline_tokens: Iterable[dict]):
+ ref_text = self._reference_definition_text(inline_tokens)
+ if ref_text is not None:
+ return Paragraph([Text(ref_text)])
+ heading = self._heading_from_inline(inline_tokens)
+ if heading:
+ return heading
+ return Paragraph(self._convert_inline_tokens(inline_tokens))
+
+ def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]):
+ ref_text = self._reference_definition_text(inline_tokens)
+ if ref_text is not None:
+ return [Text(ref_text)]
+ heading = self._heading_from_inline(inline_tokens)
+ if heading:
+ return [heading]
+ return self._convert_inline_tokens(inline_tokens)
+
+ def _heading_from_inline(self, inline_tokens: Iterable[dict]):
+ if len(inline_tokens) != 1:
+ return None
+ token = inline_tokens[0]
+ if token.get('type') != 'text':
+ return None
+ raw = token.get('raw', '')
+ match = _HEADING_LINE_RE.match(raw)
+ if not match:
+ return None
+ level = len(match.group(2))
+ content = raw[match.end(2):].lstrip()
+ heading_tokens = self._markdown(f"{'#' * level} {content}")
+ if heading_tokens and heading_tokens[0].get('type') == 'heading':
+ children = heading_tokens[0].get('children', [])
+ else:
+ children = [{'type': 'text', 'raw': content}]
+ header = Header(level)
+ header.add_nodes(self._convert_inline_tokens(children))
+ return header
+
+ def _reference_definition_text(self, inline_tokens: Iterable[dict]):
+ if len(inline_tokens) != 1:
+ return None
+ token = inline_tokens[0]
+ if token.get('type') != 'text':
+ return None
+ raw = token.get('raw', '')
+ return self._reference_definitions.get(raw)
+
+ def _split_reference_links(self, raw: str, nodes):
+ last = 0
+ for match in _REF_LINK_OR_IMAGE_RE.finditer(raw):
+ if match.start() > last:
+ _append_text(nodes, mistune.escape(raw[last:match.start()]))
+ snippet = match.group(0)
+ if snippet.startswith('!['):
+ nodes.append(Image(snippet))
+ else:
+ nodes.append(Link(snippet))
+ last = match.end()
+ if last < len(raw):
+ _append_text(nodes, mistune.escape(raw[last:]))
+ return nodes
class ZendeskHelpMdParser(MdParser):
- TAG_CONTENT_GROUP = 'tag_content'
- TAG_PATTERN = r'^\s*(<{tag_name}{attr_re}>(?P<%s>[\s\S]+?){tag_name}>)\s*$' % TAG_CONTENT_GROUP
- CALLOUT_STYLE_GROUP = 'style'
- CALLOUT_ATTR_PATTERN = r'( (?P<%s>green|red|yellow))*' % CALLOUT_STYLE_GROUP
-
- def __init__(self):
- super().__init__()
- self.grammar_class.callout = re.compile(self.TAG_PATTERN.format(tag_name='callout',
- attr_re=self.CALLOUT_ATTR_PATTERN))
- self.default_rules.insert(0, 'callout')
-
- self.grammar_class.steps = re.compile(self.TAG_PATTERN.format(tag_name='steps', attr_re=''))
- self.default_rules.insert(0, 'steps')
-
- self.grammar_class.tabs = re.compile(self.TAG_PATTERN.format(tag_name='tabs', attr_re=''))
- self.default_rules.insert(0, 'tabs')
-
- def parse_callout(self, m: Match[str]) -> None:
- style = m.group(self.CALLOUT_STYLE_GROUP)
- self._parse_nested(ZendeskHelpCallout(style), m)
-
- def parse_steps(self, m: Match[str]) -> None:
- self._parse_nested(ZendeskHelpSteps(), m)
-
- def parse_tabs(self, m: Match[str]) -> None:
- self._parse_nested(ZendeskHelpTabs(), m)
-
- def _parse_nested(self, node: Node, m: Match[str]) -> None:
- nested_content = m.group(self.TAG_CONTENT_GROUP)
- nested_nodes = self.get_lexer().parse(nested_content)
- node.add_nodes(nested_nodes)
- self.tokens.append(node)
+ _CALLOUT_PATTERN = re.compile(
+ r'(?s)green|red|yellow))?>(?P.*?)'
+ )
+ _STEPS_PATTERN = re.compile(r'(?s)(?P.*?)')
+ _TABS_PATTERN = re.compile(r'(?s)(?P.*?)')
+
+ def parse(self, text, rules=None):
+ """Parse Markdown with Zendesk tag support into a list of Node objects."""
+ nodes = self._parse_nodes(text)
+ return nodes
+
+ def _parse_nodes(self, text: str):
+ nodes = []
+ remaining = text
+ while remaining:
+ tag_name, match = self._find_next_tag(remaining)
+ if not match:
+ nodes.extend(self._parse_markdown(_normalize_block_indentation(remaining)))
+ break
+
+ if match.start() > 0:
+ prefix = remaining[:match.start()]
+ nodes.extend(self._parse_markdown(_normalize_block_indentation(prefix)))
+
+ content = match.group('content')
+ if tag_name == 'callout':
+ node = ZendeskHelpCallout(match.group('style'))
+ elif tag_name == 'steps':
+ node = ZendeskHelpSteps()
+ else:
+ node = ZendeskHelpTabs()
+
+ node.add_nodes(self._parse_nodes(content))
+ nodes.append(node)
+
+ remaining = remaining[match.end():]
+ return nodes
+
+ def _find_next_tag(self, text: str):
+ matches = []
+ for name, pattern in (
+ ('callout', self._CALLOUT_PATTERN),
+ ('steps', self._STEPS_PATTERN),
+ ('tabs', self._TABS_PATTERN),
+ ):
+ match = pattern.search(text)
+ if match:
+ matches.append((match.start(), name, match))
+ if not matches:
+ return None, None
+ matches.sort(key=lambda item: item[0])
+ for _, name, match in matches:
+ if not _is_inside_fenced_block(text, match.start()):
+ return name, match
+ return None, None
+
+ def _parse_markdown(self, text: str):
+ normalized = _remove_spaces_from_empty_lines(text)
+ normalized = _remove_ltr_rtl_marks(normalized)
+ return self._convert_block_tokens(self._markdown(normalized))
+
+
+def _append_text(nodes, text):
+ if not text:
+ return
+ if nodes and isinstance(nodes[-1], Text):
+ nodes[-1].text += text
+ else:
+ nodes.append(Text(text))
+
+
+def _format_title(title: str) -> str:
+ if title is None:
+ return ''
+ escaped = title.replace('"', '\\"')
+ return f' "{escaped}"'
+
+
+def _format_link_markup(text: str, url: str, title: str | None) -> str:
+ return f'[{text}]({url}{_format_title(title)})'
+
+
+def _format_image_markup(alt: str, url: str, title: str | None) -> str:
+ return f'})'
+
+
+def _is_block_html(raw: str) -> bool:
+ stripped = raw.lstrip()
+ if stripped.startswith('