diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..aac8e09 --- /dev/null +++ b/.flake8 @@ -0,0 +1,12 @@ +[flake8] +max-line-length = 120 +max-complexity = 12 +select = E,F,W,C90 +extend-ignore = F403,F405 +exclude = + .git, + __pycache__, + venv, + build, + dist, + sdiff.egg-info diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..bf9a1e8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,37 @@ +name: CI + +on: + workflow_dispatch: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + push: + branches: [master] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install .[tests] + + - name: Format check + run: python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests + + - name: Lint + run: python -m flake8 --config .flake8 sdiff tests + + - name: Test + run: python -m coverage run -m pytest -s --durations=3 --durations-min=0.005 + + - name: Coverage report + run: python -m coverage report -m diff --git a/.husky/pre-commit b/.husky/pre-commit new file mode 100755 index 0000000..bc7696e --- /dev/null +++ b/.husky/pre-commit @@ -0,0 +1,5 @@ +#!/usr/bin/env sh +. "$(dirname -- "$0")/_/husky.sh" + +python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests +python -m flake8 --config .flake8 sdiff tests diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index df31221..0000000 --- a/.travis.yml +++ /dev/null @@ -1,11 +0,0 @@ -language: python -dist: jammy -python: - - "3.11" -# command to install dependencies -install: - - make dev -# command to run tests -script: - - make test - - make coverage diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..526549e --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,32 @@ +# Repository Guidelines + +## Project Structure & Module Organization +The core library lives in `sdiff/` (parser, comparer, renderer, and models). Tests are in `tests/`, with shared fixtures in `tests/fixtures/`. Reference PDFs sit in `docs/`. Packaging and tooling are defined in `setup.py`, `setup.cfg`, and the `Makefile`; `CHANGELOG` tracks releases. + +## Build, Test, and Development Commands +- `make env` creates the local `venv/` (Python 3.11+). +- `make dev` installs the package plus test/dev extras (`.[tests,devtools]`) into the venv. +- `make test` runs linting and the full pytest suite with coverage. +- `make vtest` runs pytest verbosely. +- `make flake` runs the autopep8 format check and flake8 on `sdiff/` and `tests/`. +- `make format` applies autopep8 formatting to `sdiff/` and `tests/`. +- `make cov` prints the coverage report. +- `make clean` removes build artifacts and the venv. +- `make hooks` installs Husky git hooks (requires Node/npm; `make dev` runs this). + +Lint parity: CI and the Husky pre-commit hook both run the same checks as `make flake` (autopep8 check + flake8). Run `make flake` or `make test` locally to mirror CI. + +Example flow: +```sh +make dev +make test +``` + +## Coding Style & Naming Conventions +Use standard Python conventions: 4-space indentation, `snake_case` for modules/functions/variables, and `PascalCase` for classes. Flake8 enforces a 120-character line limit (see `setup.cfg`). `autopep8` is available for formatting. Keep new modules in `sdiff/` and new tests in `tests/` with filenames like `test_.py`. + +## Testing Guidelines +The suite uses `pytest` with `coverage`. Coverage is expected to stay high (current config fails under 96%). Add or update tests for behavior changes, and prefer small, focused unit tests. Place reusable data in `tests/fixtures/`. Run `make test` before submitting changes. + +## Commit & Pull Request Guidelines +Commit messages in this repo are short and often use a type prefix (e.g., `chore: ...`, `fixes: ...`, `hotfix: ...`, `refactors: ...`). Follow that pattern where practical, and keep the summary concise. For PRs, include a brief description, list tests run (e.g., `make test`), and link related issues or tickets when available. diff --git a/Makefile b/Makefile index 6eeb1e2..4be00c9 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ env: dev: env update $(PIP) install .[tests,devtools] + @$(MAKE) hooks install: env update @@ -28,8 +29,20 @@ publish: $(TWINE) upload --verbose --sign --username developer --repository-url http://$(PYPICLOUD_HOST)/simple/ dist/*.whl flake: + $(PYTHON) -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests $(FLAKE) sdiff tests +format: + $(PYTHON) -m autopep8 --in-place --max-line-length 120 -r sdiff tests + +hooks: + @if command -v npm >/dev/null 2>&1; then \ + npm install --no-package-lock --silent; \ + npm run --silent prepare; \ + else \ + echo "npm not found; skipping husky install"; \ + fi + test: flake $(COVERAGE) run -m pytest $(TEST_RUNNER_FLAGS) @@ -57,4 +70,4 @@ clean: rm -rf venv -.PHONY: all build env linux run pep test vtest testloop cov clean +.PHONY: all build env linux run pep test vtest testloop cov clean hooks format diff --git a/README.md b/README.md index b8bb2a8..7ab5d32 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,40 @@ # md-sdiff -Diffs to markdown texts only based on their structure. Ignores content. Helpful to diff 2 files that contain the same content in different languages. + +Structural diffs for Markdown. The library parses two Markdown inputs into a lightweight tree and compares the *shape* (headings, lists, paragraphs, links, etc.) instead of the text content. This is useful when you expect the same document structure across translations or when you want to validate formatting consistency without caring about the wording. + +## What it does +- Parses Markdown into an AST-like node tree using `mistune`. +- Compares trees node-by-node and flags insertions/deletions in structure. +- Returns a rendered view of each document plus a list of structural errors. +- Supports a Zendesk-specific parser (`ZendeskHelpMdParser`) for ``, ``, and `` blocks. + +## Example usage +```python +from sdiff import diff, TextRenderer, MdParser + +left = "# Title\n\n- One\n- Two" +right = "# Title\n\n- One\n- Two\n- Three" + +rendered_left, rendered_right, errors = diff(left, right, renderer=TextRenderer(), parser_cls=MdParser) +print(errors[0]) # "There is a missing element `li`." +``` + +## Renderers +`TextRenderer` returns the original Markdown structure as text. `HtmlRenderer` wraps the output and marks structural insertions/deletions with `` and ``. + +## One-off usage +```sh +python - <<'PY' +from sdiff import diff, TextRenderer + +left = open("left.md", "r", encoding="utf-8").read() +right = open("right.md", "r", encoding="utf-8").read() +_, _, errors = diff(left, right, renderer=TextRenderer()) + +for err in errors: + print(err) +PY +``` + +## Notes +This project is a library (no CLI). If you need different token handling, you can provide a custom parser class that extends `MdParser`. diff --git a/package.json b/package.json new file mode 100644 index 0000000..d682872 --- /dev/null +++ b/package.json @@ -0,0 +1,10 @@ +{ + "name": "html-structure-diff", + "private": true, + "devDependencies": { + "husky": "^9.0.0" + }, + "scripts": { + "prepare": "husky install" + } +} diff --git a/requirements.txt b/requirements.txt index 1f202e5..a234623 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -mistune==0.8.1 +mistune==3.2.0 diff --git a/sdiff/__init__.py b/sdiff/__init__.py index 853d12c..85b6af4 100644 --- a/sdiff/__init__.py +++ b/sdiff/__init__.py @@ -4,13 +4,21 @@ def diff(md1, md2, renderer=TextRenderer(), parser_cls: type[MdParser] = MdParser): + """Compare two Markdown strings by structure and return rendered outputs + errors. + + Args: + md1: Left Markdown string. + md2: Right Markdown string. + renderer: Renderer instance used to format the output (TextRenderer by default). + parser_cls: Parser class to use (MdParser by default). + + Returns: + (rendered_left, rendered_right, errors) + """ tree1 = parse(md1, parser_cls) tree2 = parse(md2, parser_cls) tree1, tree2, struct_errors = diff_struct(tree1, tree2) - # tree1, tree2, links_errors = diff_links(tree1, tree2) - - # errors = struct_errors + links_errors errors = struct_errors return renderer.render(tree1), renderer.render(tree2), errors diff --git a/sdiff/compare.py b/sdiff/compare.py index 5958ada..34d75ca 100644 --- a/sdiff/compare.py +++ b/sdiff/compare.py @@ -44,8 +44,10 @@ def _diff(tree1, tree2, include_symbols=None, exclude_symbols=None): def diff_links(tree1, tree2): + """Diff only link-relevant structure (paragraphs/headers/lists/links).""" return _diff(tree1, tree2, include_symbols=['p', 'h', 'l', 'a']) def diff_struct(tree1, tree2): + """Diff overall structure, ignoring link and image content.""" return _diff(tree1, tree2, exclude_symbols=['a', 'i']) diff --git a/sdiff/parser.py b/sdiff/parser.py index 93a4736..112831b 100644 --- a/sdiff/parser.py +++ b/sdiff/parser.py @@ -1,207 +1,508 @@ -from re import Match - -import mistune import re +import textwrap +from typing import Iterable -from .model import * - - -class InlineLexer(mistune.BlockLexer): - grammar_class = mistune.InlineGrammar - - default_rules = [ - 'linebreak', 'link', - 'reflink', 'text', - ] - - def __init__(self): - self.links = {} - self.grammar_class.text = re.compile(r'^ {1,}\n|^[\s\S]+?(?=[\[`~]| {2,}\n|$)') - super().__init__() - - def parse_autolink(self, m): - self.tokens.append(Link(m.group(0))) - - def parse_url(self, m): - self.tokens.append(Link(m.group(0))) - - def parse_link(self, m): - return self._process_link(m) - - def parse_reflink(self, m): - # TODO skip this check for now - # key = mistune._keyify(m.group(2) or m.group(1)) - # if key not in self.links: - # return None - # ret = self.links[key] - return self._process_link(m) - - def _process_link(self, m): - line = m.group(0) - if line[0] == '!': - node = Image(line) - else: - node = Link(line) - - self.tokens.append(node) +import mistune +from mistune import block_parser - def parse_linebreak(self, m): - node = NewLine() - self.tokens.append(node) +from .model import (Html, Image, Link, List, ListItem, NewLine, Paragraph, Root, + Text, Header, ZendeskHelpCallout, ZendeskHelpSteps, + ZendeskHelpTabs) - def parse_text(self, m): - text = m.group(0) - if text.strip(): - escaped_text = mistune.escape(text) - node = Text(escaped_text) - self.tokens.append(node) +_BLOCK_TAGS = {tag.lower() for tag in block_parser.BLOCK_TAGS} +_HEADING_LINE_RE = re.compile(r'^(\s*)(#{1,6})(?!#)(?=\S)') +_REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\s*\[[^\]]*\]') +_REF_DEF_LINE_RE = re.compile(r'^\s{0,3}\[[^\]]+\]:\s+\S+') +_FENCE_RE = re.compile(r'^\s*(`{3,}|~{3,})') +_INLINE_MARKERS = { + 'strong': '**', + 'emphasis': '*', + 'strikethrough': '~~', +} -class MdParser(mistune.BlockLexer): - default_rules = [ - 'newline', 'list_block', 'block_html', - 'heading', 'lheading', - 'paragraph', 'text', - ] +class MdParser: + """Markdown parser that builds a lightweight structural tree. - list_rules = ( - 'newline', 'heading', 'lheading', - 'hrule', 'list_block', 'text', - ) + Uses Mistune AST tokens to build sdiff Node objects. + """ + list_rules = None @classmethod def get_lexer(cls): return cls() def __init__(self): - super().__init__() - self.grammar_class.block_html = re.compile( - r'^\s* *(?:{}|{}|{}) *(?:\n{{1,}}|\s*$)'.format( - r'', - r'<({})((?:{})*?)>([\s\S]+?)<\/\1>'.format(mistune._block_tag, mistune._valid_attr), - r'<{}(?:{})*?>'.format(mistune._block_tag, mistune._valid_attr), - ) - ) - - def _parse_inline(self, text): - inline = InlineLexer() - return inline.parse(text) - - def parse_newline(self, m): - length = len(m.group(0)) - if length > 1: - self.tokens.append(NewLine()) - - def parse_heading(self, m): - level = len(m.group(1)) - node = Header(level) - node.add_nodes(self._parse_inline(m.group(2))) - self.tokens.append(node) - - def parse_lheading(self, m): - level = 1 if m.group(2) == '=' else 2 - text = m.group(1) - node = Header(level) - node.add_nodes(self._parse_inline(text)) - self.tokens.append(node) - - def parse_block_html(self, m): - text = m.group(0) - html = Html(text) - self.tokens.append(html) - - def parse_paragraph(self, m): - text = m.group(1).rstrip('\n') - node = Paragraph() - node.add_nodes(self._parse_inline(text)) - self.tokens.append(node) - - def parse_text(self, m): - text = m.group(0) - escaped_text = mistune.escape(text) - node = Text(escaped_text) - self.tokens.append(node) - - def parse_list_block(self, m): - bull = m.group(2) - cap = m.group(0) - ordered = '.' in bull - node = List(ordered) - node.add_nodes(self._process_list_item(cap, bull)) - self.tokens.append(node) - - def _process_list_item(self, cap, bull): - result = [] - cap = self.rules.list_item.findall(cap) - - _next = False - length = len(cap) - - for i in range(length): - item = cap[i][0] - - # remove the bullet - space = len(item) - item = self.rules.list_bullet.sub('', item) - - # outdent - if '\n ' in item: - space = space - len(item) - pattern = re.compile(r'^ {1,%d}' % space, flags=re.M) - item = pattern.sub('', item) - - # determine whether item is loose or not - loose = _next - if not loose and re.search(r'\n\n(?!\s*$)', item): - loose = True - - rest = len(item) - if i != length - 1 and rest: - _next = item[rest - 1] == '\n' - if not loose: - loose = _next - - node = ListItem() - block_lexer = self.get_lexer() - nodes = block_lexer.parse(item, self.list_rules) - node.add_nodes(nodes) - result.append(node) - return result + self._markdown = mistune.create_markdown(renderer='ast') + self._reference_definitions = {} + + def parse(self, text, rules=None): + """Parse Markdown text into a list of Node objects. + + Args: + text: Markdown string. + rules: Optional rules argument kept for compatibility. + + Returns: + list[Node] + """ + tokens = self._markdown(text) + return self._convert_block_tokens(tokens) + + def _set_reference_definitions(self, definitions): + self._reference_definitions = definitions + + def _convert_block_tokens(self, tokens: Iterable[dict]): + nodes = [] + for token in tokens: + nodes.extend(self._convert_block_token(token)) + return nodes + + def _convert_block_token(self, token): + token_type = token.get('type') + if token_type == 'paragraph': + return [self._convert_paragraph_or_heading(token.get('children', []))] + if token_type == 'heading': + return [self._convert_heading(token)] + if token_type == 'list': + return [self._convert_list(token)] + if token_type == 'list_item': + return [self._convert_list_item(token)] + if token_type == 'block_text': + return [self._convert_paragraph_or_heading(token.get('children', []))] + if token_type == 'block_html': + return self._convert_block_html(token) + if token_type == 'block_quote': + return self._convert_block_quote(token) + if token_type == 'block_code': + return self._convert_block_code(token) + if token_type == 'thematic_break': + return self._convert_passthrough_block(token) + return self._convert_passthrough_block(token) + + def _convert_heading(self, token): + level = token.get('level') or token.get('attrs', {}).get('level', 1) + header = Header(level) + header.add_nodes(self._convert_inline_tokens(token.get('children', []))) + return header + + def _convert_list(self, token): + ordered = token.get('ordered') + if ordered is None: + ordered = token.get('attrs', {}).get('ordered', False) + list_node = List(bool(ordered)) + for item in token.get('children', []): + list_node.add_node(self._convert_list_item(item)) + return list_node + + def _convert_block_html(self, token): + raw = token.get('raw', '') + if _is_block_html(raw): + return [Html(raw)] + text = mistune.escape(raw) + if text.strip(): + return [Paragraph([Text(text)])] + return [] + + def _convert_passthrough_block(self, token): + child_nodes = self._convert_block_tokens(token.get('children', [])) + if child_nodes: + return child_nodes + raw = token.get('raw') or token.get('text') or '' + if raw.strip(): + return [Paragraph([Text(mistune.escape(raw))])] + return [] + + def _convert_block_quote(self, token): + children = token.get('children', []) + if not children: + return [] + content = self._render_inline_children(children) + if not content.strip(): + return [] + lines = content.splitlines() + quoted = '\n'.join([f'> {line}' if line.strip() else '>' for line in lines]) + return [Paragraph([Text(mistune.escape(quoted))])] + + def _convert_block_code(self, token): + raw = token.get('raw') or '' + marker = token.get('marker') or '```' + fence = marker if marker else '```' + content = raw.rstrip('\n') + code_block = f'{fence}\n{content}\n{fence}' + return [Paragraph([Text(mistune.escape(code_block))])] + + def _render_inline_children(self, children): + parts = [] + for child in children: + child_type = child.get('type') + if child_type in {'paragraph', 'block_text'}: + parts.append(self._flatten_inline_text(child.get('children', []))) + else: + raw = child.get('raw') or child.get('text') or '' + if raw: + parts.append(raw) + return '\n'.join([part for part in parts if part is not None]) + + def _convert_list_item(self, token): + item = ListItem() + for child in token.get('children', []): + child_type = child.get('type') + if child_type in {'block_text', 'paragraph'}: + item.add_nodes(self._convert_list_block_nodes(child.get('children', []))) + else: + item.add_nodes(self._convert_block_tokens([child])) + return item + + def _convert_inline_tokens(self, tokens: Iterable[dict]): + nodes = [] + buffer = '' + + def flush_buffer(): + nonlocal buffer + if buffer: + self._split_reference_links(buffer, nodes) + buffer = '' + + handlers = { + 'text': self._handle_inline_text, + 'inline_html': self._handle_inline_text, + 'block_html': self._handle_inline_text, + 'codespan': self._handle_inline_codespan, + 'softbreak': self._handle_inline_softbreak, + 'linebreak': self._handle_inline_linebreak, + 'link': self._handle_inline_link, + 'image': self._handle_inline_image, + 'strong': self._handle_inline_marker, + 'emphasis': self._handle_inline_marker, + 'strikethrough': self._handle_inline_marker, + } + + for token in tokens: + token_type = token.get('type') + handler = handlers.get(token_type) + if handler: + buffer = handler(token, nodes, buffer, flush_buffer) + else: + buffer = self._handle_inline_other(token, nodes, buffer, flush_buffer) + + flush_buffer() + return nodes + + def _handle_inline_text(self, token, nodes, buffer, flush_buffer): + raw = token.get('raw', '') + buffer += self._reference_definitions.get(raw, raw) + return buffer + + def _handle_inline_codespan(self, token, nodes, buffer, flush_buffer): + buffer += f"`{token.get('raw') or token.get('text') or ''}`" + return buffer + + def _handle_inline_softbreak(self, token, nodes, buffer, flush_buffer): + buffer += ' ' + return buffer + + def _handle_inline_linebreak(self, token, nodes, buffer, flush_buffer): + flush_buffer() + nodes.append(NewLine()) + return buffer + + def _handle_inline_link(self, token, nodes, buffer, flush_buffer): + flush_buffer() + text = self._flatten_inline_text(token.get('children', [])) + attrs = token.get('attrs', {}) + url = attrs.get('url', '') + title = attrs.get('title') + nodes.append(Link(_format_link_markup(text, url, title))) + return buffer + + def _handle_inline_image(self, token, nodes, buffer, flush_buffer): + flush_buffer() + alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', [])) + attrs = token.get('attrs', {}) + url = attrs.get('url', '') + title = attrs.get('title') + nodes.append(Image(_format_image_markup(alt, url, title))) + return buffer + + def _handle_inline_marker(self, token, nodes, buffer, flush_buffer): + flush_buffer() + marker = _INLINE_MARKERS[token.get('type')] + _append_text(nodes, marker) + children = token.get('children', []) + if children: + nodes.extend(self._convert_inline_tokens(children)) + _append_text(nodes, marker) + return buffer + + def _handle_inline_other(self, token, nodes, buffer, flush_buffer): + flush_buffer() + children = token.get('children', []) + if children: + nodes.extend(self._convert_inline_tokens(children)) + else: + raw = token.get('raw') or token.get('text') or '' + if raw.strip(): + _append_text(nodes, mistune.escape(raw)) + return buffer + + def _flatten_inline_text(self, tokens: Iterable[dict]): + parts = [] + for token in tokens: + token_type = token.get('type') + if token_type in {'text', 'inline_html', 'block_html'}: + raw = token.get('raw') or token.get('text') or '' + parts.append(self._reference_definitions.get(raw, raw)) + elif token_type == 'codespan': + parts.append(f"`{token.get('raw') or token.get('text') or ''}`") + elif token_type in _INLINE_MARKERS: + marker = _INLINE_MARKERS[token_type] + inner = self._flatten_inline_text(token.get('children', [])) + parts.append(f'{marker}{inner}{marker}') + elif token_type in {'linebreak', 'softbreak'}: + parts.append(' ') + else: + children = token.get('children', []) + if children: + parts.append(self._flatten_inline_text(children)) + else: + parts.append(token.get('raw') or token.get('text') or '') + return ''.join(parts).strip() + + def _convert_paragraph_or_heading(self, inline_tokens: Iterable[dict]): + ref_text = self._reference_definition_text(inline_tokens) + if ref_text is not None: + return Paragraph([Text(ref_text)]) + heading = self._heading_from_inline(inline_tokens) + if heading: + return heading + return Paragraph(self._convert_inline_tokens(inline_tokens)) + + def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]): + ref_text = self._reference_definition_text(inline_tokens) + if ref_text is not None: + return [Text(ref_text)] + heading = self._heading_from_inline(inline_tokens) + if heading: + return [heading] + return self._convert_inline_tokens(inline_tokens) + + def _heading_from_inline(self, inline_tokens: Iterable[dict]): + if len(inline_tokens) != 1: + return None + token = inline_tokens[0] + if token.get('type') != 'text': + return None + raw = token.get('raw', '') + match = _HEADING_LINE_RE.match(raw) + if not match: + return None + level = len(match.group(2)) + content = raw[match.end(2):].lstrip() + heading_tokens = self._markdown(f"{'#' * level} {content}") + if heading_tokens and heading_tokens[0].get('type') == 'heading': + children = heading_tokens[0].get('children', []) + else: + children = [{'type': 'text', 'raw': content}] + header = Header(level) + header.add_nodes(self._convert_inline_tokens(children)) + return header + + def _reference_definition_text(self, inline_tokens: Iterable[dict]): + if len(inline_tokens) != 1: + return None + token = inline_tokens[0] + if token.get('type') != 'text': + return None + raw = token.get('raw', '') + return self._reference_definitions.get(raw) + + def _split_reference_links(self, raw: str, nodes): + last = 0 + for match in _REF_LINK_OR_IMAGE_RE.finditer(raw): + if match.start() > last: + _append_text(nodes, mistune.escape(raw[last:match.start()])) + snippet = match.group(0) + if snippet.startswith('!['): + nodes.append(Image(snippet)) + else: + nodes.append(Link(snippet)) + last = match.end() + if last < len(raw): + _append_text(nodes, mistune.escape(raw[last:])) + return nodes class ZendeskHelpMdParser(MdParser): - TAG_CONTENT_GROUP = 'tag_content' - TAG_PATTERN = r'^\s*(<{tag_name}{attr_re}>(?P<%s>[\s\S]+?))\s*$' % TAG_CONTENT_GROUP - CALLOUT_STYLE_GROUP = 'style' - CALLOUT_ATTR_PATTERN = r'( (?P<%s>green|red|yellow))*' % CALLOUT_STYLE_GROUP - - def __init__(self): - super().__init__() - self.grammar_class.callout = re.compile(self.TAG_PATTERN.format(tag_name='callout', - attr_re=self.CALLOUT_ATTR_PATTERN)) - self.default_rules.insert(0, 'callout') - - self.grammar_class.steps = re.compile(self.TAG_PATTERN.format(tag_name='steps', attr_re='')) - self.default_rules.insert(0, 'steps') - - self.grammar_class.tabs = re.compile(self.TAG_PATTERN.format(tag_name='tabs', attr_re='')) - self.default_rules.insert(0, 'tabs') - - def parse_callout(self, m: Match[str]) -> None: - style = m.group(self.CALLOUT_STYLE_GROUP) - self._parse_nested(ZendeskHelpCallout(style), m) - - def parse_steps(self, m: Match[str]) -> None: - self._parse_nested(ZendeskHelpSteps(), m) - - def parse_tabs(self, m: Match[str]) -> None: - self._parse_nested(ZendeskHelpTabs(), m) - - def _parse_nested(self, node: Node, m: Match[str]) -> None: - nested_content = m.group(self.TAG_CONTENT_GROUP) - nested_nodes = self.get_lexer().parse(nested_content) - node.add_nodes(nested_nodes) - self.tokens.append(node) + _CALLOUT_PATTERN = re.compile( + r'(?s)green|red|yellow))?>(?P.*?)' + ) + _STEPS_PATTERN = re.compile(r'(?s)(?P.*?)') + _TABS_PATTERN = re.compile(r'(?s)(?P.*?)') + + def parse(self, text, rules=None): + """Parse Markdown with Zendesk tag support into a list of Node objects.""" + nodes = self._parse_nodes(text) + return nodes + + def _parse_nodes(self, text: str): + nodes = [] + remaining = text + while remaining: + tag_name, match = self._find_next_tag(remaining) + if not match: + nodes.extend(self._parse_markdown(_normalize_block_indentation(remaining))) + break + + if match.start() > 0: + prefix = remaining[:match.start()] + nodes.extend(self._parse_markdown(_normalize_block_indentation(prefix))) + + content = match.group('content') + if tag_name == 'callout': + node = ZendeskHelpCallout(match.group('style')) + elif tag_name == 'steps': + node = ZendeskHelpSteps() + else: + node = ZendeskHelpTabs() + + node.add_nodes(self._parse_nodes(content)) + nodes.append(node) + + remaining = remaining[match.end():] + return nodes + + def _find_next_tag(self, text: str): + matches = [] + for name, pattern in ( + ('callout', self._CALLOUT_PATTERN), + ('steps', self._STEPS_PATTERN), + ('tabs', self._TABS_PATTERN), + ): + match = pattern.search(text) + if match: + matches.append((match.start(), name, match)) + if not matches: + return None, None + matches.sort(key=lambda item: item[0]) + for _, name, match in matches: + if not _is_inside_fenced_block(text, match.start()): + return name, match + return None, None + + def _parse_markdown(self, text: str): + normalized = _remove_spaces_from_empty_lines(text) + normalized = _remove_ltr_rtl_marks(normalized) + return self._convert_block_tokens(self._markdown(normalized)) + + +def _append_text(nodes, text): + if not text: + return + if nodes and isinstance(nodes[-1], Text): + nodes[-1].text += text + else: + nodes.append(Text(text)) + + +def _format_title(title: str) -> str: + if title is None: + return '' + escaped = title.replace('"', '\\"') + return f' "{escaped}"' + + +def _format_link_markup(text: str, url: str, title: str | None) -> str: + return f'[{text}]({url}{_format_title(title)})' + + +def _format_image_markup(alt: str, url: str, title: str | None) -> str: + return f'![{alt}]({url}{_format_title(title)})' + + +def _is_block_html(raw: str) -> bool: + stripped = raw.lstrip() + if stripped.startswith('