From a4b21c17bd79cde344df6d857390c6850402e192 Mon Sep 17 00:00:00 2001 From: manuzhang Date: Thu, 22 Jan 2026 23:53:35 +0800 Subject: [PATCH] feat: strip HTML comments from RSS feed descriptions Add automatic HTML comment removal from all feed item descriptions to ensure clean content in RSS and JSON feeds. Comments are stripped from all content sources including markdown, page.meta descriptions, and full page content. - Add strip_html_comments() static method to Util class - Update get_description_or_abstract() to strip comments in all code paths - Handle both inline and multiline HTML comments - Preserve content structure while removing comment artifacts This prevents HTML comments (including TODO notes, analytics codes, and other internal annotations) from appearing in published feeds. Co-Authored-By: Claude <> --- mkdocs_rss_plugin/util.py | 42 +++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/mkdocs_rss_plugin/util.py b/mkdocs_rss_plugin/util.py index 332a769..40b9cfc 100644 --- a/mkdocs_rss_plugin/util.py +++ b/mkdocs_rss_plugin/util.py @@ -11,6 +11,8 @@ from functools import lru_cache from mimetypes import guess_type from pathlib import Path +from re import DOTALL +from re import sub as re_sub from typing import Any, Literal from urllib.parse import urlencode, urlparse, urlunparse @@ -486,6 +488,18 @@ def get_date_from_meta( return out_date + @staticmethod + def strip_html_comments(html_content: str) -> str: + """Remove HTML comments from content. + + Args: + html_content (str): HTML content potentially containing comments + + Returns: + str: HTML content with comments removed + """ + return re_sub(r"", "", html_content, flags=DOTALL) + def get_description_or_abstract( self, in_page: Page, @@ -514,12 +528,14 @@ def get_description_or_abstract( # If the full page is wanted (unlimited chars count) if chars_count == -1 and (in_page.content or in_page.markdown): if in_page.content: - return in_page.content + return self.strip_html_comments(in_page.content) else: - return markdown.markdown(in_page.markdown, output_format="html5") + return self.strip_html_comments( + markdown.markdown(in_page.markdown, output_format="html5") + ) # If the description is explicitly given elif description: - return description + return self.strip_html_comments(description) # If the abstract is cut by the delimiter elif ( abstract_delimiter @@ -528,18 +544,24 @@ def get_description_or_abstract( ) > -1 ): - return markdown.markdown( - in_page.markdown[:excerpt_separator_position], - output_format="html5", + return self.strip_html_comments( + markdown.markdown( + in_page.markdown[:excerpt_separator_position], + output_format="html5", + ) ) # Use first chars_count from the markdown elif chars_count > 0 and in_page.markdown: if len(in_page.markdown) <= chars_count: - return markdown.markdown(in_page.markdown, output_format="html5") + return self.strip_html_comments( + markdown.markdown(in_page.markdown, output_format="html5") + ) else: - return markdown.markdown( - f"{in_page.markdown[: chars_count - 3]}...", - output_format="html5", + return self.strip_html_comments( + markdown.markdown( + f"{in_page.markdown[: chars_count - 3]}...", + output_format="html5", + ) ) # No explicit description and no (or empty) abstract found else: