From 6b7a125cf31fd425871ad871075f93d1dba6c003 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Tue, 15 Jul 2025 11:37:37 -0500
Subject: [PATCH 01/10] HTML API: Reliably parse HTML in `wp_html_split()`.

Trac ticket: Core-63694

This probably improves the performance in terms of both CPU time and
memory compared to the old PCRE-based approach.
---
 src/wp-includes/formatting.php | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index bd2d349fa20c1..13e2c0c3b370c 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -605,21 +605,41 @@ function wpautop( $text, $br = true ) {
 }
 
 /**
- * Separates HTML elements and comments from the text.
+ * Splits an HTML input into an array of raw strings, where each token
+ * represents a tag, a comment, a text node, etc…
+ *
+ * No effort is made to clean up, sanitize, or normalize the segments
+ * of HTML. {@see WP_HTML_Processor::normalize()} for normalization.
  *
  * @since 4.2.4
+ * @since {WP_VERSION} Reliably parses HTML via the HTML API.
  *
  * @param string $input The text which has to be formatted.
  * @return string[] Array of the formatted text.
  */
 function wp_html_split( $input ) {
-	return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE );
+	$token_reporter = new class( $input ) extends WP_HTML_Tag_Processor {
+		public function extract_raw_token() {
+			$this->set_bookmark( 'here' );
+			$here = $this->bookmarks['here'];
+
+			return substr( $this->html, $here->start, $here->length );
+		}
+	};
+
+	$tokens = array();
+	while ( $token_reporter->next_token() ) {
+		$tokens[] = $token_reporter->extract_raw_token();
+	}
+
+	return $tokens;
 }
 
 /**
  * Retrieves the regular expression for an HTML element.
  *
  * @since 4.4.0
+ * @deprecated {WP_VERSION} Use the HTML API instead.
  *
  * @return string The regular expression.
  */

From d50fd255136a751349cc03594cdecfbf52186ca6 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Thu, 11 Sep 2025 17:18:58 -0500
Subject: [PATCH 02/10] Deprecation notice for html split and version

---
 src/wp-includes/formatting.php | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 13e2c0c3b370c..de14253716402 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -639,13 +639,19 @@ public function extract_raw_token() {
  * Retrieves the regular expression for an HTML element.
  *
  * @since 4.4.0
- * @deprecated {WP_VERSION} Use the HTML API instead.
+ * @deprecated 6.9.0 Use the HTML API instead.
  *
  * @return string The regular expression.
  */
 function get_html_split_regex() {
 	static $regex;
 
+	_deprecated_function(
+		__FUNCTION__,
+		'6.9.0',
+		'Use the HTML API instead.'
+	);
+
 	if ( ! isset( $regex ) ) {
 		// phpcs:disable Squiz.Strings.ConcatenationSpacing.PaddingFound -- don't remove regex indentation
 		$comments =

From 1eb9f574fd50f47707259828f6ff68dea9d966dc Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Tue, 15 Jul 2025 15:49:45 -0500
Subject: [PATCH 03/10] Fix two broken tests on CDATA behavior

---
 tests/phpunit/tests/formatting/wpAutop.php     | 2 +-
 tests/phpunit/tests/formatting/wpHtmlSplit.php | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/phpunit/tests/formatting/wpAutop.php b/tests/phpunit/tests/formatting/wpAutop.php
index ee4d90645d09c..cd868bf7ffd2c 100644
--- a/tests/phpunit/tests/formatting/wpAutop.php
+++ b/tests/phpunit/tests/formatting/wpAutop.php
@@ -538,7 +538,7 @@ public function data_element_sanity() {
 			),
 			array(
 				"Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 --> a\n9 ]]>",
-				"<p>Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 --> a\n9 ]]></p>\n",
+				"<p>Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 --> a<br />\n9 ]]></p>\n",
 			),
 		);
 	}
diff --git a/tests/phpunit/tests/formatting/wpHtmlSplit.php b/tests/phpunit/tests/formatting/wpHtmlSplit.php
index 750ad3821cc54..f7d922d114f6b 100644
--- a/tests/phpunit/tests/formatting/wpHtmlSplit.php
+++ b/tests/phpunit/tests/formatting/wpHtmlSplit.php
@@ -32,7 +32,7 @@ public function data_basic_features() {
 			),
 			array(
 				'abcd <![CDATA[ <html> ]]> efgh',
-				array( 'abcd ', '<![CDATA[ <html> ]]>', ' efgh' ),
+				array( 'abcd ', '<![CDATA[ <html>', ' ]]> efgh' ),
 			),
 		);
 	}

From a8ae28f0faba9c22189bcb3996dc899326149353 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Thu, 11 Sep 2025 17:22:05 -0500
Subject: [PATCH 04/10] Explain tests that were previously asserting invalid
 behaviors. (formatting)

---
 tests/phpunit/tests/formatting/wpAutop.php     | 9 +++++++++
 tests/phpunit/tests/formatting/wpHtmlSplit.php | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/tests/phpunit/tests/formatting/wpAutop.php b/tests/phpunit/tests/formatting/wpAutop.php
index cd868bf7ffd2c..b97468e7830f8 100644
--- a/tests/phpunit/tests/formatting/wpAutop.php
+++ b/tests/phpunit/tests/formatting/wpAutop.php
@@ -536,6 +536,15 @@ public function data_element_sanity() {
 				"Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 ]]> -->",
 				"<p>Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 ]]> --></p>\n",
 			),
+			/*
+			 * CDATA sections do not exist within HTML, so even though it looks
+			 * like this should be escaping the entire “inner comment” span, there’s
+			 * actually an invalid comment starting at `<![CDATA[` and ending at
+			 * the very first `>` character, placing the end of the comment at
+			 * what looks like the end of the “normal comment” — the ` -->`. Everything
+			 * afterward is normal HTML data so the `<br>` is a real `BR` element and
+			 * the `]]>` is normal plaintext, not the CDATA terminator.
+			 */
 			array(
 				"Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 --> a\n9 ]]>",
 				"<p>Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 --> a<br />\n9 ]]></p>\n",
diff --git a/tests/phpunit/tests/formatting/wpHtmlSplit.php b/tests/phpunit/tests/formatting/wpHtmlSplit.php
index f7d922d114f6b..048dedfd081a7 100644
--- a/tests/phpunit/tests/formatting/wpHtmlSplit.php
+++ b/tests/phpunit/tests/formatting/wpHtmlSplit.php
@@ -30,6 +30,13 @@ public function data_basic_features() {
 				'abcd <!-- <html> --> efgh',
 				array( 'abcd ', '<!-- <html> -->', ' efgh' ),
 			),
+			/*
+			 * CDATA sections do not exist within HTML, so even though it looks
+			 * like this should be escaping the entire ` <html> ` span, there’s
+			 * actually an invalid comment starting at `<![CDATA[` and ending at
+			 * the very first `>` character, placing the end of the comment at
+			 * the end of `html>`. The rest is normal plaintext content.
+			 */
 			array(
 				'abcd <![CDATA[ <html> ]]> efgh',
 				array( 'abcd ', '<![CDATA[ <html>', ' ]]> efgh' ),

From d6e05851ccd97694e427ee3755fe6422f88d59c0 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Thu, 11 Sep 2025 17:45:25 -0500
Subject: [PATCH 05/10] HTML Split: Match legacy behavior from preg_split

---
 src/wp-includes/formatting.php | 67 +++++++++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 5 deletions(-)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index de14253716402..608f66fcf03a9 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -611,11 +611,44 @@ function wpautop( $text, $br = true ) {
  * No effort is made to clean up, sanitize, or normalize the segments
  * of HTML. {@see WP_HTML_Processor::normalize()} for normalization.
  *
+ * Consider using the HTML API directly instead of relying on this
+ * legacy function: it bloats memory by default and provides a text
+ * interface for working with HTML whereas the HTML API provides a
+ * low-overhead and convenient structural interface.
+ *
+ * ## Output format:
+ *
+ * To maintain legacy behaviors with this function from when it
+ * operated via {@see preg_split()}, the output array injects text
+ * nodes which do not appear in the source HTML. That is, the original
+ * array, relying on {@see PREG_SPLIT_DELIM_CAPTURE}, included a text
+ * span on each side of every tag-like or comment-like “delimiter” in
+ * the matched string.
+ *
+ * Therefore, the output array will always start and end with text nodes
+ * and will separate every non-text node with a text node. If there is no
+ * actual content in the interstitial space between tokens in the source
+ * document, an empty text node will be created.
+ *
+ * Example:
+ *
+ *     array( '', '<img>', '' )    === wp_html_split( '<img>' );
+ *     array( 'test' )             === wp_html_split( 'test' );
+ *     array( '', '<p>', 'test' )  === wp_html_split( '<p>test' );
+ *     array( 'test', '</p>', '' ) === wp_html_split( 'test</p>' );
+ *
+ *     array( '', '<br>', '', '<!-- comment -->', '' ) === wp_html_split( '<br><!-- comment -->' );
+ *
+ *     // To avoid ambiguity, leading less-than signs (<) in text nodes are encoded.
+ *     array( '&#60;3' ) === wp_split_html( '<3' );
+ *
  * @since 4.2.4
- * @since {WP_VERSION} Reliably parses HTML via the HTML API.
+ * @since 6.9.0 Reliably parses HTML via the HTML API.
  *
- * @param string $input The text which has to be formatted.
- * @return string[] Array of the formatted text.
+ * @param string $input HTML document to split, one item for every token.
+ *                      These can be text nodes, tags, comments, or doctype declarations.
+ * @return string[] Tokens from input; starting and ending in a text node, and with text
+ *                  nodes between every non-text node (see docblock note).
  */
 function wp_html_split( $input ) {
 	$token_reporter = new class( $input ) extends WP_HTML_Tag_Processor {
@@ -627,9 +660,33 @@ public function extract_raw_token() {
 		}
 	};
 
-	$tokens = array();
+	$tokens   = array();
+	$was_text = false;
 	while ( $token_reporter->next_token() ) {
-		$tokens[] = $token_reporter->extract_raw_token();
+		$raw_token = $token_reporter->extract_raw_token();
+		$is_text   = '#text' === $token_reporter->get_token_name();
+
+		if ( ! $is_text && ! $was_text ) {
+			$tokens[] = '';
+		}
+
+		/*
+		 * Some legacy code assumes that text nodes will never start with a
+		 * less-than sign (<) but this isn’t the case, as some text nodes do
+		 * if the less-than sign doesn’t introduce a syntax token. To avoid
+		 * further corruption a leading less-than sign is replaced by its
+		 * encoded equivalent numeric character reference.
+		 */
+		if ( $is_text && '<' === ( $raw_token[0] ?? '' ) ) {
+			$raw_token = '&#60;' . substr( $raw_token, 1 );
+		}
+
+		$tokens[] = $raw_token;
+		$was_text = $is_text;
+	}
+
+	if ( ! $was_text ) {
+		$tokens[] = '';
 	}
 
 	return $tokens;

From d091ce5da3e98e5687f92b6194ab66d7d935332e Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Tue, 15 Jul 2025 15:50:03 -0500
Subject: [PATCH 06/10] Fix broken comments controller test:
 wp_replace_in_html_tags()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Was detecting a non-escaped `<` as the start of an “element” and
then replaced a newline in the text as `<!-- wpnl -->` since it
thought it was replacing inside a tag. In the end that translated
into a raw `\n` again in the end.
---
 .../rest-api/rest-comments-controller.php     | 40 +++++++++++++++----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/tests/phpunit/tests/rest-api/rest-comments-controller.php b/tests/phpunit/tests/rest-api/rest-comments-controller.php
index 8542bcd42af24..c553019eb28c9 100644
--- a/tests/phpunit/tests/rest-api/rest-comments-controller.php
+++ b/tests/phpunit/tests/rest-api/rest-comments-controller.php
@@ -3152,24 +3152,50 @@ public function test_comment_roundtrip_as_editor_unfiltered_html() {
 		}
 	}
 
+	/**
+	 * Ensures that saving a comment as a super-admin does not corrupt the
+	 * comment content when presented with common edge cases.
+	 *
+	 * Note that this test used to assert the wrong behavior due to a bug
+	 * in {@see wp_html_split()}. Whereby the unescaped `<` used to be
+	 * mistakenly identified as the start of an HTML tag or comment, this
+	 * led to accidental replacement “inside” the mistaken tag. The test
+	 * has been updated with `wp_html_split()` in accordance with the
+	 * HTML5 living specification.
+	 *
+	 * @ticket {TICKET_NUMBER}
+	 */
 	public function test_comment_roundtrip_as_superadmin() {
 		wp_set_current_user( self::$superadmin_id );
 
+		$raw_content       = <<<'HTML'
+\\&\\ &amp; &invalid; < &lt; &amp;lt;
+HTML;
+		$rendered          = <<<'HTML'
+<p>\\&#038;\\ &amp; &invalid; < &lt; &amp;lt;</p>
+HTML;
+		$author_name       = <<<'HTML'
+\\&amp;\\ &amp; &amp;invalid; &lt; &lt; &amp;lt;
+HTML;
+		$author_user_agent = <<<'HTML'
+\\&\\ &amp; &invalid; &lt; &lt; &amp;lt;
+HTML;
+
 		$this->assertTrue( current_user_can( 'unfiltered_html' ) );
 		$this->verify_comment_roundtrip(
 			array(
-				'content'           => '\\\&\\\ &amp; &invalid; < &lt; &amp;lt;',
-				'author_name'       => '\\\&\\\ &amp; &invalid; < &lt; &amp;lt;',
-				'author_user_agent' => '\\\&\\\ &amp; &invalid; < &lt; &amp;lt;',
+				'content'           => $raw_content,
+				'author_name'       => $raw_content,
+				'author_user_agent' => $raw_content,
 				'author'            => self::$superadmin_id,
 			),
 			array(
 				'content'           => array(
-					'raw'      => '\\\&\\\ &amp; &invalid; < &lt; &amp;lt;',
-					'rendered' => '<p>\\\&#038;\\\ &amp; &invalid; < &lt; &amp;lt;' . "\n</p>",
+					'raw'      => $raw_content,
+					'rendered' => $rendered,
 				),
-				'author_name'       => '\\\&amp;\\\ &amp; &amp;invalid; &lt; &lt; &amp;lt;',
-				'author_user_agent' => '\\\&\\\ &amp; &invalid; &lt; &lt; &amp;lt;',
+				'author_name'       => $author_name,
+				'author_user_agent' => $author_user_agent,
 				'author'            => self::$superadmin_id,
 			)
 		);

From d5b789c4793e659161eb7707cf973046ca372697 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Thu, 11 Sep 2025 18:00:07 -0500
Subject: [PATCH 07/10] Fixup! Tests

---
 tests/phpunit/tests/formatting/wpHtmlSplit.php | 13 -------------
 tests/phpunit/tests/shortcode.php              |  2 +-
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/tests/phpunit/tests/formatting/wpHtmlSplit.php b/tests/phpunit/tests/formatting/wpHtmlSplit.php
index 048dedfd081a7..ad982059d9b2a 100644
--- a/tests/phpunit/tests/formatting/wpHtmlSplit.php
+++ b/tests/phpunit/tests/formatting/wpHtmlSplit.php
@@ -44,19 +44,6 @@ public function data_basic_features() {
 		);
 	}
 
-	/**
-	 * Automated performance testing of the main regex.
-	 *
-	 * @dataProvider data_whole_posts
-	 *
-	 * @covers ::get_html_split_regex
-	 */
-	public function test_pcre_performance( $input ) {
-		$regex  = get_html_split_regex();
-		$result = benchmark_pcre_backtracking( $regex, $input, 'split' );
-		return $this->assertLessThan( 200, $result );
-	}
-
 	public function data_whole_posts() {
 		require_once DIR_TESTDATA . '/formatting/whole-posts.php';
 		return data_whole_posts();
diff --git a/tests/phpunit/tests/shortcode.php b/tests/phpunit/tests/shortcode.php
index 269da2b05e34d..470ddb527ce0a 100644
--- a/tests/phpunit/tests/shortcode.php
+++ b/tests/phpunit/tests/shortcode.php
@@ -622,7 +622,7 @@ public function data_escaping() {
 	 * @dataProvider data_escaping2
 	 */
 	public function test_escaping2( $input, $output ) {
-		return $this->assertSame( $output, strip_shortcodes( $input ) );
+		return $this->assertEqualHTML( $output, strip_shortcodes( $input ) );
 	}
 
 	public function data_escaping2() {

From 556b3086044cd464cf8dea765fd1136274e31d86 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Thu, 11 Sep 2025 20:26:34 -0500
Subject: [PATCH 08/10] Process incomplete HTML tokens as raw text anyway

---
 src/wp-includes/formatting.php    | 18 ++++++++++++++++++
 tests/phpunit/tests/shortcode.php |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 608f66fcf03a9..228257c591689 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -662,8 +662,10 @@ public function extract_raw_token() {
 
 	$tokens   = array();
 	$was_text = false;
+	$next_at  = 0;
 	while ( $token_reporter->next_token() ) {
 		$raw_token = $token_reporter->extract_raw_token();
+		$next_at  += strlen( $raw_token );
 		$is_text   = '#text' === $token_reporter->get_token_name();
 
 		if ( ! $is_text && ! $was_text ) {
@@ -685,6 +687,22 @@ public function extract_raw_token() {
 		$was_text = $is_text;
 	}
 
+	/*
+	 * The HTML API aborts when a string ends with the start of a
+	 * token which isn’t complete, such as an un-closed comment.
+	 * Typically it’s best to avoid processing or passing along
+	 * that content because it could impact any HTML which follows
+	 * it. However, to maintain backwards compatability this last
+	 * segment needs to appear.
+	 */
+	if ( $token_reporter->paused_at_incomplete_token() ) {
+		if ( ! $was_text ) {
+			$tokens[] = '';
+		}
+		$was_text = false;
+		$tokens[] = substr( $input, $next_at );
+	}
+
 	if ( ! $was_text ) {
 		$tokens[] = '';
 	}
diff --git a/tests/phpunit/tests/shortcode.php b/tests/phpunit/tests/shortcode.php
index 470ddb527ce0a..d78fa98f07aaa 100644
--- a/tests/phpunit/tests/shortcode.php
+++ b/tests/phpunit/tests/shortcode.php
@@ -544,7 +544,7 @@ public function test_spaces_around_shortcodes() {
 	 * @dataProvider data_escaping
 	 */
 	public function test_escaping( $input, $output ) {
-		return $this->assertSame( $output, do_shortcode( $input ) );
+		return $this->assertEqualHTML( $output, do_shortcode( $input ) );
 	}
 
 	public function data_escaping() {

From a8bd5321818991f4dd5f802c2863c015e8f0c7ba Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Tue, 21 Oct 2025 02:04:02 -0700
Subject: [PATCH 09/10] wp_html_split() match legacy shortcode-tags

---
 src/wp-includes/formatting.php | 47 +++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 228257c591689..75ecf0adba18b 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -668,23 +668,46 @@ public function extract_raw_token() {
 		$next_at  += strlen( $raw_token );
 		$is_text   = '#text' === $token_reporter->get_token_name();
 
-		if ( ! $is_text && ! $was_text ) {
-			$tokens[] = '';
+		if ( ! $is_text ) {
+			if ( ! $was_text ) {
+				$tokens[] = '';
+			}
+
+			$tokens[] = $raw_token;
+			$was_text = false;
+			continue;
 		}
 
 		/*
-		 * Some legacy code assumes that text nodes will never start with a
-		 * less-than sign (<) but this isn’t the case, as some text nodes do
-		 * if the less-than sign doesn’t introduce a syntax token. To avoid
-		 * further corruption a leading less-than sign is replaced by its
-		 * encoded equivalent numeric character reference.
+		 * WordPress looks for shortcodes and escaped shortcodes within the HTML
+		 * where they look like tags but HTML wouldn’t consider them tags, such
+		 * as in "<[header level=2]>". Look for these and artificially split the
+		 * text nodes where it looks like shortcodes reside inside.
 		 */
-		if ( $is_text && '<' === ( $raw_token[0] ?? '' ) ) {
-			$raw_token = '&#60;' . substr( $raw_token, 1 );
-		}
+		$shortcode_pattern = get_shortcode_regex();
+		$text_chunks       = preg_split( "~(<{$shortcode_pattern}>)~", $raw_token, -1, PREG_SPLIT_DELIM_CAPTURE );
+		foreach ( $text_chunks as $i => $token ) {
+			// The preg_split() always puts captured delimiters in the odd indices.
+			$is_shortcode_tag = 0x01 === $i & 0x01;
+
+			if ( $is_shortcode_tag && ! $was_text ) {
+				$tokens[] = '';
+			}
 
-		$tokens[] = $raw_token;
-		$was_text = $is_text;
+			/*
+			 * Some legacy code assumes that text nodes will never start with a
+			 * less-than sign (<) but this isn’t the case, as some text nodes do
+			 * if the less-than sign doesn’t introduce a syntax token. To avoid
+			 * further corruption a leading less-than sign is replaced by its
+			 * encoded equivalent numeric character reference.
+			 */
+			if ( ! $is_shortcode_tag && '<' === ( $token[0] ?? '' ) ) {
+				$token = '&#60;' . substr( $token, 1 );
+			}
+
+			$was_text = ! $is_shortcode_tag;
+			$tokens[] = $token;
+		}
 	}
 
 	/*

From 638e6659eb3e8605aeb15498acebb1d7775f105b Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Tue, 15 Jul 2025 14:28:54 -0500
Subject: [PATCH 10/10] Refactor `wp_replace_in_html_tags()`

---
 src/wp-includes/formatting.php | 81 +++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 22 deletions(-)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 75ecf0adba18b..36c1f3f2a9296 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -872,16 +872,44 @@ function _get_wptexturize_shortcode_regex( $tagnames ) {
 /**
  * Replaces characters or phrases within HTML elements only.
  *
+ * This is a dangerous function which can break HTML syntax,
+ * consider using methods from the HTML API instead.
+ *
+ * Example:
+ *
+ *     '<p class="test">data-class</p>' === wp_replace_in_html_tags(
+ *         '<p data-class="test">data-class</p>',
+ *         array( 'data-class' => 'class' )
+ *     );
+ *
  * @since 4.2.3
+ * @since {WP_VERSION} Reliably parses HTML via the HTML API.
  *
- * @param string $haystack      The text which has to be formatted.
+ * @param string $html          Replace matches inside the tags of this HTML.
  * @param array  $replace_pairs In the form array('from' => 'to', ...).
- * @return string The formatted text.
+ * @return string HTML after replacing the `$replace_pairs` matches, but only those
+ *                matches which appear inside HTML opening and closing tags.
  */
-function wp_replace_in_html_tags( $haystack, $replace_pairs ) {
-	// Find all elements.
-	$textarr = wp_html_split( $haystack );
-	$changed = false;
+function wp_replace_in_html_tags( $html, $replace_pairs ) {
+	$token_updater = new class( $html ) extends WP_HTML_Tag_Processor {
+		public function extract_raw_token() {
+			$this->set_bookmark( 'here' );
+			$here = $this->bookmarks['here'];
+
+			return substr( $this->html, $here->start, $here->length );
+		}
+
+		public function replace_raw_token( $new_raw_html ) {
+			$this->set_bookmark( 'here' );
+			$here = $this->bookmarks['here'];
+
+			$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+				$here->start,
+				$here->length,
+				$new_raw_html
+			);
+		}
+	};
 
 	// Optimize when searching for one item.
 	if ( 1 === count( $replace_pairs ) ) {
@@ -889,35 +917,44 @@ function wp_replace_in_html_tags( $haystack, $replace_pairs ) {
 		$needle  = array_key_first( $replace_pairs );
 		$replace = $replace_pairs[ $needle ];
 
-		// Loop through delimiters (elements) only.
-		for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) {
-			if ( str_contains( $textarr[ $i ], $needle ) ) {
-				$textarr[ $i ] = str_replace( $needle, $replace, $textarr[ $i ] );
-				$changed       = true;
+		while ( $token_updater->next_token() ) {
+			if ( '#text' === $token_updater->get_token_name() ) {
+				continue;
+			}
+
+			$token   = $token_updater->extract_raw_token();
+			$updated = str_replace( $needle, $replace, $token );
+
+			if ( $token !== $updated ) {
+				$token_updater->replace_raw_token( $updated );
 			}
 		}
 	} else {
 		// Extract all $needles.
 		$needles = array_keys( $replace_pairs );
 
-		// Loop through delimiters (elements) only.
-		for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) {
+		while ( $token_updater->next_token() ) {
+			if ( '#text' === $token_updater->get_token_name() ) {
+				continue;
+			}
+
+			$token   = $token_updater->extract_raw_token();
+			$updated = $token;
+
 			foreach ( $needles as $needle ) {
-				if ( str_contains( $textarr[ $i ], $needle ) ) {
-					$textarr[ $i ] = strtr( $textarr[ $i ], $replace_pairs );
-					$changed       = true;
-					// After one strtr() break out of the foreach loop and look at next element.
+				if ( str_contains( $token, $needle ) ) {
+					$updated = strtr( $updated, $replace_pairs );
 					break;
 				}
 			}
-		}
-	}
 
-	if ( $changed ) {
-		$haystack = implode( $textarr );
+			if ( $token !== $updated ) {
+				$token_updater->replace_raw_token( $updated );
+			}
+		}
 	}
 
-	return $haystack;
+	return $token_updater->get_updated_html();
 }
 
 /**