diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index bd2d349fa20c1..36c1f3f2a9296 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -605,27 +605,151 @@ function wpautop( $text, $br = true ) {
}
/**
- * Separates HTML elements and comments from the text.
+ * Splits an HTML input into an array of raw strings, where each token
+ * represents a tag, a comment, a text node, etc…
+ *
+ * No effort is made to clean up, sanitize, or normalize the segments
+ * of HTML. {@see WP_HTML_Processor::normalize()} for normalization.
+ *
+ * Consider using the HTML API directly instead of relying on this
+ * legacy function: it bloats memory by default and provides a text
+ * interface for working with HTML whereas the HTML API provides a
+ * low-overhead and convenient structural interface.
+ *
+ * ## Output format:
+ *
+ * To maintain legacy behaviors with this function from when it
+ * operated via {@see preg_split()}, the output array injects text
+ * nodes which do not appear in the source HTML. That is, the original
+ * array, relying on {@see PREG_SPLIT_DELIM_CAPTURE}, included a text
+ * span on each side of every tag-like or comment-like “delimiter” in
+ * the matched string.
+ *
+ * Therefore, the output array will always start and end with text nodes
+ * and will separate every non-text node with a text node. If there is no
+ * actual content in the interstitial space between tokens in the source
+ * document, an empty text node will be created.
+ *
+ * Example:
+ *
+ * array( '', '', '' ) === wp_html_split( '
' );
+ * array( 'test' ) === wp_html_split( 'test' );
+ * array( '', '
', 'test' ) === wp_html_split( '
test' ); + * array( 'test', '
', '' ) === wp_html_split( 'test' ); + * + * array( '', 'data-class
' === wp_replace_in_html_tags( + * 'data-class
', + * array( 'data-class' => 'class' ) + * ); + * * @since 4.2.3 + * @since {WP_VERSION} Reliably parses HTML via the HTML API. * - * @param string $haystack The text which has to be formatted. + * @param string $html Replace matches inside the tags of this HTML. * @param array $replace_pairs In the form array('from' => 'to', ...). - * @return string The formatted text. + * @return string HTML after replacing the `$replace_pairs` matches, but only those + * matches which appear inside HTML opening and closing tags. */ -function wp_replace_in_html_tags( $haystack, $replace_pairs ) { - // Find all elements. - $textarr = wp_html_split( $haystack ); - $changed = false; +function wp_replace_in_html_tags( $html, $replace_pairs ) { + $token_updater = new class( $html ) extends WP_HTML_Tag_Processor { + public function extract_raw_token() { + $this->set_bookmark( 'here' ); + $here = $this->bookmarks['here']; + + return substr( $this->html, $here->start, $here->length ); + } + + public function replace_raw_token( $new_raw_html ) { + $this->set_bookmark( 'here' ); + $here = $this->bookmarks['here']; + + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $here->start, + $here->length, + $new_raw_html + ); + } + }; // Optimize when searching for one item. if ( 1 === count( $replace_pairs ) ) { @@ -765,35 +917,44 @@ function wp_replace_in_html_tags( $haystack, $replace_pairs ) { $needle = array_key_first( $replace_pairs ); $replace = $replace_pairs[ $needle ]; - // Loop through delimiters (elements) only. - for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) { - if ( str_contains( $textarr[ $i ], $needle ) ) { - $textarr[ $i ] = str_replace( $needle, $replace, $textarr[ $i ] ); - $changed = true; + while ( $token_updater->next_token() ) { + if ( '#text' === $token_updater->get_token_name() ) { + continue; + } + + $token = $token_updater->extract_raw_token(); + $updated = str_replace( $needle, $replace, $token ); + + if ( $token !== $updated ) { + $token_updater->replace_raw_token( $updated ); } } } else { // Extract all $needles. $needles = array_keys( $replace_pairs ); - // Loop through delimiters (elements) only. - for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) { + while ( $token_updater->next_token() ) { + if ( '#text' === $token_updater->get_token_name() ) { + continue; + } + + $token = $token_updater->extract_raw_token(); + $updated = $token; + foreach ( $needles as $needle ) { - if ( str_contains( $textarr[ $i ], $needle ) ) { - $textarr[ $i ] = strtr( $textarr[ $i ], $replace_pairs ); - $changed = true; - // After one strtr() break out of the foreach loop and look at next element. + if ( str_contains( $token, $needle ) ) { + $updated = strtr( $updated, $replace_pairs ); break; } } - } - } - if ( $changed ) { - $haystack = implode( $textarr ); + if ( $token !== $updated ) { + $token_updater->replace_raw_token( $updated ); + } + } } - return $haystack; + return $token_updater->get_updated_html(); } /** diff --git a/tests/phpunit/tests/formatting/wpAutop.php b/tests/phpunit/tests/formatting/wpAutop.php index ee4d90645d09c..b97468e7830f8 100644 --- a/tests/phpunit/tests/formatting/wpAutop.php +++ b/tests/phpunit/tests/formatting/wpAutop.php @@ -536,9 +536,18 @@ public function data_element_sanity() { "Hello -->", "Hello -->
\n", ), + /* + * CDATA sections do not exist within HTML, so even though it looks + * like this should be escaping the entire “inner comment” span, there’s + * actually an invalid comment starting at `` character, placing the end of the comment at + * what looks like the end of the “normal comment” — the ` -->`. Everything + * afterward is normal HTML data so the `Hello a\n9 ]]>
\n", + "Hello a
\n9 ]]>
\\&\\ & &invalid; < < <
+HTML; + $author_name = <<<'HTML' +\\&\\ & &invalid; < < < +HTML; + $author_user_agent = <<<'HTML' +\\&\\ & &invalid; < < < +HTML; + $this->assertTrue( current_user_can( 'unfiltered_html' ) ); $this->verify_comment_roundtrip( array( - 'content' => '\\\&\\\ & &invalid; < < <', - 'author_name' => '\\\&\\\ & &invalid; < < <', - 'author_user_agent' => '\\\&\\\ & &invalid; < < <', + 'content' => $raw_content, + 'author_name' => $raw_content, + 'author_user_agent' => $raw_content, 'author' => self::$superadmin_id, ), array( 'content' => array( - 'raw' => '\\\&\\\ & &invalid; < < <', - 'rendered' => '\\\&\\\ & &invalid; < < <' . "\n
", + 'raw' => $raw_content, + 'rendered' => $rendered, ), - 'author_name' => '\\\&\\\ & &invalid; < < <', - 'author_user_agent' => '\\\&\\\ & &invalid; < < <', + 'author_name' => $author_name, + 'author_user_agent' => $author_user_agent, 'author' => self::$superadmin_id, ) ); diff --git a/tests/phpunit/tests/shortcode.php b/tests/phpunit/tests/shortcode.php index 269da2b05e34d..d78fa98f07aaa 100644 --- a/tests/phpunit/tests/shortcode.php +++ b/tests/phpunit/tests/shortcode.php @@ -544,7 +544,7 @@ public function test_spaces_around_shortcodes() { * @dataProvider data_escaping */ public function test_escaping( $input, $output ) { - return $this->assertSame( $output, do_shortcode( $input ) ); + return $this->assertEqualHTML( $output, do_shortcode( $input ) ); } public function data_escaping() { @@ -622,7 +622,7 @@ public function data_escaping() { * @dataProvider data_escaping2 */ public function test_escaping2( $input, $output ) { - return $this->assertSame( $output, strip_shortcodes( $input ) ); + return $this->assertEqualHTML( $output, strip_shortcodes( $input ) ); } public function data_escaping2() {