Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 161 additions & 4 deletions src/wp-includes/formatting.php
Original file line number Diff line number Diff line change
Expand Up @@ -605,27 +605,184 @@ function wpautop( $text, $br = true ) {
}

/**
* Separates HTML elements and comments from the text.
* Splits an HTML input into an array of raw strings, where each token
* represents a tag, a comment, a text node, etc…
*
* No effort is made to clean up, sanitize, or normalize the segments
* of HTML. {@see WP_HTML_Processor::normalize()} for normalization.
*
* Consider using the HTML API directly instead of relying on this
* legacy function: it bloats memory by default and provides a text
* interface for working with HTML whereas the HTML API provides a
* low-overhead and convenient structural interface.
*
* ## Output format:
*
* To maintain legacy behaviors with this function from when it
* operated via {@see preg_split()}, the output array injects text
* nodes which do not appear in the source HTML. That is, the original
* array, relying on {@see PREG_SPLIT_DELIM_CAPTURE}, included a text
* span on each side of every tag-like or comment-like “delimiter” in
* the matched string.
*
* Therefore, the output array will always start and end with text nodes
* and will separate every non-text node with a text node. If there is no
* actual content in the interstitial space between tokens in the source
* document, an empty text node will be created.
*
* Example:
*
* array( '', '<img>', '' ) === wp_html_split( '<img>' );
* array( 'test' ) === wp_html_split( 'test' );
* array( '', '<p>', 'test' ) === wp_html_split( '<p>test' );
* array( 'test', '</p>', '' ) === wp_html_split( 'test</p>' );
*
* array( '', '<br>', '', '<!-- comment -->', '' ) === wp_html_split( '<br><!-- comment -->' );
*
* // To avoid ambiguity, leading less-than signs (<) in text nodes are encoded.
* array( '&#60;3' ) === wp_split_html( '<3' );
*
* @since 4.2.4
* @since 7.0.0 Reliably parses HTML via the HTML API.
*
* @param string $input The text which has to be formatted.
* @return string[] Array of the formatted text.
* @param string $input HTML document to split, one item for every token.
* These can be text nodes, tags, comments, or doctype declarations.
* @return string[] Tokens from input; starting and ending in a text node, and with text
* nodes between every non-text node (see docblock note).
*/
function wp_html_split( $input ) {
return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE );
$token_reporter = new class( $input ) extends WP_HTML_Tag_Processor {
public function extract_raw_token() {
$this->set_bookmark( 'here' );
$here = $this->bookmarks['here'];

return substr( $this->html, $here->start, $here->length );
}
};

$tokens = array();
$was_text = false;
$next_at = 0;
while ( $token_reporter->next_token() ) {
$raw_token = $token_reporter->extract_raw_token();
$next_at += strlen( $raw_token );
$is_text = '#text' === $token_reporter->get_token_name();

// This is a tag, comment, DOCTYPE declaration, malformed comment, etc…
if ( ! $is_text ) {
/*
* Code relies on the fact that this function always returns text
* in even indices and non-text tokens in odd indices. If there
* isn’t preceding text then an artificial and empty span needs
* to be added.
*/
if ( ! $was_text ) {
$tokens[] = '';
}

$tokens[] = $raw_token;
$was_text = false;
continue;
}

/*
* WordPress looks for shortcodes and escaped shortcodes within the HTML
* where they look like tags but HTML wouldn’t consider them tags, such
* as in "<[header level=2]>".
*
* This means that something WordPress wants to consider a tag might
* appear in the middle of a larger text span. To preserve that behavior
* it’s essential to look inside text nodes for these shortcode instances,
* and if found, split the string around them.
*
* Example:
*
* // HTML sees a single text span here.
* "This is <[tag-name]>important!"
*
* // It needs to break into three segments.
* "This is ", "<[tag-name]>", "important!"
*
* As with the rest of this function, text nodes must appear between these,
* implying the creation of empty nodes where they don’t already exist.
*/
$shortcode_pattern = get_shortcode_regex();
$text_chunks = preg_split( "~(<{$shortcode_pattern}>)~", $raw_token, -1, PREG_SPLIT_DELIM_CAPTURE );
foreach ( $text_chunks as $i => $token ) {
// The preg_split() always puts captured delimiters in the odd indices.
$is_shortcode_tag = 1 === $i % 2;

if ( $is_shortcode_tag && ! $was_text ) {
$tokens[] = '';
}

/*
* Some legacy code assumes that text nodes will never start with a
* less-than sign (<) but this isn’t the case, as some text nodes do
* if the less-than sign doesn’t introduce a syntax token. To avoid
* further corruption, a leading less-than sign is replaced by its
* equivalent numeric character reference.
*
* Example:
*
* input: "<3 the shortcodes like <[emoji-tag name=heart]>"
* output: "&#60;3 the shortcodes like <[emoji-tag name=heart]>"
*/
if ( ! $is_shortcode_tag && '<' === ( $token[0] ?? '' ) ) {
$token = '&#60;' . substr( $token, 1 );
}

$was_text = ! $is_shortcode_tag;
$tokens[] = $token;
}
}

/*
* The HTML API aborts when a string ends with the start of a
* token which isn’t complete, such as an un-closed comment.
* Typically it’s best to avoid processing or passing along
* that content because it could impact any HTML which follows
* it. However, to maintain backwards compatability this last
* segment needs to appear.
*/
if ( $token_reporter->paused_at_incomplete_token() ) {
$token = substr( $input, $next_at );
$syntax_like = '<' === ( $token[0] ?? '' );
$token = $syntax_like ? ( '&#60;' . substr( $token, 1 ) ) : $token;

if ( $was_text ) {
$tokens[ count( $tokens ) - 1 ] .= $token;
} else {
$tokens[] = $token;
}

$was_text = false;
}

if ( ! $was_text ) {
$tokens[] = '';
}

return $tokens;
}

/**
* Retrieves the regular expression for an HTML element.
*
* @since 4.4.0
* @deprecated 6.9.0 Use the HTML API instead.
*
* @return string The regular expression.
*/
function get_html_split_regex() {
static $regex;

_deprecated_function(
__FUNCTION__,
'6.9.0',
'Use the HTML API instead.'
);

if ( ! isset( $regex ) ) {
// phpcs:disable Squiz.Strings.ConcatenationSpacing.PaddingFound -- don't remove regex indentation
$comments =
Expand Down
11 changes: 10 additions & 1 deletion tests/phpunit/tests/formatting/wpAutop.php
Original file line number Diff line number Diff line change
Expand Up @@ -536,9 +536,18 @@ public function data_element_sanity() {
"Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 ]]> -->",
"<p>Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 ]]> --></p>\n",
),
/*
* CDATA sections do not exist within HTML, so even though it looks
* like this should be escaping the entire “inner comment” span, there’s
* actually an invalid comment starting at `<![CDATA[` and ending at
* the very first `>` character, placing the end of the comment at
* what looks like the end of the “normal comment” — the ` -->`. Everything
* afterward is normal HTML data so the `<br>` is a real `BR` element and
* the `]]>` is normal plaintext, not the CDATA terminator.
*/
array(
"Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 --> a\n9 ]]>",
"<p>Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 --> a\n9 ]]></p>\n",
"<p>Hello <![CDATA[ <!-- a\nhttps://youtu.be/jgz0uSaOZbE\n a\n9 --> a<br />\n9 ]]></p>\n",
),
);
}
Expand Down
22 changes: 8 additions & 14 deletions tests/phpunit/tests/formatting/wpHtmlSplit.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,26 +30,20 @@ public function data_basic_features() {
'abcd <!-- <html> --> efgh',
array( 'abcd ', '<!-- <html> -->', ' efgh' ),
),
/*
* CDATA sections do not exist within HTML, so even though it looks
* like this should be escaping the entire ` <html> ` span, there’s
* actually an invalid comment starting at `<![CDATA[` and ending at
* the very first `>` character, placing the end of the comment at
* the end of `html>`. The rest is normal plaintext content.
*/
array(
'abcd <![CDATA[ <html> ]]> efgh',
array( 'abcd ', '<![CDATA[ <html> ]]>', ' efgh' ),
array( 'abcd ', '<![CDATA[ <html>', ' ]]> efgh' ),
),
);
}

/**
* Automated performance testing of the main regex.
*
* @dataProvider data_whole_posts
*
* @covers ::get_html_split_regex
*/
public function test_pcre_performance( $input ) {
$regex = get_html_split_regex();
$result = benchmark_pcre_backtracking( $regex, $input, 'split' );
return $this->assertLessThan( 200, $result );
}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no longer a PCRE used in wp_html_split() and therefore no backtracking.


public function data_whole_posts() {
require_once DIR_TESTDATA . '/formatting/whole-posts.php';
return data_whole_posts();
Expand Down
40 changes: 33 additions & 7 deletions tests/phpunit/tests/rest-api/rest-comments-controller.php
Original file line number Diff line number Diff line change
Expand Up @@ -3152,24 +3152,50 @@ public function test_comment_roundtrip_as_editor_unfiltered_html() {
}
}

/**
* Ensures that saving a comment as a super-admin does not corrupt the
* comment content when presented with common edge cases.
*
* Note that this test used to assert the wrong behavior due to a bug
* in {@see wp_html_split()}. Whereby the unescaped `<` used to be
* mistakenly identified as the start of an HTML tag or comment, this
* led to accidental replacement “inside” the mistaken tag. The test
* has been updated with `wp_html_split()` in accordance with the
* HTML5 living specification.
*
* @ticket {TICKET_NUMBER}
*/
public function test_comment_roundtrip_as_superadmin() {
wp_set_current_user( self::$superadmin_id );

$raw_content = <<<'HTML'
\\&\\ &amp; &invalid; < &lt; &amp;lt;
HTML;
$rendered = <<<'HTML'
<p>\\&#038;\\ &amp; &invalid; < &lt; &amp;lt;</p>
HTML;
$author_name = <<<'HTML'
\\&amp;\\ &amp; &amp;invalid; &lt; &lt; &amp;lt;
HTML;
$author_user_agent = <<<'HTML'
\\&\\ &amp; &invalid; &lt; &lt; &amp;lt;
HTML;

$this->assertTrue( current_user_can( 'unfiltered_html' ) );
$this->verify_comment_roundtrip(
array(
'content' => '\\\&\\\ &amp; &invalid; < &lt; &amp;lt;',
'author_name' => '\\\&\\\ &amp; &invalid; < &lt; &amp;lt;',
'author_user_agent' => '\\\&\\\ &amp; &invalid; < &lt; &amp;lt;',
'content' => $raw_content,
'author_name' => $raw_content,
'author_user_agent' => $raw_content,
'author' => self::$superadmin_id,
),
array(
'content' => array(
'raw' => '\\\&\\\ &amp; &invalid; < &lt; &amp;lt;',
'rendered' => '<p>\\\&#038;\\\ &amp; &invalid; < &lt; &amp;lt;' . "\n</p>",
'raw' => $raw_content,
'rendered' => $rendered,
),
'author_name' => '\\\&amp;\\\ &amp; &amp;invalid; &lt; &lt; &amp;lt;',
'author_user_agent' => '\\\&\\\ &amp; &invalid; &lt; &lt; &amp;lt;',
'author_name' => $author_name,
'author_user_agent' => $author_user_agent,
'author' => self::$superadmin_id,
)
);
Expand Down
4 changes: 2 additions & 2 deletions tests/phpunit/tests/shortcode.php
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ public function test_spaces_around_shortcodes() {
* @dataProvider data_escaping
*/
public function test_escaping( $input, $output ) {
return $this->assertSame( $output, do_shortcode( $input ) );
return $this->assertEqualHTML( $output, do_shortcode( $input ) );
}

public function data_escaping() {
Expand Down Expand Up @@ -622,7 +622,7 @@ public function data_escaping() {
* @dataProvider data_escaping2
*/
public function test_escaping2( $input, $output ) {
return $this->assertSame( $output, strip_shortcodes( $input ) );
return $this->assertEqualHTML( $output, strip_shortcodes( $input ) );
}

public function data_escaping2() {
Expand Down
Loading