From 549ee7f7dea63d350d1901e3ff83480d27b32902 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 6 Oct 2025 14:14:05 -0700 Subject: [PATCH] Blocks: Introduce WP_Block_Processor for efficiently parsing blocks. (#9105) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Block Processor follows the HTML API in providing a streaming, near-zero-overhead, lazy, re-entrant parser for traversing block structure. This class provides an alternate interface to `parse_blocks()` which is more amenable to a number of common server-side operations on posts, such as: - Generating an excerpt from only the first N blocks in a post. - Determining which block types are present in a post. - Determining which posts contain a block of a given type. - Generating block supports content for a post. - Modifying a single block, or only blocks of a given kind in a post. Co-authored-by: Sören Wünsch Co-authored-by: Tom J Nowell Co-authored-by: Weston Ruter Co-authored-by: Jon Surrell Co-authored-by: Greg Ziółkowski Github-PR: 9105 Github-PR-URL: https://github.com/WordPress/wordpress-develop/pull/9105 Trac-Ticket: 61401 Trac-Ticket-URL: https://core.trac.wordpress.org/ticket/61401 --- phpcs.xml.dist | 5 +- src/wp-includes/blocks.php | 11 + src/wp-includes/class-wp-block-processor.php | 1983 +++++++++++++++++ src/wp-settings.php | 1 + .../wpBlockProcessor-BlockProcessing.php | 134 ++ .../block-processor/wpBlockProcessor.php | 1238 ++++++++++ 6 files changed, 3370 insertions(+), 2 deletions(-) create mode 100644 src/wp-includes/class-wp-block-processor.php create mode 100644 tests/phpunit/tests/block-processor/wpBlockProcessor-BlockProcessing.php create mode 100644 tests/phpunit/tests/block-processor/wpBlockProcessor.php diff --git a/phpcs.xml.dist b/phpcs.xml.dist index f6bc4482156d9..a8387b3604c9b 100644 --- a/phpcs.xml.dist +++ b/phpcs.xml.dist @@ -229,7 +229,7 @@ ############################################################################# SELECTIVE EXCLUSIONS Exclude specific files for specific sniffs and/or exclude sub-groups in sniffs. - + These exclusions are listed ordered by alphabetic sniff name. ############################################################################# --> @@ -266,8 +266,9 @@ + and risky handling code. Exclude forbidding goto in parser code. --> /wp-includes/compat-utf8\.php + /wp-includes/class-wp-block-processor\.php ` delimiter it will return `core/group` as the + * block type. + * + * There are two special block types that change the behavior of the processor: + * + * - The wildcard `*` represents _any block_. In addition to matching all block types, + * it also represents top-level freeform HTML whose block type is reported as `null`. + * + * - The `core/freeform` block type is a pseudo-block type which explicitly matches + * top-level freeform HTML. + * + * These special block types can be passed into any method which searches for blocks. + * + * There is one additional special block type which may be returned from + * {@see self::get_printable_block_type()}. This is the `#innerHTML` type, which + * indicates that the HTML span on which the processor is paused is inner HTML for + * a containing block. + * + * ### Spans of HTML + * + * Non-block content plays a complicated role in processing block documents. This + * processor exposes tools to help work with these spans of HTML. + * + * - {@see self::is_html()} indicates if the processor is paused at a span of + * HTML but does not differentiate between top-level freeform content and inner HTML. + * - {@see self::is_non_whitespace_html()} indicates not only if the processor + * is paused at a span of HTML, but also whether that span incorporates more than + * whitespace characters. Because block serialization often inserts newlines between + * block comment delimiters, this is useful for distinguishing “real” freeform + * content from purely aesthetic syntax. + * - {@see self::is_block_type()} matches top-level freeform HTML content when + * provided one of the special block types described above. + * + * ### Block structure + * + * As the processor traverses block delimiters it maintains a stack of which blocks are + * open at the given place in the document where it’s paused. This stack represents the + * block structure of a document and is used to determine where blocks end, which blocks + * represent inner blocks, whether a span of HTML is top-level freeform content, and + * more. Investigate the stack with {@see self::get_breadcrumbs()}, which returns an + * array of block types starting at the outermost-open block and descending to the + * currently-visited block. + * + * Unlike {@parse_blocks()}, spans of HTML appear in this structure as the special + * reported block type `#html`. Such a span represents inner HTML for a block if the + * depth reported by {@see self::get_depth()} is greater than one. + * + * It will generally not be necessary to inspect the stack of open blocks, though + * depth may be important for finding where blocks end. When visiting a block opener, + * the depth will have been increased before pausing; in contrast the depth is + * decremented before visiting a closer. This makes the following an easy way to + * determine if a block is still open. + * + * Example: + * + * $depth = $processor->get_depth(); + * while ( $processor->next_token() && $processor->get_depth() > $depth ) { + * continue + * } + * // Processor is now paused at the token immediately following the closed block. + * + * #### Extracting blocks + * + * A unique feature of this processor is the ability to return the same output as + * {@see \parse_blocks()} would produce, but for a subset of the input document. + * For example, it’s possible to extract an image block, manipulate that parsed + * block, and re-serialize it into the original document. It’s possible to do so + * while skipping over the parse of the rest of the document. + * + * {@see self::extract_block()} will scan forward from the current block opener + * and build the parsed block structure until the current block is closed. It will + * include all inner HTML and inner blocks, and parse all of the inner blocks. It + * can be used to extract a block at any depth in the document, helpful for operating + * on blocks within nested structure. + * + * Example: + * + * if ( ! $processor->next_block( 'gallery' ) ) { + * return $post_content; + * } + * + * $gallery_at = $processor->get_span()->start; + * $gallery_block = $processor->extract_block(); + * $after_gallery = $processor->get_span()->start; + * return ( + * substr( $post_content, 0, $gallery_at ) . + * serialize_block( modify_gallery( $gallery_block ) . + * substr( $post_content, $after_gallery ) + * ); + * + * #### Handling of malformed structure + * + * There are situations where closing block delimiters appear for which no open block + * exists, or where a document ends before a block is closed, or where a closing block + * delimiter appears but references a different block type than the most-recently + * opened block does. In all of these cases, the stack of open blocks should mirror + * the behavior in {@see \parse_blocks()}. + * + * Unlike {@see \parse_blocks()}, however, this processor can still operate on the + * invalid block delimiters. It provides a few functions which can be used for building + * custom and non-spec-compliant error handling. + * + * - {@see self::has_closing_flag()} indicates if the block delimiter contains the + * closing flag at the end. Some invalid block delimiters might contain both the + * void and closing flag, in which case {@see self::get_delimiter_type()} will + * report that it’s a void block. + * - {@see static::get_last_error()} indicates if the processor reached an invalid + * block closing. Depending on the context, {@see \parse_blocks()} might instead + * ignore the token or treat it as freeform HTML content. + * + * ## Static helpers + * + * This class provides helpers for performing semantic block-related operations. + * + * - {@see self::normalize_block_type()} takes a block type with or without the + * implicit `core` namespace and returns a fully-qualified block type. + * - {@see self::are_equal_block_types()} indicates if two spans across one or + * more input texts represent the same fully-qualified block type. + * + * ## Subclassing + * + * This processor is designed to accurately parse a block document. Therefore, many + * of its methods are not meant for subclassing. However, overall this class supports + * building higher-level convenience classes which may choose to subclass it. For those + * classes, avoid re-implementing methods except for the list below. Instead, create + * new names representing the higher-level concepts being introduced. For example, instead + * of creating a new method named `next_block()` which only advances to blocks of a given + * kind, consider creating a new method named something like `next_layout_block()` which + * won’t interfere with the base class method. + * + * - {@see static::get_last_error()} may be reimplemented to report new errors in the subclass + * which aren’t intrinsic to block parsing. + * - {@see static::get_attributes()} may be reimplemented to provide a streaming interface + * to reading and modifying a block’s JSON attributes. It should be fast and memory efficient. + * - {@see static::get_last_json_error()} may be reimplemented to report new errors introduced + * with a reimplementation of {@see static::get_attributes()}. + * + * @since 6.9.0 + */ +class WP_Block_Processor { + /** + * Indicates if the last operation failed, otherwise + * will be `null` for success. + * + * @since 6.9.0 + * + * @var string|null + */ + private $last_error = null; + + /** + * Indicates failures from decoding JSON attributes. + * + * @since 6.9.0 + * + * @see \json_last_error() + * + * @var int + */ + private $last_json_error = JSON_ERROR_NONE; + + /** + * Source text provided to processor. + * + * @since 6.9.0 + * + * @var string + */ + protected $source_text; + + /** + * Byte offset into source text where a matched delimiter starts. + * + * Example: + * + * 5 10 15 20 25 30 35 40 45 50 + * + * ╰─ Starts at byte offset 17. + * + * @since 6.9.0 + * + * @var int + */ + private $matched_delimiter_at = 0; + + /** + * Byte length of full span of a matched delimiter. + * + * Example: + * + * 5 10 15 20 25 30 35 40 45 50 + * + * ╰───────────────╯ + * 17 bytes long. + * + * @since 6.9.0 + * + * @var int + */ + private $matched_delimiter_length = 0; + + /** + * First byte offset into source text following any previously-matched delimiter. + * Used to indicate where an HTML span starts. + * + * Example: + * + * 5 10 15 20 25 30 35 40 45 50 55 + *

Content

<⃨!⃨-⃨-⃨ ⃨/⃨w⃨p⃨:⃨p⃨a⃨r⃨a⃨g⃨r⃨a⃨p⃨h⃨ ⃨-⃨-⃨>⃨ + * │ ╰─ This delimiter was matched, and after matching, + * │ revealed the preceding HTML span. + * │ + * ╰─ The first byte offset after the previous matched delimiter + * is 21. Because the matched delimiter starts at 55, which is after + * this, a span of HTML must exist between these boundaries. + * + * @since 6.9.0 + * + * @var int + */ + private $after_previous_delimiter = 0; + + /** + * Byte offset where namespace span begins. + * + * When no namespace is present, this will be the same as the starting + * byte offset for the block name. + * + * Example: + * + * + * │ ╰─ Name starts here. + * ╰─ Namespace starts here. + * + * + * ├─ The namespace would start here but is implied as “core.” + * ╰─ The name starts here. + * + * @since 6.9.0 + * + * @var int + */ + private $namespace_at = 0; + + /** + * Byte offset where block name span begins. + * + * When no namespace is present, this will be the same as the starting + * byte offset for the block namespace. + * + * Example: + * + * + * │ ╰─ Name starts here. + * ╰─ Namespace starts here. + * + * + * ├─ The namespace would start here but is implied as “core.” + * ╰─ The name starts here. + * + * @since 6.9.0 + * + * @var int + */ + private $name_at = 0; + + /** + * Byte length of block name span. + * + * Example: + * + * 5 10 15 20 25 + * + * ╰─────╯ + * 7 bytes long. + * + * @since 6.9.0 + * + * @var int + */ + private $name_length = 0; + + /** + * Whether the delimiter contains the block-closing flag. + * + * This may be erroneous if present within a void block, + * therefore the {@see self::has_closing_flag()} can be used by + * calling code to perform custom error-handling. + * + * @since 6.9.0 + * + * @var bool + */ + private $has_closing_flag = false; + + /** + * Byte offset where JSON attributes span begins. + * + * Example: + * + * 5 10 15 20 25 30 35 40 + * + * ╰─ Starts at byte offset 18. + * + * @since 6.9.0 + * + * @var int + */ + private $json_at; + + /** + * Byte length of JSON attributes span, or 0 if none are present. + * + * Example: + * + * 5 10 15 20 25 30 35 40 + * + * ╰───────────────╯ + * 17 bytes long. + * + * @since 6.9.0 + * + * @var int + */ + private $json_length = 0; + + /** + * Internal parser state, differentiating whether the instance is currently matched, + * on an implicit freeform node, in error, or ready to begin parsing. + * + * @see self::READY + * @see self::MATCHED + * @see self::HTML_SPAN + * @see self::INCOMPLETE_INPUT + * @see self::COMPLETE + * + * @since 6.9.0 + * + * @var string + */ + protected $state = self::READY; + + /** + * Indicates what kind of block comment delimiter was matched. + * + * One of: + * + * - {@see self::OPENER} If the delimiter is opening a block. + * - {@see self::CLOSER} If the delimiter is closing an open block. + * - {@see self::VOID} If the delimiter represents a void block with no inner content. + * + * If a parsed comment delimiter contains both the closing and the void + * flags then it will be interpreted as a void block to match the behavior + * of the official block parser, however, this is a syntax error and probably + * the block ought to close an open block of the same name, if one is open. + * + * @since 6.9.0 + * + * @var string + */ + private $type; + + /** + * Whether the last-matched delimiter acts like a void block and should be + * popped from the stack of open blocks as soon as the parser advances. + * + * This applies to void block delimiters and to HTML spans. + * + * @since 6.9.0 + * + * @var bool + */ + private $was_void = false; + + /** + * For every open block, in hierarchical order, this stores the byte offset + * into the source text where the block type starts, including for HTML spans. + * + * To avoid allocating and normalizing block names when they aren’t requested, + * the stack of open blocks is stored as the byte offsets and byte lengths of + * each open block’s block type. This allows for minimal tracking and quick + * reading or comparison of block types when requested. + * + * @since 6.9.0 + * + * @see self::$open_blocks_length + * + * @var int[] + */ + private $open_blocks_at = array(); + + /** + * For every open block, in hierarchical order, this stores the byte length + * of the block’s block type in the source text. For HTML spans this is 0. + * + * @since 6.9.0 + * + * @see self::$open_blocks_at + * + * @var int[] + */ + private $open_blocks_length = array(); + + /** + * Indicates which operation should apply to the stack of open blocks after + * processing any pending spans of HTML. + * + * Since HTML spans are discovered after matching block delimiters, those + * delimiters need to defer modifying the stack of open blocks. This value, + * if set, indicates what operation should be applied. The properties + * associated with token boundaries still point to the delimiters even + * when processing HTML spans, so there’s no need to track them independently. + * + * @var 'push'|'void'|'pop'|null + */ + private $next_stack_op = null; + + /** + * Creates a new block processor. + * + * Example: + * + * $processor = new WP_Block_Processor( $post_content ); + * if ( $processor->next_block( 'core/image' ) ) { + * echo "Found an image!\n"; + * } + * + * @see self::next_block() to advance to the start of the next block (skips closers). + * @see self::next_delimiter() to advance to the next explicit block delimiter. + * @see self::next_token() to advance to the next block delimiter or HTML span. + * + * @since 6.9.0 + * + * @param string $source_text Input document potentially containing block content. + */ + public function __construct( string $source_text ) { + $this->source_text = $source_text; + } + + /** + * Advance to the next block delimiter which opens a block, indicating if one was found. + * + * Delimiters which open blocks include opening and void block delimiters. To visit + * freeform HTML content, pass the wildcard “*” as the block type. + * + * Use this function to walk through the blocks in a document, pausing where they open. + * + * Example blocks: + * + * // The first delimiter opens the paragraph block. + * <⃨!⃨-⃨-⃨ ⃨w⃨p⃨:⃨p⃨a⃨r⃨a⃨g⃨r⃨a⃨p⃨h⃨ ⃨-⃨-⃨>⃨

Content

+ * + * // The void block is the first opener in this sequence of closers. + * <⃨!⃨-⃨-⃨ ⃨w⃨p⃨:⃨s⃨p⃨a⃨c⃨e⃨r⃨ ⃨{⃨"⃨h⃨e⃨i⃨g⃨h⃨t⃨"⃨:⃨"⃨2⃨0⃨0⃨p⃨x⃨"⃨}⃨ ⃨/⃨-⃨-⃨>⃨ + * + * // If, however, `*` is provided as the block type, freeform content is matched. + * <⃨h⃨2⃨>⃨M⃨y⃨ ⃨s⃨y⃨n⃨o⃨p⃨s⃨i⃨s⃨<⃨/⃨h⃨2⃨>⃨\⃨n⃨ + * + * // Inner HTML is never freeform content, and will not be matched even with the wildcard. + * <⃨!⃨-⃨-⃨ ⃨w⃨p⃨:⃨p⃨a⃨r⃨a⃨g⃨r⃨a⃨p⃨h⃨ ⃨-⃨>⃨

+ * + * Example: + * + * // Find all textual ranges of image block opening delimiters. + * $images = array(); + * $processor = new WP_Block_Processor( $html ); + * while ( $processor->next_block( 'core/image' ) ) { + * $images[] = $processor->get_span(); + * } + * + * In some cases it may be useful to conditionally visit the implicit freeform + * blocks, such as when determining if a post contains freeform content that + * isn’t purely whitespace. + * + * Example: + * + * $seen_block_types = []; + * $block_type = '*'; + * $processor = new WP_Block_Processor( $html ); + * while ( $processor->next_block( $block_type ) { + * // Stop wasting time visiting freeform blocks after one has been found. + * if ( + * '*' === $block_type && + * null === $processor->get_block_type() && + * $processor->is_non_whitespace_html() + * ) { + * $block_type = null; + * $seen_block_types['core/freeform'] = true; + * continue; + * } + * + * $seen_block_types[ $processor->get_block_type() ] = true; + * } + * + * @since 6.9.0 + * + * @see self::next_delimiter() to advance to the next explicit block delimiter. + * @see self::next_token() to advance to the next block delimiter or HTML span. + * + * @param string|null $block_type Optional. If provided, advance until a block of this type is found. + * Default is to stop at any block regardless of its type. + * @return bool Whether an opening delimiter for a block was found. + */ + public function next_block( ?string $block_type = null ): bool { + while ( $this->next_delimiter( $block_type ) ) { + if ( self::CLOSER !== $this->get_delimiter_type() ) { + return true; + } + } + + return false; + } + + /** + * Advance to the next block delimiter in a document, indicating if one was found. + * + * Delimiters may include invalid JSON. This parser does not attempt to parse the + * JSON attributes until requested; when invalid, the attributes will be null. This + * matches the behavior of {@see \parse_blocks()}. To visit freeform HTML content, + * pass the wildcard “*” as the block type. + * + * Use this function to walk through the block delimiters in a document. + * + * Example delimiters: + * + * + * + * + * + * // If the wildcard `*` is provided as the block type, freeform content is matched. + * <⃨h⃨2⃨>⃨M⃨y⃨ ⃨s⃨y⃨n⃨o⃨p⃨s⃨i⃨s⃨<⃨/⃨h⃨2⃨>⃨\⃨n⃨ + * + * // Inner HTML is never freeform content, and will not be matched even with the wildcard. + * ...<⃨!⃨-⃨-⃨ ⃨/⃨w⃨p⃨:⃨l⃨i⃨s⃨t⃨ ⃨-⃨-⃨>⃨

+ * + * Example: + * + * $html = '\n'; + * $processor = new WP_Block_Processor( $html ); + * while ( $processor->next_delimiter() { + * // Runs twice, seeing both void blocks of type “core/void.” + * } + * + * $processor = new WP_Block_Processor( $html ); + * while ( $processor->next_delimiter( '*' ) ) { + * // Runs thrice, seeing the void block, the newline span, and the void block. + * } + * + * @since 6.9.0 + * + * @param string|null $block_name Optional. Keep searching until a block of this name is found. + * Defaults to visit every block regardless of type. + * @return bool Whether a block delimiter was matched. + */ + public function next_delimiter( ?string $block_name = null ): bool { + if ( ! isset( $block_name ) ) { + while ( $this->next_token() ) { + if ( ! $this->is_html() ) { + return true; + } + } + + return false; + } + + while ( $this->next_token() ) { + if ( $this->is_block_type( $block_name ) ) { + return true; + } + } + + return false; + } + + /** + * Advance to the next block delimiter or HTML span in a document, indicating if one was found. + * + * This function steps through every syntactic chunk in a document. This includes explicit + * block comment delimiters, freeform non-block content, and inner HTML segments. + * + * Example tokens: + * + * + * + * + *

Normal HTML content

+ * Plaintext content too! + * + * Example: + * + * // Find span containing wrapping HTML element surrounding inner blocks. + * $processor = new WP_Block_Processor( $html ); + * if ( ! $processor->next_block( 'gallery' ) ) { + * return null; + * } + * + * $containing_span = null; + * while ( $processor->next_token() && $processor->is_html() ) { + * $containing_span = $processor->get_span(); + * } + * + * This method will visit all HTML spans including those forming freeform non-block + * content as well as those which are part of a block’s inner HTML. + * + * @since 6.9.0 + * + * @return bool Whether a token was matched or the end of the document was reached without finding any. + */ + public function next_token(): bool { + if ( $this->last_error || self::COMPLETE === $this->state || self::INCOMPLETE_INPUT === $this->state ) { + return false; + } + + // Void tokens automatically pop off the stack of open blocks. + if ( $this->was_void ) { + array_pop( $this->open_blocks_at ); + array_pop( $this->open_blocks_length ); + $this->was_void = false; + } + + $text = $this->source_text; + $end = strlen( $text ); + + /* + * Because HTML spans are inferred after finding the next delimiter, it means that + * the parser must transition out of that HTML state and reuse the token boundaries + * it found after the HTML span. If those boundaries are before the end of the + * document it implies that a real delimiter was found; otherwise this must be the + * terminating HTML span and the parsing is complete. + */ + if ( self::HTML_SPAN === $this->state ) { + if ( $this->matched_delimiter_at >= $end ) { + $this->state = self::COMPLETE; + return false; + } + + switch ( $this->next_stack_op ) { + case 'void': + $this->was_void = true; + $this->open_blocks_at[] = $this->namespace_at; + $this->open_blocks_length[] = $this->name_at + $this->name_length - $this->namespace_at; + break; + + case 'push': + $this->open_blocks_at[] = $this->namespace_at; + $this->open_blocks_length[] = $this->name_at + $this->name_length - $this->namespace_at; + break; + + case 'pop': + array_pop( $this->open_blocks_at ); + array_pop( $this->open_blocks_length ); + break; + } + + $this->next_stack_op = null; + $this->state = self::MATCHED; + return true; + } + + $this->state = self::READY; + $after_prev_delimiter = $this->matched_delimiter_at + $this->matched_delimiter_length; + $at = $after_prev_delimiter; + + while ( $at < $end ) { + /* + * Find the next possible start of a delimiter. + * + * This follows the behavior in the official block parser, which segments a post + * by the block comment delimiters. It is possible for an HTML attribute to contain + * what looks like a block comment delimiter but which is actually an HTML attribute + * value. In such a case, the parser here will break apart the HTML and create the + * block boundary inside the HTML attribute. In other words, the block parser + * isolates sections of HTML from each other, even if that leads to malformed markup. + * + * For a more robust parse, scan through the document with the HTML API and parse + * comments once they are matched to see if they are also block delimiters. In + * practice, this nuance has not caused any known problems since developing blocks. + * + * <⃨!⃨-⃨-⃨ /wp:core/paragraph {"dropCap":true} /--> + */ + $comment_opening_at = strpos( $text, ' + $opening_whitespace_at = $comment_opening_at + 4; + if ( $opening_whitespace_at >= $end ) { + goto incomplete; + } + + $opening_whitespace_length = strspn( $text, " \t\f\r\n", $opening_whitespace_at ); + + /* + * The `wp` prefix cannot come before this point, but it may come after it + * depending on the presence of the closer. This is detected next. + */ + $wp_prefix_at = $opening_whitespace_at + $opening_whitespace_length; + if ( $wp_prefix_at >= $end ) { + goto incomplete; + } + + if ( 0 === $opening_whitespace_length ) { + $at = $this->find_html_comment_end( $comment_opening_at, $end ); + continue; + } + + // + $has_closer = false; + if ( '/' === $text[ $wp_prefix_at ] ) { + $has_closer = true; + ++$wp_prefix_at; + } + + // + if ( $wp_prefix_at < $end && 0 !== substr_compare( $text, 'wp:', $wp_prefix_at, 3 ) ) { + if ( + ( $wp_prefix_at + 2 >= $end && str_ends_with( $text, 'wp' ) ) || + ( $wp_prefix_at + 1 >= $end && str_ends_with( $text, 'w' ) ) + ) { + goto incomplete; + } + + $at = $this->find_html_comment_end( $comment_opening_at, $end ); + continue; + } + + /* + * If the block contains no namespace, this will end up masquerading with + * the block name. It’s easier to first detect the span and then determine + * if it’s a namespace of a name. + * + * + */ + $namespace_at = $wp_prefix_at + 3; + if ( $namespace_at >= $end ) { + goto incomplete; + } + + $start_of_namespace = $text[ $namespace_at ]; + + // The namespace must start with a-z. + if ( 'a' > $start_of_namespace || 'z' < $start_of_namespace ) { + $at = $this->find_html_comment_end( $comment_opening_at, $end ); + continue; + } + + $namespace_length = 1 + strspn( $text, 'abcdefghijklmnopqrstuvwxyz0123456789-_', $namespace_at + 1 ); + $separator_at = $namespace_at + $namespace_length; + if ( $separator_at >= $end ) { + goto incomplete; + } + + // + $has_separator = '/' === $text[ $separator_at ]; + if ( $has_separator ) { + $name_at = $separator_at + 1; + + if ( $name_at >= $end ) { + goto incomplete; + } + + // + $start_of_name = $text[ $name_at ]; + if ( 'a' > $start_of_name || 'z' < $start_of_name ) { + $at = $this->find_html_comment_end( $comment_opening_at, $end ); + continue; + } + + $name_length = 1 + strspn( $text, 'abcdefghijklmnopqrstuvwxyz0123456789-_', $name_at + 1 ); + } else { + $name_at = $namespace_at; + $name_length = $namespace_length; + } + + if ( $name_at + $name_length >= $end ) { + goto incomplete; + } + + /* + * For this next section of the delimiter, it could be the JSON attributes + * or it could be the end of the comment. Assume that the JSON is there and + * update if it’s not. + */ + + // + $after_name_whitespace_at = $name_at + $name_length; + $after_name_whitespace_length = strspn( $text, " \t\f\r\n", $after_name_whitespace_at ); + $json_at = $after_name_whitespace_at + $after_name_whitespace_length; + + if ( $json_at >= $end ) { + goto incomplete; + } + + if ( 0 === $after_name_whitespace_length ) { + $at = $this->find_html_comment_end( $comment_opening_at, $end ); + continue; + } + + // + $has_json = '{' === $text[ $json_at ]; + $json_length = 0; + + /* + * For the final span of the delimiter it's most efficient to find the end of the + * HTML comment and work backwards. This prevents complicated parsing inside the + * JSON span, which is not allowed to contain the HTML comment terminator. + * + * This also matches the behavior in the official block parser, + * even though it allows for matching invalid JSON content. + * + * ', $json_at ); + if ( false === $comment_closing_at ) { + goto incomplete; + } + + // + if ( '/' === $text[ $comment_closing_at - 1 ] ) { + $has_void_flag = true; + $void_flag_length = 1; + } else { + $has_void_flag = false; + $void_flag_length = 0; + } + + /* + * If there's no JSON, then the span of text after the name + * until the comment closing must be completely whitespace. + * Otherwise it’s a normal HTML comment. + */ + if ( ! $has_json ) { + if ( $after_name_whitespace_at + $after_name_whitespace_length === $comment_closing_at - $void_flag_length ) { + // This must be a block delimiter! + $this->state = self::MATCHED; + break; + } + + $at = $this->find_html_comment_end( $comment_opening_at, $end ); + continue; + } + + /* + * There's JSON, so attempt to find its boundary. + * + * @todo It’s likely faster to scan forward instead of in reverse. + * + * + */ + $after_json_whitespace_length = 0; + for ( $char_at = $comment_closing_at - $void_flag_length - 1; $char_at > $json_at; $char_at-- ) { + $char = $text[ $char_at ]; + + switch ( $char ) { + case ' ': + case "\t": + case "\f": + case "\r": + case "\n": + ++$after_json_whitespace_length; + continue 2; + + case '}': + $json_length = $char_at - $json_at + 1; + break 2; + + default: + ++$at; + continue 3; + } + } + + /* + * This covers cases where there is no terminating “}” or where + * mandatory whitespace is missing. + */ + if ( 0 === $json_length || 0 === $after_json_whitespace_length ) { + $at = $this->find_html_comment_end( $comment_opening_at, $end ); + continue; + } + + // This must be a block delimiter! + $this->state = self::MATCHED; + break; + } + + // The end of the document was reached without a match. + if ( self::MATCHED !== $this->state ) { + $this->state = self::COMPLETE; + return false; + } + + /* + * From this point forward, a delimiter has been matched. There + * might also be an HTML span that appears before the delimiter. + */ + + $this->after_previous_delimiter = $after_prev_delimiter; + + $this->matched_delimiter_at = $comment_opening_at; + $this->matched_delimiter_length = $comment_closing_at + 3 - $comment_opening_at; + + $this->namespace_at = $namespace_at; + $this->name_at = $name_at; + $this->name_length = $name_length; + + $this->json_at = $json_at; + $this->json_length = $json_length; + + /* + * When delimiters contain both the void flag and the closing flag + * they shall be interpreted as void blocks, per the spec parser. + */ + if ( $has_void_flag ) { + $this->type = self::VOID; + $this->next_stack_op = 'void'; + } elseif ( $has_closer ) { + $this->type = self::CLOSER; + $this->next_stack_op = 'pop'; + + /* + * @todo Check if the name matches and bail according to the spec parser. + * The default parser doesn’t examine the names. + */ + } else { + $this->type = self::OPENER; + $this->next_stack_op = 'push'; + } + + $this->has_closing_flag = $has_closer; + + // HTML spans are visited before the delimiter that follows them. + if ( $comment_opening_at > $after_prev_delimiter ) { + $this->state = self::HTML_SPAN; + $this->open_blocks_at[] = $after_prev_delimiter; + $this->open_blocks_length[] = 0; + $this->was_void = true; + + return true; + } + + // If there were no HTML spans then flush the enqueued stack operations immediately. + switch ( $this->next_stack_op ) { + case 'void': + $this->was_void = true; + $this->open_blocks_at[] = $namespace_at; + $this->open_blocks_length[] = $name_at + $name_length - $namespace_at; + break; + + case 'push': + $this->open_blocks_at[] = $namespace_at; + $this->open_blocks_length[] = $name_at + $name_length - $namespace_at; + break; + + case 'pop': + array_pop( $this->open_blocks_at ); + array_pop( $this->open_blocks_length ); + break; + } + + $this->next_stack_op = null; + + return true; + + incomplete: + $this->state = self::COMPLETE; + $this->last_error = self::INCOMPLETE_INPUT; + return false; + } + + /** + * Returns an array containing the names of the currently-open blocks, in order + * from outermost to innermost, with HTML spans indicated as “#html”. + * + * Example: + * + * // Freeform HTML content is an HTML span. + * $processor = new WP_Block_Processor( 'Just text' ); + * $processor->next_token(); + * array( '#text' ) === $processor->get_breadcrumbs(); + * + * $processor = new WP_Block_Processor( '' ); + * $processor->next_token(); + * array( 'core/a' ) === $processor->get_breadcrumbs(); + * $processor->next_token(); + * array( 'core/a', 'core/b' ) === $processor->get_breadcrumbs(); + * $processor->next_token(); + * // Void blocks are only open while visiting them. + * array( 'core/a', 'core/b', 'core/c' ) === $processor->get_breadcrumbs(); + * $processor->next_token(); + * // Blocks are closed before visiting their closing delimiter. + * array( 'core/a' ) === $processor->get_breadcrumbs(); + * $processor->next_token(); + * array() === $processor->get_breadcrumbs(); + * + * // Inner HTML is also an HTML span. + * $processor = new WP_Block_Processor( 'Inner HTML' ); + * $processor->next_token(); + * $processor->next_token(); + * array( 'core/a', '#html' ) === $processor->get_breadcrumbs(); + * + * @since 6.9.0 + * + * @return string[] + */ + public function get_breadcrumbs(): array { + $breadcrumbs = array_fill( 0, count( $this->open_blocks_at ), null ); + + /* + * Since HTML spans can only be at the very end, set the normalized block name for + * each open element and then work backwards after creating the array. This allows + * for the elimination of a conditional on each iteration of the loop. + */ + foreach ( $this->open_blocks_at as $i => $at ) { + $block_type = substr( $this->source_text, $at, $this->open_blocks_length[ $i ] ); + $breadcrumbs[ $i ] = self::normalize_block_type( $block_type ); + } + + if ( isset( $i ) && 0 === $this->open_blocks_length[ $i ] ) { + $breadcrumbs[ $i ] = '#html'; + } + + return $breadcrumbs; + } + + /** + * Returns the depth of the open blocks where the processor is currently matched. + * + * Depth increases before visiting openers and void blocks and decreases before + * visiting closers. HTML spans behave like void blocks. + * + * @since 6.9.0 + * + * @return int + */ + public function get_depth(): int { + return count( $this->open_blocks_at ); + } + + /** + * Extracts a block object, and all inner content, starting at a matched opening + * block delimiter, or at a matched top-level HTML span as freeform HTML content. + * + * Use this function to extract some blocks within a document, but not all. For example, + * one might want to find image galleries, parse them, modify them, and then reserialize + * them in place. + * + * Once this function returns, the parser will be matched on token following the close + * of the given block. + * + * The return type of this method is compatible with the return of {@see \parse_blocks()}. + * + * Example: + * + * $processor = new WP_Block_Processor( $post_content ); + * if ( ! $processor->next_block( 'gallery' ) ) { + * return $post_content; + * } + * + * $gallery_at = $processor->get_span()->start; + * $gallery = $processor->extract_block(); + * $ends_before = $processor->get_span(); + * $ends_before = $ends_before->start ?? strlen( $post_content ); + * + * $new_gallery = update_gallery( $gallery ); + * $new_gallery = serialize_block( $new_gallery ); + * + * return ( + * substr( $post_content, 0, $gallery_at ) . + * $new_gallery . + * substr( $post_content, $ends_before ) + * ); + * + * @since 6.9.0 + * + * @return array[]|null { + * Array of block structures. + * + * @type array ...$0 { + * An associative array of a single parsed block object. See WP_Block_Parser_Block. + * + * @type string|null $blockName Name of block. + * @type array $attrs Attributes from block comment delimiters. + * @type array[] $innerBlocks List of inner blocks. An array of arrays that + * have the same structure as this one. + * @type string $innerHTML HTML from inside block comment delimiters. + * @type array $innerContent List of string fragments and null markers where + * inner blocks were found. + * } + * } + */ + public function extract_block(): ?array { + if ( $this->is_html() ) { + $chunk = $this->get_html_content(); + + return array( + 'blockName' => null, + 'attrs' => array(), + 'innerBlocks' => array(), + 'innerHTML' => $chunk, + 'innerContent' => array( $chunk ), + ); + } + + $block = array( + 'blockName' => $this->get_block_type(), + 'attrs' => $this->allocate_and_return_parsed_attributes() ?? array(), + 'innerBlocks' => array(), + 'innerHTML' => '', + 'innerContent' => array(), + ); + + $depth = $this->get_depth(); + while ( $this->next_token() && $this->get_depth() > $depth ) { + if ( $this->is_html() ) { + $chunk = $this->get_html_content(); + $block['innerHTML'] .= $chunk; + $block['innerContent'][] = $chunk; + continue; + } + + /** + * Inner blocks. + * + * @todo This is a decent place to call {@link \render_block()} + * @todo Use iteration instead of recursion, or at least refactor to tail-call form. + */ + if ( $this->opens_block() ) { + $inner_block = $this->extract_block(); + $block['innerBlocks'][] = $inner_block; + $block['innerContent'][] = null; + } + } + + return $block; + } + + /** + * Returns the byte-offset after the ending character of an HTML comment, + * assuming the proper starting byte offset. + * + * @since 6.9.0 + * + * @param int $comment_starting_at Where the HTML comment started, the leading `<`. + * @param int $search_end Last offset in which to search, for limiting search span. + * @return int Offset after the current HTML comment ends, or `$search_end` if no end was found. + */ + private function find_html_comment_end( int $comment_starting_at, int $search_end ): int { + $text = $this->source_text; + + // Find span-of-dashes comments which look like ``. + $span_of_dashes = strspn( $text, '-', $comment_starting_at + 2 ); + if ( + $comment_starting_at + 2 + $span_of_dashes < $search_end && + '>' === $text[ $comment_starting_at + 2 + $span_of_dashes ] + ) { + return $comment_starting_at + $span_of_dashes + 1; + } + + // Otherwise, there are other characters inside the comment, find the first `-->` or `--!>`. + $now_at = $comment_starting_at + 4; + while ( $now_at < $search_end ) { + $dashes_at = strpos( $text, '--', $now_at ); + if ( false === $dashes_at ) { + return $search_end; + } + + $closer_must_be_at = $dashes_at + 2 + strspn( $text, '-', $dashes_at + 2 ); + if ( $closer_must_be_at < $search_end && '!' === $text[ $closer_must_be_at ] ) { + $closer_must_be_at++; + } + + if ( $closer_must_be_at < $search_end && '>' === $text[ $closer_must_be_at ] ) { + return $closer_must_be_at + 1; + } + + $now_at++; + } + + return $search_end; + } + + /** + * Indicates if the last attempt to parse a block comment delimiter + * failed, if set, otherwise `null` if the last attempt succeeded. + * + * @since 6.9.0 + * + * @return string|null Error from last attempt at parsing next block delimiter, + * or `null` if last attempt succeeded. + */ + public function get_last_error(): ?string { + return $this->last_error; + } + + /** + * Indicates if the last attempt to parse a block’s JSON attributes failed. + * + * @see \json_last_error() + * + * @since 6.9.0 + * + * @return int JSON_ERROR_ code from last attempt to parse block JSON attributes. + */ + public function get_last_json_error(): int { + return $this->last_json_error; + } + + /** + * Returns the type of the block comment delimiter. + * + * One of: + * + * - {@see self::OPENER} + * - {@see self::CLOSER} + * - {@see self::VOID} + * - `null` + * + * @since 6.9.0 + * + * @return string|null type of the block comment delimiter, if currently matched. + */ + public function get_delimiter_type(): ?string { + switch ( $this->state ) { + case self::HTML_SPAN: + return self::VOID; + + case self::MATCHED: + return $this->type; + + default: + return null; + } + } + + /** + * Returns whether the delimiter contains the closing flag. + * + * This should be avoided except in cases of custom error-handling + * with block closers containing the void flag. For normative use, + * {@see self::get_delimiter_type()}. + * + * @since 6.9.0 + * + * @return bool Whether the currently-matched block delimiter contains the closing flag. + */ + public function has_closing_flag(): bool { + return $this->has_closing_flag; + } + + /** + * Indicates if the block delimiter represents a block of the given type. + * + * Since the “core” namespace may be implicit, it’s allowable to pass + * either the fully-qualified block type with namespace and block name + * as well as the shorthand version only containing the block name, if + * the desired block is in the “core” namespace. + * + * Since freeform HTML content is non-block content, it has no block type. + * Passing the wildcard “*” will, however, return true for all block types, + * even the implicit freeform content, though not for spans of inner HTML. + * + * Example: + * + * $is_core_paragraph = $processor->is_block_type( 'paragraph' ); + * $is_core_paragraph = $processor->is_block_type( 'core/paragraph' ); + * $is_formula = $processor->is_block_type( 'math-block/formula' ); + * + * @param string $block_type Block type name for the desired block. + * E.g. "paragraph", "core/paragraph", "math-blocks/formula". + * @return bool Whether this delimiter represents a block of the given type. + */ + public function is_block_type( string $block_type ): bool { + if ( '*' === $block_type ) { + return true; + } + + // This is a core/freeform text block, it’s special. + if ( $this->is_html() && 0 === ( $this->open_blocks_length[0] ?? null ) ) { + return ( + 'core/freeform' === $block_type || + 'freeform' === $block_type + ); + } + + return $this->are_equal_block_types( $this->source_text, $this->namespace_at, $this->name_at - $this->namespace_at + $this->name_length, $block_type, 0, strlen( $block_type ) ); + } + + /** + * Given two spans of text, indicate if they represent identical block types. + * + * This function normalizes block types to account for implicit core namespacing. + * + * Note! This function only returns valid results when the complete block types are + * represented in the span offsets and lengths. This means that the full optional + * namespace and block name must be represented in the input arguments. + * + * Example: + * + * 0 5 10 15 20 25 30 35 40 + * $text = ''; + * + * true === WP_Block_Processor::are_equal_block_types( $text, 9, 5, $text, 27, 10 ); + * false === WP_Block_Processor::are_equal_block_types( $text, 9, 5, 'my/block', 0, 8 ); + * + * @since 6.9.0 + * + * @param string $a_text Text in which first block type appears. + * @param int $a_at Byte offset into text in which first block type starts. + * @param int $a_length Byte length of first block type. + * @param string $b_text Text in which second block type appears (may be the same as the first text). + * @param int $b_at Byte offset into text in which second block type starts. + * @param int $b_length Byte length of second block type. + * @return bool Whether the spans of text represent identical block types, normalized for namespacing. + */ + public static function are_equal_block_types( string $a_text, int $a_at, int $a_length, string $b_text, int $b_at, int $b_length ): bool { + $a_ns_length = strcspn( $a_text, '/', $a_at, $a_length ); + $b_ns_length = strcspn( $b_text, '/', $b_at, $b_length ); + + $a_has_ns = $a_ns_length !== $a_length; + $b_has_ns = $b_ns_length !== $b_length; + + // Both contain namespaces. + if ( $a_has_ns && $b_has_ns ) { + if ( $a_length !== $b_length ) { + return false; + } + + $a_block_type = substr( $a_text, $a_at, $a_length ); + + return 0 === substr_compare( $b_text, $a_block_type, $b_at, $b_length ); + } + + if ( $a_has_ns ) { + $b_block_type = 'core/' . substr( $b_text, $b_at, $b_length ); + + return ( + strlen( $b_block_type ) === $a_length && + 0 === substr_compare( $a_text, $b_block_type, $a_at, $a_length ) + ); + } + + if ( $b_has_ns ) { + $a_block_type = 'core/' . substr( $a_text, $a_at, $a_length ); + + return ( + strlen( $a_block_type ) === $b_length && + 0 === substr_compare( $b_text, $a_block_type, $b_at, $b_length ) + ); + } + + // Neither contains a namespace. + if ( $a_length !== $b_length ) { + return false; + } + + $a_name = substr( $a_text, $a_at, $a_length ); + + return 0 === substr_compare( $b_text, $a_name, $b_at, $b_length ); + } + + /** + * Indicates if the matched delimiter is an opening or void delimiter of the given type, + * if a type is provided, otherwise if it opens any block or implicit freeform HTML content. + * + * This is a helper method to ease handling of code inspecting where blocks start, and for + * checking if the blocks are of a given type. The function is variadic to allow for + * checking if the delimiter opens one of many possible block types. + * + * To advance to the start of a block {@see self::next_block()}. + * + * Example: + * + * $processor = new WP_Block_Processor( $html ); + * while ( $processor->next_delimiter() ) { + * if ( $processor->opens_block( 'core/code', 'syntaxhighlighter/code' ) ) { + * echo "Found code!"; + * continue; + * } + * + * if ( $processor->opens_block( 'core/image' ) ) { + * echo "Found an image!"; + * continue; + * } + * + * if ( $processor->opens_block() ) { + * echo "Found a new block!"; + * } + * } + * + * @since 6.9.0 + * + * @see self::is_block_type() + * + * @param string[] $block_type Optional. Is the matched block type one of these? + * If none are provided, will not test block type. + * @return bool Whether the matched block delimiter opens a block, and whether it + * opens a block of one of the given block types, if provided. + */ + public function opens_block( string ...$block_type ): bool { + // HTML spans only open implicit freeform content at the top level. + if ( self::HTML_SPAN === $this->state && 1 !== count( $this->open_blocks_at ) ) { + return false; + } + + /* + * Because HTML spans are discovered after the next delimiter is found, + * the delimiter type when visiting HTML spans refers to the type of the + * following delimiter. Therefore the HTML case is handled by checking + * the state and depth of the stack of open block. + */ + if ( self::CLOSER === $this->type && ! $this->is_html() ) { + return false; + } + + if ( count( $block_type ) === 0 ) { + return true; + } + + foreach ( $block_type as $block ) { + if ( $this->is_block_type( $block ) ) { + return true; + } + } + + return false; + } + + /** + * Indicates if the matched delimiter is an HTML span. + * + * @since 6.9.0 + * + * @see self::is_non_whitespace_html() + * + * @return bool Whether the processor is matched on an HTML span. + */ + public function is_html(): bool { + return self::HTML_SPAN === $this->state; + } + + /** + * Indicates if the matched delimiter is an HTML span and comprises more + * than whitespace characters, i.e. contains real content. + * + * Many block serializers introduce newlines between block delimiters, + * so the presence of top-level non-block content does not imply that + * there are “real” freeform HTML blocks. Checking if there is content + * beyond whitespace is a more certain check, such as for determining + * whether to load CSS for the freeform or fallback block type. + * + * @since 6.9.0 + * + * @see self::is_html() + * + * @return bool Whether the currently-matched delimiter is an HTML + * span containing non-whitespace text. + */ + public function is_non_whitespace_html(): bool { + if ( ! $this->is_html() ) { + return false; + } + + $length = $this->matched_delimiter_at - $this->after_previous_delimiter; + + $whitespace_length = strspn( + $this->source_text, + " \t\f\r\n", + $this->after_previous_delimiter, + $length + ); + + return $whitespace_length !== $length; + } + + /** + * Returns the string content of a matched HTML span, or `null` otherwise. + * + * @since 6.9.0 + * + * @return string|null Raw HTML content, or `null` if not currently matched on HTML. + */ + public function get_html_content(): ?string { + if ( ! $this->is_html() ) { + return null; + } + + return substr( + $this->source_text, + $this->after_previous_delimiter, + $this->matched_delimiter_at - $this->after_previous_delimiter + ); + } + + /** + * Allocates a substring for the block type and returns the fully-qualified + * name, including the namespace, if matched on a delimiter, otherwise `null`. + * + * This function is like {@see self::get_printable_block_type()} but when + * paused on a freeform HTML block, will return `null` instead of “core/freeform”. + * The `null` behavior matches what {@see \parse_blocks()} returns but may not + * be as useful as having a string value. + * + * This function allocates a substring for the given block type. This + * allocation will be small and likely fine in most cases, but it's + * preferable to call {@see self::is_block_type()} if only needing + * to know whether the delimiter is for a given block type, as that + * function is more efficient for this purpose and avoids the allocation. + * + * Example: + * + * // Avoid. + * 'core/paragraph' = $processor->get_block_type(); + * + * // Prefer. + * $processor->is_block_type( 'core/paragraph' ); + * $processor->is_block_type( 'paragraph' ); + * $processor->is_block_type( 'core/freeform' ); + * + * // Freeform HTML content has no block type. + * $processor = new WP_Block_Processor( 'non-block content' ); + * $processor->next_token(); + * null === $processor->get_block_type(); + * + * @since 6.9.0 + * + * @see self::are_equal_block_types() + * + * @return string|null Fully-qualified block namespace and type, e.g. "core/paragraph", + * if matched on an explicit delimiter, otherwise `null`. + */ + public function get_block_type(): ?string { + if ( + self::READY === $this->state || + self::COMPLETE === $this->state || + self::INCOMPLETE_INPUT === $this->state + ) { + return null; + } + + // This is a core/freeform text block, it’s special. + if ( $this->is_html() ) { + return null; + } + + $block_type = substr( $this->source_text, $this->namespace_at, $this->name_at - $this->namespace_at + $this->name_length ); + return self::normalize_block_type( $block_type ); + } + + /** + * Allocates a printable substring for the block type and returns the fully-qualified + * name, including the namespace, if matched on a delimiter or freeform block, otherwise `null`. + * + * This function is like {@see self::get_block_type()} but when paused on a freeform + * HTML block, will return “core/freeform” instead of `null`. The `null` behavior matches + * what {@see \parse_blocks()} returns but may not be as useful as having a string value. + * + * This function allocates a substring for the given block type. This + * allocation will be small and likely fine in most cases, but it's + * preferable to call {@see self::is_block_type()} if only needing + * to know whether the delimiter is for a given block type, as that + * function is more efficient for this purpose and avoids the allocation. + * + * Example: + * + * // Avoid. + * 'core/paragraph' = $processor->get_printable_block_type(); + * + * // Prefer. + * $processor->is_block_type( 'core/paragraph' ); + * $processor->is_block_type( 'paragraph' ); + * $processor->is_block_type( 'core/freeform' ); + * + * // Freeform HTML content is given an implicit type. + * $processor = new WP_Block_Processor( 'non-block content' ); + * $processor->next_token(); + * 'core/freeform' === $processor->get_printable_block_type(); + * + * @since 6.9.0 + * + * @see self::are_equal_block_types() + * + * @return string|null Fully-qualified block namespace and type, e.g. "core/paragraph", + * if matched on an explicit delimiter or freeform block, otherwise `null`. + */ + public function get_printable_block_type(): ?string { + if ( + self::READY === $this->state || + self::COMPLETE === $this->state || + self::INCOMPLETE_INPUT === $this->state + ) { + return null; + } + + // This is a core/freeform text block, it’s special. + if ( $this->is_html() ) { + return 1 === count( $this->open_blocks_at ) + ? 'core/freeform' + : '#innerHTML'; + } + + $block_type = substr( $this->source_text, $this->namespace_at, $this->name_at - $this->namespace_at + $this->name_length ); + return self::normalize_block_type( $block_type ); + } + + /** + * Normalizes a block name to ensure that missing implicit “core” namespaces are present. + * + * Example: + * + * 'core/paragraph' === WP_Block_Processor::normalize_block_byte( 'paragraph' ); + * 'core/paragraph' === WP_Block_Processor::normalize_block_byte( 'core/paragraph' ); + * 'my/paragraph' === WP_Block_Processor::normalize_block_byte( 'my/paragraph' ); + * + * @since 6.9.0 + * + * @param string $block_type Valid block name, potentially without a namespace. + * @return string Fully-qualified block type including namespace. + */ + public static function normalize_block_type( string $block_type ): string { + return false === strpos( $block_type, '/' ) + ? "core/{$block_type}" + : $block_type; + } + + /** + * Returns a lazy wrapper around the block attributes, which can be used + * for efficiently interacting with the JSON attributes. + * + * This stub hints that there should be a lazy interface for parsing + * block attributes but doesn’t define it. It serves both as a placeholder + * for one to come as well as a guard against implementing an eager + * function in its place. + * + * @throws Exception This function is a stub for subclasses to implement + * when providing streaming attribute parsing. + * + * @since 6.9.0 + * + * @see self::allocate_and_return_parsed_attributes() + * + * @return never + */ + public function get_attributes() { + throw new Exception( 'Lazy attribute parsing not yet supported' ); + } + + /** + * Attempts to parse and return the entire JSON attributes from the delimiter, + * allocating memory and processing the JSON span in the process. + * + * This does not return any parsed attributes for a closing block delimiter + * even if there is a span of JSON content; this JSON is a parsing error. + * + * Consider calling {@see static::get_attributes()} instead if it's not + * necessary to read all the attributes at the same time, as that provides + * a more efficient mechanism for typical use cases. + * + * Since the JSON span inside the comment delimiter may not be valid JSON, + * this function will return `null` if it cannot parse the span and set the + * {@see static::get_last_json_error()} to the appropriate JSON_ERROR_ constant. + * + * If the delimiter contains no JSON span, it will also return `null`, + * but the last error will be set to {@see \JSON_ERROR_NONE}. + * + * Example: + * + * $processor = new WP_Block_Processor( '' ); + * $processor->next_delimiter(); + * $memory_hungry_and_slow_attributes = $processor->allocate_and_return_parsed_attributes(); + * $memory_hungry_and_slow_attributes === array( 'url' => 'https://wordpress.org/favicon.ico' ); + * + * $processor = new WP_Block_Processor( '' ); + * $processor->next_delimiter(); + * null = $processor->allocate_and_return_parsed_attributes(); + * JSON_ERROR_NONE = $processor->get_last_json_error(); + * + * $processor = new WP_Block_Processor( '' ); + * $processor->next_delimiter(); + * array() === $processor->allocate_and_return_parsed_attributes(); + * + * $processor = new WP_Block_Processor( '' ); + * $processor->next_delimiter(); + * null = $processor->allocate_and_return_parsed_attributes(); + * + * $processor = new WP_Block_Processor( '' ); + * $processor->next_delimiter(); + * null = $processor->allocate_and_return_parsed_attributes(); + * JSON_ERROR_CTRL_CHAR = $processor->get_last_json_error(); + * + * @since 6.9.0 + * + * @return array|null Parsed JSON attributes, if present and valid, otherwise `null`. + */ + public function allocate_and_return_parsed_attributes(): ?array { + $this->last_json_error = JSON_ERROR_NONE; + + if ( self::CLOSER === $this->type || $this->is_html() || 0 === $this->json_length ) { + return null; + } + + $json_span = substr( $this->source_text, $this->json_at, $this->json_length ); + $parsed = json_decode( $json_span, null, 512, JSON_OBJECT_AS_ARRAY | JSON_INVALID_UTF8_SUBSTITUTE ); + + $last_error = json_last_error(); + $this->last_json_error = $last_error; + + return ( JSON_ERROR_NONE === $last_error && is_array( $parsed ) ) + ? $parsed + : null; + } + + /** + * Returns the span representing the currently-matched delimiter, if matched, otherwise `null`. + * + * Example: + * + * $processor = new WP_Block_Processor( '' ); + * null === $processor->get_span(); + * + * $processor->next_delimiter(); + * WP_HTML_Span( 0, 17 ) === $processor->get_span(); + * + * @since 6.9.0 + * + * @return WP_HTML_Span|null Span of text in source text spanning matched delimiter. + */ + public function get_span(): ?WP_HTML_Span { + switch ( $this->state ) { + case self::HTML_SPAN: + return new WP_HTML_Span( $this->after_previous_delimiter, $this->matched_delimiter_at - $this->after_previous_delimiter ); + + case self::MATCHED: + return new WP_HTML_Span( $this->matched_delimiter_at, $this->matched_delimiter_length ); + + default: + return null; + } + } + + // + // Constant declarations that would otherwise pollute the top of the class. + // + + /** + * Indicates that the block comment delimiter closes an open block. + * + * @see self::$type + * + * @since 6.9.0 + */ + const CLOSER = 'closer'; + + /** + * Indicates that the block comment delimiter opens a block. + * + * @see self::$type + * + * @since 6.9.0 + */ + const OPENER = 'opener'; + + /** + * Indicates that the block comment delimiter represents a void block + * with no inner content of any kind. + * + * @see self::$type + * + * @since 6.9.0 + */ + const VOID = 'void'; + + /** + * Indicates that the processor is ready to start parsing but hasn’t yet begun. + * + * @see self::$state + * + * @since 6.9.0 + */ + const READY = 'processor-ready'; + + /** + * Indicates that the processor is matched on an explicit block delimiter. + * + * @see self::$state + * + * @since 6.9.0 + */ + const MATCHED = 'processor-matched'; + + /** + * Indicates that the processor is matched on the opening of an implicit freeform delimiter. + * + * @see self::$state + * + * @since 6.9.0 + */ + const HTML_SPAN = 'processor-html-span'; + + /** + * Indicates that the parser started parsing a block comment delimiter, but + * the input document ended before it could finish. The document was likely truncated. + * + * @see self::$state + * + * @since 6.9.0 + */ + const INCOMPLETE_INPUT = 'incomplete-input'; + + /** + * Indicates that the processor has finished parsing and has nothing left to scan. + * + * @see self::$state + * + * @since 6.9.0 + */ + const COMPLETE = 'processor-complete'; +} diff --git a/src/wp-settings.php b/src/wp-settings.php index 9a175e71f0fa9..b59991e92b2bb 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -268,6 +268,7 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-stack-event.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php'; +require ABSPATH . WPINC . '/class-wp-block-processor.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php'; diff --git a/tests/phpunit/tests/block-processor/wpBlockProcessor-BlockProcessing.php b/tests/phpunit/tests/block-processor/wpBlockProcessor-BlockProcessing.php new file mode 100644 index 0000000000000..c9524f9182923 --- /dev/null +++ b/tests/phpunit/tests/block-processor/wpBlockProcessor-BlockProcessing.php @@ -0,0 +1,134 @@ +' ); + + $this->assertTrue( + $processor->next_delimiter(), + 'Should have found the opening "top" delimiter but found nothing.' + ); + + $this->assertSame( + array( 'core/top' ), + $processor->get_breadcrumbs(), + 'Should have found only the single opening delimiter.' + ); + + $processor->next_delimiter(); + $this->assertSame( + array( 'core/top', 'core/inside' ), + $processor->get_breadcrumbs(), + 'Should have detected the nesting structure of the blocks.' + ); + } + + public function test_get_depth() { + // Create a deeply-nested stack of blocks. + $html = ''; + $max_depth = 10; + + for ( $i = 0; $i < $max_depth; $i++ ) { + $html .= ""; + } + + for ( $i = 0; $i < $max_depth; $i++ ) { + $html .= ''; + } + + $processor = new WP_Block_Processor( $html ); + $n = new NumberFormatter( 'en-US', NumberFormatter::ORDINAL ); + + for ( $i = 0; $i < $max_depth; $i++ ) { + $this->assertTrue( + $processor->next_delimiter(), + "Should have found {$n->format( $i + 1 )} opening delimiter: check test setup." + ); + + $this->assertSame( + $i + 1, + $processor->get_depth(), + "Should have identified the proper depth of the {$n->format( $i + 1 )} opening delimiter." + ); + } + + for ( $i = 0; $i < $max_depth; $i++ ) { + $this->assertTrue( + $processor->next_delimiter(), + "Should have found {$n->format( $i + 1 )} closing delimiter: check test setup." + ); + + $this->assertSame( + $max_depth - $i - 1, + $processor->get_depth(), + "Should have identified the proper depth of the {$n->format( $i + 1 )} closing delimiter." + ); + } + } + + /** + * @dataProvider data_block_content + */ + public function test_builds_block( $block_content ) { + $processor = new WP_Block_Processor( $block_content ); + + $extracted = array(); + while ( $processor->next_block( '*' ) ) { + $extracted[] = $processor->extract_block(); + } + + $this->assertSame( + parse_blocks( $block_content ), + $extracted, + 'Should have extracted a block matching the input group block.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_block_content() { + $contents = array( + 'no blocks, just freeform HTML', + '', + '

Inner HTML

', + << + + + + + +

Testing works!

+ + + +

Who knew?

+ + +HTML + , + ); + + return array_map( + function ( $content ) { + return array( $content ); + }, + $contents + ); + } +} diff --git a/tests/phpunit/tests/block-processor/wpBlockProcessor.php b/tests/phpunit/tests/block-processor/wpBlockProcessor.php new file mode 100644 index 0000000000000..682307a88f28c --- /dev/null +++ b/tests/phpunit/tests/block-processor/wpBlockProcessor.php @@ -0,0 +1,1238 @@ +assertFalse( + $processor->next_token(), + 'Should not have found any delimiters.' + ); + } + + /** + * Verifies that freeform delimiters are found when requested for + * posts with no block content. + * + * @ticket 61401 + */ + public function test_finds_freeform_delimiters_in_post_without_blocks() { + $processor = new WP_Block_Processor( 'This is non-block content.' ); + + $this->assertTrue( + $processor->next_delimiter( '*' ), + 'Should have found the start of a freeform block but found nothing.' + ); + + $this->assertSame( + WP_Block_Processor::VOID, + $processor->get_delimiter_type(), + 'Should have found an opening block delimiter.' + ); + + $this->assertNull( + $processor->get_block_type(), + 'Should have found the start of a freeform block.' + ); + + $this->assertSame( + 'core/freeform', + $processor->get_printable_block_type(), + 'Should have reported the top-level freeform content as the fallback block type.' + ); + + $this->assertFalse( + $processor->next_delimiter( '*' ), + 'Should not have found any more delimiters since the input is exactly one HTML span.' + ); + } + + /** + * Verifies that a post containing a single void block finds the block and nothing else. + * + * @ticket 61401 + */ + public function test_finds_post_of_void_block() { + $processor = new WP_Block_Processor( '' ); + + $this->assertTrue( + $processor->next_delimiter(), + 'Should have found a block delimiter but found nothing.' + ); + + $this->assertSame( + WP_Block_Processor::VOID, + $processor->get_delimiter_type(), + 'Should have found a void block delimiter.' + ); + + $this->assertSame( + 'core/recent-posts', + $processor->get_block_type(), + 'Should have found a core/recent-posts void block.' + ); + } + + /** + * Verifies that a post containing a single basic block finds the block opener and closer. + * + * @ticket 61401 + */ + public function test_finds_open_and_inner_html_and_close_of_post_with_basic_block() { + $processor = new WP_Block_Processor( '

Content

' ); + + $this->assertTrue( + $processor->next_delimiter(), + 'Should have found an opening block delimiter but found nothing.' + ); + + $this->assertTrue( + $processor->opens_block( 'core/paragraph' ), + 'Should have found an opening core/paragraph delimiter.' + ); + + $this->assertSame( + WP_Block_Processor::OPENER, + $processor->get_delimiter_type(), + 'Should have found an opening block delimiter.' + ); + + $this->assertTrue( + $processor->next_token(), + 'Should have found the inner HTML inside the paragraph block.' + ); + + $this->assertTrue( + $processor->is_html(), + 'Should have identified the inner HTML as an HTML span.' + ); + + $this->assertNull( + $processor->get_block_type(), + 'Should have found no block type for the inner HTML.' + ); + + $this->assertSame( + '#innerHTML', + $processor->get_printable_block_type(), + 'Should have identified the HTML span as inner HTML.' + ); + + $this->assertSame( + '

Content

', + $processor->get_html_content(), + 'Failed to extract the block’s inner HTML.' + ); + + $this->assertTrue( + $processor->next_delimiter(), + 'Should have found a closing block delimiter but found nothing.' + ); + + $this->assertSame( + 'core/paragraph', + $processor->get_block_type(), + 'Should have found a closing paragraph delimiter.' + ); + + $this->assertSame( + WP_Block_Processor::CLOSER, + $processor->get_delimiter_type(), + 'Should have found a closing block delimiter.' + ); + } + + /** + * Verifies that the parser refuses to parse the end of a document + * which partially contains what could be a block delimiter. + * + * @ticket 61401 + * + * @dataProvider data_partial_delimiter_endings + * + * @param string $html Input ending in a partial block delimiter. + */ + public function test_rejects_on_incomplete_inputs( $html ) { + $processor = new WP_Block_Processor( "{$html}" ); + + $processor->next_delimiter(); + $this->assertTrue( + $processor->opens_block( 'test/canary' ), + 'Should have found the test/canary block: check test code setup.' + ); + + $this->assertFalse( + $processor->next_delimiter(), + 'Should have failed to find any blocks after the test canary.' + ); + + $this->assertSame( + WP_Block_Processor::INCOMPLETE_INPUT, + $processor->get_last_error(), + 'Should have bailed because the input was incomplete.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_partial_delimiter_endings() { + $tests = array(); + + $delimiters = array( + 'opener' => '', + 'void' => '', + 'closer' => '', + ); + + foreach ( $delimiters as $kind => $delimiter ) { + for ( $i = strlen( $delimiter ) - 1; $i > 0; $i-- ) { + $partial = substr( $delimiter, 0, $i ); + $tests[ "{$kind}: {$partial}" ] = array( $partial ); + } + } + + return $tests; + } + + /** + * Verifies that it’s not possible to proceed after reaching an error. + * + * @ticket 61401 + */ + public function test_rejects_once_errored_out() { + $processor = new WP_Block_Processor( '{$html}" ); + + $processor->next_delimiter(); + $this->assertTrue( + $processor->opens_block( 'tests/before' ), + "Should have found the 'tests/before' block before the invalid block delimiter but found a '{$processor->get_block_type()}' instead." + ); + + $processor->next_token(); + $this->assertTrue( + $processor->opens_block( 'freeform' ), + "Should have found the malform block delimiter as an HTML comment, but found a '{$processor->get_block_type()}' instead." + ); + + $this->assertSame( + $html, + $processor->get_html_content(), + 'Failed to extract the proper HTML span.' + ); + + $processor->next_delimiter(); + $this->assertTrue( + $processor->opens_block( 'tests/after' ), + "Should have found the 'tests/after' block after the invalid block delimiter but found a '{$processor->get_block_type()}' instead." + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_invalid_block_delimiters_as_html_comments() { + return array( + 'Shortest HTML comment' => array( '' ), + 'Span-of-dashes' => array( '' ), + 'Empty HTML comment' => array( '' ), + 'HTML comment with exclamation' => array( '' ), + 'No spaces, minimal info' => array( '' ), + 'No spaces, minimal info, void' => array( '' ), + 'No spaces, empty JSON' => array( '' ), + 'No spaces, empty JSON, void' => array( '' ), + 'No space before wp:' => array( '' ), + 'No space after name' => array( '' ), + 'No space before JSON' => array( '' ), + 'No space after JSON' => array( '' ), + 'Missing wp:' => array( '' ), + 'Malformed wp:' => array( '' ), + 'Malformed block namespace' => array( '' ), + 'Malformed block name' => array( '' ), + 'Invalid block name characters' => array( '' ), + ); + } + + /** + * Verifies that incomplete HTML comments which could not produce delimiters + * are not considered incomplete input by the processor. + * + * Note that the block parsing operates first on block comment delimiters and + * then on HTML semantics. It’s technically possible for blocks to delimit + * invalid or non-well-formed HTML, so there’s no need to try and preserve + * other HTML boundaries in the parser the way the HTML API does. + * + * @ticket 61401 + * + * @dataProvider data_incomplete_html_comments_that_are_not_delimiters + * + * @param string $html Input containing an HTML comment that is both incomplete and + * cannot represent an incomplete block comment delimiter. + */ + public function test_unclosed_html_comment_non_delimiter_is_not_incomplete_input( $html ) { + $processor = new WP_Block_Processor( "{$html}" ); + + $this->assertTrue( + $processor->next_delimiter(), + 'Should have found setup group block but found nothing: check test setup.' + ); + + $this->assertSame( + 'core/group', + $processor->get_block_type(), + "Should have found setup 'group' block: check test setup." + ); + + $this->assertFalse( + $processor->next_delimiter(), + "Should have found no other delimiters given the incomplete HTML comment, but found a '{$processor->get_block_type()}' instead." + ); + + $this->assertNull( + $processor->get_last_error(), + 'Should have completed without reporting an error.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_incomplete_html_comments_that_are_not_delimiters() { + return array( + 'Opening and non-whitespace' => array( '" ); + + $processor->next_delimiter(); + $this->assertTrue( + $processor->opens_block( 'core/block' ), + 'Should have found the test block but found nothing instead.' + ); + + $parsed_data = $processor->allocate_and_return_parsed_attributes(); + $exported_parsed_data = var_export( $parsed_data, true ); + $exported_parsed_data = self::unhide_whitespace( $exported_parsed_data ); + $this->assertNull( + $parsed_data, + "Should have failed to parse JSON attributes, but found '{$exported_parsed_data}' instead." + ); + + $this->assertNotNull( + $processor->get_last_json_error(), + 'Should have reported an error when attempting to parse JSON attributes.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_invalid_block_json() { + return array( + 'Extra opening {' => array( '{{}' ), + 'Extra closing }' => array( '{}}' ), + 'Unquoted string' => array( '{"name": block}' ), + 'Invalid number' => array( '{"level": 14e6e7-3}' ), + ); + } + + /** + * Verifies that when there appears to be non-JSON-object content between the block name + * and the end, that the parser treats the span as a normal HTML comment. + * + * @ticket 61401 + * + * @dataProvider data_unrecognizable_block_json + * + * @param string $non_json_content Something which might appear in the place of the JSON + * attributes but which is missing the starting and ending + * curly brackets. + */ + public function test_does_not_match_block_with_unterminated_json( $non_json_content ) { + $processor = new WP_Block_Processor( "" ); + + $this->assertFalse( + $processor->next_delimiter(), + "Should have failed to find block delimiter but found '{$processor->get_block_type()}' instead." + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_unrecognizable_block_json() { + return array( + 'Empty list' => array( '[]' ), + 'Non-empty list' => array( '[1, 2, 3]' ), + 'Nested list' => array( '[{"type": "broken"}]' ), + 'True' => array( 'true' ), + 'False' => array( 'false' ), + 'null' => array( 'null' ), + 'Number (36)' => array( '36' ), + 'Number (3.141e0)' => array( '3.141e0' ), + 'Letters' => array( 'not_even_json' ), + 'Parentheses' => array( '()' ), + 'Emoji' => array( '🥳' ), + 'Canadian Syllabics' => array( 'ᐸint32ᐳ' ), + ); + } + + /** + * Verifies that a delimiter with content after the JSON attributes is not treated as a delimiter. + * + * @ticket 61401 + */ + public function test_does_not_match_block_with_content_after_json() { + $processor = new WP_Block_Processor( '' ); + + $this->assertTrue( + $processor->next_delimiter(), + 'Should have found the first valid delimiter but didn’t: check test setup.' + ); + + $this->assertFalse( + $processor->next_delimiter(), + "Should have failed to find block delimiter but found '{$processor->get_block_type()}' instead." + ); + } + + /** + * Verifies that the appropriate block delimiter type is reported for a matched delimiter. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_their_types + * + * @param string $html Contains a single block delimiter. + * @param string $delimiter_type Expected type of delimiter. + * @param int|null $skip_delimiter_count Optional. Skip this many delimiters before testing. + * Default is to skip none; start at the first one. + */ + public function test_reports_proper_delimiter_type( $html, $delimiter_type, $skip_delimiter_count = 0 ) { + $processor = new WP_Block_Processor( $html ); + + while ( $skip_delimiter_count-- > 0 ) { + $processor->next_token(); + } + + $this->assertTrue( + $processor->next_token(), + 'Should have found test block delimiter but found nothing instead.' + ); + + $this->assertSame( + $delimiter_type, + $processor->get_delimiter_type(), + 'Failed to match the expected delimiter type (opener/closer/void)' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_delimiters_and_their_types() { + return array( + 'Void' => array( '', WP_Block_Processor::VOID ), + 'Void, full name' => array( '', WP_Block_Processor::VOID ), + 'Opener' => array( '', WP_Block_Processor::OPENER ), + 'Opener, full name' => array( '', WP_Block_Processor::OPENER ), + 'Unexpected Closer' => array( '', WP_Block_Processor::CLOSER ), + 'Unexpected Closer, full name' => array( '', WP_Block_Processor::CLOSER ), + 'Closer' => array( '', WP_Block_Processor::CLOSER, 1 ), + 'Closer, full name' => array( '', WP_Block_Processor::CLOSER, 1 ), + ); + } + + /** + * Verifies that `get_delimiter_type()` returns `null` before finding any delimiters. + * + * @ticket 61401 + */ + public function test_reports_no_delimiter_type_before_scanning() { + $processor = new WP_Block_Processor( '' ); + + $this->assertNull( + $processor->get_delimiter_type(), + 'Should not have returned a delimiter type before matching any delimiters.' + ); + } + + /** + * Verifies that `get_delimiter_type()` returns `null` after scanning the last delimiter. + * + * @ticket 61401 + */ + public function test_reports_no_delimiter_type_after_scanning() { + $processor = new WP_Block_Processor( '' ); + + while ( $processor->next_token() ) { + continue; + } + + $this->assertNull( + $processor->get_delimiter_type(), + 'Should not have returned a delimiter type before matching any delimiters.' + ); + } + + /** + * Verifies that `get_delimiter_type()` returns `null` after encountering an error. + * + * @ticket 61401 + */ + public function test_reports_no_delimiter_type_after_an_error() { + $processor = new WP_Block_Processor( '', 'core/group' ), + 'Void, core/group' => array( '', 'core/group' ), + 'Closer, core/group' => array( '', 'core/group' ), + 'Opener, group' => array( '', 'core/group' ), + 'Void, group' => array( '', 'core/group' ), + 'Closer, group' => array( '', 'core/group' ), + 'Opener, my/group' => array( '', 'my/group' ), + 'Void, thy/group' => array( '', 'thy/group' ), + 'Closer, the5/group' => array( '', 'the-5/group' ), + 'Freeform content' => array( 'Not a block, but at the top level.', 'core/freeform' ), + ); + } + + /** + * Verifies that the presence of the void flag is properly reported + * regardless of a conflict with a closing black. + * + * When block delimiters contain both the void flag and the closing flag, + * it shall be interpreted as a void block to match the behavior of + * the spec parser, but the block processor exposes the closing flag to + * allow for user-space code to make its own determination. + * + * @ticket 61401 + */ + public function test_reports_presence_of_void_flag() { + $html = ''; + $processor = new WP_Block_Processor( $html ); + + // Test the opening delimiter. + + $this->assertTrue( + $processor->next_delimiter(), + "Should have found opening 'void-and-closed' block delimiter but found nothing: check test setup." + ); + + $this->assertSame( + 'core/void-and-closed', + $processor->get_block_type(), + "Should have found opening 'void-and-closed' block delimiter: check test setup." + ); + + $this->assertSame( + WP_Block_Processor::OPENER, + $processor->get_delimiter_type(), + 'Should have found an opening delimiter: check test setup.' + ); + + $this->assertFalse( + $processor->has_closing_flag(), + 'Should not have indicated the presence of the closing flag on an opening block.' + ); + + // Test the void delimiter. + + $this->assertTrue( + $processor->next_delimiter(), + "Should have found the void 'void' block delimiter but found nothing: check test setup." + ); + + $this->assertSame( + 'core/void', + $processor->get_block_type(), + "Should have found opening 'void-and-closed' block delimiter: check test setup." + ); + + $this->assertSame( + WP_Block_Processor::VOID, + $processor->get_delimiter_type(), + 'Should have found a void delimiter: check test setup.' + ); + + $this->assertFalse( + $processor->has_closing_flag(), + 'Should not have indicated the presence of the closing flag on the pure void block.' + ); + + // Test the void/closing delimiter. + + $this->assertTrue( + $processor->next_delimiter(), + "Should have found closing 'void-and-closed' block delimiter but found nothing: check test setup." + ); + + $this->assertSame( + 'core/void-and-closed', + $processor->get_block_type(), + "Should have found closing 'void-and-closed' block delimiter: check test setup." + ); + + $this->assertSame( + WP_Block_Processor::VOID, + $processor->get_delimiter_type(), + 'Should have found a closing delimiter: check test setup.' + ); + + $this->assertTrue( + $processor->has_closing_flag(), + 'Should have indicated the presence of the closing flag on a block with both the closing and void flags.' + ); + } + + /** + * Verifies that the processor indicates if the currently-matched delimiter + * is of a given block type. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_their_block_types + * + * @param string $html Contains a single delimiter. + * @param string $block_type Fully-qualified block type. + */ + public function test_reports_if_block_is_of_type( $html, $block_type ) { + $processor = new WP_Block_Processor( $html ); + + $this->assertTrue( + $processor->next_token(), + 'Should have found test block delimiter but found nothing instead: check test setup.' + ); + + $this->assertTrue( + $processor->is_block_type( $block_type ), + "Should have found the block to be of type '{$block_type}', detected type is '{$processor->get_block_type()}'." + ); + + if ( str_starts_with( $block_type, 'core/' ) ) { + // Prune off core namespace and detect implicit namespace. + $block_name = substr( $block_type, strlen( 'core/' ) ); + + $this->assertTrue( + $processor->is_block_type( $block_name ), + "Should have found the block to be of core type '{$block_name}', detected type is '{$processor->get_block_type()}'." + ); + } + } + + /** + * Verifies that the processor indicates if the currently-matched delimiter + * opens a block of a given block type. This is true for openers and void delimiters. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_their_block_types + * + * @param string $html Contains a single delimiter. + * @param string $block_type Fully-qualified block type. + */ + public function test_reports_if_block_opens_type( $html, $block_type ) { + $processor = new WP_Block_Processor( $html ); + + $this->assertTrue( + $processor->next_token(), + 'Should have found test block delimiter but found nothing instead: check test setup.' + ); + + if ( WP_Block_Processor::CLOSER === $processor->get_delimiter_type() ) { + $this->assertFalse( + $processor->opens_block( $block_type ), + 'Should not have indicated that a closing delimiter opens a block.' + ); + return; + } + + $this->assertTrue( + $processor->opens_block( $block_type ), + "Should have indicating opening of type '{$block_type}', detected type is '{$processor->get_block_type()}'." + ); + + if ( str_starts_with( $block_type, 'core/' ) ) { + // Prune off core namespace and detect implicit namespace. + $block_name = substr( $block_type, strlen( 'core/' ) ); + + $this->assertTrue( + $processor->opens_block( $block_name ), + "Should have indicated opening of core type '{$block_name}', detected type is '{$processor->get_block_type()}'." + ); + } + } + + /** + * Verifies that asking if a delimiter opens a block ignores the block type + * if none are provided in the explicit limiting list. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_their_block_types + * + * @param string $html Contains a single delimiter. + * @param string $block_type Fully-qualified block type (ignored but present due to shared data provider). + */ + public function test_opens_block_with_no_explicit_types_ignores_block_type( $html, $block_type ) { + $processor = new WP_Block_Processor( $html ); + + $this->assertTrue( + $processor->next_token(), + 'Should have found test block delimiter but found nothing instead: check test setup.' + ); + + if ( WP_Block_Processor::CLOSER === $processor->get_delimiter_type() ) { + $this->assertFalse( + $processor->opens_block(), + 'Should not have indicated that a closing delimiter opens a block.' + ); + } else { + $this->assertTrue( + $processor->opens_block(), + "Should have indicated that a '{$processor->get_delimiter_type()}' delimiter opens a block." + ); + } + } + + /** + * Verifies that when given multiple potential block types, that `opens_block()` properly + * indicates if the currently-matched block is an opening for at least one of them. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_sets_of_block_types + * + * @param string $html Contains a single block delimiter. + * @param string[] $block_types Contains one or more block types, fully qualified or not. + * @param bool $is_a_match Indicates if the provided HTML contains a block of the type in the given set. + */ + public function test_opens_block_checks_against_multiple_provided_block_types( $html, $block_types, $is_a_match ) { + $processor = new WP_Block_Processor( $html ); + + $this->assertTrue( + $processor->next_token(), + 'Should have found test setup block but found none: check test setup.' + ); + + $joined_types = implode( ', ', $block_types ); + + if ( $is_a_match ) { + $this->assertTrue( + $processor->opens_block( ...$block_types ), + "Should have found that delimiter (type {$processor->get_block_type()}) opens one of the following block types: {$joined_types}." + ); + } else { + $this->assertFalse( + $processor->opens_block( ...$block_types ), + "Should not have found that delimiter (type {$processor->get_block_type()}) opens one of the following block types: {$joined_types}." + ); + } + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_delimiters_and_sets_of_block_types() { + return array( + // Positive matches. + 'type "test", first in set' => array( '', array( 'test', 'tossed', 'tess' ), true ), + 'type "core/test", first in set' => array( '', array( 'test', 'tossed', 'tess' ), true ), + 'type "test", middle of set' => array( '', array( 'tust', 'test', 'tossed', 'tess' ), true ), + 'type "core/test", middle of set' => array( '', array( 'tust', 'test', 'tossed', 'tess' ), true ), + 'type "test", last in set' => array( '', array( 'tust', 'tossed', 'tess', 'test' ), true ), + 'type "core/test", last in set' => array( '', array( 'tust', 'tossed', 'tess', 'core/test' ), true ), + 'type "test", core/test first in set' => array( '', array( 'core/test', 'tossed', 'tess' ), true ), + 'type "core/test", core/test first in set' => array( '', array( 'core/test', 'tossed', 'tess' ), true ), + 'type "test", core/test in middle of set' => array( '', array( 'tust', 'core/test', 'tossed', 'tess' ), true ), + 'type "core/test", core/test in middle of set' => array( '', array( 'tust', 'core/test', 'tossed', 'tess' ), true ), + 'type "test", core/test last in set' => array( '', array( 'tust', 'tossed', 'tess', 'core/test' ), true ), + 'type "core/test", core/test last in set' => array( '', array( 'tust', 'tossed', 'tess', 'core/test' ), true ), + 'non-core, fully-qualified' => array( '', array( 'test/ship', 'test/block', 'test/wheel' ), true ), + + // Negative matches. + 'type "test", not in set' => array( '', array( 'text', 'core/text', 'my/test' ), false ), + 'type "core/test", not in set' => array( '', array( 'text', 'core/text', 'my/test' ), false ), + 'type "next-dev/code", not in set' => array( '', array( 'code', 'new/code', 'dev/code' ), false ), + + // Top-level freeform HTML content. + 'Freeform content' => array( 'Just some HTML', array( '*', 'core/freeform', 'freeform' ), true ), + ); + } + + /** + * Verifies that when scanning and visiting freeform blocks, that they + * return the appropriate information. + * + * @ticket 61401 + * + * @dataProvider data_freeform_blocks_and_delimiter_indices + * + * @param string $html Contains a freeform block after zero or more delimiters. + * @param int $freeform_at Freeform is located after this many other delimiters. + */ + public function test_indicates_if_matched_delimiter_is_freeform( $html, $freeform_at ) { + $processor = new WP_Block_Processor( $html ); + + for ( $i = 0; $i < $freeform_at; $i++ ) { + $processor->next_token(); + $this->assertFalse( + $processor->is_block_type( 'freeform' ), + "Improperly detected {$processor->get_printable_block_type()} as freeform HTML." + ); + } + + // Opening delimiter. + + $this->assertTrue( + $processor->next_token(), + 'Should have found the freeform content but didn’t: check test setup.' + ); + + $this->assertNull( + $processor->get_block_type(), + 'Should have found a freeform block.' + ); + + $this->assertTrue( + $processor->is_html(), + 'Should have identified the delimiter as freeform.' + ); + + $this->assertSame( + WP_Block_Processor::VOID, + $processor->get_delimiter_type(), + 'Should have stopped first on a freeform block.' + ); + + $this->assertTrue( + $processor->opens_block( 'freeform' ), + 'Should indicate that this (implicit) delimiter opens a freeform block (without the Core namespace).' + ); + + $this->assertTrue( + $processor->is_block_type( 'freeform' ), + 'Should indicate that this (implicit) delimiter is freeform (without the Core namespace).' + ); + + $this->assertTrue( + $processor->opens_block( 'core/freeform' ), + 'Should indicate that this (implicit) delimiter opens a freeform block (fully-qualified).' + ); + + $this->assertTrue( + $processor->is_block_type( 'core/freeform' ), + 'Should indicate that this (implicit) delimiter is freeform (fully-qualified).' + ); + + $this->assertNull( + $processor->allocate_and_return_parsed_attributes(), + 'Should not find any attributes on any freeform content.' + ); + + if ( $processor->next_token() ) { + $this->assertFalse( + $processor->is_html(), + 'Should have found an explicit token delimiter after the freeform content.' + ); + } else { + $this->assertFalse( + $processor->next_token(), + "Should have terminated after finding the trailing freeform content but found a '{$processor->get_printable_block_type()}' instead." + ); + } + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_freeform_blocks_and_delimiter_indices() { + return array( + 'Only non-block content (one freeform)' => array( 'this is not inside a block', 0 ), + 'Freeform before a block' => array( 'before the block', 0 ), + 'Freeform after a block' => array( 'after the block', 1 ), + 'Freeform between blocks' => array( 'after the block', 1 ), + 'Visits only freeform HTML, not inner HTML' => array( 'this is innerHTMLthis is freeform', 3 ), + ); + } + + /** + * Verifies that the freeform functions do not report freeform content + * when explicit delimiters are matched. + * + * @ticket 61401 + */ + public function test_actual_delimiters_are_not_freeform() { + $processor = new WP_Block_Processor( " \f\t\r\n " ); + + // Opening block. + + $this->assertTrue( + $processor->next_token(), + "Should have found opening 'group' test block: check test setup." + ); + + $this->assertFalse( + $processor->is_html(), + "Should not have reported the opening 'group' block as freeform." + ); + + $this->assertFalse( + $processor->is_non_whitespace_html(), + "Should not have reported the opening 'group' block as non-whitespace freeform." + ); + + // Freeform block (implicit) opener. + + $this->assertTrue( + $processor->next_token(), + 'Should have found (implicit) freeform test block: check test setup.' + ); + + $this->assertTrue( + $processor->is_html(), + 'Should have reported the (implicit) opening freeform delimiter.' + ); + + $this->assertFalse( + $processor->is_non_whitespace_html(), + 'Should have reported the (implicit) opening freeform delimiter as whitespace-only.' + ); + + // Closing block. + + $this->assertTrue( + $processor->next_token(), + "Should have found closing 'group' test block: check test setup." + ); + + $this->assertFalse( + $processor->is_html(), + "Should not have reported the closing 'group' block as freeform." + ); + + $this->assertFalse( + $processor->is_non_whitespace_html(), + "Should not have reported the closing 'group' block as non-whitespace freeform." + ); + } + + /** + * Verifies that whitespace-only freeform content is properly indicated. + * + * This is used to skip over whitespace-only freeform content which is + * usually produced by {@see \serialize_blocks()} for clearer formatting. + * + * @ticket 61401 + */ + public function test_indicates_if_freeform_content_is_only_whitespace() { + $processor = new WP_Block_Processor( + << + +HTML + ); + + $this->assertTrue( + $processor->next_delimiter( '*' ), + 'Should have found the first freeform block: check test setup.' + ); + + $this->assertNull( + $processor->get_block_type(), + 'Should have identified the first (implicit) delimiter as freeform.' + ); + + $this->assertTrue( + $processor->is_html(), + 'Should have identified the first delimiter as (implicit) freeform.' + ); + + $this->assertTrue( + $processor->is_non_whitespace_html(), + 'Should have identified that the freeform block contains non-whitespace content.' + ); + + $this->assertTrue( + $processor->next_delimiter(), + "Should have found the first explicit 'separator' delimiter: check test setup." + ); + + $this->assertSame( + 'core/separator', + $processor->get_block_type(), + "Should have found the 'separator' delimiter: check test setup." + ); + + $this->assertTrue( + $processor->next_delimiter( '*' ), + 'Should have found the second implicit freeform delimiter.' + ); + + $this->assertTrue( + $processor->is_html(), + 'Should have identified the second (implicit) freeform opening delimiter.' + ); + + $this->assertFalse( + $processor->is_non_whitespace_html(), + 'Should have identified that the second freeform block contains only whitespace content.' + ); + + $this->assertTrue( + $processor->next_delimiter(), + "Should have found the final 'ladder' delimiter." + ); + + $this->assertSame( + 'core/ladder', + $processor->get_block_type(), + "Should have identified the final delimiter as a 'core/ladder' type: check test setup." + ); + } + + /** + * Verifies that `get_attributes()` throws because it’s unsupported at the moment. + * + * This test should be changed if and when lazy attribute parsing is added. + * + * @ticket 61401 + */ + public function test_get_attributes_currently_throws() { + $processor = new WP_Block_Processor( '' ); + + $this->assertTrue( + $processor->next_delimiter(), + 'Should have found the "test" setup delimiter but found nothing: check test setup.' + ); + + $this->assertSame( + array( 'not' => 'yet supported' ), + $processor->allocate_and_return_parsed_attributes(), + 'Should have read eagerly-parsed block attributes: check test setup.' + ); + + $this->expectExceptionMessage( 'Lazy attribute parsing not yet supported' ); + $processor->get_attributes(); + } + + /** + * Verifies that the processor reports the appropriate string indices for each delimiter. + * + * @ticket 61401 + * + * @dataProvider data_content_and_delimiter_spans + * + * @param string $html Contains one or more block delimiters, + * including implicit freeform delimiters. + * @param int[] $spans For each delimiter in `$html`, a [ start, length ] + * pair representing the textual span of the delimiter. + */ + public function test_returns_appropriate_span_for_delimiters( $html, ...$spans ) { + $processor = new WP_Block_Processor( $html ); + + if ( 0 === count( $spans ) ) { + $this->assertNull( + $processor->get_span(), + 'Should not have returned any span when not matched on a delimiter.' + ); + return; + } + + foreach ( $spans as $i => $span ) { + $this->assertTrue( + $processor->next_token(), + "Should have found delimiter in position {$i} but found nothing: check test setup." + ); + + $reported = $processor->get_span(); + $this->assertSame( + $span, + array( $reported->start, $reported->length ), + 'Should have reported the proper span of text covered by the delimiter.' + ); + } + + $this->assertFalse( + $processor->next_token(), + 'Should not have found any additional delimiters: check test setup.' + ); + + $this->assertNull( + $processor->get_span(), + 'Should not have returned any span after finishing the scan of a document.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_content_and_delimiter_spans() { + return array( + 'Before matching' => array( 'Blocks advance yet' ), + 'Only freeform' => array( 'Have a lovely day.', array( 0, 18 ) ), + 'Only void' => array( '', array( 0, 23 ) ), + 'Mixed' => array( '<>', array( 0, 14 ), array( 14, 2 ), array( 16, 15 ) ), + ); + } + + // + // Test helpers. + // + + /** + * Replaces whitespace in a string with visual indicators for easier debugging. + * + * The definition of “whitespace” here is loose and intended for debugging tests. + * It’s okay to expand for more complete replacement, for example to replace all + * graphemes considered whitespace by Unicode, but not required unless it’s + * essential for tests. + * + * Concerning HTML and the block parser only the HTML whitespace is relevant. + * + * @param string $text Any input, potentially containing whitespace characters. + * @return string The input with whitespace replaced by visual placeholders. + */ + private static function unhide_whitespace( $text ) { + return str_replace( + array( ' ', "\t", "\r", "\f", "\n" ), + array( '␠', '␉', '␍', '␌', '␤' ), + $text + ); + } +}