From 0f3d8cfaad016b7c10081c64615ed8b9df7cb47a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 23 Jun 2025 20:30:05 -0500 Subject: [PATCH 1/3] Blocks: Introduce WP_Block_Scanner for efficiently parsing blocks. The Block Scanner follows the HTML API in providing a streaming, near-zero-overhead, lazy, re-entrant parser for traversing block structure. This class provides an alternate interface to `parse_blocks()` which is more amenable to a number of common server-side operations on posts, such as: - Generating an excerpt from only the first N blocks in a post. - Determining which block types are present in a post. - Determining which posts contain a block of a given type. - Generating block supports content for a post. --- src/wp-includes/blocks.php | 11 + src/wp-includes/class-wp-block-scanner.php | 1244 +++++++++++++++++ src/wp-settings.php | 1 + .../tests/block-scanner/wpBlockScanner.php | 1221 ++++++++++++++++ 4 files changed, 2477 insertions(+) create mode 100644 src/wp-includes/class-wp-block-scanner.php create mode 100644 tests/phpunit/tests/block-scanner/wpBlockScanner.php diff --git a/src/wp-includes/blocks.php b/src/wp-includes/blocks.php index d8ea09d09049b..00ba9f6e8212a 100644 --- a/src/wp-includes/blocks.php +++ b/src/wp-includes/blocks.php @@ -2376,6 +2376,17 @@ function render_block( $parsed_block ) { /** * Parses blocks out of a content string. * + * Given an HTML document, this function fully-parses block content, producing + * a tree of blocks and their contents, as well as top-level non-block content, + * which will appear as a block with no `blockName`. + * + * This function can be memory heavy for certain documents, particularly those + * with deeply-nested blocks or blocks with extensive attribute values. Further, + * this function must parse an entire document in one atomic operation. + * + * If the entire parsed document is not necessary, consider using {@see WP_Block_Scanner} + * instead, as it provides a streaming and low-overhead interface for finding blocks. + * * @since 5.0.0 * * @param string $content Post content. diff --git a/src/wp-includes/class-wp-block-scanner.php b/src/wp-includes/class-wp-block-scanner.php new file mode 100644 index 0000000000000..18f3d22dfdda4 --- /dev/null +++ b/src/wp-includes/class-wp-block-scanner.php @@ -0,0 +1,1244 @@ +next_delimiter() ) { + * if ( $scanner->opens_block( 'core/image' ) ) { + * echo "Found an image!\n"; + * } + * } + * + * @see self::next_delimiter + * + * @since 6.9.0 + * + * @param string $source_text Input document potentially containing block content. + * @return ?self Created block scanner, if successfully created. + */ + public static function create( string $source_text ): mixed { + return new self( $source_text ); + } + + /** + * Scan to the next block delimiter in a document, indicating if one was found. + * + * Block comment delimiters must be valid HTML comments and may contain JSON. + * This search does not determine, however, if the JSON is valid. + * + * Example delimiters: + * + * `` + * `` + * `` + * + * In the case that a block comment delimiter contains both the void indicator and + * also the closing indicator, it will be treated as a void block. + * + * Example: + * + * // Find all image block opening delimiters. + * $images = array(); + * $scanner = WP_Block_Scanner::create( $html ); + * while ( $scanner->next_delimiter() ) { + * if ( $scanner->opens_block( 'core/image' ) ) { + * $images[] = $scanner->get_span(); + * } + * } + * + * Not all blocks have explicit delimiters. Non-block content at the top-level of + * a document (so-called “HTML soup”) forms implicit blocks containing neither a + * block name nor block attributes. Because this content often comprises only + * HTML whitespace and adds undo performance burden, it is skipped by default. + * To scan the implicit freeform blocks, pass the `$html_spans` argument. + * + * Example: + * + * $html = '\n'; + * $blocks = [ + * [ 'blockName' => 'core/void' ], + * [ 'blockName' => null ], + * [ 'blockName' => 'core/void' ], + * ]; + * $scanner = WP_Block_Scanner::create( $html ); + * while ( $scanner->next_delimiter( html_spans: 'visit' ) { + * ... + * } + * + * In some cases it may be useful to conditionally visit the implicit freeform + * blocks, such as when determining if a post contains freeform content that + * isn’t purely whitespace. + * + * Example: + * + * $seen_block_types = []; + * $html_spans = 'visit-html'; + * $scanner = WP_Block_Scanner::create( $html ); + * while ( $scanner->next_delimiter( html_spans: $html_spans ) { + * if ( ! $scanner->opens_block() ) { + * continue; + * } + * + * // Stop wasting time visiting freeform blocks after one has been found. + * if ('visit-html' === $html_spans ) { + * if ( $scanner->is_non_whitespace_freeform() ) { + * $html_spans = 'skip-html'; + * $seen_block_types['core/freeform'] = true; + * } + * continue; + * } + * + * $seen_block_types[ $scanner->get_block_type() ] = true; + * } + * + * @since 6.9.0 + * + * @param string|null $block_name Optional. Keep searching until a block of this name is found. + * Defaults to visit every block regardless of type. + * @return bool Whether a block delimiter was matched. + */ + public function next_delimiter( ?string $block_name = null ) { + if ( ! isset( $block_name ) ) { + return $this->next_token(); + } + + while ( $this->next_token() ) { + if ( $this->is_block_type( $block_name ) ) { + return true; + } + } + + return false; + } + + /** + * Scan to the next block delimiter in a document, indicating if one was found. + * + * Block comment delimiters must be valid HTML comments and may contain JSON. + * This search does not determine, however, if the JSON is valid. + * + * Example delimiters: + * + * `` + * `` + * `` + * + * In the case that a block comment delimiter contains both the void indicator and + * also the closing indicator, it will be treated as a void block. + * + * Example: + * + * // Find all image block opening delimiters. + * $images = array(); + * $scanner = WP_Block_Scanner::create( $html ); + * while ( $scanner->next_delimiter() ) { + * if ( $scanner->opens_block( 'core/image' ) ) { + * $images[] = $scanner->get_span(); + * } + * } + * + * Not all blocks have explicit delimiters. Non-block content at the top-level of + * a document (so-called “HTML soup”) forms implicit blocks containing neither a + * block name nor block attributes. Because this content often comprises only + * HTML whitespace and adds undo performance burden, it is skipped by default. + * To scan the implicit freeform blocks, pass the `$html_spans` argument. + * + * Example: + * + * $html = '\n'; + * $blocks = [ + * [ 'blockName' => 'core/void' ], + * [ 'blockName' => null ], + * [ 'blockName' => 'core/void' ], + * ]; + * $scanner = WP_Block_Scanner::create( $html ); + * while ( $scanner->next_delimiter( html_spans: 'visit' ) { + * ... + * } + * + * In some cases it may be useful to conditionally visit the implicit freeform + * blocks, such as when determining if a post contains freeform content that + * isn’t purely whitespace. + * + * Example: + * + * $seen_block_types = []; + * $html_spans = 'visit-html'; + * $scanner = WP_Block_Scanner::create( $html ); + * while ( $scanner->next_delimiter( html_spans: $html_spans ) { + * if ( ! $scanner->opens_block() ) { + * continue; + * } + * + * // Stop wasting time visiting freeform blocks after one has been found. + * if ('visit-html' === $html_spans ) { + * if ( $scanner->is_non_whitespace_freeform() ) { + * $html_spans = 'skip-html'; + * $seen_block_types['core/freeform'] = true; + * } + * continue; + * } + * + * $seen_block_types[ $scanner->get_block_type() ] = true; + * } + * + * @since 6.9.0 + * + * @param 'visit-html'|'skip-html' $html_spans Optional. Pass `visit-html` to match freeform HTML content + * not surrounded by block delimiters. Defaults to `skip-html`. + * @return bool Whether a block delimiter was matched. + */ + public function next_token( $html_spans = 'skip-html' ) { + if ( $this->last_error ) { + return false; + } + + if ( static::HTML_OPEN === $this->state && 'visit-html' === $html_spans ) { + $this->state = static::HTML_CLOSE; + return true; + } + + $text = $this->source_text; + $end = strlen( $text ); + + if ( + static::HTML_CLOSE === $this->state || + ( + static::HTML_OPEN === $this->state && + 'visit-html' !== $html_spans + ) + ) { + if ( $this->delimiter_at < $end ) { + $this->state = static::MATCHED; + return true; + } else { + $this->state = static::COMPLETE; + return false; + } + } + + $this->state = static::READY; + $after_prev_delimiter = $this->delimiter_at + $this->delimiter_length; + $at = $after_prev_delimiter; + + while ( $at < $end ) { + /* + * Find the next possible opening. + * + * This follows the behavior in the official block parser, which treats a post + * as a list of blocks with nested HTML. If HTML comment syntax appears within + * an HTML attribute value, SCRIPT or STYLE element, or in other select places, + * which it can do inside of HTML, then the block parsing may break. + * + * For a more robust parse scan through the document with the HTML API. In + * practice, this has not been a problem in the entire history of blocks. + */ + $comment_opening_at = strpos( $text, '', $json_at ); + if ( false === $comment_closing_at ) { + $this->state = static::COMPLETE; + $this->last_error = static::INCOMPLETE_INPUT; + return false; + } + + /* + * It looks like this logic leaves an error in here, when the position + * overlaps the JSON or block name. However, for neither of those is it + * possible to parse a valid block if that last overlapping character + * is the void flag. This, therefore, will be valid regardless of how + * the rest of the comment delimiter is written. + */ + if ( '/' === $text[ $comment_closing_at - 1 ] ) { + $has_void_flag = true; + $void_flag_length = 1; + } else { + $has_void_flag = false; + $void_flag_length = 0; + } + + /* + * If there's no JSON, then the span of text after the name + * until the comment closing must be completely whitespace. + */ + if ( ! $has_json ) { + // This must be a block delimiter! + $this->state = static::MATCHED; + break; + } + + // There's no JSON, so attempt to find its boundary. + $after_json_whitespace_length = 0; + for ( $char_at = $comment_closing_at - $void_flag_length - 1; $char_at > $json_at; $char_at-- ) { + $char = $text[ $char_at ]; + + switch ( $char ) { + case ' ': + case "\t": + case "\f": + case "\r": + case "\n": + ++$after_json_whitespace_length; + continue 2; + + case '}': + $json_length = $char_at - $json_at + 1; + break 2; + + default: + ++$at; + continue 3; + } + } + + if ( 0 === $json_length || 0 === $after_json_whitespace_length ) { + $at = $this->find_html_comment_end( $comment_opening_at, $end ); + continue; + } + + // This must be a block delimiter! + $this->state = static::MATCHED; + break; + } + + if ( static::MATCHED !== $this->state ) { + return false; + } + + $this->next_token_at = $after_prev_delimiter; + + $this->delimiter_at = $comment_opening_at; + $this->delimiter_length = $comment_closing_at + 3 - $comment_opening_at; + + $this->namespace_at = $namespace_at; + $this->namespace_length = $namespace_length; + + $this->name_at = $name_at; + $this->name_length = $name_length; + + $this->json_at = $json_at; + $this->json_length = $json_length; + + /* + * When delimiters contain both the void flag and the closing flag + * they shall be interpreted as void blocks, per the spec parser. + */ + $this->type = $has_void_flag + ? static::VOID + : ( $has_closer ? static::CLOSER : static::OPENER ); + + $this->has_closing_flag = $has_closer; + + if ( 'visit-html' === $html_spans && $comment_opening_at > $after_prev_delimiter ) { + $this->state = static::HTML_OPEN; + } + + return true; + } + + /** + * Constructor function. + * + * @since 6.9.0 + */ + protected function __construct( string $source_text ) { + $this->source_text = $source_text; + } + + /** + * Returns the byte-offset after the ending character of an HTML comment, + * assuming the proper starting byte offset. + * + * @since 6.9.0 + * + * @param int $comment_starting_at Where the HTML comment started, the leading `<`. + * @param int $search_end Last offset in which to search, for limiting search span. + * @return int Offset after the current HTML comment ends, or `$search_end` if no end was found. + */ + private function find_html_comment_end( int $comment_starting_at, int $search_end ): int { + $text = $this->source_text; + + // Find span-of-dashes comments which look like ``. + $span_of_dashes = strspn( $text, '-', $comment_starting_at + 2 ); + if ( + $comment_starting_at + 2 + $span_of_dashes < $search_end && + '>' === $text[ $comment_starting_at + 2 + $span_of_dashes ] + ) { + return $comment_starting_at + $span_of_dashes + 1; + } + + // Otherwise, there are other characters inside the comment, find the first `-->` or `--!>`. + $now_at = $comment_starting_at + 4; + while ( $now_at < $search_end ) { + $dashes_at = strpos( $text, '--', $now_at ); + if ( false === $dashes_at ) { + return $search_end; + } + + $closer_must_be_at = $dashes_at + 2 + strspn( $text, '-', $dashes_at + 2 ); + if ( $closer_must_be_at < $search_end && '!' === $text[ $closer_must_be_at ] ) { + $closer_must_be_at++; + } + + if ( $closer_must_be_at < $search_end && '>' === $text[ $closer_must_be_at ] ) { + return $closer_must_be_at + 1; + } + + $now_at++; + } + + return $search_end; + } + + /** + * Indicates if the last attempt to parse a block comment delimiter + * failed, if set, otherwise `null` if the last attempt succeeded. + * + * @since 6.9.0 + * + * @return string|null Error from last attempt at parsing next block delimiter, + * or `NULL` if last attempt succeeded. + */ + public function get_last_error(): ?string { + return $this->last_error; + } + + /** + * Indicates if the last attempt to parse a block’s JSON attributes failed. + * + * @see JSON_ERROR_NONE, JSON_ERROR_DEPTH, etc… + * + * @since 6.9.0 + * + * @return int JSON_ERROR_ code from last attempt to parse block JSON attributes. + */ + public function get_last_json_error(): int { + return $this->last_json_error; + } + + /** + * Returns the type of the block comment delimiter. + * + * One of: + * + * - `static::OPENER` + * - `static::CLOSER` + * - `static::VOID` + * - `null` + * + * @since 6.9.0 + * + * @return string|null type of the block comment delimiter, if currently matched. + */ + public function get_delimiter_type(): ?string { + switch ( $this->state ) { + case static::HTML_OPEN: + return static::OPENER; + + case static::HTML_CLOSE: + return static::CLOSER; + + case static::MATCHED: + return $this->type; + + default: + return null; + } + } + + /** + * Returns whether the delimiter contains the closing flag. + * + * This should be avoided except in cases of handling errors with + * block closers containing the void flag. For normative use, + * {@see self::get_delimiter_type}. + * + * @since 6.9.0 + * + * @return bool Whether the currently-matched block delimiter contains the closing flag. + */ + public function has_closing_flag(): bool { + return $this->has_closing_flag; + } + + /** + * Indicates if the block delimiter represents a block of the given type. + * + * Since the "core" namespace may be implicit, it's allowable to pass + * either the fully-qualified block type with namespace and block name + * as well as the shorthand version only containing the block name, if + * the desired block is in the "core" namespace. + * + * Example: + * + * $is_core_paragraph = $scanner->is_block_type( 'paragraph' ); + * $is_core_paragraph = $scanner->is_block_type( 'core/paragraph' ); + * $is_formula = $scanner->is_block_type( 'math-block/formula' ); + * + * @param string $block_type Block type name for the desired block. + * E.g. "paragraph", "core/paragraph", "math-blocks/formula". + * @return bool Whether this delimiter represents a block of the given type. + */ + public function is_block_type( string $block_type ): bool { + // This is a core/freeform text block, it’s special. + if ( $this->is_html() ) { + return ( + 'core/freeform' === $block_type || + 'freeform' === $block_type + ); + } + + $slash_at = strpos( $block_type, '/' ); + if ( false === $slash_at ) { + $namespace = 'core'; + $block_name = $block_type; + } else { + // @todo Get lengths but avoid the allocation, use substr_compare below. + $namespace = substr( $block_type, 0, $slash_at ); + $block_name = substr( $block_type, $slash_at + 1 ); + } + + // Only the 'core' namespace is allowed to be omitted. + if ( 0 === $this->namespace_length && 'core' !== $namespace ) { + return false; + } + + // If given an explicit namespace, they must match. + if ( + 0 !== $this->namespace_length && ( + strlen( $namespace ) !== $this->namespace_length || + 0 !== substr_compare( $this->source_text, $namespace, $this->namespace_at, $this->namespace_length ) + ) + ) { + return false; + } + + // The block name must match. + return ( + strlen( $block_name ) === $this->name_length && + 0 === substr_compare( $this->source_text, $block_name, $this->name_at, $this->name_length ) + ); + } + + /** + * Indicates if the matched delimiter is an opening or void delimiter + * (i.e. it opens the block) of the given type, if a type is provided. + * + * This is a helper method to ease handling of code inspecting where + * blocks start, and of checking if the blocks are of a given type. + * The function is variadic to allow for checking if the delimiter + * opens one of many possible block types. + * + * Example: + * + * $scanner = WP_Block_Scanner::create( $html ); + * while ( $scanner->next_delimiter() ) { + * if ( $scanner->opens_block( 'core/code', 'syntaxhighlighter/code' ) ) { + * echo "Found code!"; + * continue; + * } + * + * if ( $scanner->opens_block( 'core/image' ) ) { + * echo "Found an image!"; + * continue; + * } + * + * if ( $scanner->opens_block() ) { + * echo "Found a new block!"; + * } + * } + * + * @see self::is_block_type + * + * @since 6.9.0 + * + * @param string[] $block_type Optional. Is the matched block type one of these? + * If none are provided, will not test block type. + * @return bool Whether the matched block delimiter opens a block, and whether it + * opens a block of one of the given block types, if provided. + */ + public function opens_block( string ...$block_type ): bool { + if ( static::HTML_CLOSE === $this->state ) { + return false; + } + + if ( static::CLOSER === $this->type && ! $this->is_html() ) { + return false; + } + + if ( count( $block_type ) === 0 ) { + return true; + } + + foreach ( $block_type as $block ) { + if ( $this->is_block_type( $block ) ) { + return true; + } + } + + return false; + } + + /** + * Indicates if the matched delimiter is implied due to top-level + * non-block content in the post. + * + * @see self::is_non_whitespace_html + * + * @since 6.9.0 + * + * @return bool Whether the scanner is matched on an HTML span. + */ + public function is_html(): bool { + return ( + static::HTML_OPEN === $this->state || + static::HTML_CLOSE === $this->state + ); + } + + /** + * Indicates if the matched delimiter is implicit and surrounds + * top-level non-block content containing non-whitespace text. + * + * Many block serializers introduce newlines between block delimiters, + * so the presence of top-level non-block content does not imply that + * there are “real” freeform HTML blocks. Checking if there is content + * beyond whitespace is a more certain check, such as for determining + * whether to load CSS for the freeform or fallback block type. + * + * @see self::is_html + * + * @return bool Whether the currently-matched delimiter is implicit and surround + * top-level non-block content containing non-whitespace text. + */ + public function is_non_whitespace_html(): bool { + if ( ! $this->is_html() ) { + return false; + } + + $length = $this->delimiter_at - $this->next_token_at; + + $whitespace_length = strspn( + $this->source_text, + " \t\f\r\n", + $this->next_token_at, + $length + ); + + return $whitespace_length !== $length; + } + + /** + * Returns the string content of an HTML span and advances the parser so that + * the next delimiter will be after the HTML span. + * + * @since 6.9.0 + * + * @return string|null HTML content, or `NULL` if not currently matched on HTML. + */ + public function get_html_content_and_advance(): ?string { + if ( ! $this->is_html() ) { + return null; + } + + // Finish on the (implicit) freeform closing delimiter. + if ( static::HTML_OPEN === $this->state ) { + $this->next_delimiter( null, 'visit-html' ); + } + + return substr( + $this->source_text, + $this->next_token_at, + $this->delimiter_at - $this->next_token_at + ); + } + + /** + * Allocates a substring for the block type and returns the + * fully-qualified name, including the namespace. + * + * This function allocates a substring for the given block type. This + * allocation will be small and likely fine in most cases, but it's + * preferable to call {@link self::is_block_type} if only needing + * to know whether the delimiter is for a given block type, as that + * function is more efficient for this purpose and avoids the allocation. + * + * Example: + * + * // Avoid. + * 'core/paragraph' = $scanner->get_block_type(); + * + * // Prefer. + * $scanner->is_block_type( 'core/paragraph' ); + * $scanner->is_block_type( 'paragraph' ); + * + * @since 6.9.0 + * @todo What if there’s no matched block? + * + * @return string Fully-qualified block namespace and type, e.g. "core/paragraph". + */ + public function get_block_type(): ?string { + // This is a core/freeform text block, it’s special. + if ( $this->is_html() ) { + return 'core/freeform'; + } + + // This is implicitly in the "core" namespace. + if ( 0 === $this->namespace_length ) { + $block_name = substr( $this->source_text, $this->name_at, $this->name_length ); + return "core/{$block_name}"; + } + + return substr( $this->source_text, $this->namespace_at, $this->namespace_length + $this->name_length + 1 ); + } + + /** + * Returns a lazy wrapper around the block attributes, which can be used + * for efficiently interacting with the JSON attributes. + * + * This stub hints that there should be a lazy interface for parsing + * block attributes but doesn’t define it. It serves both as a placeholder + * for one to come as well as a guard against implementing an eager + * function in its place. + * + * @see self::allocate_and_return_parsed_attributes() + * + * @throws Exception This function is a stub for subclasses to implement + * when providing streaming attribute parsing. + * + * @since 6.9.0 + * + * @return never + */ + public function get_attributes() { + throw new Exception( 'Lazy attribute parsing not yet supported' ); + } + + /** + * Attempts to parse and return the entire JSON attributes from the delimiter, + * allocating memory and processing the JSON span in the process. + * + * This does not return any parsed attributes for a closing block delimiter + * even if there is a span of JSON content; this JSON is a parsing error. + * + * Consider calling {@link self::get_attributes} instead if it's not + * necessary to read all the attributes at the same time, as that provides + * a more efficient mechanism for typical use cases. + * + * Since the JSON span inside the comment delimiter may not be valid JSON, + * this function will return `null` if it cannot parse the span and set the + * {@see self::get_last_json_error} to the appropriate JSON_ERROR_ constant. + * + * If the delimiter contains no JSON span, it will also return `null`, + * but the last error will be set to {@see JSON_ERROR_NONE}. + * + * Example: + * + * $scanner = WP_Block_Scanner::create( '' ); + * $scanner->next_delimiter(); + * $memory_hungry_and_slow_attributes = $scanner->allocate_and_return_parsed_attributes(); + * $memory_hungry_and_slow_attributes === array( 'url' => 'https://wordpress.org/favicon.ico' ); + * + * $scanner = WP_Block_Scanner::create( '' ); + * $scanner->next_delimiter(); + * null = $scanner->allocate_and_return_parsed_attributes(); + * JSON_ERROR_NONE = $scanner->get_last_json_error(); + * + * $scanner = WP_Block_Scanner::create( '' ); + * $scanner->next_delimiter(); + * array() === $scanner->allocate_and_return_parsed_attributes(); + * + * $scanner = WP_Block_Scanner::create( '' ); + * $scanner->next_delimiter(); + * null = $scanner->allocate_and_return_parsed_attributes(); + * + * $scanner = WP_Block_Scanner::create( '' ); + * $scanner->next_delimiter(); + * null = $scanner->allocate_and_return_parsed_attributes(); + * JSON_ERROR_CTRL_CHAR = $scanner->get_last_json_error(); + * + * @since 6.9.0} + * + * @return array|null Parsed JSON attributes, if present and valid, otherwise `null`. + */ + public function allocate_and_return_parsed_attributes(): ?array { + $this->last_json_error = JSON_ERROR_NONE; + + if ( static::CLOSER === $this->type || $this->is_html() || 0 === $this->json_length ) { + return null; + } + + $json_span = substr( $this->source_text, $this->json_at, $this->json_length ); + $parsed = json_decode( $json_span, null, 512, JSON_OBJECT_AS_ARRAY | JSON_INVALID_UTF8_SUBSTITUTE ); + + $last_error = json_last_error(); + $this->last_json_error = $last_error; + + return ( JSON_ERROR_NONE === $last_error && is_array( $parsed ) ) + ? $parsed + : null; + } + + /** + * Returns the span representing the currently-matched delimiter, + * if matched, otherwise `null`. + * + * Note that for freeform blocks this will return a span of length + * zero, since there is no explicit block delimiter. + * + * Example: + * + * $scanner = WP_Block_Scanner::create( '' ); + * null === $scanner->get_span(); + * + * $scanner->next_delimiter(); + * WP_HTML_Span( 0, 17 ) === $scanner->get_span(); + * + * @since 6.9.0 + * + * @return WP_HTML_Span|null Span of text in source text spanning matched delimiter. + */ + public function get_span(): ?WP_HTML_Span { + switch ( $this->state ) { + case static::HTML_OPEN: + return new WP_HTML_Span( $this->next_token_at, 0 ); + + case static::HTML_CLOSE: + return new WP_HTML_Span( $this->delimiter_at, 0 ); + + case static::MATCHED: + return new WP_HTML_Span( $this->delimiter_at, $this->delimiter_length ); + + default: + return null; + } + } + + // + // Constant declarations that would otherwise pollute the top of the class. + // + + /** + * Indicates that the block comment delimiter closes an open block. + * + * @see self::type + * + * @since 6.9.0 + */ + const CLOSER = 'closer'; + + /** + * Indicates that the block comment delimiter opens a block. + * + * @see self::type + * + * @since 6.9.0 + */ + const OPENER = 'opener'; + + /** + * Indicates that the block comment delimiter represents a void block + * with no inner content of any kind. + * + * @see self::type + * + * @since 6.9.0 + */ + const VOID = 'void'; + + /** + * Indicates that the scanner is ready to start parsing but hasn’t yet begun. + * + * @see self::state + * + * @since 6.9.0} + */ + const READY = 'scanner-ready'; + + /** + * Indicates that the scanner is matched on an explicit block delimiter. + * + * @see self::state + * + * @since 6.9.0 + */ + const MATCHED = 'scanner-matched'; + + /** + * Indicates that the scanner is matched on the opening of an implicit freeform delimiter. + * + * @see self::state + * + * @since 6.9.0 + */ + const HTML_OPEN = 'scanner-opening-freeform'; + + /** + * Indicates that the scanner is matched on the closing of an implicit freeform delimiter. + * + * @see self::state + * + * @since 6.9.0 + */ + const HTML_CLOSE = 'scanner-closing-freeform'; + + /** + * Indicates that the parser started parsing a block comment delimiter, but + * the input document ended before it could finish. The document was likely truncated. + * + * @see self::state + * + * @since 6.9.0 + */ + const INCOMPLETE_INPUT = 'incomplete-input'; + + /** + * Indicates that the scanner has finished parsing and has nothing left to scan. + * + * @see self::state + * + * @since 6.9.0 + */ + const COMPLETE = 'scanner-complete'; +} diff --git a/src/wp-settings.php b/src/wp-settings.php index 9a175e71f0fa9..01062771a0c56 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -268,6 +268,7 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-stack-event.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php'; +require ABSPATH . WPINC . '/class-wp-block-scanner.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php'; diff --git a/tests/phpunit/tests/block-scanner/wpBlockScanner.php b/tests/phpunit/tests/block-scanner/wpBlockScanner.php new file mode 100644 index 0000000000000..d80ce15104d06 --- /dev/null +++ b/tests/phpunit/tests/block-scanner/wpBlockScanner.php @@ -0,0 +1,1221 @@ +assertInstanceOf( + 'WP_Block_Scanner', + $scanner, + 'Failed to create the Block Scanner object for an empty string.' + ); + } + + /** + * Verifies that no block delimiters are found in an empty string. + * + * @ticket 61401 + */ + public function test_finds_no_block_delimiters_for_empty_string() { + $scanner = WP_Block_Scanner::create( '' ); + + $this->assertFalse( + $scanner->next_delimiter(), + 'Should not have found any delimiters.' + ); + } + + /** + * Verifies that freeform delimiters are found when requested for + * posts with no block content. + * + * @ticket 61401 + */ + public function test_finds_freeform_delimiters_in_post_without_blocks() { + $scanner = WP_Block_Scanner::create( 'This is non-block content.' ); + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + 'Should have found the start of a freeform block but found nothing.' + ); + + $this->assertSame( + WP_Block_Scanner::OPENER, + $scanner->get_delimiter_type(), + 'Should have found an opening block delimiter.' + ); + + $this->assertSame( + 'core/freeform', + $scanner->get_block_type(), + 'Should have found the start of a freeform block.' + ); + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + 'Should have found the end of a freeform block but found nothing.' + ); + + $this->assertSame( + WP_Block_Scanner::CLOSER, + $scanner->get_delimiter_type(), + 'Should have found a block closer.' + ); + + $this->assertSame( + 'core/freeform', + $scanner->get_block_type(), + 'Should have found a freeform block closer.' + ); + } + + /** + * Verifies that a post containing a single void block finds the block and nothing else. + * + * @ticket 61401 + */ + public function test_finds_post_of_void_block() { + $scanner = WP_Block_Scanner::create( '' ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found a block delimiter but found nothing.' + ); + + $this->assertSame( + WP_Block_Scanner::VOID, + $scanner->get_delimiter_type(), + 'Should have found a void block delimiter.' + ); + + $this->assertSame( + 'core/recent-posts', + $scanner->get_block_type(), + 'Should have found a core/recent-posts void block.' + ); + } + + /** + * Verifies that a post containing a single basic block finds the block opener and closer. + * + * @ticket 61401 + */ + public function test_finds_open_and_close_of_post_with_basic_block() { + $scanner = WP_Block_Scanner::create( '

Content

' ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found an opening block delimiter but found nothing.' + ); + + $this->assertSame( + WP_Block_Scanner::OPENER, + $scanner->get_delimiter_type(), + 'Should have found an opening block delimiter.' + ); + + $this->assertTrue( + $scanner->opens_block( 'core/paragraph' ), + 'Should have found an opening core/paragraph delimiter.' + ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found a closing block delimiter but found nothing.' + ); + + $this->assertSame( + WP_Block_Scanner::CLOSER, + $scanner->get_delimiter_type(), + 'Should have found a closing block delimiter.' + ); + + $this->assertSame( + 'core/paragraph', + $scanner->get_block_type(), + 'Should have found a closing paragraph delimiter.' + ); + } + + /** + * Verifies that the parser refuses to parse the end of a document + * which partially contains what could be a block delimiter. + * + * @ticket 61401 + * + * @dataProvider data_partial_delimiter_endings + * + * @param string $html Input ending in a partial block delimiter. + */ + public function test_rejects_on_incomplete_inputs( $html ) { + $scanner = WP_Block_Scanner::create( "{$html}" ); + + $scanner->next_delimiter(); + $this->assertTrue( + $scanner->opens_block( 'test/canary' ), + 'Should have found the test/canary block: check test code setup.' + ); + + $this->assertFalse( + $scanner->next_delimiter(), + 'Should have failed to find any blocks after the test canary.' + ); + + $this->assertSame( + WP_Block_Scanner::INCOMPLETE_INPUT, + $scanner->get_last_error(), + 'Should have bailed because the input was incomplete.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_partial_delimiter_endings() { + $tests = array(); + + $delimiters = array( + 'opener' => '', + 'void' => '', + 'closer' => '', + ); + + foreach ( $delimiters as $kind => $delimiter ) { + for ( $i = strlen( $delimiter ) - 1; $i > 0; $i-- ) { + $partial = substr( $delimiter, 0, $i ); + $tests[ "{$kind}: {$partial}" ] = array( $partial ); + } + } + + return $tests; + } + + /** + * Verifies that it’s not possible to proceed after reaching an error. + * + * @ticket 61401 + */ + public function test_rejects_once_errored_out() { + $scanner = WP_Block_Scanner::create( '{$html}" ); + + $scanner->next_delimiter(); + $this->assertTrue( + $scanner->opens_block( 'tests/before' ), + "Should have found the 'tests/before' block before the invalid block delimiter but found a '{$scanner->get_block_type()}' instead." + ); + + $scanner->next_token( 'visit-html' ); + $this->assertTrue( + $scanner->opens_block( 'freeform' ), + "Should have found the malform block delimiter as an HTML comment, but found a '{$scanner->get_block_type()}' instead." + ); + + $scanner->next_delimiter(); + $this->assertTrue( + $scanner->opens_block( 'tests/after' ), + "Should have found the 'tests/after' block after the invalid block delimiter but found a '{$scanner->get_block_type()}' instead." + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_invalid_block_delimiters_as_html_comments() { + return array( + 'Shortest HTML comment' => array( '' ), + 'Span-of-dashes' => array( '' ), + 'Empty HTML comment' => array( '' ), + 'HTML comment with exclamation' => array( '' ), + 'No spaces, minimal info' => array( '' ), + 'No spaces, minimal info, void' => array( '' ), + 'No spaces, empty JSON' => array( '' ), + 'No spaces, empty JSON, void' => array( '' ), + 'No space before wp:' => array( '' ), + 'No space after name' => array( '' ), + 'No space before JSON' => array( '' ), + 'No space after JSON' => array( '' ), + 'Missing wp:' => array( '' ), + 'Malformed wp:' => array( '' ), + 'Malformed block namespace' => array( '' ), + 'Malformed block name' => array( '' ), + 'Invalid block name characters' => array( '' ), + ); + } + + /** + * Verifies that incomplete HTML comments which could not produce delimiters + * are not considered incomplete input by the scanner. + * + * Note that the block parsing operates first on block comment delimiters and + * then on HTML semantics. It’s technically possible for blocks to delimit + * invalid or non-well-formed HTML, so there’s no need to try and preserve + * other HTML boundaries in the parser the way the HTML API does. + * + * @ticket 61401 + * + * @dataProvider data_incomplete_html_comments_that_are_not_delimiters + * + * @param string $html Input containing an HTML comment that is both incomplete and + * cannot represent an incomplete block comment delimiter. + */ + public function test_unclosed_html_comment_non_delimiter_is_not_incomplete_input( $html ) { + $scanner = WP_Block_Scanner::create( "{$html}" ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found setup group block but found nothing: check test setup.' + ); + + $this->assertSame( + 'core/group', + $scanner->get_block_type(), + "Should have found setup 'group' block: check test setup." + ); + + $this->assertFalse( + $scanner->next_delimiter(), + "Should have found no other delimiters given the incomplete HTML comment, but found a '{$scanner->get_block_type()}' instead." + ); + + $this->assertNull( + $scanner->get_last_error(), + 'Should have completed without reporting an error.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_incomplete_html_comments_that_are_not_delimiters() { + return array( + 'Opening and non-whitespace' => array( '" ); + + $scanner->next_delimiter(); + $this->assertTrue( + $scanner->opens_block( 'core/block' ), + 'Should have found the test block but found nothing instead.' + ); + + $parsed_data = $scanner->allocate_and_return_parsed_attributes(); + $exported_parsed_data = var_export( $parsed_data, true ); + $exported_parsed_data = self::unhide_whitespace( $exported_parsed_data ); + $this->assertNull( + $parsed_data, + "Should have failed to parse JSON attributes, but found '{$exported_parsed_data}' instead." + ); + + $this->assertNotNull( + $scanner->get_last_json_error(), + 'Should have reported an error when attempting to parse JSON attributes.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_invalid_block_json() { + return array( + 'Extra closing }' => array( '{}}' ), + 'Empty list' => array( '[]' ), + 'Non-empty list' => array( '[1, 2, 3]' ), + 'Nested list' => array( '[{"type": "broken"}]' ), + 'True' => array( 'true' ), + 'False' => array( 'false' ), + 'null' => array( 'null' ), + 'Number (36)' => array( '36' ), + 'Number (3.141e0)' => array( '3.141e0' ), + 'Unquoted string' => array( '{"name": block}' ), + 'Letters' => array( 'not_even_json' ), + ); + } + + /** + * Verifies that a delimiter with unterminated JSON is not treated as a delimiter. + * + * @ticket 61401 + */ + public function test_does_not_match_block_with_unterminated_json() { + $scanner = WP_Block_Scanner::create( '' ); + + $this->assertFalse( + $scanner->next_delimiter(), + "Should have failed to find block delimiter but found '{$scanner->get_block_type()}' instead." + ); + } + + /** + * Verifies that a delimiter with content after the JSON attributes is not treated as a delimiter. + * + * @ticket 61401 + */ + public function test_does_not_match_block_with_content_after_json() { + $scanner = WP_Block_Scanner::create( '' ); + + $this->assertFalse( + $scanner->next_delimiter(), + "Should have failed to find block delimiter but found '{$scanner->get_block_type()}' instead." + ); + } + + /** + * Verifies that the appropriate block delimiter type is reported for a matched delimiter. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_their_types + * + * @param string $html Contains a single block delimiter. + * @param string $delimiter_type Expected type of delimiter. + */ + public function test_reports_proper_delimiter_type( $html, $delimiter_type ) { + $scanner = WP_Block_Scanner::create( $html ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found test block delimiter but found nothing instead.' + ); + + $this->assertSame( + $delimiter_type, + $scanner->get_delimiter_type(), + 'Failed to match the expected delimiter type (opener/closer/void)' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_delimiters_and_their_types() { + return array( + 'Void' => array( '', WP_Block_Scanner::VOID ), + 'Void, full name' => array( '', WP_Block_Scanner::VOID ), + 'Opener' => array( '', WP_Block_Scanner::OPENER ), + 'Opener, full name' => array( '', WP_Block_Scanner::OPENER ), + 'Closer' => array( '', WP_Block_Scanner::CLOSER ), + 'Closer, full name' => array( '', WP_Block_Scanner::CLOSER ), + ); + } + + /** + * Verifies that `get_delimiter_type()` returns `null` before finding any delimiters. + * + * @ticket 61401 + */ + public function test_reports_no_delimiter_type_before_scanning() { + $scanner = WP_Block_Scanner::create( '' ); + + $this->assertNull( + $scanner->get_delimiter_type(), + 'Should not have returned a delimiter type before matching any delimiters.' + ); + } + + /** + * Verifies that `get_delimiter_type()` returns `null` after scanning the last delimiter. + * + * @ticket 61401 + */ + public function test_reports_no_delimiter_type_after_scanning() { + $scanner = WP_Block_Scanner::create( '' ); + + while ( $scanner->next_delimiter( null, 'visit-html' ) ) { + continue; + } + + $this->assertNull( + $scanner->get_delimiter_type(), + 'Should not have returned a delimiter type before matching any delimiters.' + ); + } + + /** + * Verifies that `get_delimiter_type()` returns `null` after encountering an error. + * + * @ticket 61401 + */ + public function test_reports_no_delimiter_type_after_an_error() { + $scanner = WP_Block_Scanner::create( '', 'core/group' ), + 'Void, core/group' => array( '', 'core/group' ), + 'Closer, core/group' => array( '', 'core/group' ), + 'Opener, group' => array( '', 'core/group' ), + 'Void, group' => array( '', 'core/group' ), + 'Closer, group' => array( '', 'core/group' ), + 'Opener, my/group' => array( '', 'my/group' ), + 'Void, thy/group' => array( '', 'thy/group' ), + 'Closer, the5/group' => array( '', 'the-5/group' ), + ); + } + + /** + * Verifies that the presence of the void flag is properly reported + * regardless of a conflict with a closing black. + * + * When block delimiters contain both the void flag and the closing flag, + * it shall be interpreted as a void block to match the behavior of + * the spec parser, but the block scanner exposes the closing flag to + * allow for user-space code to make its own determination. + * + * @ticket 61401 + */ + public function test_reports_presence_of_void_flag() { + $html = ''; + $scanner = WP_Block_Scanner::create( $html ); + + // Test the opening delimiter. + + $this->assertTrue( + $scanner->next_delimiter(), + "Should have found opening 'void-and-closed' block delimiter but found nothing: check test setup." + ); + + $this->assertSame( + 'core/void-and-closed', + $scanner->get_block_type(), + "Should have found opening 'void-and-closed' block delimiter: check test setup." + ); + + $this->assertSame( + WP_Block_Scanner::OPENER, + $scanner->get_delimiter_type(), + 'Should have found an opening delimiter: check test setup.' + ); + + $this->assertFalse( + $scanner->has_closing_flag(), + 'Should not have indicated the presence of the closing flag on an opening block.' + ); + + // Test the void delimiter. + + $this->assertTrue( + $scanner->next_delimiter(), + "Should have found the void 'void' block delimiter but found nothing: check test setup." + ); + + $this->assertSame( + 'core/void', + $scanner->get_block_type(), + "Should have found opening 'void-and-closed' block delimiter: check test setup." + ); + + $this->assertSame( + WP_Block_Scanner::VOID, + $scanner->get_delimiter_type(), + 'Should have found a void delimiter: check test setup.' + ); + + $this->assertFalse( + $scanner->has_closing_flag(), + 'Should not have indicated the presence of the closing flag on the pure void block.' + ); + + // Test the void/closing delimiter. + + $this->assertTrue( + $scanner->next_delimiter(), + "Should have found closing 'void-and-closed' block delimiter but found nothing: check test setup." + ); + + $this->assertSame( + 'core/void-and-closed', + $scanner->get_block_type(), + "Should have found closing 'void-and-closed' block delimiter: check test setup." + ); + + $this->assertSame( + WP_Block_Scanner::VOID, + $scanner->get_delimiter_type(), + 'Should have found a closing delimiter: check test setup.' + ); + + $this->assertTrue( + $scanner->has_closing_flag(), + 'Should have indicated the presence of the closing flag on a block with both the closing and void flags.' + ); + } + + /** + * Verifies that the scanner indicates if the currently-matched delimiter + * is of a given block type. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_their_block_types + * + * @param string $html Contains a single delimiter. + * @param string $block_type Fully-qualified block type. + */ + public function test_reports_if_block_is_of_type( $html, $block_type ) { + $scanner = WP_Block_Scanner::create( $html ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found test block delimiter but found nothing instead: check test setup.' + ); + + $this->assertTrue( + $scanner->is_block_type( $block_type ), + "Should have found the block to be of type '{$block_type}', detected type is '{$scanner->get_block_type()}'." + ); + + if ( str_starts_with( $block_type, 'core/' ) ) { + // Prune off core namespace and detect implicit namespace. + $block_type = substr( $block_type, strlen( 'core/' ) ); + + $this->assertTrue( + $scanner->is_block_type( $block_type ), + "Should have found the block to be of core type '{$block_type}', detected type is '{$scanner->get_block_type()}'." + ); + } + } + + /** + * Verifies that the scanner indicates if the currently-matched delimiter + * opens a block of a given block type. This is true for openers and void delimiters. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_their_block_types + * + * @param string $html Contains a single delimiter. + * @param string $block_type Fully-qualified block type. + */ + public function test_reports_if_block_opens_type( $html, $block_type ) { + $scanner = WP_Block_Scanner::create( $html ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found test block delimiter but found nothing instead: check test setup.' + ); + + if ( WP_Block_Scanner::CLOSER === $scanner->get_delimiter_type() ) { + $this->assertFalse( + $scanner->opens_block( $block_type ), + 'Should not have indicated that a closing delimiter opens a block.' + ); + return; + } + + $this->assertTrue( + $scanner->opens_block( $block_type ), + "Should have indicating opening of type '{$block_type}', detected type is '{$scanner->get_block_type()}'." + ); + + if ( str_starts_with( $block_type, 'core/' ) ) { + // Prune off core namespace and detect implicit namespace. + $block_type = substr( $block_type, strlen( 'core/' ) ); + + $this->assertTrue( + $scanner->opens_block( $block_type ), + "Should have indicated opening of core type '{$block_type}', detected type is '{$scanner->get_block_type()}'." + ); + } + } + + /** + * Verifies that asking if a delimiter opens a block ignores the block type + * if none are provided in the explicit limiting list. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_their_block_types + * + * @param string $html Contains a single delimiter. + * @param string $block_type Fully-qualified block type (ignored but present due to shared data provider). + */ + public function test_opens_block_with_no_explicit_types_ignores_block_type( $html, $block_type ) { + $scanner = WP_Block_Scanner::create( $html ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found test block delimiter but found nothing instead: check test setup.' + ); + + if ( WP_Block_Scanner::CLOSER === $scanner->get_delimiter_type() ) { + $this->assertFalse( + $scanner->opens_block(), + 'Should not have indicated that a closing delimiter opens a block.' + ); + } else { + $this->assertTrue( + $scanner->opens_block(), + "Should have indicated that a '{$scanner->get_delimiter_type()}' delimiter opens a block." + ); + } + } + + /** + * Verifies that when given multiple potential block types, that `opens_block()` properly + * indicates if the currently-matched block is an opening for at least one of them. + * + * @ticket 61401 + * + * @dataProvider data_delimiters_and_sets_of_block_types + * + * @param string $html Contains a single block delimiter. + * @param string[] $block_types Contains one or more block types, fully qualified or not. + * @param bool $is_a_match Indicates if the provided HTML contains a block of the type in the given set. + */ + public function test_opens_block_checks_against_multiple_provided_block_types( $html, $block_types, $is_a_match ) { + $scanner = WP_Block_Scanner::create( $html ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found test setup block but found none: check test setup.' + ); + + $joined_types = implode( ', ', $block_types ); + + if ( $is_a_match ) { + $this->assertTrue( + $scanner->opens_block( ...$block_types ), + "Should have found that delimiter (type {$scanner->get_block_type()}) opens one of the following block types: {$joined_types}." + ); + } else { + $this->assertFalse( + $scanner->opens_block( ...$block_types ), + "Should not have found that delimiter (type {$scanner->get_block_type()}) opens one of the following block types: {$joined_types}." + ); + } + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_delimiters_and_sets_of_block_types() { + return array( + // Positive matches. + 'type "test", first in set' => array( '', array( 'test', 'tossed', 'tess' ), true ), + 'type "core/test", first in set' => array( '', array( 'test', 'tossed', 'tess' ), true ), + 'type "test", middle of set' => array( '', array( 'tust', 'test', 'tossed', 'tess' ), true ), + 'type "core/test", middle of set' => array( '', array( 'tust', 'test', 'tossed', 'tess' ), true ), + 'type "test", last in set' => array( '', array( 'tust', 'tossed', 'tess', 'test' ), true ), + 'type "core/test", last in set' => array( '', array( 'tust', 'tossed', 'tess', 'core/test' ), true ), + 'type "test", core/test first in set' => array( '', array( 'core/test', 'tossed', 'tess' ), true ), + 'type "core/test", core/test first in set' => array( '', array( 'core/test', 'tossed', 'tess' ), true ), + 'type "test", core/test in middle of set' => array( '', array( 'tust', 'core/test', 'tossed', 'tess' ), true ), + 'type "core/test", core/test in middle of set' => array( '', array( 'tust', 'core/test', 'tossed', 'tess' ), true ), + 'type "test", core/test last in set' => array( '', array( 'tust', 'tossed', 'tess', 'core/test' ), true ), + 'type "core/test", core/test last in set' => array( '', array( 'tust', 'tossed', 'tess', 'core/test' ), true ), + 'non-core, fully-qualified' => array( '', array( 'test/ship', 'test/block', 'test/wheel' ), true ), + + // Negative matches. + 'type "test", not in set' => array( '', array( 'text', 'core/text', 'my/test' ), false ), + 'type "core/test", not in set' => array( '', array( 'text', 'core/text', 'my/test' ), false ), + 'type "next-dev/code", not in set' => array( '', array( 'code', 'new/code', 'dev/code' ), false ), + ); + } + + /** + * Verifies that when scanning and visiting freeform blocks, that they + * return the appropriate information, an opening, and a closing. + * + * @ticket 61401 + * + * @dataProvider data_freeform_blocks_and_delimiter_indices + * + * @param string $html Contains a freeform block after zero or more delimiters. + * @param int $freeform_at Freeform is located after this many other delimiters. + */ + public function test_indicates_if_matched_delimiter_is_freeform( $html, $freeform_at ) { + $scanner = WP_Block_Scanner::create( $html ); + + for ( $i = 0; $i < $freeform_at; $i++ ) { + $scanner->next_token( 'visit-html' ); + } + + // Opening delimiter. + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + 'Should have found the freeform content but didn’t: check test setup.' + ); + + $this->assertSame( + 'core/freeform', + $scanner->get_block_type(), + 'Should have found a freeform block.' + ); + + $this->assertTrue( + $scanner->is_html(), + 'Should have identified the delimiter as freeform.' + ); + + $this->assertSame( + WP_Block_Scanner::OPENER, + $scanner->get_delimiter_type(), + 'Should have stopped first on a freeform block opener.' + ); + + $this->assertTrue( + $scanner->opens_block( 'freeform' ), + 'Should indicate that this (implicit) delimiter opens a freeform block (without the Core namespace).' + ); + + $this->assertTrue( + $scanner->is_block_type( 'freeform' ), + 'Should indicate that this (implicit) delimiter is freeform (without the Core namespace).' + ); + + $this->assertTrue( + $scanner->opens_block( 'core/freeform' ), + 'Should indicate that this (implicit) delimiter opens a freeform block (fully-qualified).' + ); + + $this->assertTrue( + $scanner->is_block_type( 'core/freeform' ), + 'Should indicate that this (implicit) delimiter is freeform (fully-qualified).' + ); + + $this->assertNull( + $scanner->allocate_and_return_parsed_attributes(), + 'Should not find any attributes on any freeform content.' + ); + + // Closing delimiter. + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + 'Should have found the closing (implicit) freeform delimiter but found nothing instead.' + ); + + $this->assertTrue( + $scanner->is_html(), + 'Should have identified the delimiter as freeform.' + ); + + $this->assertSame( + WP_Block_Scanner::CLOSER, + $scanner->get_delimiter_type(), + 'Should have found the closing (implicit) freeform delimiter.' + ); + + $this->assertSame( + 'core/freeform', + $scanner->get_block_type(), + 'Should have found the freeform block type.' + ); + + $this->assertFalse( + $scanner->opens_block( 'core/freeform' ), + 'Should not indicate that the (implicit) freeform closing delimiter opens a block.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_freeform_blocks_and_delimiter_indices() { + return array( + 'Only non-block content (one freeform)' => array( 'this is not inside a block', 0 ), + 'Freeform before a block' => array( 'before the block', 0 ), + 'Freeform after a block' => array( 'after the block', 1 ), + 'Freeform between blocks' => array( 'after the block', 1 ), + 'Visits HTML content inside blocks' => array( 'this is innerHTMLthis is freeform', 1 ), + ); + } + + /** + * Verifies that the freeform functions do not report freeform content + * when explicit delimiters are matched. + * + * @ticket 61401 + */ + public function test_actual_delimiters_are_not_freeform() { + $scanner = WP_Block_Scanner::create( " \f\t\r\n " ); + + // Opening block. + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + "Should have found opening 'group' test block: check test setup." + ); + + $this->assertFalse( + $scanner->is_html(), + "Should not have reported the opening 'group' block as freeform." + ); + + $this->assertFalse( + $scanner->is_non_whitespace_html(), + "Should not have reported the opening 'group' block as non-whitespace freeform." + ); + + // Freeform block (implicit) opener. + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + 'Should have found (implicit) freeform test block: check test setup.' + ); + + $this->assertTrue( + $scanner->is_html(), + 'Should have reported the (implicit) opening freeform delimiter.' + ); + + $this->assertFalse( + $scanner->is_non_whitespace_html(), + 'Should have reported the (implicit) opening freeform delimiter as whitespace-only.' + ); + + // Freeform block (implicit) closer. + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + 'Should have found (implicit) freeform test block closer: check test setup.' + ); + + $this->assertTrue( + $scanner->is_html(), + 'Should have reported the (implicit) closing freeform delimiter.' + ); + + $this->assertFalse( + $scanner->is_non_whitespace_html(), + 'Should have reported the (implicit) closing freeform delimiter as whitespace-only.' + ); + + // Closing block. + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + "Should have found closing 'group' test block: check test setup." + ); + + $this->assertFalse( + $scanner->is_html(), + "Should not have reported the closing 'group' block as freeform." + ); + + $this->assertFalse( + $scanner->is_non_whitespace_html(), + "Should not have reported the closing 'group' block as non-whitespace freeform." + ); + } + + /** + * Verifies that whitespace-only freeform content is properly indicated. + * + * This is used to skip over whitespace-only freeform content which is + * usually produced by {@see \serialize_blocks()} for clearer formatting. + * + * @ticket 61401 + */ + public function test_indicates_if_freeform_content_is_only_whitespace() { + $scanner = WP_Block_Scanner::create( + << + +HTML + ); + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + 'Should have found the first freeform block: check test setup.' + ); + + $this->assertSame( + 'core/freeform', + $scanner->get_block_type(), + 'Should have identified the first (implicit) delimiter as freeform.' + ); + + $this->assertTrue( + $scanner->is_html(), + 'Should have identified the first delimiter as (implicit) freeform.' + ); + + $this->assertTrue( + $scanner->is_non_whitespace_html(), + 'Should have identified that the freeform block contains non-whitespace content.' + ); + + $this->assertTrue( + $scanner->next_token( 'skip-html' ), + "Should have found the first explicit 'separator' delimiter: check test setup." + ); + + $this->assertSame( + 'core/separator', + $scanner->get_block_type(), + "Should have found the 'separator' delimiter: check test setup." + ); + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + 'Should have found the second implicit freeform delimiter.' + ); + + $this->assertTrue( + $scanner->is_html(), + 'Should have identified the second (implicit) freeform opening delimiter.' + ); + + $this->assertFalse( + $scanner->is_non_whitespace_html(), + 'Should have identified that the second freeform block contains only whitespace content.' + ); + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + 'Should have found the second implicit freeform closing delimiter' + ); + + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + "Should have found the final 'ladder' delimiter." + ); + + $this->assertSame( + 'core/ladder', + $scanner->get_block_type(), + "Should have identified the final delimiter as a 'core/ladder' type: check test setup." + ); + } + + /** + * Verifies that `get_attributes()` throws because it’s unsupported at the moment. + * + * This test should be changed if and when lazy attribute parsing is added. + * + * @ticket 61401 + */ + public function test_get_attributes_currently_throws() { + $scanner = WP_Block_Scanner::create( '' ); + + $this->assertTrue( + $scanner->next_delimiter(), + 'Should have found the "test" setup delimiter but found nothing: check test setup.' + ); + + $this->assertSame( + array( 'not' => 'yet supported' ), + $scanner->allocate_and_return_parsed_attributes(), + 'Should have read eagerly-parsed block attributes: check test setup.' + ); + + $this->expectExceptionMessage( 'Lazy attribute parsing not yet supported' ); + $scanner->get_attributes(); + } + + /** + * Verifies that the scanner reports the appropriate string indices for each delimiter. + * + * @ticket 61401 + * + * @dataProvider data_content_and_delimiter_spans + * + * @param string $html Contains one or more block delimiters, + * including implicit freeform delimiters. + * @param int[] $spans For each delimiter in `$html`, a [ start, length ] + * pair representing the textual span of the delimiter. + */ + public function test_returns_appropriate_span_for_delimiters( $html, ...$spans ) { + $scanner = WP_Block_Scanner::create( $html ); + + if ( 0 === count( $spans ) ) { + $this->assertNull( + $scanner->get_span(), + 'Should not have returned any span when not matched on a delimiter.' + ); + return; + } + + foreach ( $spans as $i => $span ) { + $this->assertTrue( + $scanner->next_token( 'visit-html' ), + "Should have found delimiter in position {$i} but found nothing: check test setup." + ); + + $reported = $scanner->get_span(); + $this->assertSame( + $span, + array( $reported->start, $reported->length ), + 'Should have reported the proper span of text covered by the delimiter.' + ); + } + + $this->assertFalse( + $scanner->next_token( 'visit-html' ), + 'Should not have found any additional delimiters: check test setup.' + ); + + $this->assertNull( + $scanner->get_span(), + 'Should not have returned any span after finishing the scan of a document.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_content_and_delimiter_spans() { + return array( + 'Before matching' => array( 'Blocks advance yet' ), + 'Only freeform' => array( 'Have a lovely day.', array( 0, 0 ), array( 18, 0 ) ), + 'Only void' => array( '', array( 0, 23 ) ), + 'Mixed' => array( '<>', array( 0, 14 ), array( 14, 0 ), array( 16, 0 ), array( 16, 15 ) ), + ); + } + + // + // Test helpers. + // + + /** + * Replaces whitespace in a string with visual indicators for easier debugging. + * + * The definition of “whitespace” here is loose and intended for debugging tests. + * It’s okay to expand for more complete replacement, for example to replace all + * graphemes considered whitespace by Unicode, but not required unless it’s + * essential for tests. + * + * Concerning HTML and the block parser only the HTML whitespace is relevant. + * + * @param string $text Any input, potentially containing whitespace characters. + * @return string The input with whitespace replaced by visual placeholders. + */ + private static function unhide_whitespace( $text ) { + return str_replace( + array( ' ', "\t", "\r", "\f", "\n" ), + array( '␠', '␉', '␍', '␌', '␤' ), + $text + ); + } +} From 85452871c61321a5c48f688fb3a94b16e051ca45 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 2 Oct 2025 16:22:51 -0500 Subject: [PATCH 2/3] No type annotation to protect subclasses --- src/wp-includes/class-wp-block-scanner.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/class-wp-block-scanner.php b/src/wp-includes/class-wp-block-scanner.php index 18f3d22dfdda4..3d2f4067daf09 100644 --- a/src/wp-includes/class-wp-block-scanner.php +++ b/src/wp-includes/class-wp-block-scanner.php @@ -220,7 +220,7 @@ class WP_Block_Scanner { * @param string $source_text Input document potentially containing block content. * @return ?self Created block scanner, if successfully created. */ - public static function create( string $source_text ): mixed { + public static function create( string $source_text ) { return new self( $source_text ); } From b56da74e08178a5adda78419857c1e0fa7653957 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 13 Aug 2025 21:51:02 -0500 Subject: [PATCH 3/3] Introduce WP_Block_Processor class. --- src/wp-includes/class-wp-block-processor.php | 122 ++++++++++++++++++ src/wp-settings.php | 1 + .../tests/block-scanner/wpBlockProcessor.php | 108 ++++++++++++++++ 3 files changed, 231 insertions(+) create mode 100644 src/wp-includes/class-wp-block-processor.php create mode 100644 tests/phpunit/tests/block-scanner/wpBlockProcessor.php diff --git a/src/wp-includes/class-wp-block-processor.php b/src/wp-includes/class-wp-block-processor.php new file mode 100644 index 0000000000000..666c190c327e4 --- /dev/null +++ b/src/wp-includes/class-wp-block-processor.php @@ -0,0 +1,122 @@ +was_void ) { + array_pop( $this->open_blocks ); + $this->was_void = false; + } + + if ( false === parent::next_token( $html_spans ) ) { + return false; + } + + $block_type = $this->get_block_type(); + + switch ( $this->get_delimiter_type() ) { + case parent::OPENER: + $this->open_blocks[] = $block_type; + break; + + case parent::CLOSER: + $closed_block = array_pop( $this->open_blocks ); + if ( $block_type !== $closed_block ) { + throw new Error( 'Tried to close a block of another type.' ); + } + break; + + case parent::VOID: + $this->open_blocks[] = $block_type; + $this->was_void = true; + break; + } + + if ( isset( $block_type ) && ! $this->is_block_type( $block_type ) ) { + return $this->next_delimiter( $block_type, $html_spans ); + } + + return true; + } + + public function get_breadcrumbs() { + return $this->open_blocks; + } + + /** + * Returns the depth of the open blocks where the processor is currently matched. + * + * Depth increases before visiting openers and void blocks, + * and decreases before visiting closers. + * @return int|null + */ + public function get_depth() { + return count( $this->open_blocks ); + } + + /** + * Extracts a block object starting at a matched block delimiter opener. + * + * @todo Use iteration instead of recursion, or at least refactor to tail-call form. + * + * @return array|null + */ + public function extract_block() { + if ( parent::MATCHED !== $this->state || ! $this->opens_block() ) { + return null; + } + + $block = array( + 'blockName' => $this->get_block_type(), + 'attrs' => $this->allocate_and_return_parsed_attributes() ?? array(), + 'innerHTML' => '', + 'innerBlocks' => array(), + 'innerContent' => array(), + ); + + $depth = $this->get_depth(); + $child_depth = $depth + 1; + while ( $this->next_delimiter( null, 'visit-html' ) && $this->get_depth() > $depth ) { + if ( $this->get_depth() === $child_depth && $this->opens_block( 'core/freeform' ) ) { + $chunk = $this->get_html_content_and_advance(); + $block['innerHTML'] .= $chunk; + + $last_chunk = count( $block['innerContent'] ) - 1; + if ( isset( $block['innerContent'][ $last_chunk ] ) ) { + $block['innerContent'][ count( $block['innerContent'] ) - 1 ] = $chunk; + } else { + $block['innerContent'][] = $chunk; + } + + continue; + } + + /** + * Inner blocks. + * + * @todo This is a decent place to call {@link \render_block()} + */ + if ( $this->opens_block() ) { + $inner_block = $this->extract_block(); + $block['innerBlocks'][] = $inner_block; + $block['innerContent'][] = null; + } + } + + if ( empty( $block['innerBlocks'] ) ) { + unset( $block['innerBlocks'] ); + } + + return $block; + } +} diff --git a/src/wp-settings.php b/src/wp-settings.php index 01062771a0c56..39ac7aa260e0e 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -269,6 +269,7 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php'; require ABSPATH . WPINC . '/class-wp-block-scanner.php'; +require ABSPATH . WPINC . '/class-wp-block-processor.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php'; diff --git a/tests/phpunit/tests/block-scanner/wpBlockProcessor.php b/tests/phpunit/tests/block-scanner/wpBlockProcessor.php new file mode 100644 index 0000000000000..6c1298db0faf5 --- /dev/null +++ b/tests/phpunit/tests/block-scanner/wpBlockProcessor.php @@ -0,0 +1,108 @@ +assertInstanceOf( 'WP_Block_Processor', $processor ); + } + + public function test_get_breadcrumbs() { + $processor = WP_Block_Processor::create( '' ); + + $this->assertTrue( + $processor->next_delimiter(), + 'Should have found the opening "top" delimiter but found nothing.' + ); + + $this->assertSame( + array( 'core/top' ), + $processor->get_breadcrumbs(), + 'Should have found only the single opening delimiter.' + ); + + $processor->next_delimiter(); + $this->assertSame( + array( 'core/top', 'core/inside' ), + $processor->get_breadcrumbs(), + 'Should have detected the nesting structure of the blocks.' + ); + } + + public function test_get_depth() { + // Create a deeply-nested stack of blocks. + $html = ''; + $max_depth = 10; + + for ( $i = 0; $i < $max_depth; $i++ ) { + $html .= "\n"; + } + + for ( $i = 0; $i < $max_depth; $i++ ) { + $html .= "\n"; + } + + $processor = WP_Block_Processor::create( $html ); + $n = new NumberFormatter( 'en-US', NumberFormatter::ORDINAL ); + + for ( $i = 0; $i < $max_depth; $i++ ) { + $this->assertTrue( + $processor->next_delimiter(), + "Should have found {$n->format( $i + 1 )} opening delimiter: check test setup." + ); + + $this->assertSame( + $i + 1, + $processor->get_depth(), + "Should have identified the proper depth of the {$n->format( $i + 1 )} opening delimiter." + ); + } + + for ( $i = 0; $i < $max_depth; $i++ ) { + $this->assertTrue( + $processor->next_delimiter(), + "Should have found {$n->format( $i + 1 )} closing delimiter: check test setup." + ); + + $this->assertSame( + $max_depth - $i - 1, + $processor->get_depth(), + "Should have identified the proper depth of the {$n->format( $i + 1 )} closing delimiter." + ); + } + } + + public function test_builds_block() { + $cover_block = [ 'blockName' => 'core/cover', 'attrs' => [], 'innerHTML' => '', 'innerContent' => [ '' ] ]; + $heading_block = [ 'blockName' => 'core/heading', 'attrs' => [ 'level' => 2 ], 'innerHTML' => '

Testing works!

', 'innerContent' => [ '

Testing works!

' ] ]; + $paragraph_block = [ 'blockName' => 'core/paragraph', 'attrs' => [], 'innerHTML' => '

Who knew?

', 'innerContent' => [ '

Who knew?

' ] ]; + $group_block = [ 'blockName' => 'core/group', 'attrs' => [], 'innerHTML' => '', 'innerBlocks' => [ $heading_block, $paragraph_block ], 'innerContent' => [ null, null ] ]; + $blocks = [ $cover_block, $group_block ]; + $html = serialize_blocks( $blocks ); + + $processor = WP_Block_Processor::create( $html ); + + $extracted = array(); + while ( $processor->next_delimiter() ) { + $extracted[] = $processor->extract_block(); + } + + $this->assertSame( + $blocks, + $extracted, + 'Should have extracted a block matching the input group block.' + ); + } +}