ng elements with attributes, find a way * to indicate if the virtually-reconstructed formatting elements contain the * wanted class name. * * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive. * @return bool|null Whether the matched tag contains the given class name, or null if not matched. */ public function has_class( $wanted_class ): ?bool { return $this->is_virtual() ? null : parent::has_class( $wanted_class ); } /** * Generator for a foreach loop to step through each class name for the matched tag. * * This generator function is designed to be used inside a "foreach" loop. * * Example: * * $p = WP_HTML_Processor::create_fragment( "
" ); * $p->next_tag(); * foreach ( $p->class_list() as $class_name ) { * echo "{$class_name} "; * } * // Outputs: "free lang-en " * * @since 6.6.0 Subclassed for the HTML Processor. */ public function class_list() { return $this->is_virtual() ? null : parent::class_list(); } /** * Returns the modifiable text for a matched token, or an empty string. * * Modifiable text is text content that may be read and changed without * changing the HTML structure of the document around it. This includes * the contents of `#text` nodes in the HTML as well as the inner * contents of HTML comments, Processing Instructions, and others, even * though these nodes aren't part of a parsed DOM tree. They also contain * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any * other section in an HTML document which cannot contain HTML markup (DATA). * * If a token has no modifiable text then an empty string is returned to * avoid needless crashing or type errors. An empty string does not mean * that a token has modifiable text, and a token with modifiable text may * have an empty string (e.g. a comment with no contents). * * @since 6.6.0 Subclassed for the HTML Processor. * * @return string */ public function get_modifiable_text(): string { return $this->is_virtual() ? '' : parent::get_modifiable_text(); } /** * Indicates what kind of comment produced the comment node. * * Because there are different kinds of HTML syntax which produce * comments, the Tag Processor tracks and exposes this as a type * for the comment. Nominally only regular HTML comments exist as * they are commonly known, but a number of unrelated syntax errors * also produce comments. * * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT * @see self::COMMENT_AS_CDATA_LOOKALIKE * @see self::COMMENT_AS_INVALID_HTML * @see self::COMMENT_AS_HTML_COMMENT * @see self::COMMENT_AS_PI_NODE_LOOKALIKE * * @since 6.6.0 Subclassed for the HTML Processor. * * @return string|null */ public function get_comment_type(): ?string { return $this->is_virtual() ? null : parent::get_comment_type(); } /** * Removes a bookmark that is no longer needed. * * Releasing a bookmark frees up the small * performance overhead it requires. * * @since 6.4.0 * * @param string $bookmark_name Name of the bookmark to remove. * @return bool Whether the bookmark already existed before removal. */ public function release_bookmark( $bookmark_name ): bool { return parent::release_bookmark( "_{$bookmark_name}" ); } /** * Moves the internal cursor in the HTML Processor to a given bookmark's location. * * Be careful! Seeking backwards to a previous location resets the parser to the * start of the document and reparses the entire contents up until it finds the * sought-after bookmarked location. * * In order to prevent accidental infinite loops, there's a * maximum limit on the number of times seek() can be called. * * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document. * * @since 6.4.0 * * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ public function seek( $bookmark_name ): bool { // Flush any pending updates to the document before beginning. $this->get_updated_html(); $actual_bookmark_name = "_{$bookmark_name}"; $processor_started_at = $this->state->current_token ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start : 0; $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; /* * If seeking backwards, it's possible that the sought-after bookmark exists within an element * which has been closed before the current cursor; in other words, it has already been removed * from the stack of open elements. This means that it's insufficient to simply pop off elements * from the stack of open elements which appear after the bookmarked location and then jump to * that location, as the elements which were open before won't be re-opened. * * In order to maintain consistency, the HTML Processor rewinds to the start of the document * and reparses everything until it finds the sought-after bookmark. * * There are potentially better ways to do this: cache the parser state for each bookmark and * restore it when seeking; store an immutable and idempotent register of where elements open * and close. * * If caching the parser state it will be essential to properly maintain the cached stack of * open elements and active formatting elements when modifying the document. This could be a * tedious and time-consuming process as well, and so for now will not be performed. * * It may be possible to track bookmarks for where elements open and close, and in doing so * be able to quickly recalculate breadcrumbs for any element in the document. It may even * be possible to remove the stack of open elements and compute it on the fly this way. * If doing this, the parser would need to track the opening and closing locations for all * tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves * this list could be automatically maintained while modifying the document. Finding the * breadcrumbs would then amount to traversing that list from the start until the token * being inspected. Once an element closes, if there are no bookmarks pointing to locations * within that element, then all of these locations may be forgotten to save on memory use * and computation time. */ if ( 'backward' === $direction ) { /* * Instead of clearing the parser state and starting fresh, calling the stack methods * maintains the proper flags in the parser. */ foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { if ( 'context-node' === $item->bookmark_name ) { break; } $this->state->stack_of_open_elements->remove_node( $item ); } foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { if ( 'context-node' === $item->bookmark_name ) { break; } $this->state->active_formatting_elements->remove_node( $item ); } parent::seek( 'context-node' ); $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; $this->state->frameset_ok = true; $this->element_queue = array(); $this->current_element = null; if ( isset( $this->context_node ) ) { $this->breadcrumbs = array_slice( $this->breadcrumbs, 0, 2 ); } else { $this->breadcrumbs = array(); } } // When moving forwards, reparse the document until reaching the same location as the original bookmark. if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { return true; } while ( $this->next_token() ) { if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { while ( isset( $this->current_element ) && WP_HTML_Stack_Event::POP === $this->current_element->operation ) { $this->current_element = array_shift( $this->element_queue ); } return true; } } return false; } /** * Sets a bookmark in the HTML document. * * Bookmarks represent specific places or tokens in the HTML * document, such as a tag opener or closer. When applying * edits to a document, such as setting an attribute, the * text offsets of that token may shift; the bookmark is * kept updated with those shifts and remains stable unless * the entire span of text in which the token sits is removed. * * Release bookmarks when they are no longer needed. * * Example: * *

Surprising fact you may not know!

* ^ ^ * \-|-- this `H2` opener bookmark tracks the token * *

Surprising fact you may no… * ^ ^ * \-|-- it shifts with edits * * Bookmarks provide the ability to seek to a previously-scanned * place in the HTML document. This avoids the need to re-scan * the entire document. * * Example: * *
  • One
  • Two
  • Three
* ^^^^ * want to note this last item * * $p = new WP_HTML_Tag_Processor( $html ); * $in_list = false; * while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) { * if ( 'UL' === $p->get_tag() ) { * if ( $p->is_tag_closer() ) { * $in_list = false; * $p->set_bookmark( 'resume' ); * if ( $p->seek( 'last-li' ) ) { * $p->add_class( 'last-li' ); * } * $p->seek( 'resume' ); * $p->release_bookmark( 'last-li' ); * $p->release_bookmark( 'resume' ); * } else { * $in_list = true; * } * } * * if ( 'LI' === $p->get_tag() ) { * $p->set_bookmark( 'last-li' ); * } * } * * Bookmarks intentionally hide the internal string offsets * to which they refer. They are maintained internally as * updates are applied to the HTML document and therefore * retain their "position" - the location to which they * originally pointed. The inability to use bookmarks with * functions like `substr` is therefore intentional to guard * against accidentally breaking the HTML. * * Because bookmarks allocate memory and require processing * for every applied update, they are limited and require * a name. They should not be created with programmatically-made * names, such as "li_{$index}" with some loop. As a general * rule they should only be created with string-literal names * like "start-of-section" or "last-paragraph". * * Bookmarks are a powerful tool to enable complicated behavior. * Consider double-checking that you need this tool if you are * reaching for it, as inappropriate use could lead to broken * HTML structure or unwanted processing overhead. * * @since 6.4.0 * * @param string $bookmark_name Identifies this particular bookmark. * @return bool Whether the bookmark was successfully created. */ public function set_bookmark( $bookmark_name ): bool { return parent::set_bookmark( "_{$bookmark_name}" ); } /** * Checks whether a bookmark with the given name exists. * * @since 6.5.0 * * @param string $bookmark_name Name to identify a bookmark that potentially exists. * @return bool Whether that bookmark exists. */ public function has_bookmark( $bookmark_name ): bool { return parent::has_bookmark( "_{$bookmark_name}" ); } /* * HTML Parsing Algorithms */ /** * Closes a P element. * * @since 6.4.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#close-a-p-element */ private function close_a_p_element(): void { $this->generate_implied_end_tags( 'P' ); $this->state->stack_of_open_elements->pop_until( 'P' ); } /** * Closes elements that have implied end tags. * * @since 6.4.0 * @since 6.7.0 Full spec support. * * @see https://html.spec.whatwg.org/#generate-implied-end-tags * * @param string|null $except_for_this_element Perform as if this element doesn't exist in the stack of open elements. */ private function generate_implied_end_tags( ?string $except_for_this_element = null ): void { $elements_with_implied_end_tags = array( 'DD', 'DT', 'LI', 'OPTGROUP', 'OPTION', 'P', 'RB', 'RP', 'RT', 'RTC', ); $no_exclusions = ! isset( $except_for_this_element ); while ( ( $no_exclusions || ! $this->state->stack_of_open_elements->current_node_is( $except_for_this_element ) ) && in_array( $this->state->stack_of_open_elements->current_node()->node_name, $elements_with_implied_end_tags, true ) ) { $this->state->stack_of_open_elements->pop(); } } /** * Closes elements that have implied end tags, thoroughly. * * See the HTML specification for an explanation why this is * different from generating end tags in the normal sense. * * @since 6.4.0 * @since 6.7.0 Full spec support. * * @see WP_HTML_Processor::generate_implied_end_tags * @see https://html.spec.whatwg.org/#generate-implied-end-tags */ private function generate_implied_end_tags_thoroughly(): void { $elements_with_implied_end_tags = array( 'CAPTION', 'COLGROUP', 'DD', 'DT', 'LI', 'OPTGROUP', 'OPTION', 'P', 'RB', 'RP', 'RT', 'RTC', 'TBODY', 'TD', 'TFOOT', 'TH', 'THEAD', 'TR', ); while ( in_array( $this->state->stack_of_open_elements->current_node()->node_name, $elements_with_implied_end_tags, true ) ) { $this->state->stack_of_open_elements->pop(); } } /** * Returns the adjusted current node. * * > The adjusted current node is the context element if the parser was created as * > part of the HTML fragment parsing algorithm and the stack of open elements * > has only one element in it (fragment case); otherwise, the adjusted current * > node is the current node. * * @see https://html.spec.whatwg.org/#adjusted-current-node * * @since 6.7.0 * * @return WP_HTML_Token|null The adjusted current node. */ private function get_adjusted_current_node(): ?WP_HTML_Token { if ( isset( $this->context_node ) && 1 === $this->state->stack_of_open_elements->count() ) { return $this->context_node; } return $this->state->stack_of_open_elements->current_node(); } /** * Reconstructs the active formatting elements. * * > This has the effect of reopening all the formatting elements that were opened * > in the current body, cell, or caption (whichever is youngest) that haven't * > been explicitly closed. * * @since 6.4.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#reconstruct-the-active-formatting-elements * * @return bool Whether any formatting elements needed to be reconstructed. */ private function reconstruct_active_formatting_elements(): bool { /* * > If there are no entries in the list of active formatting elements, then there is nothing * > to reconstruct; stop this algorithm. */ if ( 0 === $this->state->active_formatting_elements->count() ) { return false; } $last_entry = $this->state->active_formatting_elements->current_node(); if ( /* * > If the last (most recently added) entry in the list of active formatting elements is a marker; * > stop this algorithm. */ 'marker' === $last_entry->node_name || /* * > If the last (most recently added) entry in the list of active formatting elements is an * > element that is in the stack of open elements, then there is nothing to reconstruct; * > stop this algorithm. */ $this->state->stack_of_open_elements->contains_node( $last_entry ) ) { return false; } $this->bail( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' ); } /** * Runs the reset the insertion mode appropriately algorithm. * * @since 6.7.0 * * @see https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately */ private function reset_insertion_mode_appropriately(): void { // Set the first node. $first_node = null; foreach ( $this->state->stack_of_open_elements->walk_down() as $first_node ) { break; } /* * > 1. Let _last_ be false. */ $last = false; foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { /* * > 2. Let _node_ be the last node in the stack of open elements. * > 3. _Loop_: If _node_ is the first node in the stack of open elements, then set _last_ * > to true, and, if the parser was created as part of the HTML fragment parsing * > algorithm (fragment case), set node to the context element passed to * > that algorithm. * > … */ if ( $node === $first_node ) { $last = true; if ( isset( $this->context_node ) ) { $node = $this->context_node; } } // All of the following rules are for matching HTML elements. if ( 'html' !== $node->namespace ) { continue; } switch ( $node->node_name ) { /* * > 4. If node is a `select` element, run these substeps: * > 1. If _last_ is true, jump to the step below labeled done. * > 2. Let _ancestor_ be _node_. * > 3. _Loop_: If _ancestor_ is the first node in the stack of open elements, * > jump to the step below labeled done. * > 4. Let ancestor be the node before ancestor in the stack of open elements. * > … * > 7. Jump back to the step labeled _loop_. * > 8. _Done_: Switch the insertion mode to "in select" and return. */ case 'SELECT': if ( ! $last ) { foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $ancestor ) { if ( 'html' !== $ancestor->namespace ) { continue; } switch ( $ancestor->node_name ) { /* * > 5. If _ancestor_ is a `template` node, jump to the step below * > labeled _done_. */ case 'TEMPLATE': break 2; /* * > 6. If _ancestor_ is a `table` node, switch the insertion mode to * > "in select in table" and return. */ case 'TABLE': $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE; return; } } } $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT; return; /* * > 5. If _node_ is a `td` or `th` element and _last_ is false, then switch the * > insertion mode to "in cell" and return. */ case 'TD': case 'TH': if ( ! $last ) { $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_CELL; return; } break; /* * > 6. If _node_ is a `tr` element, then switch the insertion mode to "in row" * > and return. */ case 'TR': $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; return; /* * > 7. If _node_ is a `tbody`, `thead`, or `tfoot` element, then switch the * > insertion mode to "in table body" and return. */ case 'TBODY': case 'THEAD': case 'TFOOT': $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; return; /* * > 8. If _node_ is a `caption` element, then switch the insertion mode to * > "in caption" and return. */ case 'CAPTION': $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION; return; /* * > 9. If _node_ is a `colgroup` element, then switch the insertion mode to * > "in column group" and return. */ case 'COLGROUP': $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP; return; /* * > 10. If _node_ is a `table` element, then switch the insertion mode to * > "in table" and return. */ case 'TABLE': $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; return; /* * > 11. If _node_ is a `template` element, then switch the insertion mode to the * > current template insertion mode and return. */ case 'TEMPLATE': $this->state->insertion_mode = end( $this->state->stack_of_template_insertion_modes ); return; /* * > 12. If _node_ is a `head` element and _last_ is false, then switch the * > insertion mode to "in head" and return. */ case 'HEAD': if ( ! $last ) { $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; return; } break; /* * > 13. If _node_ is a `body` element, then switch the insertion mode to "in body" * > and return. */ case 'BODY': $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; return; /* * > 14. If _node_ is a `frameset` element, then switch the insertion mode to * > "in frameset" and return. (fragment case) */ case 'FRAMESET': $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET; return; /* * > 15. If _node_ is an `html` element, run these substeps: * > 1. If the head element pointer is null, switch the insertion mode to * > "before head" and return. (fragment case) * > 2. Otherwise, the head element pointer is not null, switch the insertion * > mode to "after head" and return. */ case 'HTML': $this->state->insertion_mode = isset( $this->state->head_element ) ? WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD : WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD; return; } } /* * > 16. If _last_ is true, then switch the insertion mode to "in body" * > and return. (fragment case) * * This is only reachable if `$last` is true, as per the fragment parsing case. */ $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; } /** * Runs the adoption agency algorithm. * * @since 6.4.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#adoption-agency-algorithm */ private function run_adoption_agency_algorithm(): void { $budget = 1000; $subject = $this->get_tag(); $current_node = $this->state->stack_of_open_elements->current_node(); if ( // > If the current node is an HTML element whose tag name is subject $current_node && $subject === $current_node->node_name && // > the current node is not in the list of active formatting elements ! $this->state->active_formatting_elements->contains_node( $current_node ) ) { $this->state->stack_of_open_elements->pop(); return; } $outer_loop_counter = 0; while ( $budget-- > 0 ) { if ( $outer_loop_counter++ >= 8 ) { return; } /* * > Let formatting element be the last element in the list of active formatting elements that: * > - is between the end of the list and the last marker in the list, * > if any, or the start of the list otherwise, * > - and has the tag name subject. */ $formatting_element = null; foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { if ( 'marker' === $item->node_name ) { break; } if ( $subject === $item->node_name ) { $formatting_element = $item; break; } } // > If there is no such element, then return and instead act as described in the "any other end tag" entry above. if ( null === $formatting_element ) { $this->bail( 'Cannot run adoption agency when "any other end tag" is required.' ); } // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) { $this->state->active_formatting_elements->remove_node( $formatting_element ); return; } // > If formatting element is in the stack of open elements, but the element is not in scope, then this is a parse error; return. if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $formatting_element->node_name ) ) { return; } /* * > Let furthest block be the topmost node in the stack of open elements that is lower in the stack * > than formatting element, and is an element in the special category. There might not be one. */ $is_above_formatting_element = true; $furthest_block = null; foreach ( $this->state->stack_of_open_elements->walk_down() as $item ) { if ( $is_above_formatting_element && $formatting_element->bookmark_name !== $item->bookmark_name ) { continue; } if ( $is_above_formatting_element ) { $is_above_formatting_element = false; continue; } if ( self::is_special( $item ) ) { $furthest_block = $item; break; } } /* * > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the * > stack of open elements, from the current node up to and including formatting element, then * > remove formatting element from the list of active formatting elements, and finally return. */ if ( null === $furthest_block ) { foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { $this->state->stack_of_open_elements->pop(); if ( $formatting_element->bookmark_name === $item->bookmark_name ) { $this->state->active_formatting_elements->remove_node( $formatting_element ); return; } } } $this->bail( 'Cannot extract common ancestor in adoption agency algorithm.' ); } $this->bail( 'Cannot run adoption agency when looping required.' ); } /** * Runs the "close the cell" algorithm. * * > Where the steps above say to close the cell, they mean to run the following algorithm: * > 1. Generate implied end tags. * > 2. If the current node is not now a td element or a th element, then this is a parse error. * > 3. Pop elements from the stack of open elements stack until a td element or a th element has been popped from the stack. * > 4. Clear the list of active formatting elements up to the last marker. * > 5. Switch the insertion mode to "in row". * * @see https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell * * @since 6.7.0 */ private function close_cell(): void { $this->generate_implied_end_tags(); // @todo Parse error if the current node is a "td" or "th" element. foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) { $this->state->stack_of_open_elements->pop(); if ( 'TD' === $element->node_name || 'TH' === $element->node_name ) { break; } } $this->state->active_formatting_elements->clear_up_to_last_marker(); $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; } /** * Inserts an HTML element on the stack of open elements. * * @since 6.4.0 * * @see https://html.spec.whatwg.org/#insert-a-foreign-element * * @param WP_HTML_Token $token Name of bookmark pointing to element in original input HTML. */ private function insert_html_element( WP_HTML_Token $token ): void { $this->state->stack_of_open_elements->push( $token ); } /** * Inserts a foreign element on to the stack of open elements. * * @since 6.7.0 * * @see https://html.spec.whatwg.org/#insert-a-foreign-element * * @param WP_HTML_Token $token Insert this token. The token's namespace and * insertion point will be updated correctly. * @param bool $only_add_to_element_stack Whether to skip the "insert an element at the adjusted * insertion location" algorithm when adding this element. */ private function insert_foreign_element( WP_HTML_Token $token, bool $only_add_to_element_stack ): void { $adjusted_current_node = $this->get_adjusted_current_node(); $token->namespace = $adjusted_current_node ? $adjusted_current_node->namespace : 'html'; if ( $this->is_mathml_integration_point() ) { $token->integration_node_type = 'math'; } elseif ( $this->is_html_integration_point() ) { $token->integration_node_type = 'html'; } if ( false === $only_add_to_element_stack ) { /* * @todo Implement the "appropriate place for inserting a node" and the * "insert an element at the adjusted insertion location" algorithms. * * These algorithms mostly impacts DOM tree construction and not the HTML API. * Here, there's no DOM node onto which the element will be appended, so the * parser will skip this step. * * @see https://html.spec.whatwg.org/#insert-an-element-at-the-adjusted-insertion-location */ } $this->insert_html_element( $token ); } /** * Inserts a virtual element on the stack of open elements. * * @since 6.7.0 * * @param string $token_name Name of token to create and insert into the stack of open elements. * @param string|null $bookmark_name Optional. Name to give bookmark for created virtual node. * Defaults to auto-creating a bookmark name. * @return WP_HTML_Token Newly-created virtual token. */ private function insert_virtual_node( $token_name, $bookmark_name = null ): WP_HTML_Token { $here = $this->bookmarks[ $this->state->current_token->bookmark_name ]; $name = $bookmark_name ?? $this->bookmark_token(); $this->bookmarks[ $name ] = new WP_HTML_Span( $here->start, 0 ); $token = new WP_HTML_Token( $name, $token_name, false ); $this->insert_html_element( $token ); return $token; } /* * HTML Specification Helpers */ /** * Indicates if the current token is a MathML integration point. * * @since 6.7.0 * * @see https://html.spec.whatwg.org/#mathml-text-integration-point * * @return bool Whether the current token is a MathML integration point. */ private function is_mathml_integration_point(): bool { $current_token = $this->state->current_token; if ( ! isset( $current_token ) ) { return false; } if ( 'math' !== $current_token->namespace || 'M' !== $current_token->node_name[0] ) { return false; } $tag_name = $current_token->node_name; return ( 'MI' === $tag_name || 'MO' === $tag_name || 'MN' === $tag_name || 'MS' === $tag_name || 'MTEXT' === $tag_name ); } /** * Indicates if the current token is an HTML integration point. * * Note that this method must be an instance method with access * to the current token, since it needs to examine the attributes * of the currently-matched tag, if it's in the MathML namespace. * Otherwise it would be required to scan the HTML and ensure that * no other accounting is overlooked. * * @since 6.7.0 * * @see https://html.spec.whatwg.org/#html-integration-point * * @return bool Whether the current token is an HTML integration point. */ private function is_html_integration_point(): bool { $current_token = $this->state->current_token; if ( ! isset( $current_token ) ) { return false; } if ( 'html' === $current_token->namespace ) { return false; } $tag_name = $current_token->node_name; if ( 'svg' === $current_token->namespace ) { return ( 'DESC' === $tag_name || 'FOREIGNOBJECT' === $tag_name || 'TITLE' === $tag_name ); } if ( 'math' === $current_token->namespace ) { if ( 'ANNOTATION-XML' !== $tag_name ) { return false; } $encoding = $this->get_attribute( 'encoding' ); return ( is_string( $encoding ) && ( 0 === strcasecmp( $encoding, 'application/xhtml+xml' ) || 0 === strcasecmp( $encoding, 'text/html' ) ) ); } $this->bail( 'Should not have reached end of HTML Integration Point detection: check HTML API code.' ); // This unnecessary return prevents tools from inaccurately reporting type errors. return false; } /** * Returns whether an element of a given name is in the HTML special category. * * @since 6.4.0 * * @see https://html.spec.whatwg.org/#special * * @param WP_HTML_Token|string $tag_name Node to check, or only its name if in the HTML namespace. * @return bool Whether the element of the given name is in the special category. */ public static function is_special( $tag_name ): bool { if ( is_string( $tag_name ) ) { $tag_name = strtoupper( $tag_name ); } else { $tag_name = 'html' === $tag_name->namespace ? strtoupper( $tag_name->node_name ) : "{$tag_name->namespace} {$tag_name->node_name}"; } return ( 'ADDRESS' === $tag_name || 'APPLET' === $tag_name || 'AREA' === $tag_name || 'ARTICLE' === $tag_name || 'ASIDE' === $tag_name || 'BASE' === $tag_name || 'BASEFONT' === $tag_name || 'BGSOUND' === $tag_name || 'BLOCKQUOTE' === $tag_name || 'BODY' === $tag_name || 'BR' === $tag_name || 'BUTTON' === $tag_name || 'CAPTION' === $tag_name || 'CENTER' === $tag_name || 'COL' === $tag_name || 'COLGROUP' === $tag_name || 'DD' === $tag_name || 'DETAILS' === $tag_name || 'DIR' === $tag_name || 'DIV' === $tag_name || 'DL' === $tag_name || 'DT' === $tag_name || 'EMBED' === $tag_name || 'FIELDSET' === $tag_name || 'FIGCAPTION' === $tag_name || 'FIGURE' === $tag_name || 'FOOTER' === $tag_name || 'FORM' === $tag_name || 'FRAME' === $tag_name || 'FRAMESET' === $tag_name || 'H1' === $tag_name || 'H2' === $tag_name || 'H3' === $tag_name || 'H4' === $tag_name || 'H5' === $tag_name || 'H6' === $tag_name || 'HEAD' === $tag_name || 'HEADER' === $tag_name || 'HGROUP' === $tag_name || 'HR' === $tag_name || 'HTML' === $tag_name || 'IFRAME' === $tag_name || 'IMG' === $tag_name || 'INPUT' === $tag_name || 'KEYGEN' === $tag_name || 'LI' === $tag_name || 'LINK' === $tag_name || 'LISTING' === $tag_name || 'MAIN' === $tag_name || 'MARQUEE' === $tag_name || 'MENU' === $tag_name || 'META' === $tag_name || 'NAV' === $tag_name || 'NOEMBED' === $tag_name || 'NOFRAMES' === $tag_name || 'NOSCRIPT' === $tag_name || 'OBJECT' === $tag_name || 'OL' === $tag_name || 'P' === $tag_name || 'PARAM' === $tag_name || 'PLAINTEXT' === $tag_name || 'PRE' === $tag_name || 'SCRIPT' === $tag_name || 'SEARCH' === $tag_name || 'SECTION' === $tag_name || 'SELECT' === $tag_name || 'SOURCE' === $tag_name || 'STYLE' === $tag_name || 'SUMMARY' === $tag_name || 'TABLE' === $tag_name || 'TBODY' === $tag_name || 'TD' === $tag_name || 'TEMPLATE' === $tag_name || 'TEXTAREA' === $tag_name || 'TFOOT' === $tag_name || 'TH' === $tag_name || 'THEAD' === $tag_name || 'TITLE' === $tag_name || 'TR' === $tag_name || 'TRACK' === $tag_name || 'UL' === $tag_name || 'WBR' === $tag_name || 'XMP' === $tag_name || // MathML. 'math MI' === $tag_name || 'math MO' === $tag_name || 'math MN' === $tag_name || 'math MS' === $tag_name || 'math MTEXT' === $tag_name || 'math ANNOTATION-XML' === $tag_name || // SVG. 'svg DESC' === $tag_name || 'svg FOREIGNOBJECT' === $tag_name || 'svg TITLE' === $tag_name ); } /** * Returns whether a given element is an HTML Void Element * * > area, base, br, col, embed, hr, img, input, link, meta, source, track, wbr * * @since 6.4.0 * * @see https://html.spec.whatwg.org/#void-elements * * @param string $tag_name Name of HTML tag to check. * @return bool Whether the given tag is an HTML Void Element. */ public static function is_void( $tag_name ): bool { $tag_name = strtoupper( $tag_name ); return ( 'AREA' === $tag_name || 'BASE' === $tag_name || 'BASEFONT' === $tag_name || // Obsolete but still treated as void. 'BGSOUND' === $tag_name || // Obsolete but still treated as void. 'BR' === $tag_name || 'COL' === $tag_name || 'EMBED' === $tag_name || 'FRAME' === $tag_name || 'HR' === $tag_name || 'IMG' === $tag_name || 'INPUT' === $tag_name || 'KEYGEN' === $tag_name || // Obsolete but still treated as void. 'LINK' === $tag_name || 'META' === $tag_name || 'PARAM' === $tag_name || // Obsolete but still treated as void. 'SOURCE' === $tag_name || 'TRACK' === $tag_name || 'WBR' === $tag_name ); } /** * Gets an encoding from a given string. * * This is an algorithm defined in the WHAT-WG specification. * * Example: * * 'UTF-8' === self::get_encoding( 'utf8' ); * 'UTF-8' === self::get_encoding( " \tUTF-8 " ); * null === self::get_encoding( 'UTF-7' ); * null === self::get_encoding( 'utf8; charset=' ); * * @see https://encoding.spec.whatwg.org/#concept-encoding-get * * @todo As this parser only supports UTF-8, only the UTF-8 * encodings are detected. Add more as desired, but the * parser will bail on non-UTF-8 encodings. * * @since 6.7.0 * * @param string $label A string which may specify a known encoding. * @return string|null Known encoding if matched, otherwise null. */ protected static function get_encoding( string $label ): ?string { /* * > Remove any leading and trailing ASCII whitespace from label. */ $label = trim( $label, " \t\f\r\n" ); /* * > If label is an ASCII case-insensitive match for any of the labels listed in the * > table below, then return the corresponding encoding; otherwise return failure. */ switch ( strtolower( $label ) ) { case 'unicode-1-1-utf-8': case 'unicode11utf8': case 'unicode20utf8': case 'utf-8': case 'utf8': case 'x-unicode20utf8': return 'UTF-8'; default: return null; } } /* * Constants that would pollute the top of the class if they were found there. */ /** * Indicates that the next HTML token should be parsed and processed. * * @since 6.4.0 * * @var string */ const PROCESS_NEXT_NODE = 'process-next-node'; /** * Indicates that the current HTML token should be reprocessed in the newly-selected insertion mode. * * @since 6.4.0 * * @var string */ const REPROCESS_CURRENT_NODE = 'reprocess-current-node'; /** * Indicates that the current HTML token should be processed without advancing the parser. * * @since 6.5.0 * * @var string */ const PROCESS_CURRENT_NODE = 'process-current-node'; /** * Indicates that the parser encountered unsupported markup and has bailed. * * @since 6.4.0 * * @var string */ const ERROR_UNSUPPORTED = 'unsupported'; /** * Indicates that the parser encountered more HTML tokens than it * was able to process and has bailed. * * @since 6.4.0 * * @var string */ const ERROR_EXCEEDED_MAX_BOOKMARKS = 'exceeded-max-bookmarks'; /** * Unlock code that must be passed into the constructor to create this class. * * This class extends the WP_HTML_Tag_Processor, which has a public class * constructor. Therefore, it's not possible to have a private constructor here. * * This unlock code is used to ensure that anyone calling the constructor is * doing so with a full understanding that it's intended to be a private API. * * @access private */ const CONSTRUCTOR_UNLOCK_CODE = 'Use WP_HTML_Processor::create_fragment() instead of calling the class constructor directly.'; }