' );
* false === $processor->next_tag();
* WP_HTML_Processor::ERROR_UNSUPPORTED === $processor->get_last_error();
*
* @since 6.4.0
*
* @see self::ERROR_UNSUPPORTED
* @see self::ERROR_EXCEEDED_MAX_BOOKMARKS
*
* @return string|null The last error, if one exists, otherwise null.
*/
public function get_last_error() {
return $this->last_error;
}
/**
* Finds the next tag matching the $query.
*
* @todo Support matching the class name and tag name.
*
* @since 6.4.0
* @since 6.6.0 Visits all tokens, including virtual ones.
*
* @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
*
* @param array|string|null $query {
* Optional. Which tag name to find, having which class, etc. Default is to find any tag.
*
* @type string|null $tag_name Which tag to find, or `null` for "any tag."
* @type string $tag_closers 'visit' to pause at tag closers, 'skip' or unset to only visit openers.
* @type int|null $match_offset Find the Nth tag matching all search criteria.
* 1 for "first" tag, 3 for "third," etc.
* Defaults to first tag.
* @type string|null $class_name Tag must contain this whole class name to match.
* @type string[] $breadcrumbs DOM sub-path at which element is found, e.g. `array( 'FIGURE', 'IMG' )`.
* May also contain the wildcard `*` which matches a single element, e.g. `array( 'SECTION', '*' )`.
* }
* @return bool Whether a tag was matched.
*/
public function next_tag( $query = null ) {
$visit_closers = isset( $query['tag_closers'] ) && 'visit' === $query['tag_closers'];
if ( null === $query ) {
while ( $this->next_token() ) {
if ( '#tag' !== $this->get_token_type() ) {
continue;
}
if ( ! $this->is_tag_closer() || $visit_closers ) {
return true;
}
}
return false;
}
if ( is_string( $query ) ) {
$query = array( 'breadcrumbs' => array( $query ) );
}
if ( ! is_array( $query ) ) {
_doing_it_wrong(
__METHOD__,
__( 'Please pass a query array to this function.' ),
'6.4.0'
);
return false;
}
$needs_class = ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) )
? $query['class_name']
: null;
if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
while ( $this->next_token() ) {
if ( '#tag' !== $this->get_token_type() ) {
continue;
}
if ( isset( $needs_class ) && ! $this->has_class( $needs_class ) ) {
continue;
}
if ( ! $this->is_tag_closer() || $visit_closers ) {
return true;
}
}
return false;
}
$breadcrumbs = $query['breadcrumbs'];
$match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;
while ( $match_offset > 0 && $this->next_token() ) {
if ( '#tag' !== $this->get_token_type() || $this->is_tag_closer() ) {
continue;
}
if ( isset( $needs_class ) && ! $this->has_class( $needs_class ) ) {
continue;
}
if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) {
return true;
}
}
return false;
}
/**
* Ensures internal accounting is maintained for HTML semantic rules while
* the underlying Tag Processor class is seeking to a bookmark.
*
* This doesn't currently have a way to represent non-tags and doesn't process
* semantic rules for text nodes. For access to the raw tokens consider using
* WP_HTML_Tag_Processor instead.
*
* @since 6.5.0 Added for internal support; do not use.
*
* @access private
*
* @return bool
*/
public function next_token() {
$this->current_element = null;
if ( isset( $this->last_error ) ) {
return false;
}
if ( 'done' !== $this->has_seen_context_node && 0 === count( $this->element_queue ) && ! $this->step() ) {
while ( 'context-node' !== $this->state->stack_of_open_elements->current_node()->bookmark_name && $this->state->stack_of_open_elements->pop() ) {
continue;
}
$this->has_seen_context_node = 'done';
return $this->next_token();
}
$this->current_element = array_shift( $this->element_queue );
while ( isset( $this->context_node ) && ! $this->has_seen_context_node ) {
if ( isset( $this->current_element ) ) {
if ( $this->context_node === $this->current_element->token && WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) {
$this->has_seen_context_node = true;
return $this->next_token();
}
}
$this->current_element = array_shift( $this->element_queue );
}
if ( ! isset( $this->current_element ) ) {
if ( 'done' === $this->has_seen_context_node ) {
return false;
} else {
return $this->next_token();
}
}
if ( isset( $this->context_node ) && WP_HTML_Stack_Event::POP === $this->current_element->operation && $this->context_node === $this->current_element->token ) {
$this->element_queue = array();
$this->current_element = null;
return false;
}
// Avoid sending close events for elements which don't expect a closing.
if (
WP_HTML_Stack_Event::POP === $this->current_element->operation &&
! static::expects_closer( $this->current_element->token )
) {
return $this->next_token();
}
return true;
}
/**
* Indicates if the current tag token is a tag closer.
*
* Example:
*
* $p = WP_HTML_Processor::create_fragment( '' );
* $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
* $p->is_tag_closer() === false;
*
* $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
* $p->is_tag_closer() === true;
*
* @since 6.6.0 Subclassed for HTML Processor.
*
* @return bool Whether the current tag is a tag closer.
*/
public function is_tag_closer() {
return $this->is_virtual()
? ( WP_HTML_Stack_Event::POP === $this->current_element->operation && '#tag' === $this->get_token_type() )
: parent::is_tag_closer();
}
/**
* Indicates if the currently-matched token is virtual, created by a stack operation
* while processing HTML, rather than a token found in the HTML text itself.
*
* @since 6.6.0
*
* @return bool Whether the current token is virtual.
*/
private function is_virtual() {
return (
isset( $this->current_element->provenance ) &&
'virtual' === $this->current_element->provenance
);
}
/**
* Indicates if the currently-matched tag matches the given breadcrumbs.
*
* A "*" represents a single tag wildcard, where any tag matches, but not no tags.
*
* At some point this function _may_ support a `**` syntax for matching any number
* of unspecified tags in the breadcrumb stack. This has been intentionally left
* out, however, to keep this function simple and to avoid introducing backtracking,
* which could open up surprising performance breakdowns.
*
* Example:
*
* $processor = WP_HTML_Processor::create_fragment( '' );
* $processor->next_tag( 'img' );
* true === $processor->matches_breadcrumbs( array( 'figure', 'img' ) );
* true === $processor->matches_breadcrumbs( array( 'span', 'figure', 'img' ) );
* false === $processor->matches_breadcrumbs( array( 'span', 'img' ) );
* true === $processor->matches_breadcrumbs( array( 'span', '*', 'img' ) );
*
* @since 6.4.0
*
* @param string[] $breadcrumbs DOM sub-path at which element is found, e.g. `array( 'FIGURE', 'IMG' )`.
* May also contain the wildcard `*` which matches a single element, e.g. `array( 'SECTION', '*' )`.
* @return bool Whether the currently-matched tag is found at the given nested structure.
*/
public function matches_breadcrumbs( $breadcrumbs ) {
// Everything matches when there are zero constraints.
if ( 0 === count( $breadcrumbs ) ) {
return true;
}
// Start at the last crumb.
$crumb = end( $breadcrumbs );
if ( '*' !== $crumb && $this->get_tag() !== strtoupper( $crumb ) ) {
return false;
}
foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
$crumb = strtoupper( current( $breadcrumbs ) );
if ( '*' !== $crumb && $node->node_name !== $crumb ) {
return false;
}
if ( false === prev( $breadcrumbs ) ) {
return true;
}
}
return false;
}
/**
* Indicates if the currently-matched node expects a closing
* token, or if it will self-close on the next step.
*
* Most HTML elements expect a closer, such as a P element or
* a DIV element. Others, like an IMG element are void and don't
* have a closing tag. Special elements, such as SCRIPT and STYLE,
* are treated just like void tags. Text nodes and self-closing
* foreign content will also act just like a void tag, immediately
* closing as soon as the processor advances to the next token.
*
* @since 6.6.0
*
* @todo When adding support for foreign content, ensure that
* this returns false for self-closing elements in the
* SVG and MathML namespace.
*
* @param ?WP_HTML_Token $node Node to examine instead of current node, if provided.
* @return bool Whether to expect a closer for the currently-matched node,
* or `null` if not matched on any token.
*/
public function expects_closer( $node = null ) {
$token_name = $node->node_name ?? $this->get_token_name();
if ( ! isset( $token_name ) ) {
return null;
}
return ! (
// Comments, text nodes, and other atomic tokens.
'#' === $token_name[0] ||
// Doctype declarations.
'html' === $token_name ||
// Void elements.
self::is_void( $token_name ) ||
// Special atomic elements.
in_array( $token_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true )
);
}
/**
* Steps through the HTML document and stop at the next tag, if any.
*
* @since 6.4.0
*
* @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
*
* @see self::PROCESS_NEXT_NODE
* @see self::REPROCESS_CURRENT_NODE
*
* @param string $node_to_process Whether to parse the next node or reprocess the current node.
* @return bool Whether a tag was matched.
*/
public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
// Refuse to proceed if there was a previous error.
if ( null !== $this->last_error ) {
return false;
}
if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) {
/*
* Void elements still hop onto the stack of open elements even though
* there's no corresponding closing tag. This is important for managing
* stack-based operations such as "navigate to parent node" or checking
* on an element's breadcrumbs.
*
* When moving on to the next node, therefore, if the bottom-most element
* on the stack is a void element, it must be closed.
*
* @todo Once self-closing foreign elements and BGSOUND are supported,
* they must also be implicitly closed here too. BGSOUND is
* special since it's only self-closing if the self-closing flag
* is provided in the opening tag, otherwise it expects a tag closer.
*/
$top_node = $this->state->stack_of_open_elements->current_node();
if ( isset( $top_node ) && ! static::expects_closer( $top_node ) ) {
$this->state->stack_of_open_elements->pop();
}
}
if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
parent::next_token();
}
// Finish stepping when there are no more tokens in the document.
if (
WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
) {
return false;
}
$this->state->current_token = new WP_HTML_Token(
$this->bookmark_token(),
$this->get_token_name(),
$this->has_self_closing_flag(),
$this->release_internal_bookmark_on_destruct
);
try {
switch ( $this->state->insertion_mode ) {
case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY:
return $this->step_in_body();
default:
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
}
} catch ( WP_HTML_Unsupported_Exception $e ) {
/*
* Exceptions are used in this class to escape deep call stacks that
* otherwise might involve messier calling and return conventions.
*/
return false;
}
}
/**
* Computes the HTML breadcrumbs for the currently-matched node, if matched.
*
* Breadcrumbs start at the outermost parent and descend toward the matched element.
* They always include the entire path from the root HTML node to the matched element.
*
* @todo It could be more efficient to expose a generator-based version of this function
* to avoid creating the array copy on tag iteration. If this is done, it would likely
* be more useful to walk up the stack when yielding instead of starting at the top.
*
* Example
*
* $processor = WP_HTML_Processor::create_fragment( '
' );
* $processor->next_tag( 'IMG' );
* $processor->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' );
*
* @since 6.4.0
*
* @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
*/
public function get_breadcrumbs() {
$breadcrumbs = array();
foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) {
$breadcrumbs[] = $stack_item->node_name;
}
if ( ! $this->is_virtual() ) {
return $breadcrumbs;
}
foreach ( $this->element_queue as $queue_item ) {
if ( $this->current_element->token->bookmark_name === $queue_item->token->bookmark_name ) {
break;
}
if ( 'context-node' === $queue_item->token->bookmark_name ) {
break;
}
if ( 'real' === $queue_item->provenance ) {
break;
}
if ( WP_HTML_Stack_Event::PUSH === $queue_item->operation ) {
$breadcrumbs[] = $queue_item->token->node_name;
} else {
array_pop( $breadcrumbs );
}
}
if ( null !== parent::get_token_name() && ! parent::is_tag_closer() ) {
array_pop( $breadcrumbs );
}
// Add the virtual node we're at.
if ( WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) {
$breadcrumbs[] = $this->current_element->token->node_name;
}
return $breadcrumbs;
}
/**
* Returns the nesting depth of the current location in the document.
*
* Example:
*
* $processor = WP_HTML_Processor::create_fragment( '' );
* // The processor starts in the BODY context, meaning it has depth from the start: HTML > BODY.
* 2 === $processor->get_current_depth();
*
* // Opening the DIV element increases the depth.
* $processor->next_token();
* 3 === $processor->get_current_depth();
*
* // Opening the P element increases the depth.
* $processor->next_token();
* 4 === $processor->get_current_depth();
*
* // The P element is closed during `next_token()` so the depth is decreased to reflect that.
* $processor->next_token();
* 3 === $processor->get_current_depth();
*
* @since 6.6.0
*
* @return int Nesting-depth of current location in the document.
*/
public function get_current_depth() {
return $this->is_virtual()
? count( $this->get_breadcrumbs() )
: $this->state->stack_of_open_elements->count();
}
/**
* Parses next element in the 'in body' insertion mode.
*
* This internal function performs the 'in body' insertion mode
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.4.0
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#parsing-main-inbody
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
private function step_in_body() {
$token_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$token_name}";
switch ( $op ) {
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->insert_html_element( $this->state->current_token );
return true;
case '#text':
$this->reconstruct_active_formatting_elements();
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
/*
* > A character token that is U+0000 NULL
*
* Any successive sequence of NULL bytes is ignored and won't
* trigger active format reconstruction. Therefore, if the text
* only comprises NULL bytes then the token should be ignored
* here, but if there are any other characters in the stream
* the active formats should be reconstructed.
*/
if (
1 <= $current_token->length &&
"\x00" === $this->html[ $current_token->start ] &&
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
) {
// Parse error: ignore the token.
return $this->step();
}
/*
* Whitespace-only text does not affect the frameset-ok flag.
* It is probably inter-element whitespace, but it may also
* contain character references which decode only to whitespace.
*/
$text = $this->get_modifiable_text();
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
$this->state->frameset_ok = false;
}
$this->insert_html_element( $this->state->current_token );
return true;
case 'html':
/*
* > A DOCTYPE token
* > Parse error. Ignore the token.
*/
return $this->step();
/*
* > A start tag whose tag name is "button"
*/
case '+BUTTON':
if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) {
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
$this->generate_implied_end_tags();
$this->state->stack_of_open_elements->pop_until( 'BUTTON' );
}
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
$this->state->frameset_ok = false;
return true;
/*
* > A start tag whose tag name is one of: "address", "article", "aside",
* > "blockquote", "center", "details", "dialog", "dir", "div", "dl",
* > "fieldset", "figcaption", "figure", "footer", "header", "hgroup",
* > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul"
*/
case '+ADDRESS':
case '+ARTICLE':
case '+ASIDE':
case '+BLOCKQUOTE':
case '+CENTER':
case '+DETAILS':
case '+DIALOG':
case '+DIR':
case '+DIV':
case '+DL':
case '+FIELDSET':
case '+FIGCAPTION':
case '+FIGURE':
case '+FOOTER':
case '+HEADER':
case '+HGROUP':
case '+MAIN':
case '+MENU':
case '+NAV':
case '+OL':
case '+P':
case '+SEARCH':
case '+SECTION':
case '+SUMMARY':
case '+UL':
if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->close_a_p_element();
}
$this->insert_html_element( $this->state->current_token );
return true;
/*
* > An end tag whose tag name is one of: "address", "article", "aside", "blockquote",
* > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset",
* > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main",
* > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul"
*/
case '-ADDRESS':
case '-ARTICLE':
case '-ASIDE':
case '-BLOCKQUOTE':
case '-BUTTON':
case '-CENTER':
case '-DETAILS':
case '-DIALOG':
case '-DIR':
case '-DIV':
case '-DL':
case '-FIELDSET':
case '-FIGCAPTION':
case '-FIGURE':
case '-FOOTER':
case '-HEADER':
case '-HGROUP':
case '-LISTING':
case '-MAIN':
case '-MENU':
case '-NAV':
case '-OL':
case '-PRE':
case '-SEARCH':
case '-SECTION':
case '-SUMMARY':
case '-UL':
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
// @todo Report parse error.
// Ignore the token.
return $this->step();
}
$this->generate_implied_end_tags();
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
// @todo Record parse error: this error doesn't impact parsing.
}
$this->state->stack_of_open_elements->pop_until( $token_name );
return true;
/*
* > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
*/
case '+H1':
case '+H2':
case '+H3':
case '+H4':
case '+H5':
case '+H6':
if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->close_a_p_element();
}
if (
in_array(
$this->state->stack_of_open_elements->current_node()->node_name,
array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ),
true
)
) {
// @todo Indicate a parse error once it's possible.
$this->state->stack_of_open_elements->pop();
}
$this->insert_html_element( $this->state->current_token );
return true;
/*
* > A start tag whose tag name is one of: "pre", "listing"
*/
case '+PRE':
case '+LISTING':
if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->close_a_p_element();
}
$this->insert_html_element( $this->state->current_token );
$this->state->frameset_ok = false;
return true;
/*
* > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
*/
case '-H1':
case '-H2':
case '-H3':
case '-H4':
case '-H5':
case '-H6':
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) {
/*
* This is a parse error; ignore the token.
*
* @todo Indicate a parse error once it's possible.
*/
return $this->step();
}
$this->generate_implied_end_tags();
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
// @todo Record parse error: this error doesn't impact parsing.
}
$this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' );
return true;
/*
* > A start tag whose tag name is "li"
* > A start tag whose tag name is one of: "dd", "dt"
*/
case '+DD':
case '+DT':
case '+LI':
$this->state->frameset_ok = false;
$node = $this->state->stack_of_open_elements->current_node();
$is_li = 'LI' === $token_name;
in_body_list_loop:
/*
* The logic for LI and DT/DD is the same except for one point: LI elements _only_
* close other LI elements, but a DT or DD element closes _any_ open DT or DD element.
*/
if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) {
$node_name = $is_li ? 'LI' : $node->node_name;
$this->generate_implied_end_tags( $node_name );
if ( $node_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
}
$this->state->stack_of_open_elements->pop_until( $node_name );
goto in_body_list_done;
}
if (
'ADDRESS' !== $node->node_name &&
'DIV' !== $node->node_name &&
'P' !== $node->node_name &&
$this->is_special( $node->node_name )
) {
/*
* > If node is in the special category, but is not an address, div,
* > or p element, then jump to the step labeled done below.
*/
goto in_body_list_done;
} else {
/*
* > Otherwise, set node to the previous entry in the stack of open elements
* > and return to the step labeled loop.
*/
foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) {
$node = $item;
break;
}
goto in_body_list_loop;
}
in_body_list_done:
if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->close_a_p_element();
}
$this->insert_html_element( $this->state->current_token );
return true;
/*
* > An end tag whose tag name is "li"
* > An end tag whose tag name is one of: "dd", "dt"
*/
case '-DD':
case '-DT':
case '-LI':
if (
/*
* An end tag whose tag name is "li":
* If the stack of open elements does not have an li element in list item scope,
* then this is a parse error; ignore the token.
*/
(
'LI' === $token_name &&
! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
) ||
/*
* An end tag whose tag name is one of: "dd", "dt":
* If the stack of open elements does not have an element in scope that is an
* HTML element with the same tag name as that of the token, then this is a
* parse error; ignore the token.
*/
(
'LI' !== $token_name &&
! $this->state->stack_of_open_elements->has_element_in_scope( $token_name )
)
) {
/*
* This is a parse error, ignore the token.
*
* @todo Indicate a parse error once it's possible.
*/
return $this->step();
}
$this->generate_implied_end_tags( $token_name );
if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
}
$this->state->stack_of_open_elements->pop_until( $token_name );
return true;
/*
* > An end tag whose tag name is "p"
*/
case '-P':
if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->insert_html_element( $this->state->current_token );
}
$this->close_a_p_element();
return true;
// > A start tag whose tag name is "a"
case '+A':
foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
switch ( $item->node_name ) {
case 'marker':
break;
case 'A':
$this->run_adoption_agency_algorithm();
$this->state->active_formatting_elements->remove_node( $item );
$this->state->stack_of_open_elements->remove_node( $item );
break;
}
}
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
$this->state->active_formatting_elements->push( $this->state->current_token );
return true;
/*
* > A start tag whose tag name is one of: "b", "big", "code", "em", "font", "i",
* > "s", "small", "strike", "strong", "tt", "u"
*/
case '+B':
case '+BIG':
case '+CODE':
case '+EM':
case '+FONT':
case '+I':
case '+S':
case '+SMALL':
case '+STRIKE':
case '+STRONG':
case '+TT':
case '+U':
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
$this->state->active_formatting_elements->push( $this->state->current_token );
return true;
/*
* > An end tag whose tag name is one of: "a", "b", "big", "code", "em", "font", "i",
* > "nobr", "s", "small", "strike", "strong", "tt", "u"
*/
case '-A':
case '-B':
case '-BIG':
case '-CODE':
case '-EM':
case '-FONT':
case '-I':
case '-S':
case '-SMALL':
case '-STRIKE':
case '-STRONG':
case '-TT':
case '-U':
$this->run_adoption_agency_algorithm();
return true;
/*
* > An end tag whose tag name is "br"
* > Parse error. Drop the attributes from the token, and act as described in the next
* > entry; i.e. act as if this was a "br" start tag token with no attributes, rather
* > than the end tag token that it actually is.
*/
case '-BR':
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' );
/*
* > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr"
*/
case '+AREA':
case '+BR':
case '+EMBED':
case '+IMG':
case '+KEYGEN':
case '+WBR':
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
$this->state->frameset_ok = false;
return true;
/*
* > A start tag whose tag name is "input"
*/
case '+INPUT':
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
$type_attribute = $this->get_attribute( 'type' );
/*
* > If the token does not have an attribute with the name "type", or if it does,
* > but that attribute's value is not an ASCII case-insensitive match for the
* > string "hidden", then: set the frameset-ok flag to "not ok".
*/
if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) {
$this->state->frameset_ok = false;
}
return true;
/*
* > A start tag whose tag name is "hr"
*/
case '+HR':
if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->close_a_p_element();
}
$this->insert_html_element( $this->state->current_token );
$this->state->frameset_ok = false;
return true;
/*
* > A start tag whose tag name is one of: "param", "source", "track"
*/
case '+PARAM':
case '+SOURCE':
case '+TRACK':
$this->insert_html_element( $this->state->current_token );
return true;
}
/*
* These tags require special handling in the 'in body' insertion mode
* but that handling hasn't yet been implemented.
*
* As the rules for each tag are implemented, the corresponding tag
* name should be removed from this list. An accompanying test should
* help ensure this list is maintained.
*
* @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags
*
* Since this switch structure throws a WP_HTML_Unsupported_Exception, it's
* possible to handle "any other start tag" and "any other end tag" below,
* as that guarantees execution doesn't proceed for the unimplemented tags.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
*/
switch ( $token_name ) {
case 'APPLET':
case 'BASE':
case 'BASEFONT':
case 'BGSOUND':
case 'BODY':
case 'CAPTION':
case 'COL':
case 'COLGROUP':
case 'FORM':
case 'FRAME':
case 'FRAMESET':
case 'HEAD':
case 'HTML':
case 'IFRAME':
case 'LINK':
case 'MARQUEE':
case 'MATH':
case 'META':
case 'NOBR':
case 'NOEMBED':
case 'NOFRAMES':
case 'NOSCRIPT':
case 'OBJECT':
case 'OPTGROUP':
case 'OPTION':
case 'PLAINTEXT':
case 'RB':
case 'RP':
case 'RT':
case 'RTC':
case 'SARCASM':
case 'SCRIPT':
case 'SELECT':
case 'STYLE':
case 'SVG':
case 'TABLE':
case 'TBODY':
case 'TD':
case 'TEMPLATE':
case 'TEXTAREA':
case 'TFOOT':
case 'TH':
case 'THEAD':
case 'TITLE':
case 'TR':
case 'XMP':
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
}
if ( ! parent::is_tag_closer() ) {
/*
* > Any other start tag
*/
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
return true;
} else {
/*
* > Any other end tag
*/
/*
* Find the corresponding tag opener in the stack of open elements, if
* it exists before reaching a special element, which provides a kind
* of boundary in the stack. For example, a `` should not
* close anything beyond its containing `P` or `DIV` element.
*/
foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
if ( $token_name === $node->node_name ) {
break;
}
if ( self::is_special( $node->node_name ) ) {
// This is a parse error, ignore the token.
return $this->step();
}
}
$this->generate_implied_end_tags( $token_name );
if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
// @todo Record parse error: this error doesn't impact parsing.
}
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
$this->state->stack_of_open_elements->pop();
if ( $node === $item ) {
return true;
}
}
}
}
/*
* Internal helpers
*/
/**
* Creates a new bookmark for the currently-matched token and returns the generated name.
*
* @since 6.4.0
* @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
*
* @throws Exception When unable to allocate requested bookmark.
*
* @return string|false Name of created bookmark, or false if unable to create.
*/
private function bookmark_token() {
if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
$this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
throw new Exception( 'could not allocate bookmark' );
}
return "{$this->bookmark_counter}";
}
/*
* HTML semantic overrides for Tag Processor
*/
/**
* Returns the uppercase name of the matched tag.
*
* The semantic rules for HTML specify that certain tags be reprocessed
* with a different tag name. Because of this, the tag name presented
* by the HTML Processor may differ from the one reported by the HTML
* Tag Processor, which doesn't apply these semantic rules.
*
* Example:
*
* $processor = new WP_HTML_Tag_Processor( 'Test
' );
* $processor->next_tag() === true;
* $processor->get_tag() === 'DIV';
*
* $processor->next_tag() === false;
* $processor->get_tag() === null;
*
* @since 6.4.0
*
* @return string|null Name of currently matched tag in input HTML, or `null` if none found.
*/
public function get_tag() {
if ( null !== $this->last_error ) {
return null;
}
if ( $this->is_virtual() ) {
return $this->current_element->token->node_name;
}
$tag_name = parent::get_tag();
switch ( $tag_name ) {
case 'IMAGE':
/*
* > A start tag whose tag name is "image"
* > Change the token's tag name to "img" and reprocess it. (Don't ask.)
*/
return 'IMG';
default:
return $tag_name;
}
}
/**
* Indicates if the currently matched tag contains the self-closing flag.
*
* No HTML elements ought to have the self-closing flag and for those, the self-closing
* flag will be ignored. For void elements this is benign because they "self close"
* automatically. For non-void HTML elements though problems will appear if someone
* intends to use a self-closing element in place of that element with an empty body.
* For HTML foreign elements and custom elements the self-closing flag determines if
* they self-close or not.
*
* This function does not determine if a tag is self-closing,
* but only if the self-closing flag is present in the syntax.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return bool Whether the currently matched tag contains the self-closing flag.
*/
public function has_self_closing_flag() {
return $this->is_virtual() ? false : parent::has_self_closing_flag();
}
/**
* Returns the node name represented by the token.
*
* This matches the DOM API value `nodeName`. Some values
* are static, such as `#text` for a text node, while others
* are dynamically generated from the token itself.
*
* Dynamic names:
* - Uppercase tag name for tag matches.
* - `html` for DOCTYPE declarations.
*
* Note that if the Tag Processor is not matched on a token
* then this function will return `null`, either because it
* hasn't yet found a token or because it reached the end
* of the document without matching a token.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string|null Name of the matched token.
*/
public function get_token_name() {
return $this->is_virtual()
? $this->current_element->token->node_name
: parent::get_token_name();
}
/**
* Indicates the kind of matched token, if any.
*
* This differs from `get_token_name()` in that it always
* returns a static string indicating the type, whereas
* `get_token_name()` may return values derived from the
* token itself, such as a tag name or processing
* instruction tag.
*
* Possible values:
* - `#tag` when matched on a tag.
* - `#text` when matched on a text node.
* - `#cdata-section` when matched on a CDATA node.
* - `#comment` when matched on a comment.
* - `#doctype` when matched on a DOCTYPE declaration.
* - `#presumptuous-tag` when matched on an empty tag closer.
* - `#funky-comment` when matched on a funky comment.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string|null What kind of token is matched, or null.
*/
public function get_token_type() {
if ( $this->is_virtual() ) {
/*
* This logic comes from the Tag Processor.
*
* @todo It would be ideal not to repeat this here, but it's not clearly
* better to allow passing a token name to `get_token_type()`.
*/
$node_name = $this->current_element->token->node_name;
$starting_char = $node_name[0];
if ( 'A' <= $starting_char && 'Z' >= $starting_char ) {
return '#tag';
}
if ( 'html' === $node_name ) {
return '#doctype';
}
return $node_name;
}
return parent::get_token_type();
}
/**
* Returns the value of a requested attribute from a matched tag opener if that attribute exists.
*
* Example:
*
* $p = WP_HTML_Processor::create_fragment( 'Test
' );
* $p->next_token() === true;
* $p->get_attribute( 'data-test-id' ) === '14';
* $p->get_attribute( 'enabled' ) === true;
* $p->get_attribute( 'aria-label' ) === null;
*
* $p->next_tag() === false;
* $p->get_attribute( 'class' ) === null;
*
* @since 6.6.0 Subclassed for HTML Processor.
*
* @param string $name Name of attribute whose value is requested.
* @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
*/
public function get_attribute( $name ) {
return $this->is_virtual() ? null : parent::get_attribute( $name );
}
/**
* Updates or creates a new attribute on the currently matched tag with the passed value.
*
* For boolean attributes special handling is provided:
* - When `true` is passed as the value, then only the attribute name is added to the tag.
* - When `false` is passed, the attribute gets removed if it existed before.
*
* For string attributes, the value is escaped using the `esc_attr` function.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $name The attribute name to target.
* @param string|bool $value The new attribute value.
* @return bool Whether an attribute value was set.
*/
public function set_attribute( $name, $value ) {
return $this->is_virtual() ? false : parent::set_attribute( $name, $value );
}
/**
* Remove an attribute from the currently-matched tag.
*
* @since 6.6.0 Subclassed for HTML Processor.
*
* @param string $name The attribute name to remove.
* @return bool Whether an attribute was removed.
*/
public function remove_attribute( $name ) {
return $this->is_virtual() ? false : parent::remove_attribute( $name );
}
/**
* Gets lowercase names of all attributes matching a given prefix in the current tag.
*
* Note that matching is case-insensitive. This is in accordance with the spec:
*
* > There must never be two or more attributes on
* > the same start tag whose names are an ASCII
* > case-insensitive match for each other.
* - HTML 5 spec
*
* Example:
*
* $p = new WP_HTML_Tag_Processor( 'Test
' );
* $p->next_tag( array( 'class_name' => 'test' ) ) === true;
* $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' );
*
* $p->next_tag() === false;
* $p->get_attribute_names_with_prefix( 'data-' ) === null;
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*
* @param string $prefix Prefix of requested attribute names.
* @return array|null List of attribute names, or `null` when no tag opener is matched.
*/
public function get_attribute_names_with_prefix( $prefix ) {
return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix );
}
/**
* Adds a new class name to the currently matched tag.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $class_name The class name to add.
* @return bool Whether the class was set to be added.
*/
public function add_class( $class_name ) {
return $this->is_virtual() ? false : parent::add_class( $class_name );
}
/**
* Removes a class name from the currently matched tag.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $class_name The class name to remove.
* @return bool Whether the class was set to be removed.
*/
public function remove_class( $class_name ) {
return $this->is_virtual() ? false : parent::remove_class( $class_name );
}
/**
* Returns if a matched tag contains the given ASCII case-insensitive class name.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
* @return bool|null Whether the matched tag contains the given class name, or null if not matched.
*/
public function has_class( $wanted_class ) {
return $this->is_virtual() ? null : parent::has_class( $wanted_class );
}
/**
* Generator for a foreach loop to step through each class name for the matched tag.
*
* This generator function is designed to be used inside a "foreach" loop.
*
* Example:
*
* $p = WP_HTML_Processor::create_fragment( "" );
* $p->next_tag();
* foreach ( $p->class_list() as $class_name ) {
* echo "{$class_name} ";
* }
* // Outputs: "free
lang-en "
*
* @since 6.6.0 Subclassed for the HTML Processor.
*/
public function class_list() {
return $this->is_virtual() ? null : parent::class_list();
}
/**
* Returns the modifiable text for a matched token, or an empty string.
*
* Modifiable text is text content that may be read and changed without
* changing the HTML structure of the document around it. This includes
* the contents of `#text` nodes in the HTML as well as the inner
* contents of HTML comments, Processing Instructions, and others, even
* though these nodes aren't part of a parsed DOM tree. They also contain
* the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
* other section in an HTML document which cannot contain HTML markup (DATA).
*
* If a token has no modifiable text then an empty string is returned to
* avoid needless crashing or type errors. An empty string does not mean
* that a token has modifiable text, and a token with modifiable text may
* have an empty string (e.g. a comment with no contents).
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string
*/
public function get_modifiable_text() {
return $this->is_virtual() ? '' : parent::get_modifiable_text();
}
/**
* Indicates what kind of comment produced the comment node.
*
* Because there are different kinds of HTML syntax which produce
* comments, the Tag Processor tracks and exposes this as a type
* for the comment. Nominally only regular HTML comments exist as
* they are commonly known, but a number of unrelated syntax errors
* also produce comments.
*
* @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
* @see self::COMMENT_AS_CDATA_LOOKALIKE
* @see self::COMMENT_AS_INVALID_HTML
* @see self::COMMENT_AS_HTML_COMMENT
* @see self::COMMENT_AS_PI_NODE_LOOKALIKE
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string|null
*/
public function get_comment_type() {
return $this->is_virtual() ? null : parent::get_comment_type();
}
/**
* Removes a bookmark that is no longer needed.
*
* Releasing a bookmark frees up the small
* performance overhead it requires.
*
* @since 6.4.0
*
* @param string $bookmark_name Name of the bookmark to remove.
* @return bool Whether the bookmark already existed before removal.
*/
public function release_bookmark( $bookmark_name ) {
return parent::release_bookmark( "_{$bookmark_name}" );
}
/**
* Moves the internal cursor in the HTML Processor to a given bookmark's location.
*
* Be careful! Seeking backwards to a previous location resets the parser to the
* start of the document and reparses the entire contents up until it finds the
* sought-after bookmarked location.
*
* In order to prevent accidental infinite loops, there's a
* maximum limit on the number of times seek() can be called.
*
* @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
*
* @since 6.4.0
*
* @param string $bookmark_name Jump to the place in the document identified by this bookmark name.
* @return bool Whether the internal cursor was successfully moved to the bookmark's location.
*/
public function seek( $bookmark_name ) {
// Flush any pending updates to the document before beginning.
$this->get_updated_html();
$actual_bookmark_name = "_{$bookmark_name}";
$processor_started_at = $this->state->current_token
? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start
: 0;
$bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start;
$bookmark_length = $this->bookmarks[ $actual_bookmark_name ]->length;
$direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward';
/*
* If seeking backwards, it's possible that the sought-after bookmark exists within an element
* which has been closed before the current cursor; in other words, it has already been removed
* from the stack of open elements. This means that it's insufficient to simply pop off elements
* from the stack of open elements which appear after the bookmarked location and then jump to
* that location, as the elements which were open before won't be re-opened.
*
* In order to maintain consistency, the HTML Processor rewinds to the start of the document
* and reparses everything until it finds the sought-after bookmark.
*
* There are potentially better ways to do this: cache the parser state for each bookmark and
* restore it when seeking; store an immutable and idempotent register of where elements open
* and close.
*
* If caching the parser state it will be essential to properly maintain the cached stack of
* open elements and active formatting elements when modifying the document. This could be a
* tedious and time-consuming process as well, and so for now will not be performed.
*
* It may be possible to track bookmarks for where elements open and close, and in doing so
* be able to quickly recalculate breadcrumbs for any element in the document. It may even
* be possible to remove the stack of open elements and compute it on the fly this way.
* If doing this, the parser would need to track the opening and closing locations for all
* tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves
* this list could be automatically maintained while modifying the document. Finding the
* breadcrumbs would then amount to traversing that list from the start until the token
* being inspected. Once an element closes, if there are no bookmarks pointing to locations
* within that element, then all of these locations may be forgotten to save on memory use
* and computation time.
*/
if ( 'backward' === $direction ) {
/*
* Instead of clearing the parser state and starting fresh, calling the stack methods
* maintains the proper flags in the parser.
*/
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
if ( 'context-node' === $item->bookmark_name ) {
break;
}
$this->state->stack_of_open_elements->remove_node( $item );
}
foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
if ( 'context-node' === $item->bookmark_name ) {
break;
}
$this->state->active_formatting_elements->remove_node( $item );
}
parent::seek( 'context-node' );
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
$this->state->frameset_ok = true;
$this->element_queue = array();
$this->current_element = null;
}
// When moving forwards, reparse the document until reaching the same location as the original bookmark.
if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
return true;
}
while ( $this->next_token() ) {
if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
while ( isset( $this->current_element ) && WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
$this->current_element = array_shift( $this->element_queue );
}
return true;
}
}
return false;
}
/**
* Sets a bookmark in the HTML document.
*
* Bookmarks represent specific places or tokens in the HTML
* document, such as a tag opener or closer. When applying
* edits to a document, such as setting an attribute, the
* text offsets of that token may shift; the bookmark is
* kept updated with those shifts and remains stable unless
* the entire span of text in which the token sits is removed.
*
* Release bookmarks when they are no longer needed.
*
* Example:
*
* Surprising fact you may not know!
* ^ ^
* \-|-- this `H2` opener bookmark tracks the token
*
* Surprising fact you may no…
* ^ ^
* \-|-- it shifts with edits
*
* Bookmarks provide the ability to seek to a previously-scanned
* place in the HTML document. This avoids the need to re-scan
* the entire document.
*
* Example:
*
*
* ^^^^
* want to note this last item
*
* $p = new WP_HTML_Tag_Processor( $html );
* $in_list = false;
* while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) {
* if ( 'UL' === $p->get_tag() ) {
* if ( $p->is_tag_closer() ) {
* $in_list = false;
* $p->set_bookmark( 'resume' );
* if ( $p->seek( 'last-li' ) ) {
* $p->add_class( 'last-li' );
* }
* $p->seek( 'resume' );
* $p->release_bookmark( 'last-li' );
* $p->release_bookmark( 'resume' );
* } else {
* $in_list = true;
* }
* }
*
* if ( 'LI' === $p->get_tag() ) {
* $p->set_bookmark( 'last-li' );
* }
* }
*
* Bookmarks intentionally hide the internal string offsets
* to which they refer. They are maintained internally as
* updates are applied to the HTML document and therefore
* retain their "position" - the location to which they
* originally pointed. The inability to use bookmarks with
* functions like `substr` is therefore intentional to guard
* against accidentally breaking the HTML.
*
* Because bookmarks allocate memory and require processing
* for every applied update, they are limited and require
* a name. They should not be created with programmatically-made
* names, such as "li_{$index}" with some loop. As a general
* rule they should only be created with string-literal names
* like "start-of-section" or "last-paragraph".
*
* Bookmarks are a powerful tool to enable complicated behavior.
* Consider double-checking that you need this tool if you are
* reaching for it, as inappropriate use could lead to broken
* HTML structure or unwanted processing overhead.
*
* @since 6.4.0
*
* @param string $bookmark_name Identifies this particular bookmark.
* @return bool Whether the bookmark was successfully created.
*/
public function set_bookmark( $bookmark_name ) {
return parent::set_bookmark( "_{$bookmark_name}" );
}
/**
* Checks whether a bookmark with the given name exists.
*
* @since 6.5.0
*
* @param string $bookmark_name Name to identify a bookmark that potentially exists.
* @return bool Whether that bookmark exists.
*/
public function has_bookmark( $bookmark_name ) {
return parent::has_bookmark( "_{$bookmark_name}" );
}
/*
* HTML Parsing Algorithms
*/
/**
* Closes a P element.
*
* @since 6.4.0
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#close-a-p-element
*/
private function close_a_p_element() {
$this->generate_implied_end_tags( 'P' );
$this->state->stack_of_open_elements->pop_until( 'P' );
}
/**
* Closes elements that have implied end tags.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#generate-implied-end-tags
*
* @param string|null $except_for_this_element Perform as if this element doesn't exist in the stack of open elements.
*/
private function generate_implied_end_tags( $except_for_this_element = null ) {
$elements_with_implied_end_tags = array(
'DD',
'DT',
'LI',
'P',
);
$current_node = $this->state->stack_of_open_elements->current_node();
while (
$current_node && $current_node->node_name !== $except_for_this_element &&
in_array( $this->state->stack_of_open_elements->current_node(), $elements_with_implied_end_tags, true )
) {
$this->state->stack_of_open_elements->pop();
}
}
/**
* Closes elements that have implied end tags, thoroughly.
*
* See the HTML specification for an explanation why this is
* different from generating end tags in the normal sense.
*
* @since 6.4.0
*
* @see WP_HTML_Processor::generate_implied_end_tags
* @see https://html.spec.whatwg.org/#generate-implied-end-tags
*/
private function generate_implied_end_tags_thoroughly() {
$elements_with_implied_end_tags = array(
'DD',
'DT',
'LI',
'P',
);
while ( in_array( $this->state->stack_of_open_elements->current_node(), $elements_with_implied_end_tags, true ) ) {
$this->state->stack_of_open_elements->pop();
}
}
/**
* Reconstructs the active formatting elements.
*
* > This has the effect of reopening all the formatting elements that were opened
* > in the current body, cell, or caption (whichever is youngest) that haven't
* > been explicitly closed.
*
* @since 6.4.0
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#reconstruct-the-active-formatting-elements
*
* @return bool Whether any formatting elements needed to be reconstructed.
*/
private function reconstruct_active_formatting_elements() {
/*
* > If there are no entries in the list of active formatting elements, then there is nothing
* > to reconstruct; stop this algorithm.
*/
if ( 0 === $this->state->active_formatting_elements->count() ) {
return false;
}
$last_entry = $this->state->active_formatting_elements->current_node();
if (
/*
* > If the last (most recently added) entry in the list of active formatting elements is a marker;
* > stop this algorithm.
*/
'marker' === $last_entry->node_name ||
/*
* > If the last (most recently added) entry in the list of active formatting elements is an
* > element that is in the stack of open elements, then there is nothing to reconstruct;
* > stop this algorithm.
*/
$this->state->stack_of_open_elements->contains_node( $last_entry )
) {
return false;
}
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' );
}
/**
* Runs the adoption agency algorithm.
*
* @since 6.4.0
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#adoption-agency-algorithm
*/
private function run_adoption_agency_algorithm() {
$budget = 1000;
$subject = $this->get_tag();
$current_node = $this->state->stack_of_open_elements->current_node();
if (
// > If the current node is an HTML element whose tag name is subject
$current_node && $subject === $current_node->node_name &&
// > the current node is not in the list of active formatting elements
! $this->state->active_formatting_elements->contains_node( $current_node )
) {
$this->state->stack_of_open_elements->pop();
return;
}
$outer_loop_counter = 0;
while ( $budget-- > 0 ) {
if ( $outer_loop_counter++ >= 8 ) {
return;
}
/*
* > Let formatting element be the last element in the list of active formatting elements that:
* > - is between the end of the list and the last marker in the list,
* > if any, or the start of the list otherwise,
* > - and has the tag name subject.
*/
$formatting_element = null;
foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
if ( 'marker' === $item->node_name ) {
break;
}
if ( $subject === $item->node_name ) {
$formatting_element = $item;
break;
}
}
// > If there is no such element, then return and instead act as described in the "any other end tag" entry above.
if ( null === $formatting_element ) {
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when "any other end tag" is required.' );
}
// > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return.
if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) {
$this->state->active_formatting_elements->remove_node( $formatting_element );
return;
}
// > If formatting element is in the stack of open elements, but the element is not in scope, then this is a parse error; return.
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $formatting_element->node_name ) ) {
return;
}
/*
* > Let furthest block be the topmost node in the stack of open elements that is lower in the stack
* > than formatting element, and is an element in the special category. There might not be one.
*/
$is_above_formatting_element = true;
$furthest_block = null;
foreach ( $this->state->stack_of_open_elements->walk_down() as $item ) {
if ( $is_above_formatting_element && $formatting_element->bookmark_name !== $item->bookmark_name ) {
continue;
}
if ( $is_above_formatting_element ) {
$is_above_formatting_element = false;
continue;
}
if ( self::is_special( $item->node_name ) ) {
$furthest_block = $item;
break;
}
}
/*
* > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the
* > stack of open elements, from the current node up to and including formatting element, then
* > remove formatting element from the list of active formatting elements, and finally return.
*/
if ( null === $furthest_block ) {
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
$this->state->stack_of_open_elements->pop();
if ( $formatting_element->bookmark_name === $item->bookmark_name ) {
$this->state->active_formatting_elements->remove_node( $formatting_element );
return;
}
}
}
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( 'Cannot extract common ancestor in adoption agency algorithm.' );
}
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when looping required.' );
}
/**
* Inserts an HTML element on the stack of open elements.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#insert-a-foreign-element
*
* @param WP_HTML_Token $token Name of bookmark pointing to element in original input HTML.
*/
private function insert_html_element( $token ) {
$this->state->stack_of_open_elements->push( $token );
}
/*
* HTML Specification Helpers
*/
/**
* Returns whether an element of a given name is in the HTML special category.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#special
*
* @param string $tag_name Name of element to check.
* @return bool Whether the element of the given name is in the special category.
*/
public static function is_special( $tag_name ) {
$tag_name = strtoupper( $tag_name );
return (
'ADDRESS' === $tag_name ||
'APPLET' === $tag_name ||
'AREA' === $tag_name ||
'ARTICLE' === $tag_name ||
'ASIDE' === $tag_name ||
'BASE' === $tag_name ||
'BASEFONT' === $tag_name ||
'BGSOUND' === $tag_name ||
'BLOCKQUOTE' === $tag_name ||
'BODY' === $tag_name ||
'BR' === $tag_name ||
'BUTTON' === $tag_name ||
'CAPTION' === $tag_name ||
'CENTER' === $tag_name ||
'COL' === $tag_name ||
'COLGROUP' === $tag_name ||
'DD' === $tag_name ||
'DETAILS' === $tag_name ||
'DIR' === $tag_name ||
'DIV' === $tag_name ||
'DL' === $tag_name ||
'DT' === $tag_name ||
'EMBED' === $tag_name ||
'FIELDSET' === $tag_name ||
'FIGCAPTION' === $tag_name ||
'FIGURE' === $tag_name ||
'FOOTER' === $tag_name ||
'FORM' === $tag_name ||
'FRAME' === $tag_name ||
'FRAMESET' === $tag_name ||
'H1' === $tag_name ||
'H2' === $tag_name ||
'H3' === $tag_name ||
'H4' === $tag_name ||
'H5' === $tag_name ||
'H6' === $tag_name ||
'HEAD' === $tag_name ||
'HEADER' === $tag_name ||
'HGROUP' === $tag_name ||
'HR' === $tag_name ||
'HTML' === $tag_name ||
'IFRAME' === $tag_name ||
'IMG' === $tag_name ||
'INPUT' === $tag_name ||
'KEYGEN' === $tag_name ||
'LI' === $tag_name ||
'LINK' === $tag_name ||
'LISTING' === $tag_name ||
'MAIN' === $tag_name ||
'MARQUEE' === $tag_name ||
'MENU' === $tag_name ||
'META' === $tag_name ||
'NAV' === $tag_name ||
'NOEMBED' === $tag_name ||
'NOFRAMES' === $tag_name ||
'NOSCRIPT' === $tag_name ||
'OBJECT' === $tag_name ||
'OL' === $tag_name ||
'P' === $tag_name ||
'PARAM' === $tag_name ||
'PLAINTEXT' === $tag_name ||
'PRE' === $tag_name ||
'SCRIPT' === $tag_name ||
'SEARCH' === $tag_name ||
'SECTION' === $tag_name ||
'SELECT' === $tag_name ||
'SOURCE' === $tag_name ||
'STYLE' === $tag_name ||
'SUMMARY' === $tag_name ||
'TABLE' === $tag_name ||
'TBODY' === $tag_name ||
'TD' === $tag_name ||
'TEMPLATE' === $tag_name ||
'TEXTAREA' === $tag_name ||
'TFOOT' === $tag_name ||
'TH' === $tag_name ||
'THEAD' === $tag_name ||
'TITLE' === $tag_name ||
'TR' === $tag_name ||
'TRACK' === $tag_name ||
'UL' === $tag_name ||
'WBR' === $tag_name ||
'XMP' === $tag_name ||
// MathML.
'MI' === $tag_name ||
'MO' === $tag_name ||
'MN' === $tag_name ||
'MS' === $tag_name ||
'MTEXT' === $tag_name ||
'ANNOTATION-XML' === $tag_name ||
// SVG.
'FOREIGNOBJECT' === $tag_name ||
'DESC' === $tag_name ||
'TITLE' === $tag_name
);
}
/**
* Returns whether a given element is an HTML Void Element
*
* > area, base, br, col, embed, hr, img, input, link, meta, source, track, wbr
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#void-elements
*
* @param string $tag_name Name of HTML tag to check.
* @return bool Whether the given tag is an HTML Void Element.
*/
public static function is_void( $tag_name ) {
$tag_name = strtoupper( $tag_name );
return (
'AREA' === $tag_name ||
'BASE' === $tag_name ||
'BASEFONT' === $tag_name || // Obsolete but still treated as void.
'BGSOUND' === $tag_name || // Obsolete but still treated as void.
'BR' === $tag_name ||
'COL' === $tag_name ||
'EMBED' === $tag_name ||
'FRAME' === $tag_name ||
'HR' === $tag_name ||
'IMG' === $tag_name ||
'INPUT' === $tag_name ||
'KEYGEN' === $tag_name || // Obsolete but still treated as void.
'LINK' === $tag_name ||
'META' === $tag_name ||
'PARAM' === $tag_name || // Obsolete but still treated as void.
'SOURCE' === $tag_name ||
'TRACK' === $tag_name ||
'WBR' === $tag_name
);
}
/*
* Constants that would pollute the top of the class if they were found there.
*/
/**
* Indicates that the next HTML token should be parsed and processed.
*
* @since 6.4.0
*
* @var string
*/
const PROCESS_NEXT_NODE = 'process-next-node';
/**
* Indicates that the current HTML token should be reprocessed in the newly-selected insertion mode.
*
* @since 6.4.0
*
* @var string
*/
const REPROCESS_CURRENT_NODE = 'reprocess-current-node';
/**
* Indicates that the current HTML token should be processed without advancing the parser.
*
* @since 6.5.0
*
* @var string
*/
const PROCESS_CURRENT_NODE = 'process-current-node';
/**
* Indicates that the parser encountered unsupported markup and has bailed.
*
* @since 6.4.0
*
* @var string
*/
const ERROR_UNSUPPORTED = 'unsupported';
/**
* Indicates that the parser encountered more HTML tokens than it
* was able to process and has bailed.
*
* @since 6.4.0
*
* @var string
*/
const ERROR_EXCEEDED_MAX_BOOKMARKS = 'exceeded-max-bookmarks';
/**
* Unlock code that must be passed into the constructor to create this class.
*
* This class extends the WP_HTML_Tag_Processor, which has a public class
* constructor. Therefore, it's not possible to have a private constructor here.
*
* This unlock code is used to ensure that anyone calling the constructor is
* doing so with a full understanding that it's intended to be a private API.
*
* @access private
*/
const CONSTRUCTOR_UNLOCK_CODE = 'Use WP_HTML_Processor::create_fragment() instead of calling the class constructor directly.';
}
class-wp-html-span.php 0000644 00000002103 14717700467 0010720 0 ustar 00 start = $start;
$this->length = $length;
}
}
class-wp-html-decoder.php 0000644 00000040252 14717700467 0011373 0 ustar 00 = $end ) {
break;
}
$character_reference = self::read_character_reference( $context, $text, $next_character_reference_at, $token_length );
if ( isset( $character_reference ) ) {
$at = $next_character_reference_at;
$decoded .= substr( $text, $was_at, $at - $was_at );
$decoded .= $character_reference;
$at += $token_length;
$was_at = $at;
continue;
}
++$at;
}
if ( 0 === $was_at ) {
return $text;
}
if ( $was_at < $end ) {
$decoded .= substr( $text, $was_at, $end - $was_at );
}
return $decoded;
}
/**
* Attempt to read a character reference at the given location in a given string,
* depending on the context in which it's found.
*
* If a character reference is found, this function will return the translated value
* that the reference maps to. It will then set `$match_byte_length` the
* number of bytes of input it read while consuming the character reference. This
* gives calling code the opportunity to advance its cursor when traversing a string
* and decoding.
*
* Example:
*
* null === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 0 );
* '…' === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 5, $token_length );
* 8 === $token_length; // `…`
*
* null === WP_HTML_Decoder::read_character_reference( 'attribute', '¬in', 0 );
* '∉' === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0, $token_length );
* 7 === $token_length; // `∉`
*
* '¬' === WP_HTML_Decoder::read_character_reference( 'data', '¬in', 0, $token_length );
* 4 === $token_length; // `¬`
* '∉' === WP_HTML_Decoder::read_character_reference( 'data', '∉', 0, $token_length );
* 7 === $token_length; // `∉`
*
* @since 6.6.0
*
* @param string $context `attribute` for decoding attribute values, `data` otherwise.
* @param string $text Text document containing span of text to decode.
* @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0).
* @param int &$match_byte_length Optional. Set to byte-length of character reference if provided and if a match
* is found, otherwise not set. Default null.
* @return string|false Decoded character reference in UTF-8 if found, otherwise `false`.
*/
public static function read_character_reference( $context, $text, $at = 0, &$match_byte_length = null ) {
/**
* Mappings for HTML5 named character references.
*
* @var WP_Token_Map $html5_named_character_references
*/
global $html5_named_character_references;
$length = strlen( $text );
if ( $at + 1 >= $length ) {
return null;
}
if ( '&' !== $text[ $at ] ) {
return null;
}
/*
* Numeric character references.
*
* When truncated, these will encode the code point found by parsing the
* digits that are available. For example, when `🅰` is truncated
* to `DZ` it will encode `DZ`. It does not:
* - know how to parse the original `🅰`.
* - fail to parse and return plaintext `DZ`.
* - fail to parse and return the replacement character `�`
*/
if ( '#' === $text[ $at + 1 ] ) {
if ( $at + 2 >= $length ) {
return null;
}
/** Tracks inner parsing within the numeric character reference. */
$digits_at = $at + 2;
if ( 'x' === $text[ $digits_at ] || 'X' === $text[ $digits_at ] ) {
$numeric_base = 16;
$numeric_digits = '0123456789abcdefABCDEF';
$max_digits = 6; //
++$digits_at;
} else {
$numeric_base = 10;
$numeric_digits = '0123456789';
$max_digits = 7; //
}
// Cannot encode invalid Unicode code points. Max is to U+10FFFF.
$zero_count = strspn( $text, '0', $digits_at );
$digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count );
$after_digits = $digits_at + $zero_count + $digit_count;
$has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ];
$end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits;
// `` or `` without digits returns into plaintext.
if ( 0 === $digit_count && 0 === $zero_count ) {
return null;
}
// Whereas `` and only zeros is invalid.
if ( 0 === $digit_count ) {
$match_byte_length = $end_of_span - $at;
return '�';
}
// If there are too many digits then it's not worth parsing. It's invalid.
if ( $digit_count > $max_digits ) {
$match_byte_length = $end_of_span - $at;
return '�';
}
$digits = substr( $text, $digits_at + $zero_count, $digit_count );
$code_point = intval( $digits, $numeric_base );
/*
* Noncharacters, 0x0D, and non-ASCII-whitespace control characters.
*
* > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
* > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
* > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
* > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
* > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
* > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
*
* A C0 control is a code point that is in the range of U+00 to U+1F,
* but ASCII whitespace includes U+09, U+0A, U+0C, and U+0D.
*
* These characters are invalid but still decode as any valid character.
* This comment is here to note and explain why there's no check to
* remove these characters or replace them.
*
* @see https://infra.spec.whatwg.org/#noncharacter
*/
/*
* Code points in the C1 controls area need to be remapped as if they
* were stored in Windows-1252. Note! This transformation only happens
* for numeric character references. The raw code points in the byte
* stream are not translated.
*
* > If the number is one of the numbers in the first column of
* > the following table, then find the row with that number in
* > the first column, and set the character reference code to
* > the number in the second column of that row.
*/
if ( $code_point >= 0x80 && $code_point <= 0x9F ) {
$windows_1252_mapping = array(
0x20AC, // 0x80 -> EURO SIGN (€).
0x81, // 0x81 -> (no change).
0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚).
0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ).
0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („).
0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…).
0x2020, // 0x86 -> DAGGER (†).
0x2021, // 0x87 -> DOUBLE DAGGER (‡).
0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ).
0x2030, // 0x89 -> PER MILLE SIGN (‰).
0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š).
0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹).
0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ).
0x8D, // 0x8D -> (no change).
0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž).
0x8F, // 0x8F -> (no change).
0x90, // 0x90 -> (no change).
0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘).
0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’).
0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“).
0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”).
0x2022, // 0x95 -> BULLET (•).
0x2013, // 0x96 -> EN DASH (–).
0x2014, // 0x97 -> EM DASH (—).
0x02DC, // 0x98 -> SMALL TILDE (˜).
0x2122, // 0x99 -> TRADE MARK SIGN (™).
0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š).
0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›).
0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ).
0x9D, // 0x9D -> (no change).
0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž).
0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ).
);
$code_point = $windows_1252_mapping[ $code_point - 0x80 ];
}
$match_byte_length = $end_of_span - $at;
return self::code_point_to_utf8_bytes( $code_point );
}
/** Tracks inner parsing within the named character reference. */
$name_at = $at + 1;
// Minimum named character reference is two characters. E.g. `GT`.
if ( $name_at + 2 > $length ) {
return null;
}
$name_length = 0;
$replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length );
if ( false === $replacement ) {
return null;
}
$after_name = $name_at + $name_length;
// If the match ended with a semicolon then it should always be decoded.
if ( ';' === $text[ $name_at + $name_length - 1 ] ) {
$match_byte_length = $after_name - $at;
return $replacement;
}
/*
* At this point though there's a match for an entry in the named
* character reference table but the match doesn't end in `;`.
* It may be allowed if it's followed by something unambiguous.
*/
$ambiguous_follower = (
$after_name < $length &&
$name_at < $length &&
(
ctype_alnum( $text[ $after_name ] ) ||
'=' === $text[ $after_name ]
)
);
// It's non-ambiguous, safe to leave it in.
if ( ! $ambiguous_follower ) {
$match_byte_length = $after_name - $at;
return $replacement;
}
// It's ambiguous, which isn't allowed inside attributes.
if ( 'attribute' === $context ) {
return null;
}
$match_byte_length = $after_name - $at;
return $replacement;
}
/**
* Encode a code point number into the UTF-8 encoding.
*
* This encoder implements the UTF-8 encoding algorithm for converting
* a code point into a byte sequence. If it receives an invalid code
* point it will return the Unicode Replacement Character U+FFFD `�`.
*
* Example:
*
* '🅰' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0x1f170 );
*
* // Half of a surrogate pair is an invalid code point.
* '�' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0xd83c );
*
* @since 6.6.0
*
* @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard.
*
* @param int $code_point Which code point to convert.
* @return string Converted code point, or `�` if invalid.
*/
public static function code_point_to_utf8_bytes( $code_point ) {
// Pre-check to ensure a valid code point.
if (
$code_point <= 0 ||
( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||
$code_point > 0x10FFFF
) {
return '�';
}
if ( $code_point <= 0x7F ) {
return chr( $code_point );
}
if ( $code_point <= 0x7FF ) {
$byte1 = ( $code_point >> 6 ) | 0xC0;
$byte2 = $code_point & 0x3F | 0x80;
return pack( 'CC', $byte1, $byte2 );
}
if ( $code_point <= 0xFFFF ) {
$byte1 = ( $code_point >> 12 ) | 0xE0;
$byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;
$byte3 = $code_point & 0x3F | 0x80;
return pack( 'CCC', $byte1, $byte2, $byte3 );
}
// Any values above U+10FFFF are eliminated above in the pre-check.
$byte1 = ( $code_point >> 18 ) | 0xF0;
$byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
$byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
$byte4 = $code_point & 0x3F | 0x80;
return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );
}
}
class-wp-html-processor-state.php 0000644 00000006402 14717700467 0013122 0 ustar 00 The frameset-ok flag is set to "ok" when the parser is created. It is set to "not ok" after certain tokens are seen.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#frameset-ok-flag
*
* @var bool
*/
public $frameset_ok = true;
/**
* Constructor - creates a new and empty state value.
*
* @since 6.4.0
*
* @see WP_HTML_Processor
*/
public function __construct() {
$this->stack_of_open_elements = new WP_HTML_Open_Elements();
$this->active_formatting_elements = new WP_HTML_Active_Formatting_Elements();
}
}
class-wp-html-token.php 0000644 00000005360 14717700467 0011107 0 ustar 00 bookmark_name = $bookmark_name;
$this->node_name = $node_name;
$this->has_self_closing_flag = $has_self_closing_flag;
$this->on_destroy = $on_destroy;
}
/**
* Destructor.
*
* @since 6.4.0
*/
public function __destruct() {
if ( is_callable( $this->on_destroy ) ) {
call_user_func( $this->on_destroy, $this->bookmark_name );
}
}
/**
* Wakeup magic method.
*
* @since 6.4.2
*/
public function __wakeup() {
throw new \LogicException( __CLASS__ . ' should never be unserialized' );
}
}
html5-named-character-references.php 0000644 00000234443 14717700467 0013470 0 ustar 00 "6.6.0-trunk",
"key_length" => 2,
"groups" => "AE\x00AM\x00Aa\x00Ab\x00Ac\x00Af\x00Ag\x00Al\x00Am\x00An\x00Ao\x00Ap\x00Ar\x00As\x00At\x00Au\x00Ba\x00Bc\x00Be\x00Bf\x00Bo\x00Br\x00Bs\x00Bu\x00CH\x00CO\x00Ca\x00Cc\x00Cd\x00Ce\x00Cf\x00Ch\x00Ci\x00Cl\x00Co\x00Cr\x00Cs\x00Cu\x00DD\x00DJ\x00DS\x00DZ\x00Da\x00Dc\x00De\x00Df\x00Di\x00Do\x00Ds\x00EN\x00ET\x00Ea\x00Ec\x00Ed\x00Ef\x00Eg\x00El\x00Em\x00Eo\x00Ep\x00Eq\x00Es\x00Et\x00Eu\x00Ex\x00Fc\x00Ff\x00Fi\x00Fo\x00Fs\x00GJ\x00GT\x00Ga\x00Gb\x00Gc\x00Gd\x00Gf\x00Gg\x00Go\x00Gr\x00Gs\x00Gt\x00HA\x00Ha\x00Hc\x00Hf\x00Hi\x00Ho\x00Hs\x00Hu\x00IE\x00IJ\x00IO\x00Ia\x00Ic\x00Id\x00If\x00Ig\x00Im\x00In\x00Io\x00Is\x00It\x00Iu\x00Jc\x00Jf\x00Jo\x00Js\x00Ju\x00KH\x00KJ\x00Ka\x00Kc\x00Kf\x00Ko\x00Ks\x00LJ\x00LT\x00La\x00Lc\x00Le\x00Lf\x00Ll\x00Lm\x00Lo\x00Ls\x00Lt\x00Ma\x00Mc\x00Me\x00Mf\x00Mi\x00Mo\x00Ms\x00Mu\x00NJ\x00Na\x00Nc\x00Ne\x00Nf\x00No\x00Ns\x00Nt\x00Nu\x00OE\x00Oa\x00Oc\x00Od\x00Of\x00Og\x00Om\x00Oo\x00Op\x00Or\x00Os\x00Ot\x00Ou\x00Ov\x00Pa\x00Pc\x00Pf\x00Ph\x00Pi\x00Pl\x00Po\x00Pr\x00Ps\x00QU\x00Qf\x00Qo\x00Qs\x00RB\x00RE\x00Ra\x00Rc\x00Re\x00Rf\x00Rh\x00Ri\x00Ro\x00Rr\x00Rs\x00Ru\x00SH\x00SO\x00Sa\x00Sc\x00Sf\x00Sh\x00Si\x00Sm\x00So\x00Sq\x00Ss\x00St\x00Su\x00TH\x00TR\x00TS\x00Ta\x00Tc\x00Tf\x00Th\x00Ti\x00To\x00Tr\x00Ts\x00Ua\x00Ub\x00Uc\x00Ud\x00Uf\x00Ug\x00Um\x00Un\x00Uo\x00Up\x00Ur\x00Us\x00Ut\x00Uu\x00VD\x00Vb\x00Vc\x00Vd\x00Ve\x00Vf\x00Vo\x00Vs\x00Vv\x00Wc\x00We\x00Wf\x00Wo\x00Ws\x00Xf\x00Xi\x00Xo\x00Xs\x00YA\x00YI\x00YU\x00Ya\x00Yc\x00Yf\x00Yo\x00Ys\x00Yu\x00ZH\x00Za\x00Zc\x00Zd\x00Ze\x00Zf\x00Zo\x00Zs\x00aa\x00ab\x00ac\x00ae\x00af\x00ag\x00al\x00am\x00an\x00ao\x00ap\x00ar\x00as\x00at\x00au\x00aw\x00bN\x00ba\x00bb\x00bc\x00bd\x00be\x00bf\x00bi\x00bk\x00bl\x00bn\x00bo\x00bp\x00br\x00bs\x00bu\x00ca\x00cc\x00cd\x00ce\x00cf\x00ch\x00ci\x00cl\x00co\x00cr\x00cs\x00ct\x00cu\x00cw\x00cy\x00dA\x00dH\x00da\x00db\x00dc\x00dd\x00de\x00df\x00dh\x00di\x00dj\x00dl\x00do\x00dr\x00ds\x00dt\x00du\x00dw\x00dz\x00eD\x00ea\x00ec\x00ed\x00ee\x00ef\x00eg\x00el\x00em\x00en\x00eo\x00ep\x00eq\x00er\x00es\x00et\x00eu\x00ex\x00fa\x00fc\x00fe\x00ff\x00fi\x00fj\x00fl\x00fn\x00fo\x00fp\x00fr\x00fs\x00gE\x00ga\x00gb\x00gc\x00gd\x00ge\x00gf\x00gg\x00gi\x00gj\x00gl\x00gn\x00go\x00gr\x00gs\x00gt\x00gv\x00hA\x00ha\x00hb\x00hc\x00he\x00hf\x00hk\x00ho\x00hs\x00hy\x00ia\x00ic\x00ie\x00if\x00ig\x00ii\x00ij\x00im\x00in\x00io\x00ip\x00iq\x00is\x00it\x00iu\x00jc\x00jf\x00jm\x00jo\x00js\x00ju\x00ka\x00kc\x00kf\x00kg\x00kh\x00kj\x00ko\x00ks\x00lA\x00lB\x00lE\x00lH\x00la\x00lb\x00lc\x00ld\x00le\x00lf\x00lg\x00lh\x00lj\x00ll\x00lm\x00ln\x00lo\x00lp\x00lr\x00ls\x00lt\x00lu\x00lv\x00mD\x00ma\x00mc\x00md\x00me\x00mf\x00mh\x00mi\x00ml\x00mn\x00mo\x00mp\x00ms\x00mu\x00nG\x00nL\x00nR\x00nV\x00na\x00nb\x00nc\x00nd\x00ne\x00nf\x00ng\x00nh\x00ni\x00nj\x00nl\x00nm\x00no\x00np\x00nr\x00ns\x00nt\x00nu\x00nv\x00nw\x00oS\x00oa\x00oc\x00od\x00oe\x00of\x00og\x00oh\x00oi\x00ol\x00om\x00oo\x00op\x00or\x00os\x00ot\x00ou\x00ov\x00pa\x00pc\x00pe\x00pf\x00ph\x00pi\x00pl\x00pm\x00po\x00pr\x00ps\x00pu\x00qf\x00qi\x00qo\x00qp\x00qs\x00qu\x00rA\x00rB\x00rH\x00ra\x00rb\x00rc\x00rd\x00re\x00rf\x00rh\x00ri\x00rl\x00rm\x00rn\x00ro\x00rp\x00rr\x00rs\x00rt\x00ru\x00rx\x00sa\x00sb\x00sc\x00sd\x00se\x00sf\x00sh\x00si\x00sl\x00sm\x00so\x00sp\x00sq\x00sr\x00ss\x00st\x00su\x00sw\x00sz\x00ta\x00tb\x00tc\x00td\x00te\x00tf\x00th\x00ti\x00to\x00tp\x00tr\x00ts\x00tw\x00uA\x00uH\x00ua\x00ub\x00uc\x00ud\x00uf\x00ug\x00uh\x00ul\x00um\x00uo\x00up\x00ur\x00us\x00ut\x00uu\x00uw\x00vA\x00vB\x00vD\x00va\x00vc\x00vd\x00ve\x00vf\x00vl\x00vn\x00vo\x00vp\x00vr\x00vs\x00vz\x00wc\x00we\x00wf\x00wo\x00wp\x00wr\x00ws\x00xc\x00xd\x00xf\x00xh\x00xi\x00xl\x00xm\x00xn\x00xo\x00xr\x00xs\x00xu\x00xv\x00xw\x00ya\x00yc\x00ye\x00yf\x00yi\x00yo\x00ys\x00yu\x00za\x00zc\x00zd\x00ze\x00zf\x00zh\x00zi\x00zo\x00zs\x00zw\x00",
"large_words" => array(
// AElig;[Æ] AElig[Æ].
"\x04lig;\x02Æ\x03lig\x02Æ",
// AMP;[&] AMP[&].
"\x02P;\x01&\x01P\x01&",
// Aacute;[Á] Aacute[Á].
"\x05cute;\x02Á\x04cute\x02Á",
// Abreve;[Ă].
"\x05reve;\x02Ă",
// Acirc;[Â] Acirc[Â] Acy;[А].
"\x04irc;\x02Â\x03irc\x02Â\x02y;\x02А",
// Afr;[𝔄].
"\x02r;\x04𝔄",
// Agrave;[À] Agrave[À].
"\x05rave;\x02À\x04rave\x02À",
// Alpha;[Α].
"\x04pha;\x02Α",
// Amacr;[Ā].
"\x04acr;\x02Ā",
// And;[⩓].
"\x02d;\x03⩓",
// Aogon;[Ą] Aopf;[𝔸].
"\x04gon;\x02Ą\x03pf;\x04𝔸",
// ApplyFunction;[].
"\x0cplyFunction;\x03",
// Aring;[Å] Aring[Å].
"\x04ing;\x02Å\x03ing\x02Å",
// Assign;[≔] Ascr;[𝒜].
"\x05sign;\x03≔\x03cr;\x04𝒜",
// Atilde;[Ã] Atilde[Ã].
"\x05ilde;\x02Ã\x04ilde\x02Ã",
// Auml;[Ä] Auml[Ä].
"\x03ml;\x02Ä\x02ml\x02Ä",
// Backslash;[∖] Barwed;[⌆] Barv;[⫧].
"\x08ckslash;\x03∖\x05rwed;\x03⌆\x03rv;\x03⫧",
// Bcy;[Б].
"\x02y;\x02Б",
// Bernoullis;[ℬ] Because;[∵] Beta;[Β].
"\x09rnoullis;\x03ℬ\x06cause;\x03∵\x03ta;\x02Β",
// Bfr;[𝔅].
"\x02r;\x04𝔅",
// Bopf;[𝔹].
"\x03pf;\x04𝔹",
// Breve;[˘].
"\x04eve;\x02˘",
// Bscr;[ℬ].
"\x03cr;\x03ℬ",
// Bumpeq;[≎].
"\x05mpeq;\x03≎",
// CHcy;[Ч].
"\x03cy;\x02Ч",
// COPY;[©] COPY[©].
"\x03PY;\x02©\x02PY\x02©",
// CapitalDifferentialD;[ⅅ] Cayleys;[ℭ] Cacute;[Ć] Cap;[⋒].
"\x13pitalDifferentialD;\x03ⅅ\x06yleys;\x03ℭ\x05cute;\x02Ć\x02p;\x03⋒",
// Cconint;[∰] Ccaron;[Č] Ccedil;[Ç] Ccedil[Ç] Ccirc;[Ĉ].
"\x06onint;\x03∰\x05aron;\x02Č\x05edil;\x02Ç\x04edil\x02Ç\x04irc;\x02Ĉ",
// Cdot;[Ċ].
"\x03ot;\x02Ċ",
// CenterDot;[·] Cedilla;[¸].
"\x08nterDot;\x02·\x06dilla;\x02¸",
// Cfr;[ℭ].
"\x02r;\x03ℭ",
// Chi;[Χ].
"\x02i;\x02Χ",
// CircleMinus;[⊖] CircleTimes;[⊗] CirclePlus;[⊕] CircleDot;[⊙].
"\x0arcleMinus;\x03⊖\x0arcleTimes;\x03⊗\x09rclePlus;\x03⊕\x08rcleDot;\x03⊙",
// ClockwiseContourIntegral;[∲] CloseCurlyDoubleQuote;[”] CloseCurlyQuote;[’].
"\x17ockwiseContourIntegral;\x03∲\x14oseCurlyDoubleQuote;\x03”\x0eoseCurlyQuote;\x03’",
// CounterClockwiseContourIntegral;[∳] ContourIntegral;[∮] Congruent;[≡] Coproduct;[∐] Colone;[⩴] Conint;[∯] Colon;[∷] Copf;[ℂ].
"\x1eunterClockwiseContourIntegral;\x03∳\x0entourIntegral;\x03∮\x08ngruent;\x03≡\x08product;\x03∐\x05lone;\x03⩴\x05nint;\x03∯\x04lon;\x03∷\x03pf;\x03ℂ",
// Cross;[⨯].
"\x04oss;\x03⨯",
// Cscr;[𝒞].
"\x03cr;\x04𝒞",
// CupCap;[≍] Cup;[⋓].
"\x05pCap;\x03≍\x02p;\x03⋓",
// DDotrahd;[⤑] DD;[ⅅ].
"\x07otrahd;\x03⤑\x01;\x03ⅅ",
// DJcy;[Ђ].
"\x03cy;\x02Ђ",
// DScy;[Ѕ].
"\x03cy;\x02Ѕ",
// DZcy;[Џ].
"\x03cy;\x02Џ",
// Dagger;[‡] Dashv;[⫤] Darr;[↡].
"\x05gger;\x03‡\x04shv;\x03⫤\x03rr;\x03↡",
// Dcaron;[Ď] Dcy;[Д].
"\x05aron;\x02Ď\x02y;\x02Д",
// Delta;[Δ] Del;[∇].
"\x04lta;\x02Δ\x02l;\x03∇",
// Dfr;[𝔇].
"\x02r;\x04𝔇",
// DiacriticalDoubleAcute;[˝] DiacriticalAcute;[´] DiacriticalGrave;[`] DiacriticalTilde;[˜] DiacriticalDot;[˙] DifferentialD;[ⅆ] Diamond;[⋄].
"\x15acriticalDoubleAcute;\x02˝\x0facriticalAcute;\x02´\x0facriticalGrave;\x01`\x0facriticalTilde;\x02˜\x0dacriticalDot;\x02˙\x0cfferentialD;\x03ⅆ\x06amond;\x03⋄",
// DoubleLongLeftRightArrow;[⟺] DoubleContourIntegral;[∯] DoubleLeftRightArrow;[⇔] DoubleLongRightArrow;[⟹] DoubleLongLeftArrow;[⟸] DownLeftRightVector;[⥐] DownRightTeeVector;[⥟] DownRightVectorBar;[⥗] DoubleUpDownArrow;[⇕] DoubleVerticalBar;[∥] DownLeftTeeVector;[⥞] DownLeftVectorBar;[⥖] DoubleRightArrow;[⇒] DownArrowUpArrow;[⇵] DoubleDownArrow;[⇓] DoubleLeftArrow;[⇐] DownRightVector;[⇁] DoubleRightTee;[⊨] DownLeftVector;[↽] DoubleLeftTee;[⫤] DoubleUpArrow;[⇑] DownArrowBar;[⤓] DownTeeArrow;[↧] DoubleDot;[¨] DownArrow;[↓] DownBreve;[̑] Downarrow;[⇓] DotEqual;[≐] DownTee;[⊤] DotDot;[⃜] Dopf;[𝔻] Dot;[¨].
"\x17ubleLongLeftRightArrow;\x03⟺\x14ubleContourIntegral;\x03∯\x13ubleLeftRightArrow;\x03⇔\x13ubleLongRightArrow;\x03⟹\x12ubleLongLeftArrow;\x03⟸\x12wnLeftRightVector;\x03⥐\x11wnRightTeeVector;\x03⥟\x11wnRightVectorBar;\x03⥗\x10ubleUpDownArrow;\x03⇕\x10ubleVerticalBar;\x03∥\x10wnLeftTeeVector;\x03⥞\x10wnLeftVectorBar;\x03⥖\x0fubleRightArrow;\x03⇒\x0fwnArrowUpArrow;\x03⇵\x0eubleDownArrow;\x03⇓\x0eubleLeftArrow;\x03⇐\x0ewnRightVector;\x03⇁\x0dubleRightTee;\x03⊨\x0dwnLeftVector;\x03↽\x0cubleLeftTee;\x03⫤\x0cubleUpArrow;\x03⇑\x0bwnArrowBar;\x03⤓\x0bwnTeeArrow;\x03↧\x08ubleDot;\x02¨\x08wnArrow;\x03↓\x08wnBreve;\x02̑\x08wnarrow;\x03⇓\x07tEqual;\x03≐\x06wnTee;\x03⊤\x05tDot;\x03⃜\x03pf;\x04𝔻\x02t;\x02¨",
// Dstrok;[Đ] Dscr;[𝒟].
"\x05trok;\x02Đ\x03cr;\x04𝒟",
// ENG;[Ŋ].
"\x02G;\x02Ŋ",
// ETH;[Ð] ETH[Ð].
"\x02H;\x02Ð\x01H\x02Ð",
// Eacute;[É] Eacute[É].
"\x05cute;\x02É\x04cute\x02É",
// Ecaron;[Ě] Ecirc;[Ê] Ecirc[Ê] Ecy;[Э].
"\x05aron;\x02Ě\x04irc;\x02Ê\x03irc\x02Ê\x02y;\x02Э",
// Edot;[Ė].
"\x03ot;\x02Ė",
// Efr;[𝔈].
"\x02r;\x04𝔈",
// Egrave;[È] Egrave[È].
"\x05rave;\x02È\x04rave\x02È",
// Element;[∈].
"\x06ement;\x03∈",
// EmptyVerySmallSquare;[▫] EmptySmallSquare;[◻] Emacr;[Ē].
"\x13ptyVerySmallSquare;\x03▫\x0fptySmallSquare;\x03◻\x04acr;\x02Ē",
// Eogon;[Ę] Eopf;[𝔼].
"\x04gon;\x02Ę\x03pf;\x04𝔼",
// Epsilon;[Ε].
"\x06silon;\x02Ε",
// Equilibrium;[⇌] EqualTilde;[≂] Equal;[⩵].
"\x0auilibrium;\x03⇌\x09ualTilde;\x03≂\x04ual;\x03⩵",
// Escr;[ℰ] Esim;[⩳].
"\x03cr;\x03ℰ\x03im;\x03⩳",
// Eta;[Η].
"\x02a;\x02Η",
// Euml;[Ë] Euml[Ë].
"\x03ml;\x02Ë\x02ml\x02Ë",
// ExponentialE;[ⅇ] Exists;[∃].
"\x0bponentialE;\x03ⅇ\x05ists;\x03∃",
// Fcy;[Ф].
"\x02y;\x02Ф",
// Ffr;[𝔉].
"\x02r;\x04𝔉",
// FilledVerySmallSquare;[▪] FilledSmallSquare;[◼].
"\x14lledVerySmallSquare;\x03▪\x10lledSmallSquare;\x03◼",
// Fouriertrf;[ℱ] ForAll;[∀] Fopf;[𝔽].
"\x09uriertrf;\x03ℱ\x05rAll;\x03∀\x03pf;\x04𝔽",
// Fscr;[ℱ].
"\x03cr;\x03ℱ",
// GJcy;[Ѓ].
"\x03cy;\x02Ѓ",
// GT;[>].
"\x01;\x01>",
// Gammad;[Ϝ] Gamma;[Γ].
"\x05mmad;\x02Ϝ\x04mma;\x02Γ",
// Gbreve;[Ğ].
"\x05reve;\x02Ğ",
// Gcedil;[Ģ] Gcirc;[Ĝ] Gcy;[Г].
"\x05edil;\x02Ģ\x04irc;\x02Ĝ\x02y;\x02Г",
// Gdot;[Ġ].
"\x03ot;\x02Ġ",
// Gfr;[𝔊].
"\x02r;\x04𝔊",
// Gg;[⋙].
"\x01;\x03⋙",
// Gopf;[𝔾].
"\x03pf;\x04𝔾",
// GreaterSlantEqual;[⩾] GreaterEqualLess;[⋛] GreaterFullEqual;[≧] GreaterGreater;[⪢] GreaterEqual;[≥] GreaterTilde;[≳] GreaterLess;[≷].
"\x10eaterSlantEqual;\x03⩾\x0featerEqualLess;\x03⋛\x0featerFullEqual;\x03≧\x0deaterGreater;\x03⪢\x0beaterEqual;\x03≥\x0beaterTilde;\x03≳\x0aeaterLess;\x03≷",
// Gscr;[𝒢].
"\x03cr;\x04𝒢",
// Gt;[≫].
"\x01;\x03≫",
// HARDcy;[Ъ].
"\x05RDcy;\x02Ъ",
// Hacek;[ˇ] Hat;[^].
"\x04cek;\x02ˇ\x02t;\x01^",
// Hcirc;[Ĥ].
"\x04irc;\x02Ĥ",
// Hfr;[ℌ].
"\x02r;\x03ℌ",
// HilbertSpace;[ℋ].
"\x0blbertSpace;\x03ℋ",
// HorizontalLine;[─] Hopf;[ℍ].
"\x0drizontalLine;\x03─\x03pf;\x03ℍ",
// Hstrok;[Ħ] Hscr;[ℋ].
"\x05trok;\x02Ħ\x03cr;\x03ℋ",
// HumpDownHump;[≎] HumpEqual;[≏].
"\x0bmpDownHump;\x03≎\x08mpEqual;\x03≏",
// IEcy;[Е].
"\x03cy;\x02Е",
// IJlig;[IJ].
"\x04lig;\x02IJ",
// IOcy;[Ё].
"\x03cy;\x02Ё",
// Iacute;[Í] Iacute[Í].
"\x05cute;\x02Í\x04cute\x02Í",
// Icirc;[Î] Icirc[Î] Icy;[И].
"\x04irc;\x02Î\x03irc\x02Î\x02y;\x02И",
// Idot;[İ].
"\x03ot;\x02İ",
// Ifr;[ℑ].
"\x02r;\x03ℑ",
// Igrave;[Ì] Igrave[Ì].
"\x05rave;\x02Ì\x04rave\x02Ì",
// ImaginaryI;[ⅈ] Implies;[⇒] Imacr;[Ī] Im;[ℑ].
"\x09aginaryI;\x03ⅈ\x06plies;\x03⇒\x04acr;\x02Ī\x01;\x03ℑ",
// InvisibleComma;[] InvisibleTimes;[] Intersection;[⋂] Integral;[∫] Int;[∬].
"\x0dvisibleComma;\x03\x0dvisibleTimes;\x03\x0btersection;\x03⋂\x07tegral;\x03∫\x02t;\x03∬",
// Iogon;[Į] Iopf;[𝕀] Iota;[Ι].
"\x04gon;\x02Į\x03pf;\x04𝕀\x03ta;\x02Ι",
// Iscr;[ℐ].
"\x03cr;\x03ℐ",
// Itilde;[Ĩ].
"\x05ilde;\x02Ĩ",
// Iukcy;[І] Iuml;[Ï] Iuml[Ï].
"\x04kcy;\x02І\x03ml;\x02Ï\x02ml\x02Ï",
// Jcirc;[Ĵ] Jcy;[Й].
"\x04irc;\x02Ĵ\x02y;\x02Й",
// Jfr;[𝔍].
"\x02r;\x04𝔍",
// Jopf;[𝕁].
"\x03pf;\x04𝕁",
// Jsercy;[Ј] Jscr;[𝒥].
"\x05ercy;\x02Ј\x03cr;\x04𝒥",
// Jukcy;[Є].
"\x04kcy;\x02Є",
// KHcy;[Х].
"\x03cy;\x02Х",
// KJcy;[Ќ].
"\x03cy;\x02Ќ",
// Kappa;[Κ].
"\x04ppa;\x02Κ",
// Kcedil;[Ķ] Kcy;[К].
"\x05edil;\x02Ķ\x02y;\x02К",
// Kfr;[𝔎].
"\x02r;\x04𝔎",
// Kopf;[𝕂].
"\x03pf;\x04𝕂",
// Kscr;[𝒦].
"\x03cr;\x04𝒦",
// LJcy;[Љ].
"\x03cy;\x02Љ",
// LT;[<].
"\x01;\x01<",
// Laplacetrf;[ℒ] Lacute;[Ĺ] Lambda;[Λ] Lang;[⟪] Larr;[↞].
"\x09placetrf;\x03ℒ\x05cute;\x02Ĺ\x05mbda;\x02Λ\x03ng;\x03⟪\x03rr;\x03↞",
// Lcaron;[Ľ] Lcedil;[Ļ] Lcy;[Л].
"\x05aron;\x02Ľ\x05edil;\x02Ļ\x02y;\x02Л",
// LeftArrowRightArrow;[⇆] LeftDoubleBracket;[⟦] LeftDownTeeVector;[⥡] LeftDownVectorBar;[⥙] LeftTriangleEqual;[⊴] LeftAngleBracket;[⟨] LeftUpDownVector;[⥑] LessEqualGreater;[⋚] LeftRightVector;[⥎] LeftTriangleBar;[⧏] LeftUpTeeVector;[⥠] LeftUpVectorBar;[⥘] LeftDownVector;[⇃] LeftRightArrow;[↔] Leftrightarrow;[⇔] LessSlantEqual;[⩽] LeftTeeVector;[⥚] LeftVectorBar;[⥒] LessFullEqual;[≦] LeftArrowBar;[⇤] LeftTeeArrow;[↤] LeftTriangle;[⊲] LeftUpVector;[↿] LeftCeiling;[⌈] LessGreater;[≶] LeftVector;[↼] LeftArrow;[←] LeftFloor;[⌊] Leftarrow;[⇐] LessTilde;[≲] LessLess;[⪡] LeftTee;[⊣].
"\x12ftArrowRightArrow;\x03⇆\x10ftDoubleBracket;\x03⟦\x10ftDownTeeVector;\x03⥡\x10ftDownVectorBar;\x03⥙\x10ftTriangleEqual;\x03⊴\x0fftAngleBracket;\x03⟨\x0fftUpDownVector;\x03⥑\x0fssEqualGreater;\x03⋚\x0eftRightVector;\x03⥎\x0eftTriangleBar;\x03⧏\x0eftUpTeeVector;\x03⥠\x0eftUpVectorBar;\x03⥘\x0dftDownVector;\x03⇃\x0dftRightArrow;\x03↔\x0dftrightarrow;\x03⇔\x0dssSlantEqual;\x03⩽\x0cftTeeVector;\x03⥚\x0cftVectorBar;\x03⥒\x0cssFullEqual;\x03≦\x0bftArrowBar;\x03⇤\x0bftTeeArrow;\x03↤\x0bftTriangle;\x03⊲\x0bftUpVector;\x03↿\x0aftCeiling;\x03⌈\x0assGreater;\x03≶\x09ftVector;\x03↼\x08ftArrow;\x03←\x08ftFloor;\x03⌊\x08ftarrow;\x03⇐\x08ssTilde;\x03≲\x07ssLess;\x03⪡\x06ftTee;\x03⊣",
// Lfr;[𝔏].
"\x02r;\x04𝔏",
// Lleftarrow;[⇚] Ll;[⋘].
"\x09eftarrow;\x03⇚\x01;\x03⋘",
// Lmidot;[Ŀ].
"\x05idot;\x02Ŀ",
// LongLeftRightArrow;[⟷] Longleftrightarrow;[⟺] LowerRightArrow;[↘] LongRightArrow;[⟶] Longrightarrow;[⟹] LowerLeftArrow;[↙] LongLeftArrow;[⟵] Longleftarrow;[⟸] Lopf;[𝕃].
"\x11ngLeftRightArrow;\x03⟷\x11ngleftrightarrow;\x03⟺\x0ewerRightArrow;\x03↘\x0dngRightArrow;\x03⟶\x0dngrightarrow;\x03⟹\x0dwerLeftArrow;\x03↙\x0cngLeftArrow;\x03⟵\x0cngleftarrow;\x03⟸\x03pf;\x04𝕃",
// Lstrok;[Ł] Lscr;[ℒ] Lsh;[↰].
"\x05trok;\x02Ł\x03cr;\x03ℒ\x02h;\x03↰",
// Lt;[≪].
"\x01;\x03≪",
// Map;[⤅].
"\x02p;\x03⤅",
// Mcy;[М].
"\x02y;\x02М",
// MediumSpace;[ ] Mellintrf;[ℳ].
"\x0adiumSpace;\x03 \x08llintrf;\x03ℳ",
// Mfr;[𝔐].
"\x02r;\x04𝔐",
// MinusPlus;[∓].
"\x08nusPlus;\x03∓",
// Mopf;[𝕄].
"\x03pf;\x04𝕄",
// Mscr;[ℳ].
"\x03cr;\x03ℳ",
// Mu;[Μ].
"\x01;\x02Μ",
// NJcy;[Њ].
"\x03cy;\x02Њ",
// Nacute;[Ń].
"\x05cute;\x02Ń",
// Ncaron;[Ň] Ncedil;[Ņ] Ncy;[Н].
"\x05aron;\x02Ň\x05edil;\x02Ņ\x02y;\x02Н",
// NegativeVeryThinSpace;[] NestedGreaterGreater;[≫] NegativeMediumSpace;[] NegativeThickSpace;[] NegativeThinSpace;[] NestedLessLess;[≪] NewLine;[\xa].
"\x14gativeVeryThinSpace;\x03\x13stedGreaterGreater;\x03≫\x12gativeMediumSpace;\x03\x11gativeThickSpace;\x03\x10gativeThinSpace;\x03\x0dstedLessLess;\x03≪\x06wLine;\x01\xa",
// Nfr;[𝔑].
"\x02r;\x04𝔑",
// NotNestedGreaterGreater;[⪢̸] NotSquareSupersetEqual;[⋣] NotPrecedesSlantEqual;[⋠] NotRightTriangleEqual;[⋭] NotSucceedsSlantEqual;[⋡] NotDoubleVerticalBar;[∦] NotGreaterSlantEqual;[⩾̸] NotLeftTriangleEqual;[⋬] NotSquareSubsetEqual;[⋢] NotGreaterFullEqual;[≧̸] NotRightTriangleBar;[⧐̸] NotLeftTriangleBar;[⧏̸] NotGreaterGreater;[≫̸] NotLessSlantEqual;[⩽̸] NotNestedLessLess;[⪡̸] NotReverseElement;[∌] NotSquareSuperset;[⊐̸] NotTildeFullEqual;[≇] NonBreakingSpace;[ ] NotPrecedesEqual;[⪯̸] NotRightTriangle;[⋫] NotSucceedsEqual;[⪰̸] NotSucceedsTilde;[≿̸] NotSupersetEqual;[⊉] NotGreaterEqual;[≱] NotGreaterTilde;[≵] NotHumpDownHump;[≎̸] NotLeftTriangle;[⋪] NotSquareSubset;[⊏̸] NotGreaterLess;[≹] NotLessGreater;[≸] NotSubsetEqual;[⊈] NotVerticalBar;[∤] NotEqualTilde;[≂̸] NotTildeEqual;[≄] NotTildeTilde;[≉] NotCongruent;[≢] NotHumpEqual;[≏̸] NotLessEqual;[≰] NotLessTilde;[≴] NotLessLess;[≪̸] NotPrecedes;[⊀] NotSucceeds;[⊁] NotSuperset;[⊃⃒] NotElement;[∉] NotGreater;[≯] NotCupCap;[≭] NotExists;[∄] NotSubset;[⊂⃒] NotEqual;[≠] NotTilde;[≁] NoBreak;[] NotLess;[≮] Nopf;[ℕ] Not;[⫬].
"\x16tNestedGreaterGreater;\x05⪢̸\x15tSquareSupersetEqual;\x03⋣\x14tPrecedesSlantEqual;\x03⋠\x14tRightTriangleEqual;\x03⋭\x14tSucceedsSlantEqual;\x03⋡\x13tDoubleVerticalBar;\x03∦\x13tGreaterSlantEqual;\x05⩾̸\x13tLeftTriangleEqual;\x03⋬\x13tSquareSubsetEqual;\x03⋢\x12tGreaterFullEqual;\x05≧̸\x12tRightTriangleBar;\x05⧐̸\x11tLeftTriangleBar;\x05⧏̸\x10tGreaterGreater;\x05≫̸\x10tLessSlantEqual;\x05⩽̸\x10tNestedLessLess;\x05⪡̸\x10tReverseElement;\x03∌\x10tSquareSuperset;\x05⊐̸\x10tTildeFullEqual;\x03≇\x0fnBreakingSpace;\x02 \x0ftPrecedesEqual;\x05⪯̸\x0ftRightTriangle;\x03⋫\x0ftSucceedsEqual;\x05⪰̸\x0ftSucceedsTilde;\x05≿̸\x0ftSupersetEqual;\x03⊉\x0etGreaterEqual;\x03≱\x0etGreaterTilde;\x03≵\x0etHumpDownHump;\x05≎̸\x0etLeftTriangle;\x03⋪\x0etSquareSubset;\x05⊏̸\x0dtGreaterLess;\x03≹\x0dtLessGreater;\x03≸\x0dtSubsetEqual;\x03⊈\x0dtVerticalBar;\x03∤\x0ctEqualTilde;\x05≂̸\x0ctTildeEqual;\x03≄\x0ctTildeTilde;\x03≉\x0btCongruent;\x03≢\x0btHumpEqual;\x05≏̸\x0btLessEqual;\x03≰\x0btLessTilde;\x03≴\x0atLessLess;\x05≪̸\x0atPrecedes;\x03⊀\x0atSucceeds;\x03⊁\x0atSuperset;\x06⊃⃒\x09tElement;\x03∉\x09tGreater;\x03≯\x08tCupCap;\x03≭\x08tExists;\x03∄\x08tSubset;\x06⊂⃒\x07tEqual;\x03≠\x07tTilde;\x03≁\x06Break;\x03\x06tLess;\x03≮\x03pf;\x03ℕ\x02t;\x03⫬",
// Nscr;[𝒩].
"\x03cr;\x04𝒩",
// Ntilde;[Ñ] Ntilde[Ñ].
"\x05ilde;\x02Ñ\x04ilde\x02Ñ",
// Nu;[Ν].
"\x01;\x02Ν",
// OElig;[Œ].
"\x04lig;\x02Œ",
// Oacute;[Ó] Oacute[Ó].
"\x05cute;\x02Ó\x04cute\x02Ó",
// Ocirc;[Ô] Ocirc[Ô] Ocy;[О].
"\x04irc;\x02Ô\x03irc\x02Ô\x02y;\x02О",
// Odblac;[Ő].
"\x05blac;\x02Ő",
// Ofr;[𝔒].
"\x02r;\x04𝔒",
// Ograve;[Ò] Ograve[Ò].
"\x05rave;\x02Ò\x04rave\x02Ò",
// Omicron;[Ο] Omacr;[Ō] Omega;[Ω].
"\x06icron;\x02Ο\x04acr;\x02Ō\x04ega;\x02Ω",
// Oopf;[𝕆].
"\x03pf;\x04𝕆",
// OpenCurlyDoubleQuote;[“] OpenCurlyQuote;[‘].
"\x13enCurlyDoubleQuote;\x03“\x0denCurlyQuote;\x03‘",
// Or;[⩔].
"\x01;\x03⩔",
// Oslash;[Ø] Oslash[Ø] Oscr;[𝒪].
"\x05lash;\x02Ø\x04lash\x02Ø\x03cr;\x04𝒪",
// Otilde;[Õ] Otimes;[⨷] Otilde[Õ].
"\x05ilde;\x02Õ\x05imes;\x03⨷\x04ilde\x02Õ",
// Ouml;[Ö] Ouml[Ö].
"\x03ml;\x02Ö\x02ml\x02Ö",
// OverParenthesis;[⏜] OverBracket;[⎴] OverBrace;[⏞] OverBar;[‾].
"\x0eerParenthesis;\x03⏜\x0aerBracket;\x03⎴\x08erBrace;\x03⏞\x06erBar;\x03‾",
// PartialD;[∂].
"\x07rtialD;\x03∂",
// Pcy;[П].
"\x02y;\x02П",
// Pfr;[𝔓].
"\x02r;\x04𝔓",
// Phi;[Φ].
"\x02i;\x02Φ",
// Pi;[Π].
"\x01;\x02Π",
// PlusMinus;[±].
"\x08usMinus;\x02±",
// Poincareplane;[ℌ] Popf;[ℙ].
"\x0cincareplane;\x03ℌ\x03pf;\x03ℙ",
// PrecedesSlantEqual;[≼] PrecedesEqual;[⪯] PrecedesTilde;[≾] Proportional;[∝] Proportion;[∷] Precedes;[≺] Product;[∏] Prime;[″] Pr;[⪻].
"\x11ecedesSlantEqual;\x03≼\x0cecedesEqual;\x03⪯\x0cecedesTilde;\x03≾\x0boportional;\x03∝\x09oportion;\x03∷\x07ecedes;\x03≺\x06oduct;\x03∏\x04ime;\x03″\x01;\x03⪻",
// Pscr;[𝒫] Psi;[Ψ].
"\x03cr;\x04𝒫\x02i;\x02Ψ",
// QUOT;[\"] QUOT[\"].
"\x03OT;\x01\"\x02OT\x01\"",
// Qfr;[𝔔].
"\x02r;\x04𝔔",
// Qopf;[ℚ].
"\x03pf;\x03ℚ",
// Qscr;[𝒬].
"\x03cr;\x04𝒬",
// RBarr;[⤐].
"\x04arr;\x03⤐",
// REG;[®] REG[®].
"\x02G;\x02®\x01G\x02®",
// Racute;[Ŕ] Rarrtl;[⤖] Rang;[⟫] Rarr;[↠].
"\x05cute;\x02Ŕ\x05rrtl;\x03⤖\x03ng;\x03⟫\x03rr;\x03↠",
// Rcaron;[Ř] Rcedil;[Ŗ] Rcy;[Р].
"\x05aron;\x02Ř\x05edil;\x02Ŗ\x02y;\x02Р",
// ReverseUpEquilibrium;[⥯] ReverseEquilibrium;[⇋] ReverseElement;[∋] Re;[ℜ].
"\x13verseUpEquilibrium;\x03⥯\x11verseEquilibrium;\x03⇋\x0dverseElement;\x03∋\x01;\x03ℜ",
// Rfr;[ℜ].
"\x02r;\x03ℜ",
// Rho;[Ρ].
"\x02o;\x02Ρ",
// RightArrowLeftArrow;[⇄] RightDoubleBracket;[⟧] RightDownTeeVector;[⥝] RightDownVectorBar;[⥕] RightTriangleEqual;[⊵] RightAngleBracket;[⟩] RightUpDownVector;[⥏] RightTriangleBar;[⧐] RightUpTeeVector;[⥜] RightUpVectorBar;[⥔] RightDownVector;[⇂] RightTeeVector;[⥛] RightVectorBar;[⥓] RightArrowBar;[⇥] RightTeeArrow;[↦] RightTriangle;[⊳] RightUpVector;[↾] RightCeiling;[⌉] RightVector;[⇀] RightArrow;[→] RightFloor;[⌋] Rightarrow;[⇒] RightTee;[⊢].
"\x12ghtArrowLeftArrow;\x03⇄\x11ghtDoubleBracket;\x03⟧\x11ghtDownTeeVector;\x03⥝\x11ghtDownVectorBar;\x03⥕\x11ghtTriangleEqual;\x03⊵\x10ghtAngleBracket;\x03⟩\x10ghtUpDownVector;\x03⥏\x0fghtTriangleBar;\x03⧐\x0fghtUpTeeVector;\x03⥜\x0fghtUpVectorBar;\x03⥔\x0eghtDownVector;\x03⇂\x0dghtTeeVector;\x03⥛\x0dghtVectorBar;\x03⥓\x0cghtArrowBar;\x03⇥\x0cghtTeeArrow;\x03↦\x0cghtTriangle;\x03⊳\x0cghtUpVector;\x03↾\x0bghtCeiling;\x03⌉\x0aghtVector;\x03⇀\x09ghtArrow;\x03→\x09ghtFloor;\x03⌋\x09ghtarrow;\x03⇒\x07ghtTee;\x03⊢",
// RoundImplies;[⥰] Ropf;[ℝ].
"\x0bundImplies;\x03⥰\x03pf;\x03ℝ",
// Rrightarrow;[⇛].
"\x0aightarrow;\x03⇛",
// Rscr;[ℛ] Rsh;[↱].
"\x03cr;\x03ℛ\x02h;\x03↱",
// RuleDelayed;[⧴].
"\x0aleDelayed;\x03⧴",
// SHCHcy;[Щ] SHcy;[Ш].
"\x05CHcy;\x02Щ\x03cy;\x02Ш",
// SOFTcy;[Ь].
"\x05FTcy;\x02Ь",
// Sacute;[Ś].
"\x05cute;\x02Ś",
// Scaron;[Š] Scedil;[Ş] Scirc;[Ŝ] Scy;[С] Sc;[⪼].
"\x05aron;\x02Š\x05edil;\x02Ş\x04irc;\x02Ŝ\x02y;\x02С\x01;\x03⪼",
// Sfr;[𝔖].
"\x02r;\x04𝔖",
// ShortRightArrow;[→] ShortDownArrow;[↓] ShortLeftArrow;[←] ShortUpArrow;[↑].
"\x0eortRightArrow;\x03→\x0dortDownArrow;\x03↓\x0dortLeftArrow;\x03←\x0bortUpArrow;\x03↑",
// Sigma;[Σ].
"\x04gma;\x02Σ",
// SmallCircle;[∘].
"\x0aallCircle;\x03∘",
// Sopf;[𝕊].
"\x03pf;\x04𝕊",
// SquareSupersetEqual;[⊒] SquareIntersection;[⊓] SquareSubsetEqual;[⊑] SquareSuperset;[⊐] SquareSubset;[⊏] SquareUnion;[⊔] Square;[□] Sqrt;[√].
"\x12uareSupersetEqual;\x03⊒\x11uareIntersection;\x03⊓\x10uareSubsetEqual;\x03⊑\x0duareSuperset;\x03⊐\x0buareSubset;\x03⊏\x0auareUnion;\x03⊔\x05uare;\x03□\x03rt;\x03√",
// Sscr;[𝒮].
"\x03cr;\x04𝒮",
// Star;[⋆].
"\x03ar;\x03⋆",
// SucceedsSlantEqual;[≽] SucceedsEqual;[⪰] SucceedsTilde;[≿] SupersetEqual;[⊇] SubsetEqual;[⊆] Succeeds;[≻] SuchThat;[∋] Superset;[⊃] Subset;[⋐] Supset;[⋑] Sub;[⋐] Sum;[∑] Sup;[⋑].
"\x11cceedsSlantEqual;\x03≽\x0ccceedsEqual;\x03⪰\x0ccceedsTilde;\x03≿\x0cpersetEqual;\x03⊇\x0absetEqual;\x03⊆\x07cceeds;\x03≻\x07chThat;\x03∋\x07perset;\x03⊃\x05bset;\x03⋐\x05pset;\x03⋑\x02b;\x03⋐\x02m;\x03∑\x02p;\x03⋑",
// THORN;[Þ] THORN[Þ].
"\x04ORN;\x02Þ\x03ORN\x02Þ",
// TRADE;[™].
"\x04ADE;\x03™",
// TSHcy;[Ћ] TScy;[Ц].
"\x04Hcy;\x02Ћ\x03cy;\x02Ц",
// Tab;[\x9] Tau;[Τ].
"\x02b;\x01\x9\x02u;\x02Τ",
// Tcaron;[Ť] Tcedil;[Ţ] Tcy;[Т].
"\x05aron;\x02Ť\x05edil;\x02Ţ\x02y;\x02Т",
// Tfr;[𝔗].
"\x02r;\x04𝔗",
// ThickSpace;[ ] Therefore;[∴] ThinSpace;[ ] Theta;[Θ].
"\x09ickSpace;\x06 \x08erefore;\x03∴\x08inSpace;\x03 \x04eta;\x02Θ",
// TildeFullEqual;[≅] TildeEqual;[≃] TildeTilde;[≈] Tilde;[∼].
"\x0dldeFullEqual;\x03≅\x09ldeEqual;\x03≃\x09ldeTilde;\x03≈\x04lde;\x03∼",
// Topf;[𝕋].
"\x03pf;\x04𝕋",
// TripleDot;[⃛].
"\x08ipleDot;\x03⃛",
// Tstrok;[Ŧ] Tscr;[𝒯].
"\x05trok;\x02Ŧ\x03cr;\x04𝒯",
// Uarrocir;[⥉] Uacute;[Ú] Uacute[Ú] Uarr;[↟].
"\x07rrocir;\x03⥉\x05cute;\x02Ú\x04cute\x02Ú\x03rr;\x03↟",
// Ubreve;[Ŭ] Ubrcy;[Ў].
"\x05reve;\x02Ŭ\x04rcy;\x02Ў",
// Ucirc;[Û] Ucirc[Û] Ucy;[У].
"\x04irc;\x02Û\x03irc\x02Û\x02y;\x02У",
// Udblac;[Ű].
"\x05blac;\x02Ű",
// Ufr;[𝔘].
"\x02r;\x04𝔘",
// Ugrave;[Ù] Ugrave[Ù].
"\x05rave;\x02Ù\x04rave\x02Ù",
// Umacr;[Ū].
"\x04acr;\x02Ū",
// UnderParenthesis;[⏝] UnderBracket;[⎵] UnderBrace;[⏟] UnionPlus;[⊎] UnderBar;[_] Union;[⋃].
"\x0fderParenthesis;\x03⏝\x0bderBracket;\x03⎵\x09derBrace;\x03⏟\x08ionPlus;\x03⊎\x07derBar;\x01_\x04ion;\x03⋃",
// Uogon;[Ų] Uopf;[𝕌].
"\x04gon;\x02Ų\x03pf;\x04𝕌",
// UpArrowDownArrow;[⇅] UpperRightArrow;[↗] UpperLeftArrow;[↖] UpEquilibrium;[⥮] UpDownArrow;[↕] Updownarrow;[⇕] UpArrowBar;[⤒] UpTeeArrow;[↥] UpArrow;[↑] Uparrow;[⇑] Upsilon;[Υ] UpTee;[⊥] Upsi;[ϒ].
"\x0fArrowDownArrow;\x03⇅\x0eperRightArrow;\x03↗\x0dperLeftArrow;\x03↖\x0cEquilibrium;\x03⥮\x0aDownArrow;\x03↕\x0adownarrow;\x03⇕\x09ArrowBar;\x03⤒\x09TeeArrow;\x03↥\x06Arrow;\x03↑\x06arrow;\x03⇑\x06silon;\x02Υ\x04Tee;\x03⊥\x03si;\x02ϒ",
// Uring;[Ů].
"\x04ing;\x02Ů",
// Uscr;[𝒰].
"\x03cr;\x04𝒰",
// Utilde;[Ũ].
"\x05ilde;\x02Ũ",
// Uuml;[Ü] Uuml[Ü].
"\x03ml;\x02Ü\x02ml\x02Ü",
// VDash;[⊫].
"\x04ash;\x03⊫",
// Vbar;[⫫].
"\x03ar;\x03⫫",
// Vcy;[В].
"\x02y;\x02В",
// Vdashl;[⫦] Vdash;[⊩].
"\x05ashl;\x03⫦\x04ash;\x03⊩",
// VerticalSeparator;[❘] VerticalTilde;[≀] VeryThinSpace;[ ] VerticalLine;[|] VerticalBar;[∣] Verbar;[‖] Vert;[‖] Vee;[⋁].
"\x10rticalSeparator;\x03❘\x0crticalTilde;\x03≀\x0cryThinSpace;\x03 \x0brticalLine;\x01|\x0articalBar;\x03∣\x05rbar;\x03‖\x03rt;\x03‖\x02e;\x03⋁",
// Vfr;[𝔙].
"\x02r;\x04𝔙",
// Vopf;[𝕍].
"\x03pf;\x04𝕍",
// Vscr;[𝒱].
"\x03cr;\x04𝒱",
// Vvdash;[⊪].
"\x05dash;\x03⊪",
// Wcirc;[Ŵ].
"\x04irc;\x02Ŵ",
// Wedge;[⋀].
"\x04dge;\x03⋀",
// Wfr;[𝔚].
"\x02r;\x04𝔚",
// Wopf;[𝕎].
"\x03pf;\x04𝕎",
// Wscr;[𝒲].
"\x03cr;\x04𝒲",
// Xfr;[𝔛].
"\x02r;\x04𝔛",
// Xi;[Ξ].
"\x01;\x02Ξ",
// Xopf;[𝕏].
"\x03pf;\x04𝕏",
// Xscr;[𝒳].
"\x03cr;\x04𝒳",
// YAcy;[Я].
"\x03cy;\x02Я",
// YIcy;[Ї].
"\x03cy;\x02Ї",
// YUcy;[Ю].
"\x03cy;\x02Ю",
// Yacute;[Ý] Yacute[Ý].
"\x05cute;\x02Ý\x04cute\x02Ý",
// Ycirc;[Ŷ] Ycy;[Ы].
"\x04irc;\x02Ŷ\x02y;\x02Ы",
// Yfr;[𝔜].
"\x02r;\x04𝔜",
// Yopf;[𝕐].
"\x03pf;\x04𝕐",
// Yscr;[𝒴].
"\x03cr;\x04𝒴",
// Yuml;[Ÿ].
"\x03ml;\x02Ÿ",
// ZHcy;[Ж].
"\x03cy;\x02Ж",
// Zacute;[Ź].
"\x05cute;\x02Ź",
// Zcaron;[Ž] Zcy;[З].
"\x05aron;\x02Ž\x02y;\x02З",
// Zdot;[Ż].
"\x03ot;\x02Ż",
// ZeroWidthSpace;[] Zeta;[Ζ].
"\x0droWidthSpace;\x03\x03ta;\x02Ζ",
// Zfr;[ℨ].
"\x02r;\x03ℨ",
// Zopf;[ℤ].
"\x03pf;\x03ℤ",
// Zscr;[𝒵].
"\x03cr;\x04𝒵",
// aacute;[á] aacute[á].
"\x05cute;\x02á\x04cute\x02á",
// abreve;[ă].
"\x05reve;\x02ă",
// acirc;[â] acute;[´] acirc[â] acute[´] acE;[∾̳] acd;[∿] acy;[а] ac;[∾].
"\x04irc;\x02â\x04ute;\x02´\x03irc\x02â\x03ute\x02´\x02E;\x05∾̳\x02d;\x03∿\x02y;\x02а\x01;\x03∾",
// aelig;[æ] aelig[æ].
"\x04lig;\x02æ\x03lig\x02æ",
// afr;[𝔞] af;[].
"\x02r;\x04𝔞\x01;\x03",
// agrave;[à] agrave[à].
"\x05rave;\x02à\x04rave\x02à",
// alefsym;[ℵ] aleph;[ℵ] alpha;[α].
"\x06efsym;\x03ℵ\x04eph;\x03ℵ\x04pha;\x02α",
// amacr;[ā] amalg;[⨿] amp;[&] amp[&].
"\x04acr;\x02ā\x04alg;\x03⨿\x02p;\x01&\x01p\x01&",
// andslope;[⩘] angmsdaa;[⦨] angmsdab;[⦩] angmsdac;[⦪] angmsdad;[⦫] angmsdae;[⦬] angmsdaf;[⦭] angmsdag;[⦮] angmsdah;[⦯] angrtvbd;[⦝] angrtvb;[⊾] angzarr;[⍼] andand;[⩕] angmsd;[∡] angsph;[∢] angle;[∠] angrt;[∟] angst;[Å] andd;[⩜] andv;[⩚] ange;[⦤] and;[∧] ang;[∠].
"\x07dslope;\x03⩘\x07gmsdaa;\x03⦨\x07gmsdab;\x03⦩\x07gmsdac;\x03⦪\x07gmsdad;\x03⦫\x07gmsdae;\x03⦬\x07gmsdaf;\x03⦭\x07gmsdag;\x03⦮\x07gmsdah;\x03⦯\x07grtvbd;\x03⦝\x06grtvb;\x03⊾\x06gzarr;\x03⍼\x05dand;\x03⩕\x05gmsd;\x03∡\x05gsph;\x03∢\x04gle;\x03∠\x04grt;\x03∟\x04gst;\x02Å\x03dd;\x03⩜\x03dv;\x03⩚\x03ge;\x03⦤\x02d;\x03∧\x02g;\x03∠",
// aogon;[ą] aopf;[𝕒].
"\x04gon;\x02ą\x03pf;\x04𝕒",
// approxeq;[≊] apacir;[⩯] approx;[≈] apid;[≋] apos;['] apE;[⩰] ape;[≊] ap;[≈].
"\x07proxeq;\x03≊\x05acir;\x03⩯\x05prox;\x03≈\x03id;\x03≋\x03os;\x01'\x02E;\x03⩰\x02e;\x03≊\x01;\x03≈",
// aring;[å] aring[å].
"\x04ing;\x02å\x03ing\x02å",
// asympeq;[≍] asymp;[≈] ascr;[𝒶] ast;[*].
"\x06ympeq;\x03≍\x04ymp;\x03≈\x03cr;\x04𝒶\x02t;\x01*",
// atilde;[ã] atilde[ã].
"\x05ilde;\x02ã\x04ilde\x02ã",
// auml;[ä] auml[ä].
"\x03ml;\x02ä\x02ml\x02ä",
// awconint;[∳] awint;[⨑].
"\x07conint;\x03∳\x04int;\x03⨑",
// bNot;[⫭].
"\x03ot;\x03⫭",
// backepsilon;[϶] backprime;[‵] backsimeq;[⋍] backcong;[≌] barwedge;[⌅] backsim;[∽] barvee;[⊽] barwed;[⌅].
"\x0ackepsilon;\x02϶\x08ckprime;\x03‵\x08cksimeq;\x03⋍\x07ckcong;\x03≌\x07rwedge;\x03⌅\x06cksim;\x03∽\x05rvee;\x03⊽\x05rwed;\x03⌅",
// bbrktbrk;[⎶] bbrk;[⎵].
"\x07rktbrk;\x03⎶\x03rk;\x03⎵",
// bcong;[≌] bcy;[б].
"\x04ong;\x03≌\x02y;\x02б",
// bdquo;[„].
"\x04quo;\x03„",
// because;[∵] bemptyv;[⦰] between;[≬] becaus;[∵] bernou;[ℬ] bepsi;[϶] beta;[β] beth;[ℶ].
"\x06cause;\x03∵\x06mptyv;\x03⦰\x06tween;\x03≬\x05caus;\x03∵\x05rnou;\x03ℬ\x04psi;\x02϶\x03ta;\x02β\x03th;\x03ℶ",
// bfr;[𝔟].
"\x02r;\x04𝔟",
// bigtriangledown;[▽] bigtriangleup;[△] bigotimes;[⨂] bigoplus;[⨁] bigsqcup;[⨆] biguplus;[⨄] bigwedge;[⋀] bigcirc;[◯] bigodot;[⨀] bigstar;[★] bigcap;[⋂] bigcup;[⋃] bigvee;[⋁].
"\x0egtriangledown;\x03▽\x0cgtriangleup;\x03△\x08gotimes;\x03⨂\x07goplus;\x03⨁\x07gsqcup;\x03⨆\x07guplus;\x03⨄\x07gwedge;\x03⋀\x06gcirc;\x03◯\x06godot;\x03⨀\x06gstar;\x03★\x05gcap;\x03⋂\x05gcup;\x03⋃\x05gvee;\x03⋁",
// bkarow;[⤍].
"\x05arow;\x03⤍",
// blacktriangleright;[▸] blacktriangledown;[▾] blacktriangleleft;[◂] blacktriangle;[▴] blacklozenge;[⧫] blacksquare;[▪] blank;[␣] blk12;[▒] blk14;[░] blk34;[▓] block;[█].
"\x11acktriangleright;\x03▸\x10acktriangledown;\x03▾\x10acktriangleleft;\x03◂\x0cacktriangle;\x03▴\x0backlozenge;\x03⧫\x0aacksquare;\x03▪\x04ank;\x03␣\x04k12;\x03▒\x04k14;\x03░\x04k34;\x03▓\x04ock;\x03█",
// bnequiv;[≡⃥] bnot;[⌐] bne;[=⃥].
"\x06equiv;\x06≡⃥\x03ot;\x03⌐\x02e;\x04=⃥",
// boxminus;[⊟] boxtimes;[⊠] boxplus;[⊞] bottom;[⊥] bowtie;[⋈] boxbox;[⧉] boxDL;[╗] boxDR;[╔] boxDl;[╖] boxDr;[╓] boxHD;[╦] boxHU;[╩] boxHd;[╤] boxHu;[╧] boxUL;[╝] boxUR;[╚] boxUl;[╜] boxUr;[╙] boxVH;[╬] boxVL;[╣] boxVR;[╠] boxVh;[╫] boxVl;[╢] boxVr;[╟] boxdL;[╕] boxdR;[╒] boxdl;[┐] boxdr;[┌] boxhD;[╥] boxhU;[╨] boxhd;[┬] boxhu;[┴] boxuL;[╛] boxuR;[╘] boxul;[┘] boxur;[└] boxvH;[╪] boxvL;[╡] boxvR;[╞] boxvh;[┼] boxvl;[┤] boxvr;[├] bopf;[𝕓] boxH;[═] boxV;[║] boxh;[─] boxv;[│] bot;[⊥].
"\x07xminus;\x03⊟\x07xtimes;\x03⊠\x06xplus;\x03⊞\x05ttom;\x03⊥\x05wtie;\x03⋈\x05xbox;\x03⧉\x04xDL;\x03╗\x04xDR;\x03╔\x04xDl;\x03╖\x04xDr;\x03╓\x04xHD;\x03╦\x04xHU;\x03╩\x04xHd;\x03╤\x04xHu;\x03╧\x04xUL;\x03╝\x04xUR;\x03╚\x04xUl;\x03╜\x04xUr;\x03╙\x04xVH;\x03╬\x04xVL;\x03╣\x04xVR;\x03╠\x04xVh;\x03╫\x04xVl;\x03╢\x04xVr;\x03╟\x04xdL;\x03╕\x04xdR;\x03╒\x04xdl;\x03┐\x04xdr;\x03┌\x04xhD;\x03╥\x04xhU;\x03╨\x04xhd;\x03┬\x04xhu;\x03┴\x04xuL;\x03╛\x04xuR;\x03╘\x04xul;\x03┘\x04xur;\x03└\x04xvH;\x03╪\x04xvL;\x03╡\x04xvR;\x03╞\x04xvh;\x03┼\x04xvl;\x03┤\x04xvr;\x03├\x03pf;\x04𝕓\x03xH;\x03═\x03xV;\x03║\x03xh;\x03─\x03xv;\x03│\x02t;\x03⊥",
// bprime;[‵].
"\x05rime;\x03‵",
// brvbar;[¦] breve;[˘] brvbar[¦].
"\x05vbar;\x02¦\x04eve;\x02˘\x04vbar\x02¦",
// bsolhsub;[⟈] bsemi;[⁏] bsime;[⋍] bsolb;[⧅] bscr;[𝒷] bsim;[∽] bsol;[\\].
"\x07olhsub;\x03⟈\x04emi;\x03⁏\x04ime;\x03⋍\x04olb;\x03⧅\x03cr;\x04𝒷\x03im;\x03∽\x03ol;\x01\\",
// bullet;[•] bumpeq;[≏] bumpE;[⪮] bumpe;[≏] bull;[•] bump;[≎].
"\x05llet;\x03•\x05mpeq;\x03≏\x04mpE;\x03⪮\x04mpe;\x03≏\x03ll;\x03•\x03mp;\x03≎",
// capbrcup;[⩉] cacute;[ć] capand;[⩄] capcap;[⩋] capcup;[⩇] capdot;[⩀] caret;[⁁] caron;[ˇ] caps;[∩︀] cap;[∩].
"\x07pbrcup;\x03⩉\x05cute;\x02ć\x05pand;\x03⩄\x05pcap;\x03⩋\x05pcup;\x03⩇\x05pdot;\x03⩀\x04ret;\x03⁁\x04ron;\x02ˇ\x03ps;\x06∩︀\x02p;\x03∩",
// ccupssm;[⩐] ccaron;[č] ccedil;[ç] ccaps;[⩍] ccedil[ç] ccirc;[ĉ] ccups;[⩌].
"\x06upssm;\x03⩐\x05aron;\x02č\x05edil;\x02ç\x04aps;\x03⩍\x04edil\x02ç\x04irc;\x02ĉ\x04ups;\x03⩌",
// cdot;[ċ].
"\x03ot;\x02ċ",
// centerdot;[·] cemptyv;[⦲] cedil;[¸] cedil[¸] cent;[¢] cent[¢].
"\x08nterdot;\x02·\x06mptyv;\x03⦲\x04dil;\x02¸\x03dil\x02¸\x03nt;\x02¢\x02nt\x02¢",
// cfr;[𝔠].
"\x02r;\x04𝔠",
// checkmark;[✓] check;[✓] chcy;[ч] chi;[χ].
"\x08eckmark;\x03✓\x04eck;\x03✓\x03cy;\x02ч\x02i;\x02χ",
// circlearrowright;[↻] circlearrowleft;[↺] circledcirc;[⊚] circleddash;[⊝] circledast;[⊛] circledR;[®] circledS;[Ⓢ] cirfnint;[⨐] cirscir;[⧂] circeq;[≗] cirmid;[⫯] cirE;[⧃] circ;[ˆ] cire;[≗] cir;[○].
"\x0frclearrowright;\x03↻\x0erclearrowleft;\x03↺\x0arcledcirc;\x03⊚\x0arcleddash;\x03⊝\x09rcledast;\x03⊛\x07rcledR;\x02®\x07rcledS;\x03Ⓢ\x07rfnint;\x03⨐\x06rscir;\x03⧂\x05rceq;\x03≗\x05rmid;\x03⫯\x03rE;\x03⧃\x03rc;\x02ˆ\x03re;\x03≗\x02r;\x03○",
// clubsuit;[♣] clubs;[♣].
"\x07ubsuit;\x03♣\x04ubs;\x03♣",
// complement;[∁] complexes;[ℂ] coloneq;[≔] congdot;[⩭] colone;[≔] commat;[@] compfn;[∘] conint;[∮] coprod;[∐] copysr;[℗] colon;[:] comma;[,] comp;[∁] cong;[≅] copf;[𝕔] copy;[©] copy[©].
"\x09mplement;\x03∁\x08mplexes;\x03ℂ\x06loneq;\x03≔\x06ngdot;\x03⩭\x05lone;\x03≔\x05mmat;\x01@\x05mpfn;\x03∘\x05nint;\x03∮\x05prod;\x03∐\x05pysr;\x03℗\x04lon;\x01:\x04mma;\x01,\x03mp;\x03∁\x03ng;\x03≅\x03pf;\x04𝕔\x03py;\x02©\x02py\x02©",
// crarr;[↵] cross;[✗].
"\x04arr;\x03↵\x04oss;\x03✗",
// csube;[⫑] csupe;[⫒] cscr;[𝒸] csub;[⫏] csup;[⫐].
"\x04ube;\x03⫑\x04upe;\x03⫒\x03cr;\x04𝒸\x03ub;\x03⫏\x03up;\x03⫐",
// ctdot;[⋯].
"\x04dot;\x03⋯",
// curvearrowright;[↷] curvearrowleft;[↶] curlyeqprec;[⋞] curlyeqsucc;[⋟] curlywedge;[⋏] cupbrcap;[⩈] curlyvee;[⋎] cudarrl;[⤸] cudarrr;[⤵] cularrp;[⤽] curarrm;[⤼] cularr;[↶] cupcap;[⩆] cupcup;[⩊] cupdot;[⊍] curarr;[↷] curren;[¤] cuepr;[⋞] cuesc;[⋟] cupor;[⩅] curren[¤] cuvee;[⋎] cuwed;[⋏] cups;[∪︀] cup;[∪].
"\x0ervearrowright;\x03↷\x0drvearrowleft;\x03↶\x0arlyeqprec;\x03⋞\x0arlyeqsucc;\x03⋟\x09rlywedge;\x03⋏\x07pbrcap;\x03⩈\x07rlyvee;\x03⋎\x06darrl;\x03⤸\x06darrr;\x03⤵\x06larrp;\x03⤽\x06rarrm;\x03⤼\x05larr;\x03↶\x05pcap;\x03⩆\x05pcup;\x03⩊\x05pdot;\x03⊍\x05rarr;\x03↷\x05rren;\x02¤\x04epr;\x03⋞\x04esc;\x03⋟\x04por;\x03⩅\x04rren\x02¤\x04vee;\x03⋎\x04wed;\x03⋏\x03ps;\x06∪︀\x02p;\x03∪",
// cwconint;[∲] cwint;[∱].
"\x07conint;\x03∲\x04int;\x03∱",
// cylcty;[⌭].
"\x05lcty;\x03⌭",
// dArr;[⇓].
"\x03rr;\x03⇓",
// dHar;[⥥].
"\x03ar;\x03⥥",
// dagger;[†] daleth;[ℸ] dashv;[⊣] darr;[↓] dash;[‐].
"\x05gger;\x03†\x05leth;\x03ℸ\x04shv;\x03⊣\x03rr;\x03↓\x03sh;\x03‐",
// dbkarow;[⤏] dblac;[˝].
"\x06karow;\x03⤏\x04lac;\x02˝",
// dcaron;[ď] dcy;[д].
"\x05aron;\x02ď\x02y;\x02д",
// ddagger;[‡] ddotseq;[⩷] ddarr;[⇊] dd;[ⅆ].
"\x06agger;\x03‡\x06otseq;\x03⩷\x04arr;\x03⇊\x01;\x03ⅆ",
// demptyv;[⦱] delta;[δ] deg;[°] deg[°].
"\x06mptyv;\x03⦱\x04lta;\x02δ\x02g;\x02°\x01g\x02°",
// dfisht;[⥿] dfr;[𝔡].
"\x05isht;\x03⥿\x02r;\x04𝔡",
// dharl;[⇃] dharr;[⇂].
"\x04arl;\x03⇃\x04arr;\x03⇂",
// divideontimes;[⋇] diamondsuit;[♦] diamond;[⋄] digamma;[ϝ] divide;[÷] divonx;[⋇] diams;[♦] disin;[⋲] divide[÷] diam;[⋄] die;[¨] div;[÷].
"\x0cvideontimes;\x03⋇\x0aamondsuit;\x03♦\x06amond;\x03⋄\x06gamma;\x02ϝ\x05vide;\x02÷\x05vonx;\x03⋇\x04ams;\x03♦\x04sin;\x03⋲\x04vide\x02÷\x03am;\x03⋄\x02e;\x02¨\x02v;\x02÷",
// djcy;[ђ].
"\x03cy;\x02ђ",
// dlcorn;[⌞] dlcrop;[⌍].
"\x05corn;\x03⌞\x05crop;\x03⌍",
// downharpoonright;[⇂] downharpoonleft;[⇃] doublebarwedge;[⌆] downdownarrows;[⇊] dotsquare;[⊡] downarrow;[↓] doteqdot;[≑] dotminus;[∸] dotplus;[∔] dollar;[$] doteq;[≐] dopf;[𝕕] dot;[˙].
"\x0fwnharpoonright;\x03⇂\x0ewnharpoonleft;\x03⇃\x0dublebarwedge;\x03⌆\x0dwndownarrows;\x03⇊\x08tsquare;\x03⊡\x08wnarrow;\x03↓\x07teqdot;\x03≑\x07tminus;\x03∸\x06tplus;\x03∔\x05llar;\x01$\x04teq;\x03≐\x03pf;\x04𝕕\x02t;\x02˙",
// drbkarow;[⤐] drcorn;[⌟] drcrop;[⌌].
"\x07bkarow;\x03⤐\x05corn;\x03⌟\x05crop;\x03⌌",
// dstrok;[đ] dscr;[𝒹] dscy;[ѕ] dsol;[⧶].
"\x05trok;\x02đ\x03cr;\x04𝒹\x03cy;\x02ѕ\x03ol;\x03⧶",
// dtdot;[⋱] dtrif;[▾] dtri;[▿].
"\x04dot;\x03⋱\x04rif;\x03▾\x03ri;\x03▿",
// duarr;[⇵] duhar;[⥯].
"\x04arr;\x03⇵\x04har;\x03⥯",
// dwangle;[⦦].
"\x06angle;\x03⦦",
// dzigrarr;[⟿] dzcy;[џ].
"\x07igrarr;\x03⟿\x03cy;\x02џ",
// eDDot;[⩷] eDot;[≑].
"\x04Dot;\x03⩷\x03ot;\x03≑",
// eacute;[é] easter;[⩮] eacute[é].
"\x05cute;\x02é\x05ster;\x03⩮\x04cute\x02é",
// ecaron;[ě] ecolon;[≕] ecirc;[ê] ecir;[≖] ecirc[ê] ecy;[э].
"\x05aron;\x02ě\x05olon;\x03≕\x04irc;\x02ê\x03ir;\x03≖\x03irc\x02ê\x02y;\x02э",
// edot;[ė].
"\x03ot;\x02ė",
// ee;[ⅇ].
"\x01;\x03ⅇ",
// efDot;[≒] efr;[𝔢].
"\x04Dot;\x03≒\x02r;\x04𝔢",
// egrave;[è] egsdot;[⪘] egrave[è] egs;[⪖] eg;[⪚].
"\x05rave;\x02è\x05sdot;\x03⪘\x04rave\x02è\x02s;\x03⪖\x01;\x03⪚",
// elinters;[⏧] elsdot;[⪗] ell;[ℓ] els;[⪕] el;[⪙].
"\x07inters;\x03⏧\x05sdot;\x03⪗\x02l;\x03ℓ\x02s;\x03⪕\x01;\x03⪙",
// emptyset;[∅] emptyv;[∅] emsp13;[ ] emsp14;[ ] emacr;[ē] empty;[∅] emsp;[ ].
"\x07ptyset;\x03∅\x05ptyv;\x03∅\x05sp13;\x03 \x05sp14;\x03 \x04acr;\x02ē\x04pty;\x03∅\x03sp;\x03 ",
// ensp;[ ] eng;[ŋ].
"\x03sp;\x03 \x02g;\x02ŋ",
// eogon;[ę] eopf;[𝕖].
"\x04gon;\x02ę\x03pf;\x04𝕖",
// epsilon;[ε] eparsl;[⧣] eplus;[⩱] epsiv;[ϵ] epar;[⋕] epsi;[ε].
"\x06silon;\x02ε\x05arsl;\x03⧣\x04lus;\x03⩱\x04siv;\x02ϵ\x03ar;\x03⋕\x03si;\x02ε",
// eqslantless;[⪕] eqslantgtr;[⪖] eqvparsl;[⧥] eqcolon;[≕] equivDD;[⩸] eqcirc;[≖] equals;[=] equest;[≟] eqsim;[≂] equiv;[≡].
"\x0aslantless;\x03⪕\x09slantgtr;\x03⪖\x07vparsl;\x03⧥\x06colon;\x03≕\x06uivDD;\x03⩸\x05circ;\x03≖\x05uals;\x01=\x05uest;\x03≟\x04sim;\x03≂\x04uiv;\x03≡",
// erDot;[≓] erarr;[⥱].
"\x04Dot;\x03≓\x04arr;\x03⥱",
// esdot;[≐] escr;[ℯ] esim;[≂].
"\x04dot;\x03≐\x03cr;\x03ℯ\x03im;\x03≂",
// eta;[η] eth;[ð] eth[ð].
"\x02a;\x02η\x02h;\x02ð\x01h\x02ð",
// euml;[ë] euro;[€] euml[ë].
"\x03ml;\x02ë\x03ro;\x03€\x02ml\x02ë",
// exponentiale;[ⅇ] expectation;[ℰ] exist;[∃] excl;[!].
"\x0bponentiale;\x03ⅇ\x0apectation;\x03ℰ\x04ist;\x03∃\x03cl;\x01!",
// fallingdotseq;[≒].
"\x0cllingdotseq;\x03≒",
// fcy;[ф].
"\x02y;\x02ф",
// female;[♀].
"\x05male;\x03♀",
// ffilig;[ffi] ffllig;[ffl] fflig;[ff] ffr;[𝔣].
"\x05ilig;\x03ffi\x05llig;\x03ffl\x04lig;\x03ff\x02r;\x04𝔣",
// filig;[fi].
"\x04lig;\x03fi",
// fjlig;[fj].
"\x04lig;\x02fj",
// fllig;[fl] fltns;[▱] flat;[♭].
"\x04lig;\x03fl\x04tns;\x03▱\x03at;\x03♭",
// fnof;[ƒ].
"\x03of;\x02ƒ",
// forall;[∀] forkv;[⫙] fopf;[𝕗] fork;[⋔].
"\x05rall;\x03∀\x04rkv;\x03⫙\x03pf;\x04𝕗\x03rk;\x03⋔",
// fpartint;[⨍].
"\x07artint;\x03⨍",
// frac12;[½] frac13;[⅓] frac14;[¼] frac15;[⅕] frac16;[⅙] frac18;[⅛] frac23;[⅔] frac25;[⅖] frac34;[¾] frac35;[⅗] frac38;[⅜] frac45;[⅘] frac56;[⅚] frac58;[⅝] frac78;[⅞] frac12[½] frac14[¼] frac34[¾] frasl;[⁄] frown;[⌢].
"\x05ac12;\x02½\x05ac13;\x03⅓\x05ac14;\x02¼\x05ac15;\x03⅕\x05ac16;\x03⅙\x05ac18;\x03⅛\x05ac23;\x03⅔\x05ac25;\x03⅖\x05ac34;\x02¾\x05ac35;\x03⅗\x05ac38;\x03⅜\x05ac45;\x03⅘\x05ac56;\x03⅚\x05ac58;\x03⅝\x05ac78;\x03⅞\x04ac12\x02½\x04ac14\x02¼\x04ac34\x02¾\x04asl;\x03⁄\x04own;\x03⌢",
// fscr;[𝒻].
"\x03cr;\x04𝒻",
// gEl;[⪌] gE;[≧].
"\x02l;\x03⪌\x01;\x03≧",
// gacute;[ǵ] gammad;[ϝ] gamma;[γ] gap;[⪆].
"\x05cute;\x02ǵ\x05mmad;\x02ϝ\x04mma;\x02γ\x02p;\x03⪆",
// gbreve;[ğ].
"\x05reve;\x02ğ",
// gcirc;[ĝ] gcy;[г].
"\x04irc;\x02ĝ\x02y;\x02г",
// gdot;[ġ].
"\x03ot;\x02ġ",
// geqslant;[⩾] gesdotol;[⪄] gesdoto;[⪂] gesdot;[⪀] gesles;[⪔] gescc;[⪩] geqq;[≧] gesl;[⋛︀] gel;[⋛] geq;[≥] ges;[⩾] ge;[≥].
"\x07qslant;\x03⩾\x07sdotol;\x03⪄\x06sdoto;\x03⪂\x05sdot;\x03⪀\x05sles;\x03⪔\x04scc;\x03⪩\x03qq;\x03≧\x03sl;\x06⋛︀\x02l;\x03⋛\x02q;\x03≥\x02s;\x03⩾\x01;\x03≥",
// gfr;[𝔤].
"\x02r;\x04𝔤",
// ggg;[⋙] gg;[≫].
"\x02g;\x03⋙\x01;\x03≫",
// gimel;[ℷ].
"\x04mel;\x03ℷ",
// gjcy;[ѓ].
"\x03cy;\x02ѓ",
// glE;[⪒] gla;[⪥] glj;[⪤] gl;[≷].
"\x02E;\x03⪒\x02a;\x03⪥\x02j;\x03⪤\x01;\x03≷",
// gnapprox;[⪊] gneqq;[≩] gnsim;[⋧] gnap;[⪊] gneq;[⪈] gnE;[≩] gne;[⪈].
"\x07approx;\x03⪊\x04eqq;\x03≩\x04sim;\x03⋧\x03ap;\x03⪊\x03eq;\x03⪈\x02E;\x03≩\x02e;\x03⪈",
// gopf;[𝕘].
"\x03pf;\x04𝕘",
// grave;[`].
"\x04ave;\x01`",
// gsime;[⪎] gsiml;[⪐] gscr;[ℊ] gsim;[≳].
"\x04ime;\x03⪎\x04iml;\x03⪐\x03cr;\x03ℊ\x03im;\x03≳",
// gtreqqless;[⪌] gtrapprox;[⪆] gtreqless;[⋛] gtquest;[⩼] gtrless;[≷] gtlPar;[⦕] gtrarr;[⥸] gtrdot;[⋗] gtrsim;[≳] gtcir;[⩺] gtdot;[⋗] gtcc;[⪧] gt;[>].
"\x09reqqless;\x03⪌\x08rapprox;\x03⪆\x08reqless;\x03⋛\x06quest;\x03⩼\x06rless;\x03≷\x05lPar;\x03⦕\x05rarr;\x03⥸\x05rdot;\x03⋗\x05rsim;\x03≳\x04cir;\x03⩺\x04dot;\x03⋗\x03cc;\x03⪧\x01;\x01>",
// gvertneqq;[≩︀] gvnE;[≩︀].
"\x08ertneqq;\x06≩︀\x03nE;\x06≩︀",
// hArr;[⇔].
"\x03rr;\x03⇔",
// harrcir;[⥈] hairsp;[ ] hamilt;[ℋ] hardcy;[ъ] harrw;[↭] half;[½] harr;[↔].
"\x06rrcir;\x03⥈\x05irsp;\x03 \x05milt;\x03ℋ\x05rdcy;\x02ъ\x04rrw;\x03↭\x03lf;\x02½\x03rr;\x03↔",
// hbar;[ℏ].
"\x03ar;\x03ℏ",
// hcirc;[ĥ].
"\x04irc;\x02ĥ",
// heartsuit;[♥] hearts;[♥] hellip;[…] hercon;[⊹].
"\x08artsuit;\x03♥\x05arts;\x03♥\x05llip;\x03…\x05rcon;\x03⊹",
// hfr;[𝔥].
"\x02r;\x04𝔥",
// hksearow;[⤥] hkswarow;[⤦].
"\x07searow;\x03⤥\x07swarow;\x03⤦",
// hookrightarrow;[↪] hookleftarrow;[↩] homtht;[∻] horbar;[―] hoarr;[⇿] hopf;[𝕙].
"\x0dokrightarrow;\x03↪\x0cokleftarrow;\x03↩\x05mtht;\x03∻\x05rbar;\x03―\x04arr;\x03⇿\x03pf;\x04𝕙",
// hslash;[ℏ] hstrok;[ħ] hscr;[𝒽].
"\x05lash;\x03ℏ\x05trok;\x02ħ\x03cr;\x04𝒽",
// hybull;[⁃] hyphen;[‐].
"\x05bull;\x03⁃\x05phen;\x03‐",
// iacute;[í] iacute[í].
"\x05cute;\x02í\x04cute\x02í",
// icirc;[î] icirc[î] icy;[и] ic;[].
"\x04irc;\x02î\x03irc\x02î\x02y;\x02и\x01;\x03",
// iexcl;[¡] iecy;[е] iexcl[¡].
"\x04xcl;\x02¡\x03cy;\x02е\x03xcl\x02¡",
// iff;[⇔] ifr;[𝔦].
"\x02f;\x03⇔\x02r;\x04𝔦",
// igrave;[ì] igrave[ì].
"\x05rave;\x02ì\x04rave\x02ì",
// iiiint;[⨌] iinfin;[⧜] iiint;[∭] iiota;[℩] ii;[ⅈ].
"\x05iint;\x03⨌\x05nfin;\x03⧜\x04int;\x03∭\x04ota;\x03℩\x01;\x03ⅈ",
// ijlig;[ij].
"\x04lig;\x02ij",
// imagline;[ℐ] imagpart;[ℑ] imacr;[ī] image;[ℑ] imath;[ı] imped;[Ƶ] imof;[⊷].
"\x07agline;\x03ℐ\x07agpart;\x03ℑ\x04acr;\x02ī\x04age;\x03ℑ\x04ath;\x02ı\x04ped;\x02Ƶ\x03of;\x03⊷",
// infintie;[⧝] integers;[ℤ] intercal;[⊺] intlarhk;[⨗] intprod;[⨼] incare;[℅] inodot;[ı] intcal;[⊺] infin;[∞] int;[∫] in;[∈].
"\x07fintie;\x03⧝\x07tegers;\x03ℤ\x07tercal;\x03⊺\x07tlarhk;\x03⨗\x06tprod;\x03⨼\x05care;\x03℅\x05odot;\x02ı\x05tcal;\x03⊺\x04fin;\x03∞\x02t;\x03∫\x01;\x03∈",
// iogon;[į] iocy;[ё] iopf;[𝕚] iota;[ι].
"\x04gon;\x02į\x03cy;\x02ё\x03pf;\x04𝕚\x03ta;\x02ι",
// iprod;[⨼].
"\x04rod;\x03⨼",
// iquest;[¿] iquest[¿].
"\x05uest;\x02¿\x04uest\x02¿",
// isindot;[⋵] isinsv;[⋳] isinE;[⋹] isins;[⋴] isinv;[∈] iscr;[𝒾] isin;[∈].
"\x06indot;\x03⋵\x05insv;\x03⋳\x04inE;\x03⋹\x04ins;\x03⋴\x04inv;\x03∈\x03cr;\x04𝒾\x03in;\x03∈",
// itilde;[ĩ] it;[].
"\x05ilde;\x02ĩ\x01;\x03",
// iukcy;[і] iuml;[ï] iuml[ï].
"\x04kcy;\x02і\x03ml;\x02ï\x02ml\x02ï",
// jcirc;[ĵ] jcy;[й].
"\x04irc;\x02ĵ\x02y;\x02й",
// jfr;[𝔧].
"\x02r;\x04𝔧",
// jmath;[ȷ].
"\x04ath;\x02ȷ",
// jopf;[𝕛].
"\x03pf;\x04𝕛",
// jsercy;[ј] jscr;[𝒿].
"\x05ercy;\x02ј\x03cr;\x04𝒿",
// jukcy;[є].
"\x04kcy;\x02є",
// kappav;[ϰ] kappa;[κ].
"\x05ppav;\x02ϰ\x04ppa;\x02κ",
// kcedil;[ķ] kcy;[к].
"\x05edil;\x02ķ\x02y;\x02к",
// kfr;[𝔨].
"\x02r;\x04𝔨",
// kgreen;[ĸ].
"\x05reen;\x02ĸ",
// khcy;[х].
"\x03cy;\x02х",
// kjcy;[ќ].
"\x03cy;\x02ќ",
// kopf;[𝕜].
"\x03pf;\x04𝕜",
// kscr;[𝓀].
"\x03cr;\x04𝓀",
// lAtail;[⤛] lAarr;[⇚] lArr;[⇐].
"\x05tail;\x03⤛\x04arr;\x03⇚\x03rr;\x03⇐",
// lBarr;[⤎].
"\x04arr;\x03⤎",
// lEg;[⪋] lE;[≦].
"\x02g;\x03⪋\x01;\x03≦",
// lHar;[⥢].
"\x03ar;\x03⥢",
// laemptyv;[⦴] larrbfs;[⤟] larrsim;[⥳] lacute;[ĺ] lagran;[ℒ] lambda;[λ] langle;[⟨] larrfs;[⤝] larrhk;[↩] larrlp;[↫] larrpl;[⤹] larrtl;[↢] latail;[⤙] langd;[⦑] laquo;[«] larrb;[⇤] lates;[⪭︀] lang;[⟨] laquo[«] larr;[←] late;[⪭] lap;[⪅] lat;[⪫].
"\x07emptyv;\x03⦴\x06rrbfs;\x03⤟\x06rrsim;\x03⥳\x05cute;\x02ĺ\x05gran;\x03ℒ\x05mbda;\x02λ\x05ngle;\x03⟨\x05rrfs;\x03⤝\x05rrhk;\x03↩\x05rrlp;\x03↫\x05rrpl;\x03⤹\x05rrtl;\x03↢\x05tail;\x03⤙\x04ngd;\x03⦑\x04quo;\x02«\x04rrb;\x03⇤\x04tes;\x06⪭︀\x03ng;\x03⟨\x03quo\x02«\x03rr;\x03←\x03te;\x03⪭\x02p;\x03⪅\x02t;\x03⪫",
// lbrksld;[⦏] lbrkslu;[⦍] lbrace;[{] lbrack;[[] lbarr;[⤌] lbbrk;[❲] lbrke;[⦋].
"\x06rksld;\x03⦏\x06rkslu;\x03⦍\x05race;\x01{\x05rack;\x01[\x04arr;\x03⤌\x04brk;\x03❲\x04rke;\x03⦋",
// lcaron;[ľ] lcedil;[ļ] lceil;[⌈] lcub;[{] lcy;[л].
"\x05aron;\x02ľ\x05edil;\x02ļ\x04eil;\x03⌈\x03ub;\x01{\x02y;\x02л",
// ldrushar;[⥋] ldrdhar;[⥧] ldquor;[„] ldquo;[“] ldca;[⤶] ldsh;[↲].
"\x07rushar;\x03⥋\x06rdhar;\x03⥧\x05quor;\x03„\x04quo;\x03“\x03ca;\x03⤶\x03sh;\x03↲",
// leftrightsquigarrow;[↭] leftrightharpoons;[⇋] leftharpoondown;[↽] leftrightarrows;[⇆] leftleftarrows;[⇇] leftrightarrow;[↔] leftthreetimes;[⋋] leftarrowtail;[↢] leftharpoonup;[↼] lessapprox;[⪅] lesseqqgtr;[⪋] leftarrow;[←] lesseqgtr;[⋚] leqslant;[⩽] lesdotor;[⪃] lesdoto;[⪁] lessdot;[⋖] lessgtr;[≶] lesssim;[≲] lesdot;[⩿] lesges;[⪓] lescc;[⪨] leqq;[≦] lesg;[⋚︀] leg;[⋚] leq;[≤] les;[⩽] le;[≤].
"\x12ftrightsquigarrow;\x03↭\x10ftrightharpoons;\x03⇋\x0eftharpoondown;\x03↽\x0eftrightarrows;\x03⇆\x0dftleftarrows;\x03⇇\x0dftrightarrow;\x03↔\x0dftthreetimes;\x03⋋\x0cftarrowtail;\x03↢\x0cftharpoonup;\x03↼\x09ssapprox;\x03⪅\x09sseqqgtr;\x03⪋\x08ftarrow;\x03←\x08sseqgtr;\x03⋚\x07qslant;\x03⩽\x07sdotor;\x03⪃\x06sdoto;\x03⪁\x06ssdot;\x03⋖\x06ssgtr;\x03≶\x06sssim;\x03≲\x05sdot;\x03⩿\x05sges;\x03⪓\x04scc;\x03⪨\x03qq;\x03≦\x03sg;\x06⋚︀\x02g;\x03⋚\x02q;\x03≤\x02s;\x03⩽\x01;\x03≤",
// lfisht;[⥼] lfloor;[⌊] lfr;[𝔩].
"\x05isht;\x03⥼\x05loor;\x03⌊\x02r;\x04𝔩",
// lgE;[⪑] lg;[≶].
"\x02E;\x03⪑\x01;\x03≶",
// lharul;[⥪] lhard;[↽] lharu;[↼] lhblk;[▄].
"\x05arul;\x03⥪\x04ard;\x03↽\x04aru;\x03↼\x04blk;\x03▄",
// ljcy;[љ].
"\x03cy;\x02љ",
// llcorner;[⌞] llhard;[⥫] llarr;[⇇] lltri;[◺] ll;[≪].
"\x07corner;\x03⌞\x05hard;\x03⥫\x04arr;\x03⇇\x04tri;\x03◺\x01;\x03≪",
// lmoustache;[⎰] lmidot;[ŀ] lmoust;[⎰].
"\x09oustache;\x03⎰\x05idot;\x02ŀ\x05oust;\x03⎰",
// lnapprox;[⪉] lneqq;[≨] lnsim;[⋦] lnap;[⪉] lneq;[⪇] lnE;[≨] lne;[⪇].
"\x07approx;\x03⪉\x04eqq;\x03≨\x04sim;\x03⋦\x03ap;\x03⪉\x03eq;\x03⪇\x02E;\x03≨\x02e;\x03⪇",
// longleftrightarrow;[⟷] longrightarrow;[⟶] looparrowright;[↬] longleftarrow;[⟵] looparrowleft;[↫] longmapsto;[⟼] lotimes;[⨴] lozenge;[◊] loplus;[⨭] lowast;[∗] lowbar;[_] loang;[⟬] loarr;[⇽] lobrk;[⟦] lopar;[⦅] lopf;[𝕝] lozf;[⧫] loz;[◊].
"\x11ngleftrightarrow;\x03⟷\x0dngrightarrow;\x03⟶\x0doparrowright;\x03↬\x0cngleftarrow;\x03⟵\x0coparrowleft;\x03↫\x09ngmapsto;\x03⟼\x06times;\x03⨴\x06zenge;\x03◊\x05plus;\x03⨭\x05wast;\x03∗\x05wbar;\x01_\x04ang;\x03⟬\x04arr;\x03⇽\x04brk;\x03⟦\x04par;\x03⦅\x03pf;\x04𝕝\x03zf;\x03⧫\x02z;\x03◊",
// lparlt;[⦓] lpar;[(].
"\x05arlt;\x03⦓\x03ar;\x01(",
// lrcorner;[⌟] lrhard;[⥭] lrarr;[⇆] lrhar;[⇋] lrtri;[⊿] lrm;[].
"\x07corner;\x03⌟\x05hard;\x03⥭\x04arr;\x03⇆\x04har;\x03⇋\x04tri;\x03⊿\x02m;\x03",
// lsaquo;[‹] lsquor;[‚] lstrok;[ł] lsime;[⪍] lsimg;[⪏] lsquo;[‘] lscr;[𝓁] lsim;[≲] lsqb;[[] lsh;[↰].
"\x05aquo;\x03‹\x05quor;\x03‚\x05trok;\x02ł\x04ime;\x03⪍\x04img;\x03⪏\x04quo;\x03‘\x03cr;\x04𝓁\x03im;\x03≲\x03qb;\x01[\x02h;\x03↰",
// ltquest;[⩻] lthree;[⋋] ltimes;[⋉] ltlarr;[⥶] ltrPar;[⦖] ltcir;[⩹] ltdot;[⋖] ltrie;[⊴] ltrif;[◂] ltcc;[⪦] ltri;[◃] lt;[<].
"\x06quest;\x03⩻\x05hree;\x03⋋\x05imes;\x03⋉\x05larr;\x03⥶\x05rPar;\x03⦖\x04cir;\x03⩹\x04dot;\x03⋖\x04rie;\x03⊴\x04rif;\x03◂\x03cc;\x03⪦\x03ri;\x03◃\x01;\x01<",
// lurdshar;[⥊] luruhar;[⥦].
"\x07rdshar;\x03⥊\x06ruhar;\x03⥦",
// lvertneqq;[≨︀] lvnE;[≨︀].
"\x08ertneqq;\x06≨︀\x03nE;\x06≨︀",
// mDDot;[∺].
"\x04Dot;\x03∺",
// mapstodown;[↧] mapstoleft;[↤] mapstoup;[↥] maltese;[✠] mapsto;[↦] marker;[▮] macr;[¯] male;[♂] malt;[✠] macr[¯] map;[↦].
"\x09pstodown;\x03↧\x09pstoleft;\x03↤\x07pstoup;\x03↥\x06ltese;\x03✠\x05psto;\x03↦\x05rker;\x03▮\x03cr;\x02¯\x03le;\x03♂\x03lt;\x03✠\x02cr\x02¯\x02p;\x03↦",
// mcomma;[⨩] mcy;[м].
"\x05omma;\x03⨩\x02y;\x02м",
// mdash;[—].
"\x04ash;\x03—",
// measuredangle;[∡].
"\x0casuredangle;\x03∡",
// mfr;[𝔪].
"\x02r;\x04𝔪",
// mho;[℧].
"\x02o;\x03℧",
// minusdu;[⨪] midast;[*] midcir;[⫰] middot;[·] minusb;[⊟] minusd;[∸] micro;[µ] middot[·] minus;[−] micro[µ] mid;[∣].
"\x06nusdu;\x03⨪\x05dast;\x01*\x05dcir;\x03⫰\x05ddot;\x02·\x05nusb;\x03⊟\x05nusd;\x03∸\x04cro;\x02µ\x04ddot\x02·\x04nus;\x03−\x03cro\x02µ\x02d;\x03∣",
// mlcp;[⫛] mldr;[…].
"\x03cp;\x03⫛\x03dr;\x03…",
// mnplus;[∓].
"\x05plus;\x03∓",
// models;[⊧] mopf;[𝕞].
"\x05dels;\x03⊧\x03pf;\x04𝕞",
// mp;[∓].
"\x01;\x03∓",
// mstpos;[∾] mscr;[𝓂].
"\x05tpos;\x03∾\x03cr;\x04𝓂",
// multimap;[⊸] mumap;[⊸] mu;[μ].
"\x07ltimap;\x03⊸\x04map;\x03⊸\x01;\x02μ",
// nGtv;[≫̸] nGg;[⋙̸] nGt;[≫⃒].
"\x03tv;\x05≫̸\x02g;\x05⋙̸\x02t;\x06≫⃒",
// nLeftrightarrow;[⇎] nLeftarrow;[⇍] nLtv;[≪̸] nLl;[⋘̸] nLt;[≪⃒].
"\x0eeftrightarrow;\x03⇎\x09eftarrow;\x03⇍\x03tv;\x05≪̸\x02l;\x05⋘̸\x02t;\x06≪⃒",
// nRightarrow;[⇏].
"\x0aightarrow;\x03⇏",
// nVDash;[⊯] nVdash;[⊮].
"\x05Dash;\x03⊯\x05dash;\x03⊮",
// naturals;[ℕ] napprox;[≉] natural;[♮] nacute;[ń] nabla;[∇] napid;[≋̸] napos;[ʼn] natur;[♮] nang;[∠⃒] napE;[⩰̸] nap;[≉].
"\x07turals;\x03ℕ\x06pprox;\x03≉\x06tural;\x03♮\x05cute;\x02ń\x04bla;\x03∇\x04pid;\x05≋̸\x04pos;\x02ʼn\x04tur;\x03♮\x03ng;\x06∠⃒\x03pE;\x05⩰̸\x02p;\x03≉",
// nbumpe;[≏̸] nbump;[≎̸] nbsp;[ ] nbsp[ ].
"\x05umpe;\x05≏̸\x04ump;\x05≎̸\x03sp;\x02 \x02sp\x02 ",
// ncongdot;[⩭̸] ncaron;[ň] ncedil;[ņ] ncong;[≇] ncap;[⩃] ncup;[⩂] ncy;[н].
"\x07ongdot;\x05⩭̸\x05aron;\x02ň\x05edil;\x02ņ\x04ong;\x03≇\x03ap;\x03⩃\x03up;\x03⩂\x02y;\x02н",
// ndash;[–].
"\x04ash;\x03–",
// nearrow;[↗] nexists;[∄] nearhk;[⤤] nequiv;[≢] nesear;[⤨] nexist;[∄] neArr;[⇗] nearr;[↗] nedot;[≐̸] nesim;[≂̸] ne;[≠].
"\x06arrow;\x03↗\x06xists;\x03∄\x05arhk;\x03⤤\x05quiv;\x03≢\x05sear;\x03⤨\x05xist;\x03∄\x04Arr;\x03⇗\x04arr;\x03↗\x04dot;\x05≐̸\x04sim;\x05≂̸\x01;\x03≠",
// nfr;[𝔫].
"\x02r;\x04𝔫",
// ngeqslant;[⩾̸] ngeqq;[≧̸] ngsim;[≵] ngeq;[≱] nges;[⩾̸] ngtr;[≯] ngE;[≧̸] nge;[≱] ngt;[≯].
"\x08eqslant;\x05⩾̸\x04eqq;\x05≧̸\x04sim;\x03≵\x03eq;\x03≱\x03es;\x05⩾̸\x03tr;\x03≯\x02E;\x05≧̸\x02e;\x03≱\x02t;\x03≯",
// nhArr;[⇎] nharr;[↮] nhpar;[⫲].
"\x04Arr;\x03⇎\x04arr;\x03↮\x04par;\x03⫲",
// nisd;[⋺] nis;[⋼] niv;[∋] ni;[∋].
"\x03sd;\x03⋺\x02s;\x03⋼\x02v;\x03∋\x01;\x03∋",
// njcy;[њ].
"\x03cy;\x02њ",
// nleftrightarrow;[↮] nleftarrow;[↚] nleqslant;[⩽̸] nltrie;[⋬] nlArr;[⇍] nlarr;[↚] nleqq;[≦̸] nless;[≮] nlsim;[≴] nltri;[⋪] nldr;[‥] nleq;[≰] nles;[⩽̸] nlE;[≦̸] nle;[≰] nlt;[≮].
"\x0eeftrightarrow;\x03↮\x09eftarrow;\x03↚\x08eqslant;\x05⩽̸\x05trie;\x03⋬\x04Arr;\x03⇍\x04arr;\x03↚\x04eqq;\x05≦̸\x04ess;\x03≮\x04sim;\x03≴\x04tri;\x03⋪\x03dr;\x03‥\x03eq;\x03≰\x03es;\x05⩽̸\x02E;\x05≦̸\x02e;\x03≰\x02t;\x03≮",
// nmid;[∤].
"\x03id;\x03∤",
// notindot;[⋵̸] notinva;[∉] notinvb;[⋷] notinvc;[⋶] notniva;[∌] notnivb;[⋾] notnivc;[⋽] notinE;[⋹̸] notin;[∉] notni;[∌] nopf;[𝕟] not;[¬] not[¬].
"\x07tindot;\x05⋵̸\x06tinva;\x03∉\x06tinvb;\x03⋷\x06tinvc;\x03⋶\x06tniva;\x03∌\x06tnivb;\x03⋾\x06tnivc;\x03⋽\x05tinE;\x05⋹̸\x04tin;\x03∉\x04tni;\x03∌\x03pf;\x04𝕟\x02t;\x02¬\x01t\x02¬",
// nparallel;[∦] npolint;[⨔] npreceq;[⪯̸] nparsl;[⫽⃥] nprcue;[⋠] npart;[∂̸] nprec;[⊀] npar;[∦] npre;[⪯̸] npr;[⊀].
"\x08arallel;\x03∦\x06olint;\x03⨔\x06receq;\x05⪯̸\x05arsl;\x06⫽⃥\x05rcue;\x03⋠\x04art;\x05∂̸\x04rec;\x03⊀\x03ar;\x03∦\x03re;\x05⪯̸\x02r;\x03⊀",
// nrightarrow;[↛] nrarrc;[⤳̸] nrarrw;[↝̸] nrtrie;[⋭] nrArr;[⇏] nrarr;[↛] nrtri;[⋫].
"\x0aightarrow;\x03↛\x05arrc;\x05⤳̸\x05arrw;\x05↝̸\x05trie;\x03⋭\x04Arr;\x03⇏\x04arr;\x03↛\x04tri;\x03⋫",
// nshortparallel;[∦] nsubseteqq;[⫅̸] nsupseteqq;[⫆̸] nshortmid;[∤] nsubseteq;[⊈] nsupseteq;[⊉] nsqsube;[⋢] nsqsupe;[⋣] nsubset;[⊂⃒] nsucceq;[⪰̸] nsupset;[⊃⃒] nsccue;[⋡] nsimeq;[≄] nsime;[≄] nsmid;[∤] nspar;[∦] nsubE;[⫅̸] nsube;[⊈] nsucc;[⊁] nsupE;[⫆̸] nsupe;[⊉] nsce;[⪰̸] nscr;[𝓃] nsim;[≁] nsub;[⊄] nsup;[⊅] nsc;[⊁].
"\x0dhortparallel;\x03∦\x09ubseteqq;\x05⫅̸\x09upseteqq;\x05⫆̸\x08hortmid;\x03∤\x08ubseteq;\x03⊈\x08upseteq;\x03⊉\x06qsube;\x03⋢\x06qsupe;\x03⋣\x06ubset;\x06⊂⃒\x06ucceq;\x05⪰̸\x06upset;\x06⊃⃒\x05ccue;\x03⋡\x05imeq;\x03≄\x04ime;\x03≄\x04mid;\x03∤\x04par;\x03∦\x04ubE;\x05⫅̸\x04ube;\x03⊈\x04ucc;\x03⊁\x04upE;\x05⫆̸\x04upe;\x03⊉\x03ce;\x05⪰̸\x03cr;\x04𝓃\x03im;\x03≁\x03ub;\x03⊄\x03up;\x03⊅\x02c;\x03⊁",
// ntrianglerighteq;[⋭] ntrianglelefteq;[⋬] ntriangleright;[⋫] ntriangleleft;[⋪] ntilde;[ñ] ntilde[ñ] ntgl;[≹] ntlg;[≸].
"\x0frianglerighteq;\x03⋭\x0erianglelefteq;\x03⋬\x0driangleright;\x03⋫\x0criangleleft;\x03⋪\x05ilde;\x02ñ\x04ilde\x02ñ\x03gl;\x03≹\x03lg;\x03≸",
// numero;[№] numsp;[ ] num;[#] nu;[ν].
"\x05mero;\x03№\x04msp;\x03 \x02m;\x01#\x01;\x02ν",
// nvinfin;[⧞] nvltrie;[⊴⃒] nvrtrie;[⊵⃒] nvDash;[⊭] nvHarr;[⤄] nvdash;[⊬] nvlArr;[⤂] nvrArr;[⤃] nvsim;[∼⃒] nvap;[≍⃒] nvge;[≥⃒] nvgt;[>⃒] nvle;[≤⃒] nvlt;[<⃒].
"\x06infin;\x03⧞\x06ltrie;\x06⊴⃒\x06rtrie;\x06⊵⃒\x05Dash;\x03⊭\x05Harr;\x03⤄\x05dash;\x03⊬\x05lArr;\x03⤂\x05rArr;\x03⤃\x04sim;\x06∼⃒\x03ap;\x06≍⃒\x03ge;\x06≥⃒\x03gt;\x04>⃒\x03le;\x06≤⃒\x03lt;\x04<⃒",
// nwarrow;[↖] nwarhk;[⤣] nwnear;[⤧] nwArr;[⇖] nwarr;[↖].
"\x06arrow;\x03↖\x05arhk;\x03⤣\x05near;\x03⤧\x04Arr;\x03⇖\x04arr;\x03↖",
// oS;[Ⓢ].
"\x01;\x03Ⓢ",
// oacute;[ó] oacute[ó] oast;[⊛].
"\x05cute;\x02ó\x04cute\x02ó\x03st;\x03⊛",
// ocirc;[ô] ocir;[⊚] ocirc[ô] ocy;[о].
"\x04irc;\x02ô\x03ir;\x03⊚\x03irc\x02ô\x02y;\x02о",
// odblac;[ő] odsold;[⦼] odash;[⊝] odiv;[⨸] odot;[⊙].
"\x05blac;\x02ő\x05sold;\x03⦼\x04ash;\x03⊝\x03iv;\x03⨸\x03ot;\x03⊙",
// oelig;[œ].
"\x04lig;\x02œ",
// ofcir;[⦿] ofr;[𝔬].
"\x04cir;\x03⦿\x02r;\x04𝔬",
// ograve;[ò] ograve[ò] ogon;[˛] ogt;[⧁].
"\x05rave;\x02ò\x04rave\x02ò\x03on;\x02˛\x02t;\x03⧁",
// ohbar;[⦵] ohm;[Ω].
"\x04bar;\x03⦵\x02m;\x02Ω",
// oint;[∮].
"\x03nt;\x03∮",
// olcross;[⦻] olarr;[↺] olcir;[⦾] oline;[‾] olt;[⧀].
"\x06cross;\x03⦻\x04arr;\x03↺\x04cir;\x03⦾\x04ine;\x03‾\x02t;\x03⧀",
// omicron;[ο] ominus;[⊖] omacr;[ō] omega;[ω] omid;[⦶].
"\x06icron;\x02ο\x05inus;\x03⊖\x04acr;\x02ō\x04ega;\x02ω\x03id;\x03⦶",
// oopf;[𝕠].
"\x03pf;\x04𝕠",
// operp;[⦹] oplus;[⊕] opar;[⦷].
"\x04erp;\x03⦹\x04lus;\x03⊕\x03ar;\x03⦷",
// orderof;[ℴ] orslope;[⩗] origof;[⊶] orarr;[↻] order;[ℴ] ordf;[ª] ordm;[º] oror;[⩖] ord;[⩝] ordf[ª] ordm[º] orv;[⩛] or;[∨].
"\x06derof;\x03ℴ\x06slope;\x03⩗\x05igof;\x03⊶\x04arr;\x03↻\x04der;\x03ℴ\x03df;\x02ª\x03dm;\x02º\x03or;\x03⩖\x02d;\x03⩝\x02df\x02ª\x02dm\x02º\x02v;\x03⩛\x01;\x03∨",
// oslash;[ø] oslash[ø] oscr;[ℴ] osol;[⊘].
"\x05lash;\x02ø\x04lash\x02ø\x03cr;\x03ℴ\x03ol;\x03⊘",
// otimesas;[⨶] otilde;[õ] otimes;[⊗] otilde[õ].
"\x07imesas;\x03⨶\x05ilde;\x02õ\x05imes;\x03⊗\x04ilde\x02õ",
// ouml;[ö] ouml[ö].
"\x03ml;\x02ö\x02ml\x02ö",
// ovbar;[⌽].
"\x04bar;\x03⌽",
// parallel;[∥] parsim;[⫳] parsl;[⫽] para;[¶] part;[∂] par;[∥] para[¶].
"\x07rallel;\x03∥\x05rsim;\x03⫳\x04rsl;\x03⫽\x03ra;\x02¶\x03rt;\x03∂\x02r;\x03∥\x02ra\x02¶",
// pcy;[п].
"\x02y;\x02п",
// pertenk;[‱] percnt;[%] period;[.] permil;[‰] perp;[⊥].
"\x06rtenk;\x03‱\x05rcnt;\x01%\x05riod;\x01.\x05rmil;\x03‰\x03rp;\x03⊥",
// pfr;[𝔭].
"\x02r;\x04𝔭",
// phmmat;[ℳ] phone;[☎] phiv;[ϕ] phi;[φ].
"\x05mmat;\x03ℳ\x04one;\x03☎\x03iv;\x02ϕ\x02i;\x02φ",
// pitchfork;[⋔] piv;[ϖ] pi;[π].
"\x08tchfork;\x03⋔\x02v;\x02ϖ\x01;\x02π",
// plusacir;[⨣] planckh;[ℎ] pluscir;[⨢] plussim;[⨦] plustwo;[⨧] planck;[ℏ] plankv;[ℏ] plusdo;[∔] plusdu;[⨥] plusmn;[±] plusb;[⊞] pluse;[⩲] plusmn[±] plus;[+].
"\x07usacir;\x03⨣\x06anckh;\x03ℎ\x06uscir;\x03⨢\x06ussim;\x03⨦\x06ustwo;\x03⨧\x05anck;\x03ℏ\x05ankv;\x03ℏ\x05usdo;\x03∔\x05usdu;\x03⨥\x05usmn;\x02±\x04usb;\x03⊞\x04use;\x03⩲\x04usmn\x02±\x03us;\x01+",
// pm;[±].
"\x01;\x02±",
// pointint;[⨕] pound;[£] popf;[𝕡] pound[£].
"\x07intint;\x03⨕\x04und;\x02£\x03pf;\x04𝕡\x03und\x02£",
// preccurlyeq;[≼] precnapprox;[⪹] precapprox;[⪷] precneqq;[⪵] precnsim;[⋨] profalar;[⌮] profline;[⌒] profsurf;[⌓] precsim;[≾] preceq;[⪯] primes;[ℙ] prnsim;[⋨] propto;[∝] prurel;[⊰] prcue;[≼] prime;[′] prnap;[⪹] prsim;[≾] prap;[⪷] prec;[≺] prnE;[⪵] prod;[∏] prop;[∝] prE;[⪳] pre;[⪯] pr;[≺].
"\x0aeccurlyeq;\x03≼\x0aecnapprox;\x03⪹\x09ecapprox;\x03⪷\x07ecneqq;\x03⪵\x07ecnsim;\x03⋨\x07ofalar;\x03⌮\x07ofline;\x03⌒\x07ofsurf;\x03⌓\x06ecsim;\x03≾\x05eceq;\x03⪯\x05imes;\x03ℙ\x05nsim;\x03⋨\x05opto;\x03∝\x05urel;\x03⊰\x04cue;\x03≼\x04ime;\x03′\x04nap;\x03⪹\x04sim;\x03≾\x03ap;\x03⪷\x03ec;\x03≺\x03nE;\x03⪵\x03od;\x03∏\x03op;\x03∝\x02E;\x03⪳\x02e;\x03⪯\x01;\x03≺",
// pscr;[𝓅] psi;[ψ].
"\x03cr;\x04𝓅\x02i;\x02ψ",
// puncsp;[ ].
"\x05ncsp;\x03 ",
// qfr;[𝔮].
"\x02r;\x04𝔮",
// qint;[⨌].
"\x03nt;\x03⨌",
// qopf;[𝕢].
"\x03pf;\x04𝕢",
// qprime;[⁗].
"\x05rime;\x03⁗",
// qscr;[𝓆].
"\x03cr;\x04𝓆",
// quaternions;[ℍ] quatint;[⨖] questeq;[≟] quest;[?] quot;[\"] quot[\"].
"\x0aaternions;\x03ℍ\x06atint;\x03⨖\x06esteq;\x03≟\x04est;\x01?\x03ot;\x01\"\x02ot\x01\"",
// rAtail;[⤜] rAarr;[⇛] rArr;[⇒].
"\x05tail;\x03⤜\x04arr;\x03⇛\x03rr;\x03⇒",
// rBarr;[⤏].
"\x04arr;\x03⤏",
// rHar;[⥤].
"\x03ar;\x03⥤",
// rationals;[ℚ] raemptyv;[⦳] rarrbfs;[⤠] rarrsim;[⥴] racute;[ŕ] rangle;[⟩] rarrap;[⥵] rarrfs;[⤞] rarrhk;[↪] rarrlp;[↬] rarrpl;[⥅] rarrtl;[↣] ratail;[⤚] radic;[√] rangd;[⦒] range;[⦥] raquo;[»] rarrb;[⇥] rarrc;[⤳] rarrw;[↝] ratio;[∶] race;[∽̱] rang;[⟩] raquo[»] rarr;[→].
"\x08tionals;\x03ℚ\x07emptyv;\x03⦳\x06rrbfs;\x03⤠\x06rrsim;\x03⥴\x05cute;\x02ŕ\x05ngle;\x03⟩\x05rrap;\x03⥵\x05rrfs;\x03⤞\x05rrhk;\x03↪\x05rrlp;\x03↬\x05rrpl;\x03⥅\x05rrtl;\x03↣\x05tail;\x03⤚\x04dic;\x03√\x04ngd;\x03⦒\x04nge;\x03⦥\x04quo;\x02»\x04rrb;\x03⇥\x04rrc;\x03⤳\x04rrw;\x03↝\x04tio;\x03∶\x03ce;\x05∽̱\x03ng;\x03⟩\x03quo\x02»\x03rr;\x03→",
// rbrksld;[⦎] rbrkslu;[⦐] rbrace;[}] rbrack;[]] rbarr;[⤍] rbbrk;[❳] rbrke;[⦌].
"\x06rksld;\x03⦎\x06rkslu;\x03⦐\x05race;\x01}\x05rack;\x01]\x04arr;\x03⤍\x04brk;\x03❳\x04rke;\x03⦌",
// rcaron;[ř] rcedil;[ŗ] rceil;[⌉] rcub;[}] rcy;[р].
"\x05aron;\x02ř\x05edil;\x02ŗ\x04eil;\x03⌉\x03ub;\x01}\x02y;\x02р",
// rdldhar;[⥩] rdquor;[”] rdquo;[”] rdca;[⤷] rdsh;[↳].
"\x06ldhar;\x03⥩\x05quor;\x03”\x04quo;\x03”\x03ca;\x03⤷\x03sh;\x03↳",
// realpart;[ℜ] realine;[ℛ] reals;[ℝ] real;[ℜ] rect;[▭] reg;[®] reg[®].
"\x07alpart;\x03ℜ\x06aline;\x03ℛ\x04als;\x03ℝ\x03al;\x03ℜ\x03ct;\x03▭\x02g;\x02®\x01g\x02®",
// rfisht;[⥽] rfloor;[⌋] rfr;[𝔯].
"\x05isht;\x03⥽\x05loor;\x03⌋\x02r;\x04𝔯",
// rharul;[⥬] rhard;[⇁] rharu;[⇀] rhov;[ϱ] rho;[ρ].
"\x05arul;\x03⥬\x04ard;\x03⇁\x04aru;\x03⇀\x03ov;\x02ϱ\x02o;\x02ρ",
// rightleftharpoons;[⇌] rightharpoondown;[⇁] rightrightarrows;[⇉] rightleftarrows;[⇄] rightsquigarrow;[↝] rightthreetimes;[⋌] rightarrowtail;[↣] rightharpoonup;[⇀] risingdotseq;[≓] rightarrow;[→] ring;[˚].
"\x10ghtleftharpoons;\x03⇌\x0fghtharpoondown;\x03⇁\x0fghtrightarrows;\x03⇉\x0eghtleftarrows;\x03⇄\x0eghtsquigarrow;\x03↝\x0eghtthreetimes;\x03⋌\x0dghtarrowtail;\x03↣\x0dghtharpoonup;\x03⇀\x0bsingdotseq;\x03≓\x09ghtarrow;\x03→\x03ng;\x02˚",
// rlarr;[⇄] rlhar;[⇌] rlm;[].
"\x04arr;\x03⇄\x04har;\x03⇌\x02m;\x03",
// rmoustache;[⎱] rmoust;[⎱].
"\x09oustache;\x03⎱\x05oust;\x03⎱",
// rnmid;[⫮].
"\x04mid;\x03⫮",
// rotimes;[⨵] roplus;[⨮] roang;[⟭] roarr;[⇾] robrk;[⟧] ropar;[⦆] ropf;[𝕣].
"\x06times;\x03⨵\x05plus;\x03⨮\x04ang;\x03⟭\x04arr;\x03⇾\x04brk;\x03⟧\x04par;\x03⦆\x03pf;\x04𝕣",
// rppolint;[⨒] rpargt;[⦔] rpar;[)].
"\x07polint;\x03⨒\x05argt;\x03⦔\x03ar;\x01)",
// rrarr;[⇉].
"\x04arr;\x03⇉",
// rsaquo;[›] rsquor;[’] rsquo;[’] rscr;[𝓇] rsqb;[]] rsh;[↱].
"\x05aquo;\x03›\x05quor;\x03’\x04quo;\x03’\x03cr;\x04𝓇\x03qb;\x01]\x02h;\x03↱",
// rtriltri;[⧎] rthree;[⋌] rtimes;[⋊] rtrie;[⊵] rtrif;[▸] rtri;[▹].
"\x07riltri;\x03⧎\x05hree;\x03⋌\x05imes;\x03⋊\x04rie;\x03⊵\x04rif;\x03▸\x03ri;\x03▹",
// ruluhar;[⥨].
"\x06luhar;\x03⥨",
// rx;[℞].
"\x01;\x03℞",
// sacute;[ś].
"\x05cute;\x02ś",
// sbquo;[‚].
"\x04quo;\x03‚",
// scpolint;[⨓] scaron;[š] scedil;[ş] scnsim;[⋩] sccue;[≽] scirc;[ŝ] scnap;[⪺] scsim;[≿] scap;[⪸] scnE;[⪶] scE;[⪴] sce;[⪰] scy;[с] sc;[≻].
"\x07polint;\x03⨓\x05aron;\x02š\x05edil;\x02ş\x05nsim;\x03⋩\x04cue;\x03≽\x04irc;\x02ŝ\x04nap;\x03⪺\x04sim;\x03≿\x03ap;\x03⪸\x03nE;\x03⪶\x02E;\x03⪴\x02e;\x03⪰\x02y;\x02с\x01;\x03≻",
// sdotb;[⊡] sdote;[⩦] sdot;[⋅].
"\x04otb;\x03⊡\x04ote;\x03⩦\x03ot;\x03⋅",
// setminus;[∖] searrow;[↘] searhk;[⤥] seswar;[⤩] seArr;[⇘] searr;[↘] setmn;[∖] sect;[§] semi;[;] sext;[✶] sect[§].
"\x07tminus;\x03∖\x06arrow;\x03↘\x05arhk;\x03⤥\x05swar;\x03⤩\x04Arr;\x03⇘\x04arr;\x03↘\x04tmn;\x03∖\x03ct;\x02§\x03mi;\x01;\x03xt;\x03✶\x02ct\x02§",
// sfrown;[⌢] sfr;[𝔰].
"\x05rown;\x03⌢\x02r;\x04𝔰",
// shortparallel;[∥] shortmid;[∣] shchcy;[щ] sharp;[♯] shcy;[ш] shy;[] shy[].
"\x0cortparallel;\x03∥\x07ortmid;\x03∣\x05chcy;\x02щ\x04arp;\x03♯\x03cy;\x02ш\x02y;\x02\x01y\x02",
// simplus;[⨤] simrarr;[⥲] sigmaf;[ς] sigmav;[ς] simdot;[⩪] sigma;[σ] simeq;[≃] simgE;[⪠] simlE;[⪟] simne;[≆] sime;[≃] simg;[⪞] siml;[⪝] sim;[∼].
"\x06mplus;\x03⨤\x06mrarr;\x03⥲\x05gmaf;\x02ς\x05gmav;\x02ς\x05mdot;\x03⩪\x04gma;\x02σ\x04meq;\x03≃\x04mgE;\x03⪠\x04mlE;\x03⪟\x04mne;\x03≆\x03me;\x03≃\x03mg;\x03⪞\x03ml;\x03⪝\x02m;\x03∼",
// slarr;[←].
"\x04arr;\x03←",
// smallsetminus;[∖] smeparsl;[⧤] smashp;[⨳] smile;[⌣] smtes;[⪬︀] smid;[∣] smte;[⪬] smt;[⪪].
"\x0callsetminus;\x03∖\x07eparsl;\x03⧤\x05ashp;\x03⨳\x04ile;\x03⌣\x04tes;\x06⪬︀\x03id;\x03∣\x03te;\x03⪬\x02t;\x03⪪",
// softcy;[ь] solbar;[⌿] solb;[⧄] sopf;[𝕤] sol;[/].
"\x05ftcy;\x02ь\x05lbar;\x03⌿\x03lb;\x03⧄\x03pf;\x04𝕤\x02l;\x01/",
// spadesuit;[♠] spades;[♠] spar;[∥].
"\x08adesuit;\x03♠\x05ades;\x03♠\x03ar;\x03∥",
// sqsubseteq;[⊑] sqsupseteq;[⊒] sqsubset;[⊏] sqsupset;[⊐] sqcaps;[⊓︀] sqcups;[⊔︀] sqsube;[⊑] sqsupe;[⊒] square;[□] squarf;[▪] sqcap;[⊓] sqcup;[⊔] sqsub;[⊏] sqsup;[⊐] squf;[▪] squ;[□].
"\x09subseteq;\x03⊑\x09supseteq;\x03⊒\x07subset;\x03⊏\x07supset;\x03⊐\x05caps;\x06⊓︀\x05cups;\x06⊔︀\x05sube;\x03⊑\x05supe;\x03⊒\x05uare;\x03□\x05uarf;\x03▪\x04cap;\x03⊓\x04cup;\x03⊔\x04sub;\x03⊏\x04sup;\x03⊐\x03uf;\x03▪\x02u;\x03□",
// srarr;[→].
"\x04arr;\x03→",
// ssetmn;[∖] ssmile;[⌣] sstarf;[⋆] sscr;[𝓈].
"\x05etmn;\x03∖\x05mile;\x03⌣\x05tarf;\x03⋆\x03cr;\x04𝓈",
// straightepsilon;[ϵ] straightphi;[ϕ] starf;[★] strns;[¯] star;[☆].
"\x0eraightepsilon;\x02ϵ\x0araightphi;\x02ϕ\x04arf;\x03★\x04rns;\x02¯\x03ar;\x03☆",
// succcurlyeq;[≽] succnapprox;[⪺] subsetneqq;[⫋] succapprox;[⪸] supsetneqq;[⫌] subseteqq;[⫅] subsetneq;[⊊] supseteqq;[⫆] supsetneq;[⊋] subseteq;[⊆] succneqq;[⪶] succnsim;[⋩] supseteq;[⊇] subedot;[⫃] submult;[⫁] subplus;[⪿] subrarr;[⥹] succsim;[≿] supdsub;[⫘] supedot;[⫄] suphsol;[⟉] suphsub;[⫗] suplarr;[⥻] supmult;[⫂] supplus;[⫀] subdot;[⪽] subset;[⊂] subsim;[⫇] subsub;[⫕] subsup;[⫓] succeq;[⪰] supdot;[⪾] supset;[⊃] supsim;[⫈] supsub;[⫔] supsup;[⫖] subnE;[⫋] subne;[⊊] supnE;[⫌] supne;[⊋] subE;[⫅] sube;[⊆] succ;[≻] sung;[♪] sup1;[¹] sup2;[²] sup3;[³] supE;[⫆] supe;[⊇] sub;[⊂] sum;[∑] sup1[¹] sup2[²] sup3[³] sup;[⊃].
"\x0acccurlyeq;\x03≽\x0accnapprox;\x03⪺\x09bsetneqq;\x03⫋\x09ccapprox;\x03⪸\x09psetneqq;\x03⫌\x08bseteqq;\x03⫅\x08bsetneq;\x03⊊\x08pseteqq;\x03⫆\x08psetneq;\x03⊋\x07bseteq;\x03⊆\x07ccneqq;\x03⪶\x07ccnsim;\x03⋩\x07pseteq;\x03⊇\x06bedot;\x03⫃\x06bmult;\x03⫁\x06bplus;\x03⪿\x06brarr;\x03⥹\x06ccsim;\x03≿\x06pdsub;\x03⫘\x06pedot;\x03⫄\x06phsol;\x03⟉\x06phsub;\x03⫗\x06plarr;\x03⥻\x06pmult;\x03⫂\x06pplus;\x03⫀\x05bdot;\x03⪽\x05bset;\x03⊂\x05bsim;\x03⫇\x05bsub;\x03⫕\x05bsup;\x03⫓\x05cceq;\x03⪰\x05pdot;\x03⪾\x05pset;\x03⊃\x05psim;\x03⫈\x05psub;\x03⫔\x05psup;\x03⫖\x04bnE;\x03⫋\x04bne;\x03⊊\x04pnE;\x03⫌\x04pne;\x03⊋\x03bE;\x03⫅\x03be;\x03⊆\x03cc;\x03≻\x03ng;\x03♪\x03p1;\x02¹\x03p2;\x02²\x03p3;\x02³\x03pE;\x03⫆\x03pe;\x03⊇\x02b;\x03⊂\x02m;\x03∑\x02p1\x02¹\x02p2\x02²\x02p3\x02³\x02p;\x03⊃",
// swarrow;[↙] swarhk;[⤦] swnwar;[⤪] swArr;[⇙] swarr;[↙].
"\x06arrow;\x03↙\x05arhk;\x03⤦\x05nwar;\x03⤪\x04Arr;\x03⇙\x04arr;\x03↙",
// szlig;[ß] szlig[ß].
"\x04lig;\x02ß\x03lig\x02ß",
// target;[⌖] tau;[τ].
"\x05rget;\x03⌖\x02u;\x02τ",
// tbrk;[⎴].
"\x03rk;\x03⎴",
// tcaron;[ť] tcedil;[ţ] tcy;[т].
"\x05aron;\x02ť\x05edil;\x02ţ\x02y;\x02т",
// tdot;[⃛].
"\x03ot;\x03⃛",
// telrec;[⌕].
"\x05lrec;\x03⌕",
// tfr;[𝔱].
"\x02r;\x04𝔱",
// thickapprox;[≈] therefore;[∴] thetasym;[ϑ] thicksim;[∼] there4;[∴] thetav;[ϑ] thinsp;[ ] thksim;[∼] theta;[θ] thkap;[≈] thorn;[þ] thorn[þ].
"\x0aickapprox;\x03≈\x08erefore;\x03∴\x07etasym;\x02ϑ\x07icksim;\x03∼\x05ere4;\x03∴\x05etav;\x02ϑ\x05insp;\x03 \x05ksim;\x03∼\x04eta;\x02θ\x04kap;\x03≈\x04orn;\x02þ\x03orn\x02þ",
// timesbar;[⨱] timesb;[⊠] timesd;[⨰] tilde;[˜] times;[×] times[×] tint;[∭].
"\x07mesbar;\x03⨱\x05mesb;\x03⊠\x05mesd;\x03⨰\x04lde;\x02˜\x04mes;\x02×\x03mes\x02×\x03nt;\x03∭",
// topfork;[⫚] topbot;[⌶] topcir;[⫱] toea;[⤨] topf;[𝕥] tosa;[⤩] top;[⊤].
"\x06pfork;\x03⫚\x05pbot;\x03⌶\x05pcir;\x03⫱\x03ea;\x03⤨\x03pf;\x04𝕥\x03sa;\x03⤩\x02p;\x03⊤",
// tprime;[‴].
"\x05rime;\x03‴",
// trianglerighteq;[⊵] trianglelefteq;[⊴] triangleright;[▹] triangledown;[▿] triangleleft;[◃] triangleq;[≜] triangle;[▵] triminus;[⨺] trpezium;[⏢] triplus;[⨹] tritime;[⨻] tridot;[◬] trade;[™] trisb;[⧍] trie;[≜].
"\x0eianglerighteq;\x03⊵\x0dianglelefteq;\x03⊴\x0ciangleright;\x03▹\x0biangledown;\x03▿\x0biangleleft;\x03◃\x08iangleq;\x03≜\x07iangle;\x03▵\x07iminus;\x03⨺\x07pezium;\x03⏢\x06iplus;\x03⨹\x06itime;\x03⨻\x05idot;\x03◬\x04ade;\x03™\x04isb;\x03⧍\x03ie;\x03≜",
// tstrok;[ŧ] tshcy;[ћ] tscr;[𝓉] tscy;[ц].
"\x05trok;\x02ŧ\x04hcy;\x02ћ\x03cr;\x04𝓉\x03cy;\x02ц",
// twoheadrightarrow;[↠] twoheadleftarrow;[↞] twixt;[≬].
"\x10oheadrightarrow;\x03↠\x0foheadleftarrow;\x03↞\x04ixt;\x03≬",
// uArr;[⇑].
"\x03rr;\x03⇑",
// uHar;[⥣].
"\x03ar;\x03⥣",
// uacute;[ú] uacute[ú] uarr;[↑].
"\x05cute;\x02ú\x04cute\x02ú\x03rr;\x03↑",
// ubreve;[ŭ] ubrcy;[ў].
"\x05reve;\x02ŭ\x04rcy;\x02ў",
// ucirc;[û] ucirc[û] ucy;[у].
"\x04irc;\x02û\x03irc\x02û\x02y;\x02у",
// udblac;[ű] udarr;[⇅] udhar;[⥮].
"\x05blac;\x02ű\x04arr;\x03⇅\x04har;\x03⥮",
// ufisht;[⥾] ufr;[𝔲].
"\x05isht;\x03⥾\x02r;\x04𝔲",
// ugrave;[ù] ugrave[ù].
"\x05rave;\x02ù\x04rave\x02ù",
// uharl;[↿] uharr;[↾] uhblk;[▀].
"\x04arl;\x03↿\x04arr;\x03↾\x04blk;\x03▀",
// ulcorner;[⌜] ulcorn;[⌜] ulcrop;[⌏] ultri;[◸].
"\x07corner;\x03⌜\x05corn;\x03⌜\x05crop;\x03⌏\x04tri;\x03◸",
// umacr;[ū] uml;[¨] uml[¨].
"\x04acr;\x02ū\x02l;\x02¨\x01l\x02¨",
// uogon;[ų] uopf;[𝕦].
"\x04gon;\x02ų\x03pf;\x04𝕦",
// upharpoonright;[↾] upharpoonleft;[↿] updownarrow;[↕] upuparrows;[⇈] uparrow;[↑] upsilon;[υ] uplus;[⊎] upsih;[ϒ] upsi;[υ].
"\x0dharpoonright;\x03↾\x0charpoonleft;\x03↿\x0adownarrow;\x03↕\x09uparrows;\x03⇈\x06arrow;\x03↑\x06silon;\x02υ\x04lus;\x03⊎\x04sih;\x02ϒ\x03si;\x02υ",
// urcorner;[⌝] urcorn;[⌝] urcrop;[⌎] uring;[ů] urtri;[◹].
"\x07corner;\x03⌝\x05corn;\x03⌝\x05crop;\x03⌎\x04ing;\x02ů\x04tri;\x03◹",
// uscr;[𝓊].
"\x03cr;\x04𝓊",
// utilde;[ũ] utdot;[⋰] utrif;[▴] utri;[▵].
"\x05ilde;\x02ũ\x04dot;\x03⋰\x04rif;\x03▴\x03ri;\x03▵",
// uuarr;[⇈] uuml;[ü] uuml[ü].
"\x04arr;\x03⇈\x03ml;\x02ü\x02ml\x02ü",
// uwangle;[⦧].
"\x06angle;\x03⦧",
// vArr;[⇕].
"\x03rr;\x03⇕",
// vBarv;[⫩] vBar;[⫨].
"\x04arv;\x03⫩\x03ar;\x03⫨",
// vDash;[⊨].
"\x04ash;\x03⊨",
// vartriangleright;[⊳] vartriangleleft;[⊲] varsubsetneqq;[⫋︀] varsupsetneqq;[⫌︀] varsubsetneq;[⊊︀] varsupsetneq;[⊋︀] varepsilon;[ϵ] varnothing;[∅] varpropto;[∝] varkappa;[ϰ] varsigma;[ς] vartheta;[ϑ] vangrt;[⦜] varphi;[ϕ] varrho;[ϱ] varpi;[ϖ] varr;[↕].
"\x0frtriangleright;\x03⊳\x0ertriangleleft;\x03⊲\x0crsubsetneqq;\x06⫋︀\x0crsupsetneqq;\x06⫌︀\x0brsubsetneq;\x06⊊︀\x0brsupsetneq;\x06⊋︀\x09repsilon;\x02ϵ\x09rnothing;\x03∅\x08rpropto;\x03∝\x07rkappa;\x02ϰ\x07rsigma;\x02ς\x07rtheta;\x02ϑ\x05ngrt;\x03⦜\x05rphi;\x02ϕ\x05rrho;\x02ϱ\x04rpi;\x02ϖ\x03rr;\x03↕",
// vcy;[в].
"\x02y;\x02в",
// vdash;[⊢].
"\x04ash;\x03⊢",
// veebar;[⊻] vellip;[⋮] verbar;[|] veeeq;[≚] vert;[|] vee;[∨].
"\x05ebar;\x03⊻\x05llip;\x03⋮\x05rbar;\x01|\x04eeq;\x03≚\x03rt;\x01|\x02e;\x03∨",
// vfr;[𝔳].
"\x02r;\x04𝔳",
// vltri;[⊲].
"\x04tri;\x03⊲",
// vnsub;[⊂⃒] vnsup;[⊃⃒].
"\x04sub;\x06⊂⃒\x04sup;\x06⊃⃒",
// vopf;[𝕧].
"\x03pf;\x04𝕧",
// vprop;[∝].
"\x04rop;\x03∝",
// vrtri;[⊳].
"\x04tri;\x03⊳",
// vsubnE;[⫋︀] vsubne;[⊊︀] vsupnE;[⫌︀] vsupne;[⊋︀] vscr;[𝓋].
"\x05ubnE;\x06⫋︀\x05ubne;\x06⊊︀\x05upnE;\x06⫌︀\x05upne;\x06⊋︀\x03cr;\x04𝓋",
// vzigzag;[⦚].
"\x06igzag;\x03⦚",
// wcirc;[ŵ].
"\x04irc;\x02ŵ",
// wedbar;[⩟] wedgeq;[≙] weierp;[℘] wedge;[∧].
"\x05dbar;\x03⩟\x05dgeq;\x03≙\x05ierp;\x03℘\x04dge;\x03∧",
// wfr;[𝔴].
"\x02r;\x04𝔴",
// wopf;[𝕨].
"\x03pf;\x04𝕨",
// wp;[℘].
"\x01;\x03℘",
// wreath;[≀] wr;[≀].
"\x05eath;\x03≀\x01;\x03≀",
// wscr;[𝓌].
"\x03cr;\x04𝓌",
// xcirc;[◯] xcap;[⋂] xcup;[⋃].
"\x04irc;\x03◯\x03ap;\x03⋂\x03up;\x03⋃",
// xdtri;[▽].
"\x04tri;\x03▽",
// xfr;[𝔵].
"\x02r;\x04𝔵",
// xhArr;[⟺] xharr;[⟷].
"\x04Arr;\x03⟺\x04arr;\x03⟷",
// xi;[ξ].
"\x01;\x02ξ",
// xlArr;[⟸] xlarr;[⟵].
"\x04Arr;\x03⟸\x04arr;\x03⟵",
// xmap;[⟼].
"\x03ap;\x03⟼",
// xnis;[⋻].
"\x03is;\x03⋻",
// xoplus;[⨁] xotime;[⨂] xodot;[⨀] xopf;[𝕩].
"\x05plus;\x03⨁\x05time;\x03⨂\x04dot;\x03⨀\x03pf;\x04𝕩",
// xrArr;[⟹] xrarr;[⟶].
"\x04Arr;\x03⟹\x04arr;\x03⟶",
// xsqcup;[⨆] xscr;[𝓍].
"\x05qcup;\x03⨆\x03cr;\x04𝓍",
// xuplus;[⨄] xutri;[△].
"\x05plus;\x03⨄\x04tri;\x03△",
// xvee;[⋁].
"\x03ee;\x03⋁",
// xwedge;[⋀].
"\x05edge;\x03⋀",
// yacute;[ý] yacute[ý] yacy;[я].
"\x05cute;\x02ý\x04cute\x02ý\x03cy;\x02я",
// ycirc;[ŷ] ycy;[ы].
"\x04irc;\x02ŷ\x02y;\x02ы",
// yen;[¥] yen[¥].
"\x02n;\x02¥\x01n\x02¥",
// yfr;[𝔶].
"\x02r;\x04𝔶",
// yicy;[ї].
"\x03cy;\x02ї",
// yopf;[𝕪].
"\x03pf;\x04𝕪",
// yscr;[𝓎].
"\x03cr;\x04𝓎",
// yucy;[ю] yuml;[ÿ] yuml[ÿ].
"\x03cy;\x02ю\x03ml;\x02ÿ\x02ml\x02ÿ",
// zacute;[ź].
"\x05cute;\x02ź",
// zcaron;[ž] zcy;[з].
"\x05aron;\x02ž\x02y;\x02з",
// zdot;[ż].
"\x03ot;\x02ż",
// zeetrf;[ℨ] zeta;[ζ].
"\x05etrf;\x03ℨ\x03ta;\x02ζ",
// zfr;[𝔷].
"\x02r;\x04𝔷",
// zhcy;[ж].
"\x03cy;\x02ж",
// zigrarr;[⇝].
"\x06grarr;\x03⇝",
// zopf;[𝕫].
"\x03pf;\x04𝕫",
// zscr;[𝓏].
"\x03cr;\x04𝓏",
// zwnj;[] zwj;[].
"\x03nj;\x03\x02j;\x03",
),
"small_words" => "GT\x00LT\x00gt\x00lt\x00",
"small_mappings" => array(
">",
"<",
">",
"<",
)
)
);
class-wp-html-tag-processor.php 0000644 00000346566 14717700467 0012577 0 ustar 00 "c" not " c".
* This would increase the size of the changes for some operations but leave more
* natural-looking output HTML.
*
* @package WordPress
* @subpackage HTML-API
* @since 6.2.0
*/
/**
* Core class used to modify attributes in an HTML document for tags matching a query.
*
* ## Usage
*
* Use of this class requires three steps:
*
* 1. Create a new class instance with your input HTML document.
* 2. Find the tag(s) you are looking for.
* 3. Request changes to the attributes in those tag(s).
*
* Example:
*
* $tags = new WP_HTML_Tag_Processor( $html );
* if ( $tags->next_tag( 'option' ) ) {
* $tags->set_attribute( 'selected', true );
* }
*
* ### Finding tags
*
* The `next_tag()` function moves the internal cursor through
* your input HTML document until it finds a tag meeting any of
* the supplied restrictions in the optional query argument. If
* no argument is provided then it will find the next HTML tag,
* regardless of what kind it is.
*
* If you want to _find whatever the next tag is_:
*
* $tags->next_tag();
*
* | Goal | Query |
* |-----------------------------------------------------------|---------------------------------------------------------------------------------|
* | Find any tag. | `$tags->next_tag();` |
* | Find next image tag. | `$tags->next_tag( array( 'tag_name' => 'img' ) );` |
* | Find next image tag (without passing the array). | `$tags->next_tag( 'img' );` |
* | Find next tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'class_name' => 'fullwidth' ) );` |
* | Find next image tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'tag_name' => 'img', 'class_name' => 'fullwidth' ) );` |
*
* If a tag was found meeting your criteria then `next_tag()`
* will return `true` and you can proceed to modify it. If it
* returns `false`, however, it failed to find the tag and
* moved the cursor to the end of the file.
*
* Once the cursor reaches the end of the file the processor
* is done and if you want to reach an earlier tag you will
* need to recreate the processor and start over, as it's
* unable to back up or move in reverse.
*
* See the section on bookmarks for an exception to this
* no-backing-up rule.
*
* #### Custom queries
*
* Sometimes it's necessary to further inspect an HTML tag than
* the query syntax here permits. In these cases one may further
* inspect the search results using the read-only functions
* provided by the processor or external state or variables.
*
* Example:
*
* // Paint up to the first five DIV or SPAN tags marked with the "jazzy" style.
* $remaining_count = 5;
* while ( $remaining_count > 0 && $tags->next_tag() ) {
* if (
* ( 'DIV' === $tags->get_tag() || 'SPAN' === $tags->get_tag() ) &&
* 'jazzy' === $tags->get_attribute( 'data-style' )
* ) {
* $tags->add_class( 'theme-style-everest-jazz' );
* $remaining_count--;
* }
* }
*
* `get_attribute()` will return `null` if the attribute wasn't present
* on the tag when it was called. It may return `""` (the empty string)
* in cases where the attribute was present but its value was empty.
* For boolean attributes, those whose name is present but no value is
* given, it will return `true` (the only way to set `false` for an
* attribute is to remove it).
*
* #### When matching fails
*
* When `next_tag()` returns `false` it could mean different things:
*
* - The requested tag wasn't found in the input document.
* - The input document ended in the middle of an HTML syntax element.
*
* When a document ends in the middle of a syntax element it will pause
* the processor. This is to make it possible in the future to extend the
* input document and proceed - an important requirement for chunked
* streaming parsing of a document.
*
* Example:
*
* $processor = new WP_HTML_Tag_Processor( 'This ` inside an HTML comment.
* - STYLE content is raw text.
* - TITLE content is plain text but character references are decoded.
* - TEXTAREA content is plain text but character references are decoded.
* - XMP (deprecated) content is raw text.
*
* ### Modifying HTML attributes for a found tag
*
* Once you've found the start of an opening tag you can modify
* any number of the attributes on that tag. You can set a new
* value for an attribute, remove the entire attribute, or do
* nothing and move on to the next opening tag.
*
* Example:
*
* if ( $tags->next_tag( array( 'class_name' => 'wp-group-block' ) ) ) {
* $tags->set_attribute( 'title', 'This groups the contained content.' );
* $tags->remove_attribute( 'data-test-id' );
* }
*
* If `set_attribute()` is called for an existing attribute it will
* overwrite the existing value. Similarly, calling `remove_attribute()`
* for a non-existing attribute has no effect on the document. Both
* of these methods are safe to call without knowing if a given attribute
* exists beforehand.
*
* ### Modifying CSS classes for a found tag
*
* The tag processor treats the `class` attribute as a special case.
* Because it's a common operation to add or remove CSS classes, this
* interface adds helper methods to make that easier.
*
* As with attribute values, adding or removing CSS classes is a safe
* operation that doesn't require checking if the attribute or class
* exists before making changes. If removing the only class then the
* entire `class` attribute will be removed.
*
* Example:
*
* // from `
Yippee!`
* // to `
Yippee!`
* $tags->add_class( 'is-active' );
*
* // from `
Yippee!`
* // to `
Yippee!`
* $tags->add_class( 'is-active' );
*
* // from `
Yippee!`
* // to `
Yippee!`
* $tags->add_class( 'is-active' );
*
* // from `
`
* // to `
* $tags->remove_class( 'rugby' );
*
* // from `
`
* // to `
* $tags->remove_class( 'rugby' );
*
* // from `
`
* // to `
* $tags->remove_class( 'rugby' );
*
* When class changes are enqueued but a direct change to `class` is made via
* `set_attribute` then the changes to `set_attribute` (or `remove_attribute`)
* will take precedence over those made through `add_class` and `remove_class`.
*
* ### Bookmarks
*
* While scanning through the input HTMl document it's possible to set
* a named bookmark when a particular tag is found. Later on, after
* continuing to scan other tags, it's possible to `seek` to one of
* the set bookmarks and then proceed again from that point forward.
*
* Because bookmarks create processing overhead one should avoid
* creating too many of them. As a rule, create only bookmarks
* of known string literal names; avoid creating "mark_{$index}"
* and so on. It's fine from a performance standpoint to create a
* bookmark and update it frequently, such as within a loop.
*
* $total_todos = 0;
* while ( $p->next_tag( array( 'tag_name' => 'UL', 'class_name' => 'todo' ) ) ) {
* $p->set_bookmark( 'list-start' );
* while ( $p->next_tag( array( 'tag_closers' => 'visit' ) ) ) {
* if ( 'UL' === $p->get_tag() && $p->is_tag_closer() ) {
* $p->set_bookmark( 'list-end' );
* $p->seek( 'list-start' );
* $p->set_attribute( 'data-contained-todos', (string) $total_todos );
* $total_todos = 0;
* $p->seek( 'list-end' );
* break;
* }
*
* if ( 'LI' === $p->get_tag() && ! $p->is_tag_closer() ) {
* $total_todos++;
* }
* }
* }
*
* ## Tokens and finer-grained processing.
*
* It's possible to scan through every lexical token in the
* HTML document using the `next_token()` function. This
* alternative form takes no argument and provides no built-in
* query syntax.
*
* Example:
*
* $title = '(untitled)';
* $text = '';
* while ( $processor->next_token() ) {
* switch ( $processor->get_token_name() ) {
* case '#text':
* $text .= $processor->get_modifiable_text();
* break;
*
* case 'BR':
* $text .= "\n";
* break;
*
* case 'TITLE':
* $title = $processor->get_modifiable_text();
* break;
* }
* }
* return trim( "# {$title}\n\n{$text}" );
*
* ### Tokens and _modifiable text_.
*
* #### Special "atomic" HTML elements.
*
* Not all HTML elements are able to contain other elements inside of them.
* For instance, the contents inside a TITLE element are plaintext (except
* that character references like & will be decoded). This means that
* if the string `
` appears inside a TITLE element, then it's not an
* image tag, but rather it's text describing an image tag. Likewise, the
* contents of a SCRIPT or STYLE element are handled entirely separately in
* a browser than the contents of other elements because they represent a
* different language than HTML.
*
* For these elements the Tag Processor treats the entire sequence as one,
* from the opening tag, including its contents, through its closing tag.
* This means that the it's not possible to match the closing tag for a
* SCRIPT element unless it's unexpected; the Tag Processor already matched
* it when it found the opening tag.
*
* The inner contents of these elements are that element's _modifiable text_.
*
* The special elements are:
* - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy
* style of including JavaScript inside of HTML comments to avoid accidentally
* closing the SCRIPT from inside a JavaScript string. E.g. `console.log( '' )`.
* - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any
* character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`.
* - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as
* raw plaintext and left as-is. E.g. `1 < 2 < 3` remains `1 < 2 < 3`.
*
* #### Other tokens with modifiable text.
*
* There are also non-elements which are void/self-closing in nature and contain
* modifiable text that is part of that individual syntax token itself.
*
* - `#text` nodes, whose entire token _is_ the modifiable text.
* - HTML comments and tokens that become comments due to some syntax error. The
* text for these tokens is the portion of the comment inside of the syntax.
* E.g. for `` the text is `" comment "` (note the spaces are included).
* - `CDATA` sections, whose text is the content inside of the section itself. E.g. for
* `` the text is `"some content"` (with restrictions [1]).
* - "Funky comments," which are a special case of invalid closing tags whose name is
* invalid. The text for these nodes is the text that a browser would transform into
* an HTML comment when parsing. E.g. for `%post_author>` the text is `%post_author`.
* - `DOCTYPE` declarations like `
` which have no closing tag.
* - XML Processing instruction nodes like `` (with restrictions [2]).
* - The empty end tag `>` which is ignored in the browser and DOM.
*
* [1]: There are no CDATA sections in HTML. When encountering `` becomes a bogus HTML comment, meaning there can be no CDATA
* section in an HTML document containing `>`. The Tag Processor will first find
* all valid and bogus HTML comments, and then if the comment _would_ have been a
* CDATA section _were they to exist_, it will indicate this as the type of comment.
*
* [2]: XML allows a broader range of characters in a processing instruction's target name
* and disallows "xml" as a name, since it's special. The Tag Processor only recognizes
* target names with an ASCII-representable subset of characters. It also exhibits the
* same constraint as with CDATA sections, in that `>` cannot exist within the token
* since Processing Instructions do no exist within HTML and their syntax transforms
* into a bogus comment in the DOM.
*
* ## Design and limitations
*
* The Tag Processor is designed to linearly scan HTML documents and tokenize
* HTML tags and their attributes. It's designed to do this as efficiently as
* possible without compromising parsing integrity. Therefore it will be
* slower than some methods of modifying HTML, such as those incorporating
* over-simplified PCRE patterns, but will not introduce the defects and
* failures that those methods bring in, which lead to broken page renders
* and often to security vulnerabilities. On the other hand, it will be faster
* than full-blown HTML parsers such as DOMDocument and use considerably
* less memory. It requires a negligible memory overhead, enough to consider
* it a zero-overhead system.
*
* The performance characteristics are maintained by avoiding tree construction
* and semantic cleanups which are specified in HTML5. Because of this, for
* example, it's not possible for the Tag Processor to associate any given
* opening tag with its corresponding closing tag, or to return the inner markup
* inside an element. Systems may be built on top of the Tag Processor to do
* this, but the Tag Processor is and should be constrained so it can remain an
* efficient, low-level, and reliable HTML scanner.
*
* The Tag Processor's design incorporates a "garbage-in-garbage-out" philosophy.
* HTML5 specifies that certain invalid content be transformed into different forms
* for display, such as removing null bytes from an input document and replacing
* invalid characters with the Unicode replacement character `U+FFFD` (visually "�").
* Where errors or transformations exist within the HTML5 specification, the Tag Processor
* leaves those invalid inputs untouched, passing them through to the final browser
* to handle. While this implies that certain operations will be non-spec-compliant,
* such as reading the value of an attribute with invalid content, it also preserves a
* simplicity and efficiency for handling those error cases.
*
* Most operations within the Tag Processor are designed to minimize the difference
* between an input and output document for any given change. For example, the
* `add_class` and `remove_class` methods preserve whitespace and the class ordering
* within the `class` attribute; and when encountering tags with duplicated attributes,
* the Tag Processor will leave those invalid duplicate attributes where they are but
* update the proper attribute which the browser will read for parsing its value. An
* exception to this rule is that all attribute updates store their values as
* double-quoted strings, meaning that attributes on input with single-quoted or
* unquoted values will appear in the output with double-quotes.
*
* ### Scripting Flag
*
* The Tag Processor parses HTML with the "scripting flag" disabled. This means
* that it doesn't run any scripts while parsing the page. In a browser with
* JavaScript enabled, for example, the script can change the parse of the
* document as it loads. On the server, however, evaluating JavaScript is not
* only impractical, but also unwanted.
*
* Practically this means that the Tag Processor will descend into NOSCRIPT
* elements and process its child tags. Were the scripting flag enabled, such
* as in a typical browser, the contents of NOSCRIPT are skipped entirely.
*
* This allows the HTML API to process the content that will be presented in
* a browser when scripting is disabled, but it offers a different view of a
* page than most browser sessions will experience. E.g. the tags inside the
* NOSCRIPT disappear.
*
* ### Text Encoding
*
* The Tag Processor assumes that the input HTML document is encoded with a
* text encoding compatible with 7-bit ASCII's '<', '>', '&', ';', '/', '=',
* "'", '"', 'a' - 'z', 'A' - 'Z', and the whitespace characters ' ', tab,
* carriage-return, newline, and form-feed.
*
* In practice, this includes almost every single-byte encoding as well as
* UTF-8. Notably, however, it does not include UTF-16. If providing input
* that's incompatible, then convert the encoding beforehand.
*
* @since 6.2.0
* @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive.
* @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE.
* @since 6.5.0 Pauses processor when input ends in an incomplete syntax token.
* Introduces "special" elements which act like void elements, e.g. TITLE, STYLE.
* Allows scanning through all tokens and processing modifiable text, where applicable.
*/
class WP_HTML_Tag_Processor {
/**
* The maximum number of bookmarks allowed to exist at
* any given time.
*
* @since 6.2.0
* @var int
*
* @see WP_HTML_Tag_Processor::set_bookmark()
*/
const MAX_BOOKMARKS = 10;
/**
* Maximum number of times seek() can be called.
* Prevents accidental infinite loops.
*
* @since 6.2.0
* @var int
*
* @see WP_HTML_Tag_Processor::seek()
*/
const MAX_SEEK_OPS = 1000;
/**
* The HTML document to parse.
*
* @since 6.2.0
* @var string
*/
protected $html;
/**
* The last query passed to next_tag().
*
* @since 6.2.0
* @var array|null
*/
private $last_query;
/**
* The tag name this processor currently scans for.
*
* @since 6.2.0
* @var string|null
*/
private $sought_tag_name;
/**
* The CSS class name this processor currently scans for.
*
* @since 6.2.0
* @var string|null
*/
private $sought_class_name;
/**
* The match offset this processor currently scans for.
*
* @since 6.2.0
* @var int|null
*/
private $sought_match_offset;
/**
* Whether to visit tag closers, e.g. , when walking an input document.
*
* @since 6.2.0
* @var bool
*/
private $stop_on_tag_closers;
/**
* Specifies mode of operation of the parser at any given time.
*
* | State | Meaning |
* | ----------------|----------------------------------------------------------------------|
* | *Ready* | The parser is ready to run. |
* | *Complete* | There is nothing left to parse. |
* | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. |
* | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. |
* | *Text node* | Found a #text node; this is plaintext and modifiable. |
* | *CDATA node* | Found a CDATA section; this is modifiable. |
* | *Comment* | Found a comment or bogus comment; this is modifiable. |
* | *Presumptuous* | Found an empty tag closer: `>`. |
* | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. |
*
* @since 6.5.0
*
* @see WP_HTML_Tag_Processor::STATE_READY
* @see WP_HTML_Tag_Processor::STATE_COMPLETE
* @see WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT
* @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
* @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
* @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
* @see WP_HTML_Tag_Processor::STATE_COMMENT
* @see WP_HTML_Tag_Processor::STATE_DOCTYPE
* @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
* @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT
*
* @var string
*/
protected $parser_state = self::STATE_READY;
/**
* What kind of syntax token became an HTML comment.
*
* Since there are many ways in which HTML syntax can create an HTML comment,
* this indicates which of those caused it. This allows the Tag Processor to
* represent more from the original input document than would appear in the DOM.
*
* @since 6.5.0
*
* @var string|null
*/
protected $comment_type = null;
/**
* How many bytes from the original HTML document have been read and parsed.
*
* This value points to the latest byte offset in the input document which
* has been already parsed. It is the internal cursor for the Tag Processor
* and updates while scanning through the HTML tokens.
*
* @since 6.2.0
* @var int
*/
private $bytes_already_parsed = 0;
/**
* Byte offset in input document where current token starts.
*
* Example:
*
* ...
* 01234
* - token starts at 0
*
* @since 6.5.0
*
* @var int|null
*/
private $token_starts_at;
/**
* Byte length of current token.
*
* Example:
*
*
...
* 012345678901234
* - token length is 14 - 0 = 14
*
* a is a token.
* 0123456789 123456789 123456789
* - token length is 17 - 2 = 15
*
* @since 6.5.0
*
* @var int|null
*/
private $token_length;
/**
* Byte offset in input document where current tag name starts.
*
* Example:
*
*
...
* 01234
* - tag name starts at 1
*
* @since 6.2.0
*
* @var int|null
*/
private $tag_name_starts_at;
/**
* Byte length of current tag name.
*
* Example:
*
*
...
* 01234
* --- tag name length is 3
*
* @since 6.2.0
*
* @var int|null
*/
private $tag_name_length;
/**
* Byte offset into input document where current modifiable text starts.
*
* @since 6.5.0
*
* @var int
*/
private $text_starts_at;
/**
* Byte length of modifiable text.
*
* @since 6.5.0
*
* @var string
*/
private $text_length;
/**
* Whether the current tag is an opening tag, e.g.
, or a closing tag, e.g.
.
*
* @var bool
*/
private $is_closing_tag;
/**
* Lazily-built index of attributes found within an HTML tag, keyed by the attribute name.
*
* Example:
*
* // Supposing the parser is working through this content
* // and stops after recognizing the `id` attribute.
* //
* // ^ parsing will continue from this point.
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false )
* );
*
* // When picking up parsing again, or when asking to find the
* // `class` attribute we will continue and add to this array.
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ),
* 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false )
* );
*
* // Note that only the `class` attribute value is stored in the index.
* // That's because it is the only value used by this class at the moment.
*
* @since 6.2.0
* @var WP_HTML_Attribute_Token[]
*/
private $attributes = array();
/**
* Tracks spans of duplicate attributes on a given tag, used for removing
* all copies of an attribute when calling `remove_attribute()`.
*
* @since 6.3.2
*
* @var (WP_HTML_Span[])[]|null
*/
private $duplicate_attributes = null;
/**
* Which class names to add or remove from a tag.
*
* These are tracked separately from attribute updates because they are
* semantically distinct, whereas this interface exists for the common
* case of adding and removing class names while other attributes are
* generally modified as with DOM `setAttribute` calls.
*
* When modifying an HTML document these will eventually be collapsed
* into a single `set_attribute( 'class', $changes )` call.
*
* Example:
*
* // Add the `wp-block-group` class, remove the `wp-group` class.
* $classname_updates = array(
* // Indexed by a comparable class name.
* 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS,
* 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS
* );
*
* @since 6.2.0
* @var bool[]
*/
private $classname_updates = array();
/**
* Tracks a semantic location in the original HTML which
* shifts with updates as they are applied to the document.
*
* @since 6.2.0
* @var WP_HTML_Span[]
*/
protected $bookmarks = array();
const ADD_CLASS = true;
const REMOVE_CLASS = false;
const SKIP_CLASS = null;
/**
* Lexical replacements to apply to input HTML document.
*
* "Lexical" in this class refers to the part of this class which
* operates on pure text _as text_ and not as HTML. There's a line
* between the public interface, with HTML-semantic methods like
* `set_attribute` and `add_class`, and an internal state that tracks
* text offsets in the input document.
*
* When higher-level HTML methods are called, those have to transform their
* operations (such as setting an attribute's value) into text diffing
* operations (such as replacing the sub-string from indices A to B with
* some given new string). These text-diffing operations are the lexical
* updates.
*
* As new higher-level methods are added they need to collapse their
* operations into these lower-level lexical updates since that's the
* Tag Processor's internal language of change. Any code which creates
* these lexical updates must ensure that they do not cross HTML syntax
* boundaries, however, so these should never be exposed outside of this
* class or any classes which intentionally expand its functionality.
*
* These are enqueued while editing the document instead of being immediately
* applied to avoid processing overhead, string allocations, and string
* copies when applying many updates to a single document.
*
* Example:
*
* // Replace an attribute stored with a new value, indices
* // sourced from the lazily-parsed HTML recognizer.
* $start = $attributes['src']->start;
* $length = $attributes['src']->length;
* $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value );
*
* // Correspondingly, something like this will appear in this array.
* $lexical_updates = array(
* WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' )
* );
*
* @since 6.2.0
* @var WP_HTML_Text_Replacement[]
*/
protected $lexical_updates = array();
/**
* Tracks and limits `seek()` calls to prevent accidental infinite loops.
*
* @since 6.2.0
* @var int
*
* @see WP_HTML_Tag_Processor::seek()
*/
protected $seek_count = 0;
/**
* Constructor.
*
* @since 6.2.0
*
* @param string $html HTML to process.
*/
public function __construct( $html ) {
$this->html = $html;
}
/**
* Finds the next tag matching the $query.
*
* @since 6.2.0
* @since 6.5.0 No longer processes incomplete tokens at end of document; pauses the processor at start of token.
*
* @param array|string|null $query {
* Optional. Which tag name to find, having which class, etc. Default is to find any tag.
*
* @type string|null $tag_name Which tag to find, or `null` for "any tag."
* @type int|null $match_offset Find the Nth tag matching all search criteria.
* 1 for "first" tag, 3 for "third," etc.
* Defaults to first tag.
* @type string|null $class_name Tag must contain this whole class name to match.
* @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g.
.
* }
* @return bool Whether a tag was matched.
*/
public function next_tag( $query = null ) {
$this->parse_query( $query );
$already_found = 0;
do {
if ( false === $this->next_token() ) {
return false;
}
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
continue;
}
if ( $this->matches() ) {
++$already_found;
}
} while ( $already_found < $this->sought_match_offset );
return true;
}
/**
* Finds the next token in the HTML document.
*
* An HTML document can be viewed as a stream of tokens,
* where tokens are things like HTML tags, HTML comments,
* text nodes, etc. This method finds the next token in
* the HTML document and returns whether it found one.
*
* If it starts parsing a token and reaches the end of the
* document then it will seek to the start of the last
* token and pause, returning `false` to indicate that it
* failed to find a complete token.
*
* Possible token types, based on the HTML specification:
*
* - an HTML tag, whether opening, closing, or void.
* - a text node - the plaintext inside tags.
* - an HTML comment.
* - a DOCTYPE declaration.
* - a processing instruction, e.g. ``.
*
* The Tag Processor currently only supports the tag token.
*
* @since 6.5.0
*
* @return bool Whether a token was parsed.
*/
public function next_token() {
return $this->base_class_next_token();
}
/**
* Internal method which finds the next token in the HTML document.
*
* This method is a protected internal function which implements the logic for
* finding the next token in a document. It exists so that the parser can update
* its state without affecting the location of the cursor in the document and
* without triggering subclass methods for things like `next_token()`, e.g. when
* applying patches before searching for the next token.
*
* @since 6.5.0
*
* @access private
*
* @return bool Whether a token was parsed.
*/
private function base_class_next_token() {
$was_at = $this->bytes_already_parsed;
$this->after_tag();
// Don't proceed if there's nothing more to scan.
if (
self::STATE_COMPLETE === $this->parser_state ||
self::STATE_INCOMPLETE_INPUT === $this->parser_state
) {
return false;
}
/*
* The next step in the parsing loop determines the parsing state;
* clear it so that state doesn't linger from the previous step.
*/
$this->parser_state = self::STATE_READY;
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$this->parser_state = self::STATE_COMPLETE;
return false;
}
// Find the next tag if it exists.
if ( false === $this->parse_next_tag() ) {
if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
$this->bytes_already_parsed = $was_at;
}
return false;
}
/*
* For legacy reasons the rest of this function handles tags and their
* attributes. If the processor has reached the end of the document
* or if it matched any other token then it should return here to avoid
* attempting to process tag-specific syntax.
*/
if (
self::STATE_INCOMPLETE_INPUT !== $this->parser_state &&
self::STATE_COMPLETE !== $this->parser_state &&
self::STATE_MATCHED_TAG !== $this->parser_state
) {
return true;
}
// Parse all of its attributes.
while ( $this->parse_next_attribute() ) {
continue;
}
// Ensure that the tag closes before the end of the document.
if (
self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
$this->bytes_already_parsed >= strlen( $this->html )
) {
// Does this appropriately clear state (parsed attributes)?
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
}
$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
if ( false === $tag_ends_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
}
$this->parser_state = self::STATE_MATCHED_TAG;
$this->bytes_already_parsed = $tag_ends_at + 1;
$this->token_length = $this->bytes_already_parsed - $this->token_starts_at;
/*
* For non-DATA sections which might contain text that looks like HTML tags but
* isn't, scan with the appropriate alternative mode. Looking at the first letter
* of the tag name as a pre-check avoids a string allocation when it's not needed.
*/
$t = $this->html[ $this->tag_name_starts_at ];
if (
$this->is_closing_tag ||
! (
'i' === $t || 'I' === $t ||
'n' === $t || 'N' === $t ||
's' === $t || 'S' === $t ||
't' === $t || 'T' === $t ||
'x' === $t || 'X' === $t
)
) {
return true;
}
$tag_name = $this->get_tag();
/*
* Preserve the opening tag pointers, as these will be overwritten
* when finding the closing tag. They will be reset after finding
* the closing to tag to point to the opening of the special atomic
* tag sequence.
*/
$tag_name_starts_at = $this->tag_name_starts_at;
$tag_name_length = $this->tag_name_length;
$tag_ends_at = $this->token_starts_at + $this->token_length;
$attributes = $this->attributes;
$duplicate_attributes = $this->duplicate_attributes;
// Find the closing tag if necessary.
$found_closer = false;
switch ( $tag_name ) {
case 'SCRIPT':
$found_closer = $this->skip_script_data();
break;
case 'TEXTAREA':
case 'TITLE':
$found_closer = $this->skip_rcdata( $tag_name );
break;
/*
* In the browser this list would include the NOSCRIPT element,
* but the Tag Processor is an environment with the scripting
* flag disabled, meaning that it needs to descend into the
* NOSCRIPT element to be able to properly process what will be
* sent to a browser.
*
* Note that this rule makes HTML5 syntax incompatible with XML,
* because the parsing of this token depends on client application.
* The NOSCRIPT element cannot be represented in the XHTML syntax.
*/
case 'IFRAME':
case 'NOEMBED':
case 'NOFRAMES':
case 'STYLE':
case 'XMP':
$found_closer = $this->skip_rawtext( $tag_name );
break;
// No other tags should be treated in their entirety here.
default:
return true;
}
if ( ! $found_closer ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
}
/*
* The values here look like they reference the opening tag but they reference
* the closing tag instead. This is why the opening tag values were stored
* above in a variable. It reads confusingly here, but that's because the
* functions that skip the contents have moved all the internal cursors past
* the inner content of the tag.
*/
$this->token_starts_at = $was_at;
$this->token_length = $this->bytes_already_parsed - $this->token_starts_at;
$this->text_starts_at = $tag_ends_at;
$this->text_length = $this->tag_name_starts_at - $this->text_starts_at;
$this->tag_name_starts_at = $tag_name_starts_at;
$this->tag_name_length = $tag_name_length;
$this->attributes = $attributes;
$this->duplicate_attributes = $duplicate_attributes;
return true;
}
/**
* Whether the processor paused because the input HTML document ended
* in the middle of a syntax element, such as in the middle of a tag.
*
* Example:
*
* $processor = new WP_HTML_Tag_Processor( '
" );
* $p->next_tag();
* foreach ( $p->class_list() as $class_name ) {
* echo "{$class_name} ";
* }
* // Outputs: "free
lang-en "
*
* @since 6.4.0
*/
public function class_list() {
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return;
}
/** @var string $class contains the string value of the class attribute, with character references decoded. */
$class = $this->get_attribute( 'class' );
if ( ! is_string( $class ) ) {
return;
}
$seen = array();
$at = 0;
while ( $at < strlen( $class ) ) {
// Skip past any initial boundary characters.
$at += strspn( $class, " \t\f\r\n", $at );
if ( $at >= strlen( $class ) ) {
return;
}
// Find the byte length until the next boundary.
$length = strcspn( $class, " \t\f\r\n", $at );
if ( 0 === $length ) {
return;
}
/*
* CSS class names are case-insensitive in the ASCII range.
*
* @see https://www.w3.org/TR/CSS2/syndata.html#x1
*/
$name = strtolower( substr( $class, $at, $length ) );
$at += $length;
/*
* It's expected that the number of class names for a given tag is relatively small.
* Given this, it is probably faster overall to scan an array for a value rather
* than to use the class name as a key and check if it's a key of $seen.
*/
if ( in_array( $name, $seen, true ) ) {
continue;
}
$seen[] = $name;
yield $name;
}
}
/**
* Returns if a matched tag contains the given ASCII case-insensitive class name.
*
* @since 6.4.0
*
* @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
* @return bool|null Whether the matched tag contains the given class name, or null if not matched.
*/
public function has_class( $wanted_class ) {
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return null;
}
$wanted_class = strtolower( $wanted_class );
foreach ( $this->class_list() as $class_name ) {
if ( $class_name === $wanted_class ) {
return true;
}
}
return false;
}
/**
* Sets a bookmark in the HTML document.
*
* Bookmarks represent specific places or tokens in the HTML
* document, such as a tag opener or closer. When applying
* edits to a document, such as setting an attribute, the
* text offsets of that token may shift; the bookmark is
* kept updated with those shifts and remains stable unless
* the entire span of text in which the token sits is removed.
*
* Release bookmarks when they are no longer needed.
*
* Example:
*
* Surprising fact you may not know!
* ^ ^
* \-|-- this `H2` opener bookmark tracks the token
*
* Surprising fact you may no…
* ^ ^
* \-|-- it shifts with edits
*
* Bookmarks provide the ability to seek to a previously-scanned
* place in the HTML document. This avoids the need to re-scan
* the entire document.
*
* Example:
*
*
* ^^^^
* want to note this last item
*
* $p = new WP_HTML_Tag_Processor( $html );
* $in_list = false;
* while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) {
* if ( 'UL' === $p->get_tag() ) {
* if ( $p->is_tag_closer() ) {
* $in_list = false;
* $p->set_bookmark( 'resume' );
* if ( $p->seek( 'last-li' ) ) {
* $p->add_class( 'last-li' );
* }
* $p->seek( 'resume' );
* $p->release_bookmark( 'last-li' );
* $p->release_bookmark( 'resume' );
* } else {
* $in_list = true;
* }
* }
*
* if ( 'LI' === $p->get_tag() ) {
* $p->set_bookmark( 'last-li' );
* }
* }
*
* Bookmarks intentionally hide the internal string offsets
* to which they refer. They are maintained internally as
* updates are applied to the HTML document and therefore
* retain their "position" - the location to which they
* originally pointed. The inability to use bookmarks with
* functions like `substr` is therefore intentional to guard
* against accidentally breaking the HTML.
*
* Because bookmarks allocate memory and require processing
* for every applied update, they are limited and require
* a name. They should not be created with programmatically-made
* names, such as "li_{$index}" with some loop. As a general
* rule they should only be created with string-literal names
* like "start-of-section" or "last-paragraph".
*
* Bookmarks are a powerful tool to enable complicated behavior.
* Consider double-checking that you need this tool if you are
* reaching for it, as inappropriate use could lead to broken
* HTML structure or unwanted processing overhead.
*
* @since 6.2.0
*
* @param string $name Identifies this particular bookmark.
* @return bool Whether the bookmark was successfully created.
*/
public function set_bookmark( $name ) {
// It only makes sense to set a bookmark if the parser has paused on a concrete token.
if (
self::STATE_COMPLETE === $this->parser_state ||
self::STATE_INCOMPLETE_INPUT === $this->parser_state
) {
return false;
}
if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= static::MAX_BOOKMARKS ) {
_doing_it_wrong(
__METHOD__,
__( 'Too many bookmarks: cannot create any more.' ),
'6.2.0'
);
return false;
}
$this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length );
return true;
}
/**
* Removes a bookmark that is no longer needed.
*
* Releasing a bookmark frees up the small
* performance overhead it requires.
*
* @param string $name Name of the bookmark to remove.
* @return bool Whether the bookmark already existed before removal.
*/
public function release_bookmark( $name ) {
if ( ! array_key_exists( $name, $this->bookmarks ) ) {
return false;
}
unset( $this->bookmarks[ $name ] );
return true;
}
/**
* Skips contents of generic rawtext elements.
*
* @since 6.3.2
*
* @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm
*
* @param string $tag_name The uppercase tag name which will close the RAWTEXT region.
* @return bool Whether an end to the RAWTEXT region was found before the end of the document.
*/
private function skip_rawtext( $tag_name ) {
/*
* These two functions distinguish themselves on whether character references are
* decoded, and since functionality to read the inner markup isn't supported, it's
* not necessary to implement these two functions separately.
*/
return $this->skip_rcdata( $tag_name );
}
/**
* Skips contents of RCDATA elements, namely title and textarea tags.
*
* @since 6.2.0
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
*
* @param string $tag_name The uppercase tag name which will close the RCDATA region.
* @return bool Whether an end to the RCDATA region was found before the end of the document.
*/
private function skip_rcdata( $tag_name ) {
$html = $this->html;
$doc_length = strlen( $html );
$tag_length = strlen( $tag_name );
$at = $this->bytes_already_parsed;
while ( false !== $at && $at < $doc_length ) {
$at = strpos( $this->html, '', $at );
$this->tag_name_starts_at = $at;
// Fail if there is no possible tag closer.
if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
return false;
}
$at += 2;
/*
* Find a case-insensitive match to the tag name.
*
* Because tag names are limited to US-ASCII there is no
* need to perform any kind of Unicode normalization when
* comparing; any character which could be impacted by such
* normalization could not be part of a tag name.
*/
for ( $i = 0; $i < $tag_length; $i++ ) {
$tag_char = $tag_name[ $i ];
$html_char = $html[ $at + $i ];
if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
$at += $i;
continue 2;
}
}
$at += $tag_length;
$this->bytes_already_parsed = $at;
if ( $at >= strlen( $html ) ) {
return false;
}
/*
* Ensure that the tag name terminates to avoid matching on
* substrings of a longer tag name. For example, the sequence
* "' !== $c ) {
continue;
}
while ( $this->parse_next_attribute() ) {
continue;
}
$at = $this->bytes_already_parsed;
if ( $at >= strlen( $this->html ) ) {
return false;
}
if ( '>' === $html[ $at ] ) {
$this->bytes_already_parsed = $at + 1;
return true;
}
if ( $at + 1 >= strlen( $this->html ) ) {
return false;
}
if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) {
$this->bytes_already_parsed = $at + 2;
return true;
}
}
return false;
}
/**
* Skips contents of script tags.
*
* @since 6.2.0
*
* @return bool Whether the script tag was closed before the end of the document.
*/
private function skip_script_data() {
$state = 'unescaped';
$html = $this->html;
$doc_length = strlen( $html );
$at = $this->bytes_already_parsed;
while ( false !== $at && $at < $doc_length ) {
$at += strcspn( $html, '-<', $at );
/*
* For all script states a "-->" transitions
* back into the normal unescaped script mode,
* even if that's the current state.
*/
if (
$at + 2 < $doc_length &&
'-' === $html[ $at ] &&
'-' === $html[ $at + 1 ] &&
'>' === $html[ $at + 2 ]
) {
$at += 3;
$state = 'unescaped';
continue;
}
// Everything of interest past here starts with "<".
if ( $at + 1 >= $doc_length || '<' !== $html[ $at++ ] ) {
continue;
}
/*
* Unlike with "-->", the "`. Unlike other comment
* and bogus comment syntax, these leave no clear insertion point for text and
* they need to be modified specially in order to contain text. E.g. to store
* `?` as the modifiable text, the `` needs to become ``, which
* involves inserting an additional `-` into the token after the modifiable text.
*/
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT;
$this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at;
// Only provide modifiable text if the token is long enough to contain it.
if ( $span_of_dashes >= 2 ) {
$this->comment_type = self::COMMENT_AS_HTML_COMMENT;
$this->text_starts_at = $this->token_starts_at + 4;
$this->text_length = $span_of_dashes - 2;
}
$this->bytes_already_parsed = $closer_at + $span_of_dashes + 1;
return true;
}
/*
* Comments may be closed by either a --> or an invalid --!>.
* The first occurrence closes the comment.
*
* See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
*/
--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
while ( ++$closer_at < $doc_length ) {
$closer_at = strpos( $html, '--', $closer_at );
if ( false === $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) {
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_HTML_COMMENT;
$this->token_length = $closer_at + 3 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 4;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 3;
return true;
}
if (
$closer_at + 3 < $doc_length &&
'!' === $html[ $closer_at + 2 ] &&
'>' === $html[ $closer_at + 3 ]
) {
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_HTML_COMMENT;
$this->token_length = $closer_at + 4 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 4;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 4;
return true;
}
}
}
/*
* `
* These are ASCII-case-insensitive.
* https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if (
$doc_length > $at + 8 &&
( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) &&
( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) &&
( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) &&
( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] )
) {
$closer_at = strpos( $html, '>', $at + 9 );
if ( false === $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$this->parser_state = self::STATE_DOCTYPE;
$this->token_length = $closer_at + 1 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 9;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
return true;
}
/*
* Anything else here is an incorrectly-opened comment and transitions
* to the bogus comment state - skip to the nearest >. If no closer is
* found then the HTML was truncated inside the markup declaration.
*/
$closer_at = strpos( $html, '>', $at + 1 );
if ( false === $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_INVALID_HTML;
$this->token_length = $closer_at + 1 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 2;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
/*
* Identify nodes that would be CDATA if HTML had CDATA sections.
*
* This section must occur after identifying the bogus comment end
* because in an HTML parser it will span to the nearest `>`, even
* if there's no `]]>` as would be required in an XML document. It
* is therefore not possible to parse a CDATA section containing
* a `>` in the HTML syntax.
*
* Inside foreign elements there is a discrepancy between browsers
* and the specification on this.
*
* @todo Track whether the Tag Processor is inside a foreign element
* and require the proper closing `]]>` in those cases.
*/
if (
$this->token_length >= 10 &&
'[' === $html[ $this->token_starts_at + 2 ] &&
'C' === $html[ $this->token_starts_at + 3 ] &&
'D' === $html[ $this->token_starts_at + 4 ] &&
'A' === $html[ $this->token_starts_at + 5 ] &&
'T' === $html[ $this->token_starts_at + 6 ] &&
'A' === $html[ $this->token_starts_at + 7 ] &&
'[' === $html[ $this->token_starts_at + 8 ] &&
']' === $html[ $closer_at - 1 ] &&
']' === $html[ $closer_at - 2 ]
) {
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_CDATA_LOOKALIKE;
$this->text_starts_at += 7;
$this->text_length -= 9;
}
return true;
}
/*
* > is a missing end tag name, which is ignored.
*
* This was also known as the "presumptuous empty tag"
* in early discussions as it was proposed to close
* the nearest previous opening tag.
*
* See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
*/
if ( '>' === $html[ $at + 1 ] ) {
// `<>` is interpreted as plaintext.
if ( ! $this->is_closing_tag ) {
++$at;
continue;
}
$this->parser_state = self::STATE_PRESUMPTUOUS_TAG;
$this->token_length = $at + 2 - $this->token_starts_at;
$this->bytes_already_parsed = $at + 2;
return true;
}
/*
* `` transitions to a bogus comment state – skip to the nearest >
* See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if ( ! $this->is_closing_tag && '?' === $html[ $at + 1 ] ) {
$closer_at = strpos( $html, '>', $at + 2 );
if ( false === $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_INVALID_HTML;
$this->token_length = $closer_at + 1 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 2;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
/*
* Identify a Processing Instruction node were HTML to have them.
*
* This section must occur after identifying the bogus comment end
* because in an HTML parser it will span to the nearest `>`, even
* if there's no `?>` as would be required in an XML document. It
* is therefore not possible to parse a Processing Instruction node
* containing a `>` in the HTML syntax.
*
* XML allows for more target names, but this code only identifies
* those with ASCII-representable target names. This means that it
* may identify some Processing Instruction nodes as bogus comments,
* but it will not misinterpret the HTML structure. By limiting the
* identification to these target names the Tag Processor can avoid
* the need to start parsing UTF-8 sequences.
*
* > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
* [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
* [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
* [#x10000-#xEFFFF]
* > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
*
* @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
*/
if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
$comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
if ( 0 < $pi_target_length ) {
$pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
$this->comment_type = self::COMMENT_AS_PI_NODE_LOOKALIKE;
$this->tag_name_starts_at = $this->token_starts_at + 2;
$this->tag_name_length = $pi_target_length;
$this->text_starts_at += $pi_target_length;
$this->text_length -= $pi_target_length + 1;
}
}
return true;
}
/*
* If a non-alpha starts the tag name in a tag closer it's a comment.
* Find the first `>`, which closes the comment.
*
* This parser classifies these particular comments as special "funky comments"
* which are made available for further processing.
*
* See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
*/
if ( $this->is_closing_tag ) {
// No chance of finding a closer.
if ( $at + 3 > $doc_length ) {
return false;
}
$closer_at = strpos( $html, '>', $at + 2 );
if ( false === $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$this->parser_state = self::STATE_FUNKY_COMMENT;
$this->token_length = $closer_at + 1 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 2;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
return true;
}
++$at;
}
return false;
}
/**
* Parses the next attribute.
*
* @since 6.2.0
*
* @return bool Whether an attribute was found before the end of the document.
*/
private function parse_next_attribute() {
// Skip whitespace and slashes.
$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
/*
* Treat the equal sign as a part of the attribute
* name if it is the first encountered byte.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
*/
$name_length = '=' === $this->html[ $this->bytes_already_parsed ]
? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 )
: strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed );
// No attribute, just tag closer.
if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) {
return false;
}
$attribute_start = $this->bytes_already_parsed;
$attribute_name = substr( $this->html, $attribute_start, $name_length );
$this->bytes_already_parsed += $name_length;
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$has_value = '=' === $this->html[ $this->bytes_already_parsed ];
if ( $has_value ) {
++$this->bytes_already_parsed;
$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
switch ( $this->html[ $this->bytes_already_parsed ] ) {
case "'":
case '"':
$quote = $this->html[ $this->bytes_already_parsed ];
$value_start = $this->bytes_already_parsed + 1;
$value_length = strcspn( $this->html, $quote, $value_start );
$attribute_end = $value_start + $value_length + 1;
$this->bytes_already_parsed = $attribute_end;
break;
default:
$value_start = $this->bytes_already_parsed;
$value_length = strcspn( $this->html, "> \t\f\r\n", $value_start );
$attribute_end = $value_start + $value_length;
$this->bytes_already_parsed = $attribute_end;
}
} else {
$value_start = $this->bytes_already_parsed;
$value_length = 0;
$attribute_end = $attribute_start + $name_length;
}
if ( $attribute_end >= strlen( $this->html ) ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
if ( $this->is_closing_tag ) {
return true;
}
/*
* > There must never be two or more attributes on
* > the same start tag whose names are an ASCII
* > case-insensitive match for each other.
* - HTML 5 spec
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*/
$comparable_name = strtolower( $attribute_name );
// If an attribute is listed many times, only use the first declaration and ignore the rest.
if ( ! array_key_exists( $comparable_name, $this->attributes ) ) {
$this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token(
$attribute_name,
$value_start,
$value_length,
$attribute_start,
$attribute_end - $attribute_start,
! $has_value
);
return true;
}
/*
* Track the duplicate attributes so if we remove it, all disappear together.
*
* While `$this->duplicated_attributes` could always be stored as an `array()`,
* which would simplify the logic here, storing a `null` and only allocating
* an array when encountering duplicates avoids needless allocations in the
* normative case of parsing tags with no duplicate attributes.
*/
$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start );
if ( null === $this->duplicate_attributes ) {
$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
$this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
} else {
$this->duplicate_attributes[ $comparable_name ][] = $duplicate_span;
}
return true;
}
/**
* Move the internal cursor past any immediate successive whitespace.
*
* @since 6.2.0
*/
private function skip_whitespace() {
$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n", $this->bytes_already_parsed );
}
/**
* Applies attribute updates and cleans up once a tag is fully parsed.
*
* @since 6.2.0
*/
private function after_tag() {
/*
* There could be lexical updates enqueued for an attribute that
* also exists on the next tag. In order to avoid conflating the
* attributes across the two tags, lexical updates with names
* need to be flushed to raw lexical updates.
*/
$this->class_name_updates_to_attributes_updates();
/*
* Purge updates if there are too many. The actual count isn't
* scientific, but a few values from 100 to a few thousand were
* tests to find a practically-useful limit.
*
* If the update queue grows too big, then the Tag Processor
* will spend more time iterating through them and lose the
* efficiency gains of deferring applying them.
*/
if ( 1000 < count( $this->lexical_updates ) ) {
$this->get_updated_html();
}
foreach ( $this->lexical_updates as $name => $update ) {
/*
* Any updates appearing after the cursor should be applied
* before proceeding, otherwise they may be overlooked.
*/
if ( $update->start >= $this->bytes_already_parsed ) {
$this->get_updated_html();
break;
}
if ( is_int( $name ) ) {
continue;
}
$this->lexical_updates[] = $update;
unset( $this->lexical_updates[ $name ] );
}
$this->token_starts_at = null;
$this->token_length = null;
$this->tag_name_starts_at = null;
$this->tag_name_length = null;
$this->text_starts_at = 0;
$this->text_length = 0;
$this->is_closing_tag = null;
$this->attributes = array();
$this->comment_type = null;
$this->duplicate_attributes = null;
}
/**
* Converts class name updates into tag attributes updates
* (they are accumulated in different data formats for performance).
*
* @since 6.2.0
*
* @see WP_HTML_Tag_Processor::$lexical_updates
* @see WP_HTML_Tag_Processor::$classname_updates
*/
private function class_name_updates_to_attributes_updates() {
if ( count( $this->classname_updates ) === 0 ) {
return;
}
$existing_class = $this->get_enqueued_attribute_value( 'class' );
if ( null === $existing_class || true === $existing_class ) {
$existing_class = '';
}
if ( false === $existing_class && isset( $this->attributes['class'] ) ) {
$existing_class = substr(
$this->html,
$this->attributes['class']->value_starts_at,
$this->attributes['class']->value_length
);
}
if ( false === $existing_class ) {
$existing_class = '';
}
/**
* Updated "class" attribute value.
*
* This is incrementally built while scanning through the existing class
* attribute, skipping removed classes on the way, and then appending
* added classes at the end. Only when finished processing will the
* value contain the final new value.
* @var string $class
*/
$class = '';
/**
* Tracks the cursor position in the existing
* class attribute value while parsing.
*
* @var int $at
*/
$at = 0;
/**
* Indicates if there's any need to modify the existing class attribute.
*
* If a call to `add_class()` and `remove_class()` wouldn't impact
* the `class` attribute value then there's no need to rebuild it.
* For example, when adding a class that's already present or
* removing one that isn't.
*
* This flag enables a performance optimization when none of the enqueued
* class updates would impact the `class` attribute; namely, that the
* processor can continue without modifying the input document, as if
* none of the `add_class()` or `remove_class()` calls had been made.
*
* This flag is set upon the first change that requires a string update.
*
* @var bool $modified
*/
$modified = false;
// Remove unwanted classes by only copying the new ones.
$existing_class_length = strlen( $existing_class );
while ( $at < $existing_class_length ) {
// Skip to the first non-whitespace character.
$ws_at = $at;
$ws_length = strspn( $existing_class, " \t\f\r\n", $ws_at );
$at += $ws_length;
// Capture the class name – it's everything until the next whitespace.
$name_length = strcspn( $existing_class, " \t\f\r\n", $at );
if ( 0 === $name_length ) {
// If no more class names are found then that's the end.
break;
}
$name = substr( $existing_class, $at, $name_length );
$at += $name_length;
// If this class is marked for removal, start processing the next one.
$remove_class = (
isset( $this->classname_updates[ $name ] ) &&
self::REMOVE_CLASS === $this->classname_updates[ $name ]
);
// If a class has already been seen then skip it; it should not be added twice.
if ( ! $remove_class ) {
$this->classname_updates[ $name ] = self::SKIP_CLASS;
}
if ( $remove_class ) {
$modified = true;
continue;
}
/*
* Otherwise, append it to the new "class" attribute value.
*
* There are options for handling whitespace between tags.
* Preserving the existing whitespace produces fewer changes
* to the HTML content and should clarify the before/after
* content when debugging the modified output.
*
* This approach contrasts normalizing the inter-class
* whitespace to a single space, which might appear cleaner
* in the output HTML but produce a noisier change.
*/
$class .= substr( $existing_class, $ws_at, $ws_length );
$class .= $name;
}
// Add new classes by appending those which haven't already been seen.
foreach ( $this->classname_updates as $name => $operation ) {
if ( self::ADD_CLASS === $operation ) {
$modified = true;
$class .= strlen( $class ) > 0 ? ' ' : '';
$class .= $name;
}
}
$this->classname_updates = array();
if ( ! $modified ) {
return;
}
if ( strlen( $class ) > 0 ) {
$this->set_attribute( 'class', $class );
} else {
$this->remove_attribute( 'class' );
}
}
/**
* Applies attribute updates to HTML document.
*
* @since 6.2.0
* @since 6.2.1 Accumulates shift for internal cursor and passed pointer.
* @since 6.3.0 Invalidate any bookmarks whose targets are overwritten.
*
* @param int $shift_this_point Accumulate and return shift for this position.
* @return int How many bytes the given pointer moved in response to the updates.
*/
private function apply_attributes_updates( $shift_this_point ) {
if ( ! count( $this->lexical_updates ) ) {
return 0;
}
$accumulated_shift_for_given_point = 0;
/*
* Attribute updates can be enqueued in any order but updates
* to the document must occur in lexical order; that is, each
* replacement must be made before all others which follow it
* at later string indices in the input document.
*
* Sorting avoid making out-of-order replacements which
* can lead to mangled output, partially-duplicated
* attributes, and overwritten attributes.
*/
usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) );
$bytes_already_copied = 0;
$output_buffer = '';
foreach ( $this->lexical_updates as $diff ) {
$shift = strlen( $diff->text ) - $diff->length;
// Adjust the cursor position by however much an update affects it.
if ( $diff->start < $this->bytes_already_parsed ) {
$this->bytes_already_parsed += $shift;
}
// Accumulate shift of the given pointer within this function call.
if ( $diff->start <= $shift_this_point ) {
$accumulated_shift_for_given_point += $shift;
}
$output_buffer .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied );
$output_buffer .= $diff->text;
$bytes_already_copied = $diff->start + $diff->length;
}
$this->html = $output_buffer . substr( $this->html, $bytes_already_copied );
/*
* Adjust bookmark locations to account for how the text
* replacements adjust offsets in the input document.
*/
foreach ( $this->bookmarks as $bookmark_name => $bookmark ) {
$bookmark_end = $bookmark->start + $bookmark->length;
/*
* Each lexical update which appears before the bookmark's endpoints
* might shift the offsets for those endpoints. Loop through each change
* and accumulate the total shift for each bookmark, then apply that
* shift after tallying the full delta.
*/
$head_delta = 0;
$tail_delta = 0;
foreach ( $this->lexical_updates as $diff ) {
$diff_end = $diff->start + $diff->length;
if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) {
break;
}
if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) {
$this->release_bookmark( $bookmark_name );
continue 2;
}
$delta = strlen( $diff->text ) - $diff->length;
if ( $bookmark->start >= $diff->start ) {
$head_delta += $delta;
}
if ( $bookmark_end >= $diff_end ) {
$tail_delta += $delta;
}
}
$bookmark->start += $head_delta;
$bookmark->length += $tail_delta - $head_delta;
}
$this->lexical_updates = array();
return $accumulated_shift_for_given_point;
}
/**
* Checks whether a bookmark with the given name exists.
*
* @since 6.3.0
*
* @param string $bookmark_name Name to identify a bookmark that potentially exists.
* @return bool Whether that bookmark exists.
*/
public function has_bookmark( $bookmark_name ) {
return array_key_exists( $bookmark_name, $this->bookmarks );
}
/**
* Move the internal cursor in the Tag Processor to a given bookmark's location.
*
* In order to prevent accidental infinite loops, there's a
* maximum limit on the number of times seek() can be called.
*
* @since 6.2.0
*
* @param string $bookmark_name Jump to the place in the document identified by this bookmark name.
* @return bool Whether the internal cursor was successfully moved to the bookmark's location.
*/
public function seek( $bookmark_name ) {
if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) {
_doing_it_wrong(
__METHOD__,
__( 'Unknown bookmark name.' ),
'6.2.0'
);
return false;
}
if ( ++$this->seek_count > static::MAX_SEEK_OPS ) {
_doing_it_wrong(
__METHOD__,
__( 'Too many calls to seek() - this can lead to performance issues.' ),
'6.2.0'
);
return false;
}
// Flush out any pending updates to the document.
$this->get_updated_html();
// Point this tag processor before the sought tag opener and consume it.
$this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start;
$this->parser_state = self::STATE_READY;
return $this->next_token();
}
/**
* Compare two WP_HTML_Text_Replacement objects.
*
* @since 6.2.0
*
* @param WP_HTML_Text_Replacement $a First attribute update.
* @param WP_HTML_Text_Replacement $b Second attribute update.
* @return int Comparison value for string order.
*/
private static function sort_start_ascending( $a, $b ) {
$by_start = $a->start - $b->start;
if ( 0 !== $by_start ) {
return $by_start;
}
$by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0;
if ( 0 !== $by_text ) {
return $by_text;
}
/*
* This code should be unreachable, because it implies the two replacements
* start at the same location and contain the same text.
*/
return $a->length - $b->length;
}
/**
* Return the enqueued value for a given attribute, if one exists.
*
* Enqueued updates can take different data types:
* - If an update is enqueued and is boolean, the return will be `true`
* - If an update is otherwise enqueued, the return will be the string value of that update.
* - If an attribute is enqueued to be removed, the return will be `null` to indicate that.
* - If no updates are enqueued, the return will be `false` to differentiate from "removed."
*
* @since 6.2.0
*
* @param string $comparable_name The attribute name in its comparable form.
* @return string|boolean|null Value of enqueued update if present, otherwise false.
*/
private function get_enqueued_attribute_value( $comparable_name ) {
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return false;
}
if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) {
return false;
}
$enqueued_text = $this->lexical_updates[ $comparable_name ]->text;
// Removed attributes erase the entire span.
if ( '' === $enqueued_text ) {
return null;
}
/*
* Boolean attribute updates are just the attribute name without a corresponding value.
*
* This value might differ from the given comparable name in that there could be leading
* or trailing whitespace, and that the casing follows the name given in `set_attribute`.
*
* Example:
*
* $p->set_attribute( 'data-TEST-id', 'update' );
* 'update' === $p->get_enqueued_attribute_value( 'data-test-id' );
*
* Detect this difference based on the absence of the `=`, which _must_ exist in any
* attribute containing a value, e.g. ``.
* ¹ ²
* 1. Attribute with a string value.
* 2. Boolean attribute whose value is `true`.
*/
$equals_at = strpos( $enqueued_text, '=' );
if ( false === $equals_at ) {
return true;
}
/*
* Finally, a normal update's value will appear after the `=` and
* be double-quoted, as performed incidentally by `set_attribute`.
*
* e.g. `type="text"`
* ¹² ³
* 1. Equals is here.
* 2. Double-quoting starts one after the equals sign.
* 3. Double-quoting ends at the last character in the update.
*/
$enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 );
return WP_HTML_Decoder::decode_attribute( $enqueued_value );
}
/**
* Returns the value of a requested attribute from a matched tag opener if that attribute exists.
*
* Example:
*
* $p = new WP_HTML_Tag_Processor( '
Test
' );
* $p->next_tag( array( 'class_name' => 'test' ) ) === true;
* $p->get_attribute( 'data-test-id' ) === '14';
* $p->get_attribute( 'enabled' ) === true;
* $p->get_attribute( 'aria-label' ) === null;
*
* $p->next_tag() === false;
* $p->get_attribute( 'class' ) === null;
*
* @since 6.2.0
*
* @param string $name Name of attribute whose value is requested.
* @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
*/
public function get_attribute( $name ) {
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return null;
}
$comparable = strtolower( $name );
/*
* For every attribute other than `class` it's possible to perform a quick check if
* there's an enqueued lexical update whose value takes priority over what's found in
* the input document.
*
* The `class` attribute is special though because of the exposed helpers `add_class`
* and `remove_class`. These form a builder for the `class` attribute, so an additional
* check for enqueued class changes is required in addition to the check for any enqueued
* attribute values. If any exist, those enqueued class changes must first be flushed out
* into an attribute value update.
*/
if ( 'class' === $name ) {
$this->class_name_updates_to_attributes_updates();
}
// Return any enqueued attribute value updates if they exist.
$enqueued_value = $this->get_enqueued_attribute_value( $comparable );
if ( false !== $enqueued_value ) {
return $enqueued_value;
}
if ( ! isset( $this->attributes[ $comparable ] ) ) {
return null;
}
$attribute = $this->attributes[ $comparable ];
/*
* This flag distinguishes an attribute with no value
* from an attribute with an empty string value. For
* unquoted attributes this could look very similar.
* It refers to whether an `=` follows the name.
*
* e.g.
* ¹ ²
* 1. Attribute `boolean-attribute` is `true`.
* 2. Attribute `empty-attribute` is `""`.
*/
if ( true === $attribute->is_true ) {
return true;
}
$raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length );
return WP_HTML_Decoder::decode_attribute( $raw_value );
}
/**
* Gets lowercase names of all attributes matching a given prefix in the current tag.
*
* Note that matching is case-insensitive. This is in accordance with the spec:
*
* > There must never be two or more attributes on
* > the same start tag whose names are an ASCII
* > case-insensitive match for each other.
* - HTML 5 spec
*
* Example:
*
* $p = new WP_HTML_Tag_Processor( 'Test
' );
* $p->next_tag( array( 'class_name' => 'test' ) ) === true;
* $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' );
*
* $p->next_tag() === false;
* $p->get_attribute_names_with_prefix( 'data-' ) === null;
*
* @since 6.2.0
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*
* @param string $prefix Prefix of requested attribute names.
* @return array|null List of attribute names, or `null` when no tag opener is matched.
*/
public function get_attribute_names_with_prefix( $prefix ) {
if (
self::STATE_MATCHED_TAG !== $this->parser_state ||
$this->is_closing_tag
) {
return null;
}
$comparable = strtolower( $prefix );
$matches = array();
foreach ( array_keys( $this->attributes ) as $attr_name ) {
if ( str_starts_with( $attr_name, $comparable ) ) {
$matches[] = $attr_name;
}
}
return $matches;
}
/**
* Returns the uppercase name of the matched tag.
*
* Example:
*
* $p = new WP_HTML_Tag_Processor( 'Test
' );
* $p->next_tag() === true;
* $p->get_tag() === 'DIV';
*
* $p->next_tag() === false;
* $p->get_tag() === null;
*
* @since 6.2.0
*
* @return string|null Name of currently matched tag in input HTML, or `null` if none found.
*/
public function get_tag() {
if ( null === $this->tag_name_starts_at ) {
return null;
}
$tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
if ( self::STATE_MATCHED_TAG === $this->parser_state ) {
return strtoupper( $tag_name );
}
if (
self::STATE_COMMENT === $this->parser_state &&
self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type()
) {
return $tag_name;
}
return null;
}
/**
* Indicates if the currently matched tag contains the self-closing flag.
*
* No HTML elements ought to have the self-closing flag and for those, the self-closing
* flag will be ignored. For void elements this is benign because they "self close"
* automatically. For non-void HTML elements though problems will appear if someone
* intends to use a self-closing element in place of that element with an empty body.
* For HTML foreign elements and custom elements the self-closing flag determines if
* they self-close or not.
*
* This function does not determine if a tag is self-closing,
* but only if the self-closing flag is present in the syntax.
*
* @since 6.3.0
*
* @return bool Whether the currently matched tag contains the self-closing flag.
*/
public function has_self_closing_flag() {
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return false;
}
/*
* The self-closing flag is the solidus at the _end_ of the tag, not the beginning.
*
* Example:
*
*
* ^ this appears one character before the end of the closing ">".
*/
return '/' === $this->html[ $this->token_starts_at + $this->token_length - 2 ];
}
/**
* Indicates if the current tag token is a tag closer.
*
* Example:
*
* $p = new WP_HTML_Tag_Processor( '' );
* $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
* $p->is_tag_closer() === false;
*
* $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
* $p->is_tag_closer() === true;
*
* @since 6.2.0
*
* @return bool Whether the current tag is a tag closer.
*/
public function is_tag_closer() {
return (
self::STATE_MATCHED_TAG === $this->parser_state &&
$this->is_closing_tag
);
}
/**
* Indicates the kind of matched token, if any.
*
* This differs from `get_token_name()` in that it always
* returns a static string indicating the type, whereas
* `get_token_name()` may return values derived from the
* token itself, such as a tag name or processing
* instruction tag.
*
* Possible values:
* - `#tag` when matched on a tag.
* - `#text` when matched on a text node.
* - `#cdata-section` when matched on a CDATA node.
* - `#comment` when matched on a comment.
* - `#doctype` when matched on a DOCTYPE declaration.
* - `#presumptuous-tag` when matched on an empty tag closer.
* - `#funky-comment` when matched on a funky comment.
*
* @since 6.5.0
*
* @return string|null What kind of token is matched, or null.
*/
public function get_token_type() {
switch ( $this->parser_state ) {
case self::STATE_MATCHED_TAG:
return '#tag';
case self::STATE_DOCTYPE:
return '#doctype';
default:
return $this->get_token_name();
}
}
/**
* Returns the node name represented by the token.
*
* This matches the DOM API value `nodeName`. Some values
* are static, such as `#text` for a text node, while others
* are dynamically generated from the token itself.
*
* Dynamic names:
* - Uppercase tag name for tag matches.
* - `html` for DOCTYPE declarations.
*
* Note that if the Tag Processor is not matched on a token
* then this function will return `null`, either because it
* hasn't yet found a token or because it reached the end
* of the document without matching a token.
*
* @since 6.5.0
*
* @return string|null Name of the matched token.
*/
public function get_token_name() {
switch ( $this->parser_state ) {
case self::STATE_MATCHED_TAG:
return $this->get_tag();
case self::STATE_TEXT_NODE:
return '#text';
case self::STATE_CDATA_NODE:
return '#cdata-section';
case self::STATE_COMMENT:
return '#comment';
case self::STATE_DOCTYPE:
return 'html';
case self::STATE_PRESUMPTUOUS_TAG:
return '#presumptuous-tag';
case self::STATE_FUNKY_COMMENT:
return '#funky-comment';
}
return null;
}
/**
* Indicates what kind of comment produced the comment node.
*
* Because there are different kinds of HTML syntax which produce
* comments, the Tag Processor tracks and exposes this as a type
* for the comment. Nominally only regular HTML comments exist as
* they are commonly known, but a number of unrelated syntax errors
* also produce comments.
*
* @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
* @see self::COMMENT_AS_CDATA_LOOKALIKE
* @see self::COMMENT_AS_INVALID_HTML
* @see self::COMMENT_AS_HTML_COMMENT
* @see self::COMMENT_AS_PI_NODE_LOOKALIKE
*
* @since 6.5.0
*
* @return string|null
*/
public function get_comment_type() {
if ( self::STATE_COMMENT !== $this->parser_state ) {
return null;
}
return $this->comment_type;
}
/**
* Returns the modifiable text for a matched token, or an empty string.
*
* Modifiable text is text content that may be read and changed without
* changing the HTML structure of the document around it. This includes
* the contents of `#text` nodes in the HTML as well as the inner
* contents of HTML comments, Processing Instructions, and others, even
* though these nodes aren't part of a parsed DOM tree. They also contain
* the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
* other section in an HTML document which cannot contain HTML markup (DATA).
*
* If a token has no modifiable text then an empty string is returned to
* avoid needless crashing or type errors. An empty string does not mean
* that a token has modifiable text, and a token with modifiable text may
* have an empty string (e.g. a comment with no contents).
*
* @since 6.5.0
*
* @return string
*/
public function get_modifiable_text() {
if ( null === $this->text_starts_at ) {
return '';
}
$text = substr( $this->html, $this->text_starts_at, $this->text_length );
// Comment data is not decoded.
if (
self::STATE_CDATA_NODE === $this->parser_state ||
self::STATE_COMMENT === $this->parser_state ||
self::STATE_DOCTYPE === $this->parser_state ||
self::STATE_FUNKY_COMMENT === $this->parser_state
) {
return $text;
}
$tag_name = $this->get_tag();
if (
// Script data is not decoded.
'SCRIPT' === $tag_name ||
// RAWTEXT data is not decoded.
'IFRAME' === $tag_name ||
'NOEMBED' === $tag_name ||
'NOFRAMES' === $tag_name ||
'STYLE' === $tag_name ||
'XMP' === $tag_name
) {
return $text;
}
$decoded = WP_HTML_Decoder::decode_text_node( $text );
/*
* TEXTAREA skips a leading newline, but this newline may appear not only as the
* literal character `\n`, but also as a character reference, such as in the
* following markup: ``.
*
* For these cases it's important to first decode the text content before checking
* for a leading newline and removing it.
*/
if (
self::STATE_MATCHED_TAG === $this->parser_state &&
'TEXTAREA' === $tag_name &&
strlen( $decoded ) > 0 &&
"\n" === $decoded[0]
) {
return substr( $decoded, 1 );
}
return $decoded;
}
/**
* Updates or creates a new attribute on the currently matched tag with the passed value.
*
* For boolean attributes special handling is provided:
* - When `true` is passed as the value, then only the attribute name is added to the tag.
* - When `false` is passed, the attribute gets removed if it existed before.
*
* For string attributes, the value is escaped using the `esc_attr` function.
*
* @since 6.2.0
* @since 6.2.1 Fix: Only create a single update for multiple calls with case-variant attribute names.
*
* @param string $name The attribute name to target.
* @param string|bool $value The new attribute value.
* @return bool Whether an attribute value was set.
*/
public function set_attribute( $name, $value ) {
if (
self::STATE_MATCHED_TAG !== $this->parser_state ||
$this->is_closing_tag
) {
return false;
}
/*
* WordPress rejects more characters than are strictly forbidden
* in HTML5. This is to prevent additional security risks deeper
* in the WordPress and plugin stack. Specifically the
* less-than (<) greater-than (>) and ampersand (&) aren't allowed.
*
* The use of a PCRE match enables looking for specific Unicode
* code points without writing a UTF-8 decoder. Whereas scanning
* for one-byte characters is trivial (with `strcspn`), scanning
* for the longer byte sequences would be more complicated. Given
* that this shouldn't be in the hot path for execution, it's a
* reasonable compromise in efficiency without introducing a
* noticeable impact on the overall system.
*
* @see https://html.spec.whatwg.org/#attributes-2
*
* @todo As the only regex pattern maybe we should take it out?
* Are Unicode patterns available broadly in Core?
*/
if ( preg_match(
'~[' .
// Syntax-like characters.
'"\'>& =' .
// Control characters.
'\x{00}-\x{1F}' .
// HTML noncharacters.
'\x{FDD0}-\x{FDEF}' .
'\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}' .
'\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}' .
'\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}' .
'\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}' .
'\x{10FFFE}\x{10FFFF}' .
']~Ssu',
$name
) ) {
_doing_it_wrong(
__METHOD__,
__( 'Invalid attribute name.' ),
'6.2.0'
);
return false;
}
/*
* > The values "true" and "false" are not allowed on boolean attributes.
* > To represent a false value, the attribute has to be omitted altogether.
* - HTML5 spec, https://html.spec.whatwg.org/#boolean-attributes
*/
if ( false === $value ) {
return $this->remove_attribute( $name );
}
if ( true === $value ) {
$updated_attribute = $name;
} else {
$comparable_name = strtolower( $name );
/*
* Escape URL attributes.
*
* @see https://html.spec.whatwg.org/#attributes-3
*/
$escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes() ) ? esc_url( $value ) : esc_attr( $value );
// If the escaping functions wiped out the update, reject it and indicate it was rejected.
if ( '' === $escaped_new_value && '' !== $value ) {
return false;
}
$updated_attribute = "{$name}=\"{$escaped_new_value}\"";
}
/*
* > There must never be two or more attributes on
* > the same start tag whose names are an ASCII
* > case-insensitive match for each other.
* - HTML 5 spec
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*/
$comparable_name = strtolower( $name );
if ( isset( $this->attributes[ $comparable_name ] ) ) {
/*
* Update an existing attribute.
*
* Example – set attribute id to "new" in :
*
*
* ^-------------^
* start end
* replacement: `id="new"`
*
* Result:
*/
$existing_attribute = $this->attributes[ $comparable_name ];
$this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement(
$existing_attribute->start,
$existing_attribute->length,
$updated_attribute
);
} else {
/*
* Create a new attribute at the tag's name end.
*
* Example – add attribute id="new" to :
*
*
* ^
* start and end
* replacement: ` id="new"`
*
* Result:
*/
$this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement(
$this->tag_name_starts_at + $this->tag_name_length,
0,
' ' . $updated_attribute
);
}
/*
* Any calls to update the `class` attribute directly should wipe out any
* enqueued class changes from `add_class` and `remove_class`.
*/
if ( 'class' === $comparable_name && ! empty( $this->classname_updates ) ) {
$this->classname_updates = array();
}
return true;
}
/**
* Remove an attribute from the currently-matched tag.
*
* @since 6.2.0
*
* @param string $name The attribute name to remove.
* @return bool Whether an attribute was removed.
*/
public function remove_attribute( $name ) {
if (
self::STATE_MATCHED_TAG !== $this->parser_state ||
$this->is_closing_tag
) {
return false;
}
/*
* > There must never be two or more attributes on
* > the same start tag whose names are an ASCII
* > case-insensitive match for each other.
* - HTML 5 spec
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*/
$name = strtolower( $name );
/*
* Any calls to update the `class` attribute directly should wipe out any
* enqueued class changes from `add_class` and `remove_class`.
*/
if ( 'class' === $name && count( $this->classname_updates ) !== 0 ) {
$this->classname_updates = array();
}
/*
* If updating an attribute that didn't exist in the input
* document, then remove the enqueued update and move on.
*
* For example, this might occur when calling `remove_attribute()`
* after calling `set_attribute()` for the same attribute
* and when that attribute wasn't originally present.
*/
if ( ! isset( $this->attributes[ $name ] ) ) {
if ( isset( $this->lexical_updates[ $name ] ) ) {
unset( $this->lexical_updates[ $name ] );
}
return false;
}
/*
* Removes an existing tag attribute.
*
* Example – remove the attribute id from :
*
* ^-------------^
* start end
* replacement: ``
*
* Result:
*/
$this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement(
$this->attributes[ $name ]->start,
$this->attributes[ $name ]->length,
''
);
// Removes any duplicated attributes if they were also present.
if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$attribute_token->start,
$attribute_token->length,
''
);
}
}
return true;
}
/**
* Adds a new class name to the currently matched tag.
*
* @since 6.2.0
*
* @param string $class_name The class name to add.
* @return bool Whether the class was set to be added.
*/
public function add_class( $class_name ) {
if (
self::STATE_MATCHED_TAG !== $this->parser_state ||
$this->is_closing_tag
) {
return false;
}
$this->classname_updates[ $class_name ] = self::ADD_CLASS;
return true;
}
/**
* Removes a class name from the currently matched tag.
*
* @since 6.2.0
*
* @param string $class_name The class name to remove.
* @return bool Whether the class was set to be removed.
*/
public function remove_class( $class_name ) {
if (
self::STATE_MATCHED_TAG !== $this->parser_state ||
$this->is_closing_tag
) {
return false;
}
if ( null !== $this->tag_name_starts_at ) {
$this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
}
return true;
}
/**
* Returns the string representation of the HTML Tag Processor.
*
* @since 6.2.0
*
* @see WP_HTML_Tag_Processor::get_updated_html()
*
* @return string The processed HTML.
*/
public function __toString() {
return $this->get_updated_html();
}
/**
* Returns the string representation of the HTML Tag Processor.
*
* @since 6.2.0
* @since 6.2.1 Shifts the internal cursor corresponding to the applied updates.
* @since 6.4.0 No longer calls subclass method `next_tag()` after updating HTML.
*
* @return string The processed HTML.
*/
public function get_updated_html() {
$requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates );
/*
* When there is nothing more to update and nothing has already been
* updated, return the original document and avoid a string copy.
*/
if ( $requires_no_updating ) {
return $this->html;
}
/*
* Keep track of the position right before the current tag. This will
* be necessary for reparsing the current tag after updating the HTML.
*/
$before_current_tag = $this->token_starts_at ?? 0;
/*
* 1. Apply the enqueued edits and update all the pointers to reflect those changes.
*/
$this->class_name_updates_to_attributes_updates();
$before_current_tag += $this->apply_attributes_updates( $before_current_tag );
/*
* 2. Rewind to before the current tag and reparse to get updated attributes.
*
* At this point the internal cursor points to the end of the tag name.
* Rewind before the tag name starts so that it's as if the cursor didn't
* move; a call to `next_tag()` will reparse the recently-updated attributes
* and additional calls to modify the attributes will apply at this same
* location, but in order to avoid issues with subclasses that might add
* behaviors to `next_tag()`, the internal methods should be called here
* instead.
*
* It's important to note that in this specific place there will be no change
* because the processor was already at a tag when this was called and it's
* rewinding only to the beginning of this very tag before reprocessing it
* and its attributes.
*
* Previous HTMLMore HTML
* ↑ │ back up by the length of the tag name plus the opening <
* └←─┘ back up by strlen("em") + 1 ==> 3
*/
$this->bytes_already_parsed = $before_current_tag;
$this->base_class_next_token();
return $this->html;
}
/**
* Parses tag query input into internal search criteria.
*
* @since 6.2.0
*
* @param array|string|null $query {
* Optional. Which tag name to find, having which class, etc. Default is to find any tag.
*
* @type string|null $tag_name Which tag to find, or `null` for "any tag."
* @type int|null $match_offset Find the Nth tag matching all search criteria.
* 1 for "first" tag, 3 for "third," etc.
* Defaults to first tag.
* @type string|null $class_name Tag must contain this class name to match.
* @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. .
* }
*/
private function parse_query( $query ) {
if ( null !== $query && $query === $this->last_query ) {
return;
}
$this->last_query = $query;
$this->sought_tag_name = null;
$this->sought_class_name = null;
$this->sought_match_offset = 1;
$this->stop_on_tag_closers = false;
// A single string value means "find the tag of this name".
if ( is_string( $query ) ) {
$this->sought_tag_name = $query;
return;
}
// An empty query parameter applies no restrictions on the search.
if ( null === $query ) {
return;
}
// If not using the string interface, an associative array is required.
if ( ! is_array( $query ) ) {
_doing_it_wrong(
__METHOD__,
__( 'The query argument must be an array or a tag name.' ),
'6.2.0'
);
return;
}
if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) {
$this->sought_tag_name = $query['tag_name'];
}
if ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) {
$this->sought_class_name = $query['class_name'];
}
if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) {
$this->sought_match_offset = $query['match_offset'];
}
if ( isset( $query['tag_closers'] ) ) {
$this->stop_on_tag_closers = 'visit' === $query['tag_closers'];
}
}
/**
* Checks whether a given tag and its attributes match the search criteria.
*
* @since 6.2.0
*
* @return bool Whether the given tag and its attribute match the search criteria.
*/
private function matches() {
if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) {
return false;
}
// Does the tag name match the requested tag name in a case-insensitive manner?
if ( null !== $this->sought_tag_name ) {
/*
* String (byte) length lookup is fast. If they aren't the
* same length then they can't be the same string values.
*/
if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) {
return false;
}
/*
* Check each character to determine if they are the same.
* Defer calls to `strtoupper()` to avoid them when possible.
* Calling `strcasecmp()` here tested slowed than comparing each
* character, so unless benchmarks show otherwise, it should
* not be used.
*
* It's expected that most of the time that this runs, a
* lower-case tag name will be supplied and the input will
* contain lower-case tag names, thus normally bypassing
* the case comparison code.
*/
for ( $i = 0; $i < $this->tag_name_length; $i++ ) {
$html_char = $this->html[ $this->tag_name_starts_at + $i ];
$tag_char = $this->sought_tag_name[ $i ];
if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
return false;
}
}
}
if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {
return false;
}
return true;
}
/**
* Parser Ready State.
*
* Indicates that the parser is ready to run and waiting for a state transition.
* It may not have started yet, or it may have just finished parsing a token and
* is ready to find the next one.
*
* @since 6.5.0
*
* @access private
*/
const STATE_READY = 'STATE_READY';
/**
* Parser Complete State.
*
* Indicates that the parser has reached the end of the document and there is
* nothing left to scan. It finished parsing the last token completely.
*
* @since 6.5.0
*
* @access private
*/
const STATE_COMPLETE = 'STATE_COMPLETE';
/**
* Parser Incomplete Input State.
*
* Indicates that the parser has reached the end of the document before finishing
* a token. It started parsing a token but there is a possibility that the input
* HTML document was truncated in the middle of a token.
*
* The parser is reset at the start of the incomplete token and has paused. There
* is nothing more than can be scanned unless provided a more complete document.
*
* @since 6.5.0
*
* @access private
*/
const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT';
/**
* Parser Matched Tag State.
*
* Indicates that the parser has found an HTML tag and it's possible to get
* the tag name and read or modify its attributes (if it's not a closing tag).
*
* @since 6.5.0
*
* @access private
*/
const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG';
/**
* Parser Text Node State.
*
* Indicates that the parser has found a text node and it's possible
* to read and modify that text.
*
* @since 6.5.0
*
* @access private
*/
const STATE_TEXT_NODE = 'STATE_TEXT_NODE';
/**
* Parser CDATA Node State.
*
* Indicates that the parser has found a CDATA node and it's possible
* to read and modify its modifiable text. Note that in HTML there are
* no CDATA nodes outside of foreign content (SVG and MathML). Outside
* of foreign content, they are treated as HTML comments.
*
* @since 6.5.0
*
* @access private
*/
const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
/**
* Indicates that the parser has found an HTML comment and it's
* possible to read and modify its modifiable text.
*
* @since 6.5.0
*
* @access private
*/
const STATE_COMMENT = 'STATE_COMMENT';
/**
* Indicates that the parser has found a DOCTYPE node and it's
* possible to read and modify its modifiable text.
*
* @since 6.5.0
*
* @access private
*/
const STATE_DOCTYPE = 'STATE_DOCTYPE';
/**
* Indicates that the parser has found an empty tag closer `>`.
*
* Note that in HTML there are no empty tag closers, and they
* are ignored. Nonetheless, the Tag Processor still
* recognizes them as they appear in the HTML stream.
*
* These were historically discussed as a "presumptuous tag
* closer," which would close the nearest open tag, but were
* dismissed in favor of explicitly-closing tags.
*
* @since 6.5.0
*
* @access private
*/
const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG';
/**
* Indicates that the parser has found a "funky comment"
* and it's possible to read and modify its modifiable text.
*
* Example:
*
* %url>
* {"wp-bit":"query/post-author"}>
* 2>
*
* Funky comments are tag closers with invalid tag names. Note
* that in HTML these are turn into bogus comments. Nonetheless,
* the Tag Processor recognizes them in a stream of HTML and
* exposes them for inspection and modification.
*
* @since 6.5.0
*
* @access private
*/
const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY';
/**
* Indicates that a comment was created when encountering abruptly-closed HTML comment.
*
* Example:
*
*
*
*
* @since 6.5.0
*/
const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT';
/**
* Indicates that a comment would be parsed as a CDATA node,
* were HTML to allow CDATA nodes outside of foreign content.
*
* Example:
*
*
*
* This is an HTML comment, but it looks like a CDATA node.
*
* @since 6.5.0
*/
const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE';
/**
* Indicates that a comment was created when encountering
* normative HTML comment syntax.
*
* Example:
*
*
*
* @since 6.5.0
*/
const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT';
/**
* Indicates that a comment would be parsed as a Processing
* Instruction node, were they to exist within HTML.
*
* Example:
*
*
*
* This is an HTML comment, but it looks like a CDATA node.
*
* @since 6.5.0
*/
const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE';
/**
* Indicates that a comment was created when encountering invalid
* HTML input, a so-called "bogus comment."
*
* Example:
*
*
*
*
* @since 6.5.0
*/
const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
}
class-wp-html-open-elements.php 0000644 00000034157 14717700467 0012550 0 ustar 00 Initially, the stack of open elements is empty. The stack grows
* > downwards; the topmost node on the stack is the first one added
* > to the stack, and the bottommost node of the stack is the most
* > recently added node in the stack (notwithstanding when the stack
* > is manipulated in a random access fashion as part of the handling
* > for misnested tags).
*
* @since 6.4.0
*
* @access private
*
* @see https://html.spec.whatwg.org/#stack-of-open-elements
* @see WP_HTML_Processor
*/
class WP_HTML_Open_Elements {
/**
* Holds the stack of open element references.
*
* @since 6.4.0
*
* @var WP_HTML_Token[]
*/
public $stack = array();
/**
* Whether a P element is in button scope currently.
*
* This class optimizes scope lookup by pre-calculating
* this value when elements are added and removed to the
* stack of open elements which might change its value.
* This avoids frequent iteration over the stack.
*
* @since 6.4.0
*
* @var bool
*/
private $has_p_in_button_scope = false;
/**
* A function that will be called when an item is popped off the stack of open elements.
*
* The function will be called with the popped item as its argument.
*
* @since 6.6.0
*
* @var Closure
*/
private $pop_handler = null;
/**
* A function that will be called when an item is pushed onto the stack of open elements.
*
* The function will be called with the pushed item as its argument.
*
* @since 6.6.0
*
* @var Closure
*/
private $push_handler = null;
/**
* Sets a pop handler that will be called when an item is popped off the stack of
* open elements.
*
* The function will be called with the pushed item as its argument.
*
* @since 6.6.0
*
* @param Closure $handler The handler function.
*/
public function set_pop_handler( Closure $handler ) {
$this->pop_handler = $handler;
}
/**
* Sets a push handler that will be called when an item is pushed onto the stack of
* open elements.
*
* The function will be called with the pushed item as its argument.
*
* @since 6.6.0
*
* @param Closure $handler The handler function.
*/
public function set_push_handler( Closure $handler ) {
$this->push_handler = $handler;
}
/**
* Reports if a specific node is in the stack of open elements.
*
* @since 6.4.0
*
* @param WP_HTML_Token $token Look for this node in the stack.
* @return bool Whether the referenced node is in the stack of open elements.
*/
public function contains_node( $token ) {
foreach ( $this->walk_up() as $item ) {
if ( $token->bookmark_name === $item->bookmark_name ) {
return true;
}
}
return false;
}
/**
* Returns how many nodes are currently in the stack of open elements.
*
* @since 6.4.0
*
* @return int How many node are in the stack of open elements.
*/
public function count() {
return count( $this->stack );
}
/**
* Returns the node at the end of the stack of open elements,
* if one exists. If the stack is empty, returns null.
*
* @since 6.4.0
*
* @return WP_HTML_Token|null Last node in the stack of open elements, if one exists, otherwise null.
*/
public function current_node() {
$current_node = end( $this->stack );
return $current_node ? $current_node : null;
}
/**
* Returns whether an element is in a specific scope.
*
* ## HTML Support
*
* This function skips checking for the termination list because there
* are no supported elements which appear in the termination list.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#has-an-element-in-the-specific-scope
*
* @param string $tag_name Name of tag check.
* @param string[] $termination_list List of elements that terminate the search.
* @return bool Whether the element was found in a specific scope.
*/
public function has_element_in_specific_scope( $tag_name, $termination_list ) {
foreach ( $this->walk_up() as $node ) {
if ( $node->node_name === $tag_name ) {
return true;
}
if (
'(internal: H1 through H6 - do not use)' === $tag_name &&
in_array( $node->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true )
) {
return true;
}
switch ( $node->node_name ) {
case 'HTML':
return false;
}
if ( in_array( $node->node_name, $termination_list, true ) ) {
return false;
}
}
return false;
}
/**
* Returns whether a particular element is in scope.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#has-an-element-in-scope
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_scope( $tag_name ) {
return $this->has_element_in_specific_scope(
$tag_name,
array(
/*
* Because it's not currently possible to encounter
* one of the termination elements, they don't need
* to be listed here. If they were, they would be
* unreachable and only waste CPU cycles while
* scanning through HTML.
*/
)
);
}
/**
* Returns whether a particular element is in list item scope.
*
* @since 6.4.0
* @since 6.5.0 Implemented: no longer throws on every invocation.
*
* @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_list_item_scope( $tag_name ) {
return $this->has_element_in_specific_scope(
$tag_name,
array(
// There are more elements that belong here which aren't currently supported.
'OL',
'UL',
)
);
}
/**
* Returns whether a particular element is in button scope.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#has-an-element-in-button-scope
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_button_scope( $tag_name ) {
return $this->has_element_in_specific_scope( $tag_name, array( 'BUTTON' ) );
}
/**
* Returns whether a particular element is in table scope.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#has-an-element-in-table-scope
*
* @throws WP_HTML_Unsupported_Exception Always until this function is implemented.
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_table_scope( $tag_name ) {
throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on table scope.' );
return false; // The linter requires this unreachable code until the function is implemented and can return.
}
/**
* Returns whether a particular element is in select scope.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#has-an-element-in-select-scope
*
* @throws WP_HTML_Unsupported_Exception Always until this function is implemented.
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_select_scope( $tag_name ) {
throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on select scope.' );
return false; // The linter requires this unreachable code until the function is implemented and can return.
}
/**
* Returns whether a P is in BUTTON scope.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#has-an-element-in-button-scope
*
* @return bool Whether a P is in BUTTON scope.
*/
public function has_p_in_button_scope() {
return $this->has_p_in_button_scope;
}
/**
* Pops a node off of the stack of open elements.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#stack-of-open-elements
*
* @return bool Whether a node was popped off of the stack.
*/
public function pop() {
$item = array_pop( $this->stack );
if ( null === $item ) {
return false;
}
if ( 'context-node' === $item->bookmark_name ) {
$this->stack[] = $item;
return false;
}
$this->after_element_pop( $item );
return true;
}
/**
* Pops nodes off of the stack of open elements until one with the given tag name has been popped.
*
* @since 6.4.0
*
* @see WP_HTML_Open_Elements::pop
*
* @param string $tag_name Name of tag that needs to be popped off of the stack of open elements.
* @return bool Whether a tag of the given name was found and popped off of the stack of open elements.
*/
public function pop_until( $tag_name ) {
foreach ( $this->walk_up() as $item ) {
if ( 'context-node' === $item->bookmark_name ) {
return true;
}
$this->pop();
if (
'(internal: H1 through H6 - do not use)' === $tag_name &&
in_array( $item->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true )
) {
return true;
}
if ( $tag_name === $item->node_name ) {
return true;
}
}
return false;
}
/**
* Pushes a node onto the stack of open elements.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#stack-of-open-elements
*
* @param WP_HTML_Token $stack_item Item to add onto stack.
*/
public function push( $stack_item ) {
$this->stack[] = $stack_item;
$this->after_element_push( $stack_item );
}
/**
* Removes a specific node from the stack of open elements.
*
* @since 6.4.0
*
* @param WP_HTML_Token $token The node to remove from the stack of open elements.
* @return bool Whether the node was found and removed from the stack of open elements.
*/
public function remove_node( $token ) {
if ( 'context-node' === $token->bookmark_name ) {
return false;
}
foreach ( $this->walk_up() as $position_from_end => $item ) {
if ( $token->bookmark_name !== $item->bookmark_name ) {
continue;
}
$position_from_start = $this->count() - $position_from_end - 1;
array_splice( $this->stack, $position_from_start, 1 );
$this->after_element_pop( $item );
return true;
}
return false;
}
/**
* Steps through the stack of open elements, starting with the top element
* (added first) and walking downwards to the one added last.
*
* This generator function is designed to be used inside a "foreach" loop.
*
* Example:
*
* $html = '
We are here';
* foreach ( $stack->walk_down() as $node ) {
* echo "{$node->node_name} -> ";
* }
* > EM -> STRONG -> A ->
*
* To start with the most-recently added element and walk towards the top,
* see WP_HTML_Open_Elements::walk_up().
*
* @since 6.4.0
*/
public function walk_down() {
$count = count( $this->stack );
for ( $i = 0; $i < $count; $i++ ) {
yield $this->stack[ $i ];
}
}
/**
* Steps through the stack of open elements, starting with the bottom element
* (added last) and walking upwards to the one added first.
*
* This generator function is designed to be used inside a "foreach" loop.
*
* Example:
*
* $html = 'We are here';
* foreach ( $stack->walk_up() as $node ) {
* echo "{$node->node_name} -> ";
* }
* > A -> STRONG -> EM ->
*
* To start with the first added element and walk towards the bottom,
* see WP_HTML_Open_Elements::walk_down().
*
* @since 6.4.0
* @since 6.5.0 Accepts $above_this_node to start traversal above a given node, if it exists.
*
* @param ?WP_HTML_Token $above_this_node Start traversing above this node, if provided and if the node exists.
*/
public function walk_up( $above_this_node = null ) {
$has_found_node = null === $above_this_node;
for ( $i = count( $this->stack ) - 1; $i >= 0; $i-- ) {
$node = $this->stack[ $i ];
if ( ! $has_found_node ) {
$has_found_node = $node === $above_this_node;
continue;
}
yield $node;
}
}
/*
* Internal helpers.
*/
/**
* Updates internal flags after adding an element.
*
* Certain conditions (such as "has_p_in_button_scope") are maintained here as
* flags that are only modified when adding and removing elements. This allows
* the HTML Processor to quickly check for these conditions instead of iterating
* over the open stack elements upon each new tag it encounters. These flags,
* however, need to be maintained as items are added and removed from the stack.
*
* @since 6.4.0
*
* @param WP_HTML_Token $item Element that was added to the stack of open elements.
*/
public function after_element_push( $item ) {
/*
* When adding support for new elements, expand this switch to trap
* cases where the precalculated value needs to change.
*/
switch ( $item->node_name ) {
case 'BUTTON':
$this->has_p_in_button_scope = false;
break;
case 'P':
$this->has_p_in_button_scope = true;
break;
}
if ( null !== $this->push_handler ) {
( $this->push_handler )( $item );
}
}
/**
* Updates internal flags after removing an element.
*
* Certain conditions (such as "has_p_in_button_scope") are maintained here as
* flags that are only modified when adding and removing elements. This allows
* the HTML Processor to quickly check for these conditions instead of iterating
* over the open stack elements upon each new tag it encounters. These flags,
* however, need to be maintained as items are added and removed from the stack.
*
* @since 6.4.0
*
* @param WP_HTML_Token $item Element that was removed from the stack of open elements.
*/
public function after_element_pop( $item ) {
/*
* When adding support for new elements, expand this switch to trap
* cases where the precalculated value needs to change.
*/
switch ( $item->node_name ) {
case 'BUTTON':
$this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
break;
case 'P':
$this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
break;
}
if ( null !== $this->pop_handler ) {
( $this->pop_handler )( $item );
}
}
/**
* Wakeup magic method.
*
* @since 6.6.0
*/
public function __wakeup() {
throw new \LogicException( __CLASS__ . ' should never be unserialized' );
}
}
class-wp-html-unsupported-exception.php 0000644 00000001504 14717700467 0014347 0 ustar 00 token = $token;
$this->operation = $operation;
$this->provenance = $provenance;
}
}
class-wp-html-doctype-info.php 0000604 00000061415 14717700467 0012366 0 ustar 00 `.
*
* > DOCTYPEs are required for legacy reasons. When omitted, browsers tend to use a different
* > rendering mode that is incompatible with some specifications. Including the DOCTYPE in a
* > document ensures that the browser makes a best-effort attempt at following the
* > relevant specifications.
*
* @see https://html.spec.whatwg.org/#the-doctype
*
* DOCTYPE declarations comprise four properties: a name, public identifier, system identifier,
* and an indication of which document compatability mode they would imply if an HTML parser
* hadn't already determined it from other information.
*
* @see https://html.spec.whatwg.org/#the-initial-insertion-mode
*
* Historically, the DOCTYPE declaration was used in SGML documents to instruct a parser how
* to interpret the various tags and entities within a document. Its role in HTML diverged
* from how it was used in SGML and no meaning should be back-read into HTML based on how it
* is used in SGML, XML, or XHTML documents.
*
* @see https://www.iso.org/standard/16387.html
*
* @since 6.7.0
*
* @see WP_HTML_Processor
*/
class WP_HTML_Doctype_Info {
/**
* Name of the DOCTYPE: should be "html" for HTML documents.
*
* This value should be considered "read only" and not modified.
*
* Historically the DOCTYPE name indicates name of the document's root element.
*
*
* ╰──┴── name is "html".
*
* @see https://html.spec.whatwg.org/#tokenization
*
* @since 6.7.0
*
* @var string|null
*/
public $name = null;
/**
* Public identifier of the DOCTYPE.
*
* This value should be considered "read only" and not modified.
*
* The public identifier is optional and should not appear in HTML documents.
* A `null` value indicates that no public identifier was present in the DOCTYPE.
*
* Historically the presence of the public identifier indicated that a document
* was meant to be shared between computer systems and the value indicated to a
* knowledgeable parser how to find the relevant document type definition (DTD).
*
*
* │ │ ╰─── public identifier ─────╯
* ╰──┴── name is "html".
*
* @see https://html.spec.whatwg.org/#tokenization
*
* @since 6.7.0
*
* @var string|null
*/
public $public_identifier = null;
/**
* System identifier of the DOCTYPE.
*
* This value should be considered "read only" and not modified.
*
* The system identifier is optional and should not appear in HTML documents.
* A `null` value indicates that no system identifier was present in the DOCTYPE.
*
* Historically the system identifier specified where a relevant document type
* declaration for the given document is stored and may be retrieved.
*
*
* │ │ ╰──── system identifier ────╯
* ╰──┴── name is "html".
*
* If a public identifier were provided it would indicate to a knowledgeable
* parser how to interpret the system identifier.
*
*
* │ │ ╰─── public identifier ─────╯ ╰──── system identifier ────╯
* ╰──┴── name is "html".
*
* @see https://html.spec.whatwg.org/#tokenization
*
* @since 6.7.0
*
* @var string|null
*/
public $system_identifier = null;
/**
* Which document compatability mode this DOCTYPE declaration indicates.
*
* This value should be considered "read only" and not modified.
*
* When an HTML parser has not already set the document compatability mode,
* (e.g. "quirks" or "no-quirks" mode), it will infer if from the properties
* of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can
* indicate one of three possible document compatability modes:
*
* - "no-quirks" and "limited-quirks" modes (also called "standards" mode).
* - "quirks" mode (also called `CSS1Compat` mode).
*
* An appropriate DOCTYPE is one encountered in the "initial" insertion mode,
* before the HTML element has been opened and before finding any other
* DOCTYPE declaration tokens.
*
* @see https://html.spec.whatwg.org/#the-initial-insertion-mode
*
* @since 6.7.0
*
* @var string One of "no-quirks", "limited-quirks", or "quirks".
*/
public $indicated_compatability_mode;
/**
* Constructor.
*
* This class should not be instantiated directly.
* Use the static {@see self::from_doctype_token} method instead.
*
* The arguments to this constructor correspond to the "DOCTYPE token"
* as defined in the HTML specification.
*
* > DOCTYPE tokens have a name, a public identifier, a system identifier,
* > and a force-quirks flag. When a DOCTYPE token is created, its name, public identifier,
* > and system identifier must be marked as missing (which is a distinct state from the
* > empty string), and the force-quirks flag must be set to off (its other state is on).
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#tokenization
*
* @since 6.7.0
*
* @param string|null $name Name of the DOCTYPE.
* @param string|null $public_identifier Public identifier of the DOCTYPE.
* @param string|null $system_identifier System identifier of the DOCTYPE.
* @param bool $force_quirks_flag Whether the force-quirks flag is set for the token.
*/
private function __construct(
?string $name,
?string $public_identifier,
?string $system_identifier,
bool $force_quirks_flag
) {
$this->name = $name;
$this->public_identifier = $public_identifier;
$this->system_identifier = $system_identifier;
/*
* > If the DOCTYPE token matches one of the conditions in the following list,
* > then set the Document to quirks mode:
*/
/*
* > The force-quirks flag is set to on.
*/
if ( $force_quirks_flag ) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* Normative documents will contain the literal `` with no
* public or system identifiers; short-circuit to avoid extra parsing.
*/
if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) {
$this->indicated_compatability_mode = 'no-quirks';
return;
}
/*
* > The name is not "html".
*
* The tokenizer must report the name in lower case even if provided in
* the document in upper case; thus no conversion is required here.
*/
if ( 'html' !== $name ) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* Set up some variables to handle the rest of the conditions.
*
* > set...the public identifier...to...the empty string if the public identifier was missing.
* > set...the system identifier...to...the empty string if the system identifier was missing.
* >
* > The system identifier and public identifier strings must be compared...
* > in an ASCII case-insensitive manner.
* >
* > A system identifier whose value is the empty string is not considered missing
* > for the purposes of the conditions above.
*/
$system_identifier_is_missing = null === $system_identifier;
$public_identifier = null === $public_identifier ? '' : strtolower( $public_identifier );
$system_identifier = null === $system_identifier ? '' : strtolower( $system_identifier );
/*
* > The public identifier is set to…
*/
if (
'-//w3o//dtd w3 html strict 3.0//en//' === $public_identifier ||
'-/w3c/dtd html 4.0 transitional/en' === $public_identifier ||
'html' === $public_identifier
) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* > The system identifier is set to…
*/
if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* All of the following conditions depend on matching the public identifier.
* If the public identifier is empty, none of the following conditions will match.
*/
if ( '' === $public_identifier ) {
$this->indicated_compatability_mode = 'no-quirks';
return;
}
/*
* > The public identifier starts with…
*
* @todo Optimize this matching. It shouldn't be a large overall performance issue,
* however, as only a single DOCTYPE declaration token should ever be parsed,
* and normative documents will have exited before reaching this condition.
*/
if (
str_starts_with( $public_identifier, '+//silmaril//dtd html pro v0r11 19970101//' ) ||
str_starts_with( $public_identifier, '-//as//dtd html 3.0 aswedit + extensions//' ) ||
str_starts_with( $public_identifier, '-//advasoft ltd//dtd html 3.0 aswedit + extensions//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 1//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 1//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.1e//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 3.0//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 3.2 final//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 3.2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 3//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html level 0//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html level 1//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html level 2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html level 3//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict level 0//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict level 1//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict level 2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict level 3//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html//' ) ||
str_starts_with( $public_identifier, '-//metrius//dtd metrius presentational//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html strict//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 tables//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html strict//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 tables//' ) ||
str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd html//' ) ||
str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd strict html//' ) ||
str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html 2.0//" ) ||
str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended 1.0//" ) ||
str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended relaxed 1.0//" ) ||
str_starts_with( $public_identifier, '-//sq//dtd html 2.0 hotmetal + extensions//' ) ||
str_starts_with( $public_identifier, '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//' ) ||
str_starts_with( $public_identifier, '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//' ) ||
str_starts_with( $public_identifier, '-//spyglass//dtd html 2.0 extended//' ) ||
str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava html//' ) ||
str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava strict html//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3 1995-03-24//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 draft//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 final//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3.2//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3.2s draft//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 frameset//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 transitional//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html experimental 19960712//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html experimental 970421//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd w3 html//' ) ||
str_starts_with( $public_identifier, '-//w3o//dtd w3 html 3.0//' ) ||
str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) ||
str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' )
) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* > The system identifier is missing and the public identifier starts with…
*/
if (
$system_identifier_is_missing && (
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
)
) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* > Otherwise, if the DOCTYPE token matches one of the conditions in
* > the following list, then set the Document to limited-quirks mode.
*/
/*
* > The public identifier starts with…
*/
if (
str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' )
) {
$this->indicated_compatability_mode = 'limited-quirks';
return;
}
/*
* > The system identifier is not missing and the public identifier starts with…
*/
if (
! $system_identifier_is_missing && (
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
)
) {
$this->indicated_compatability_mode = 'limited-quirks';
return;
}
$this->indicated_compatability_mode = 'no-quirks';
}
/**
* Creates a WP_HTML_Doctype_Info instance by parsing a raw DOCTYPE declaration token.
*
* Use this method to parse a DOCTYPE declaration token and get access to its properties
* via the returned WP_HTML_Doctype_Info class instance. The provided input must parse
* properly as a DOCTYPE declaration, though it must not represent a valid DOCTYPE.
*
* Example:
*
* // Normative HTML DOCTYPE declaration.
* $doctype = WP_HTML_Doctype_Info::from_doctype_token( '' );
* 'no-quirks' === $doctype->indicated_compatability_mode;
*
* // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode.
* $doctype = WP_HTML_Doctype_Info::from_doctype_token( '' );
* 'quirks' === $doctype->indicated_compatability_mode;
*
* // Textual quirks present in raw HTML are handled appropriately.
* $doctype = WP_HTML_Doctype_Info::from_doctype_token( "" );
* 'no-quirks' === $doctype->indicated_compatability_mode;
*
* // Anything other than a proper DOCTYPE declaration token fails to parse.
* null === WP_HTML_Doctype_Info::from_doctype_token( ' ' );
* null === WP_HTML_Doctype_Info::from_doctype_token( '' );
* null === WP_HTML_Doctype_Info::from_doctype_token( '' );
* null === WP_HTML_Doctype_Info::from_doctype_token( 'html' );
* null === WP_HTML_Doctype_Info::from_doctype_token( '' );
*
* @since 6.7.0
*
* @param string $doctype_html The complete raw DOCTYPE HTML string, e.g. ``.
*
* @return WP_HTML_Doctype_Info|null A WP_HTML_Doctype_Info instance will be returned if the
* provided DOCTYPE HTML is a valid DOCTYPE. Otherwise, null.
*/
public static function from_doctype_token( string $doctype_html ): ?self {
$doctype_name = null;
$doctype_public_id = null;
$doctype_system_id = null;
$end = strlen( $doctype_html ) - 1;
/*
* This parser combines the rules for parsing DOCTYPE tokens found in the HTML
* specification for the DOCTYPE related tokenizer states.
*
* @see https://html.spec.whatwg.org/#doctype-state
*/
/*
* - Valid DOCTYPE HTML token must be at least `` assuming a complete token not
* ending in end-of-file.
* - It must start with an ASCII case-insensitive match for `` must be the final byte in the HTML string.
*/
if (
$end < 9 ||
0 !== substr_compare( $doctype_html, '`?
if ( '>' !== $doctype_html[ $end ] || ( strcspn( $doctype_html, '>', $at ) + $at ) < $end ) {
return null;
}
/*
* Perform newline normalization and ensure the $end value is correct after normalization.
*
* @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
* @see https://infra.spec.whatwg.org/#normalize-newlines
*/
$doctype_html = str_replace( "\r\n", "\n", $doctype_html );
$doctype_html = str_replace( "\r", "\n", $doctype_html );
$end = strlen( $doctype_html ) - 1;
/*
* In this state, the doctype token has been found and its "content" optionally including the
* name, public identifier, and system identifier is between the current position and the end.
*
* ""
* ╰─ $at ╰─ $end
*
* It's also possible that the declaration part is empty.
*
* ╭─ $at
* ""
* ╰─ $end
*
* Rules for parsing ">" which terminates the DOCTYPE do not need to be considered as they
* have been handled above in the condition that the provided DOCTYPE HTML must contain
* exactly one ">" character in the final position.
*/
/*
*
* Parsing effectively begins in "Before DOCTYPE name state". Ignore whitespace and
* proceed to the next state.
*
* @see https://html.spec.whatwg.org/#before-doctype-name-state
*/
$at += strspn( $doctype_html, " \t\n\f\r", $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
$name_length = strcspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
$doctype_name = str_replace( "\0", "\u{FFFD}", strtolower( substr( $doctype_html, $at, $name_length ) ) );
$at += $name_length;
$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
}
/*
* "After DOCTYPE name state"
*
* Find a case-insensitive match for "PUBLIC" or "SYSTEM" at this point.
* Otherwise, set force-quirks and enter bogus DOCTYPE state (skip the rest of the doctype).
*
* @see https://html.spec.whatwg.org/#after-doctype-name-state
*/
if ( $at + 6 >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
/*
* > If the six characters starting from the current input character are an ASCII
* > case-insensitive match for the word "PUBLIC", then consume those characters
* > and switch to the after DOCTYPE public keyword state.
*/
if ( 0 === substr_compare( $doctype_html, 'PUBLIC', $at, 6, true ) ) {
$at += 6;
$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
goto parse_doctype_public_identifier;
}
/*
* > Otherwise, if the six characters starting from the current input character are an ASCII
* > case-insensitive match for the word "SYSTEM", then consume those characters and switch
* > to the after DOCTYPE system keyword state.
*/
if ( 0 === substr_compare( $doctype_html, 'SYSTEM', $at, 6, true ) ) {
$at += 6;
$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
goto parse_doctype_system_identifier;
}
/*
* > Otherwise, this is an invalid-character-sequence-after-doctype-name parse error.
* > Set the current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus
* > DOCTYPE state.
*/
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
parse_doctype_public_identifier:
/*
* The parser should enter "DOCTYPE public identifier (double-quoted) state" or
* "DOCTYPE public identifier (single-quoted) state" by finding one of the valid quotes.
* Anything else forces quirks mode and ignores the rest of the contents.
*
* @see https://html.spec.whatwg.org/#doctype-public-identifier-(double-quoted)-state
* @see https://html.spec.whatwg.org/#doctype-public-identifier-(single-quoted)-state
*/
$closer_quote = $doctype_html[ $at ];
/*
* > This is a missing-quote-before-doctype-public-identifier parse error. Set the
* > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
*/
if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
++$at;
$identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
$doctype_public_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
$at += $identifier_length;
if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
++$at;
/*
* "Between DOCTYPE public and system identifiers state"
*
* Advance through whitespace between public and system identifiers.
*
* @see https://html.spec.whatwg.org/#between-doctype-public-and-system-identifiers-state
*/
$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
}
parse_doctype_system_identifier:
/*
* The parser should enter "DOCTYPE system identifier (double-quoted) state" or
* "DOCTYPE system identifier (single-quoted) state" by finding one of the valid quotes.
* Anything else forces quirks mode and ignores the rest of the contents.
*
* @see https://html.spec.whatwg.org/#doctype-system-identifier-(double-quoted)-state
* @see https://html.spec.whatwg.org/#doctype-system-identifier-(single-quoted)-state
*/
$closer_quote = $doctype_html[ $at ];
/*
* > This is a missing-quote-before-doctype-system-identifier parse error. Set the
* > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
*/
if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
++$at;
$identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
$doctype_system_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
$at += $identifier_length;
if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
}
}