another-boids-in-rust/vendor/roxmltree/src/tokenizer.rs

use core::ops::Range;
use core::str;

use crate::{Error, TextPos};

type Result<T> = core::result::Result<T, Error>;

/// Extension methods for XML-subset only operations.
trait XmlCharExt {
    /// Checks if the value is within the
    /// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range.
    fn is_xml_name_start(&self) -> bool;

    /// Checks if the value is within the
    /// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range.
    fn is_xml_name(&self) -> bool;

    /// Checks if the value is within the
    /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
    fn is_xml_char(&self) -> bool;
}

impl XmlCharExt for char {
    #[inline]
    fn is_xml_name_start(&self) -> bool {
        // Check for ASCII first.
        if *self as u32 <= 128 {
            return matches!(*self as u8, b'A'..=b'Z' | b'a'..=b'z' | b':' | b'_');
        }

        matches!(*self as u32,
            0x0000C0..=0x0000D6
            | 0x0000D8..=0x0000F6
            | 0x0000F8..=0x0002FF
            | 0x000370..=0x00037D
            | 0x00037F..=0x001FFF
            | 0x00200C..=0x00200D
            | 0x002070..=0x00218F
            | 0x002C00..=0x002FEF
            | 0x003001..=0x00D7FF
            | 0x00F900..=0x00FDCF
            | 0x00FDF0..=0x00FFFD
            | 0x010000..=0x0EFFFF)
    }

    #[inline]
    fn is_xml_name(&self) -> bool {
        // Check for ASCII first.
        if *self as u32 <= 128 {
            return (*self as u8).is_xml_name();
        }

        matches!(*self as u32, 0x0000B7
                | 0x0000C0..=0x0000D6
                | 0x0000D8..=0x0000F6
                | 0x0000F8..=0x0002FF
                | 0x000300..=0x00036F
                | 0x000370..=0x00037D
                | 0x00037F..=0x001FFF
                | 0x00200C..=0x00200D
                | 0x00203F..=0x002040
                | 0x002070..=0x00218F
                | 0x002C00..=0x002FEF
                | 0x003001..=0x00D7FF
                | 0x00F900..=0x00FDCF
                | 0x00FDF0..=0x00FFFD
                | 0x010000..=0x0EFFFF)
    }

    #[inline]
    fn is_xml_char(&self) -> bool {
        // Does not check for surrogate code points U+D800-U+DFFF,
        // since that check was performed by Rust when the `&str` was constructed.
        if (*self as u32) < 0x20 {
            return (*self as u8).is_xml_space();
        }

        !matches!(*self as u32, 0xFFFF | 0xFFFE)
    }
}

trait XmlByteExt {
    /// Checks if byte is a space.
    ///
    /// `[ \r\n\t]`
    fn is_xml_space(&self) -> bool;

    /// Checks if byte is within the ASCII
    /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
    fn is_xml_name(&self) -> bool;
}

impl XmlByteExt for u8 {
    #[inline]
    fn is_xml_space(&self) -> bool {
        matches!(*self, b' ' | b'\t' | b'\n' | b'\r')
    }

    #[inline]
    fn is_xml_name(&self) -> bool {
        matches!(*self, b'A'..=b'Z' | b'a'..=b'z'| b'0'..=b'9'| b':' | b'_' | b'-' | b'.')
    }
}

/// A string slice.
///
/// Like `&str`, but also contains the position in the input XML
/// from which it was parsed.
#[must_use]
#[derive(Clone, Copy)]
pub struct StrSpan<'input> {
    text: &'input str,
    start: usize,
}

impl<'input> From<&'input str> for StrSpan<'input> {
    #[inline]
    fn from(text: &'input str) -> Self {
        StrSpan { text, start: 0 }
    }
}

impl<'input> StrSpan<'input> {
    #[inline]
    pub fn from_substr(text: &str, start: usize, end: usize) -> StrSpan {
        debug_assert!(start <= end);
        StrSpan {
            text: &text[start..end],
            start,
        }
    }

    #[inline]
    pub fn range(&self) -> Range<usize> {
        self.start..(self.start + self.text.len())
    }

    #[inline]
    pub fn as_str(&self) -> &'input str {
        self.text
    }

    #[inline]
    fn slice_region(&self, start: usize, end: usize) -> &'input str {
        &self.text[start..end]
    }
}

pub enum Token<'input> {
    // <?target content?>
    ProcessingInstruction(&'input str, Option<&'input str>, Range<usize>),

    // <!-- text -->
    Comment(&'input str, Range<usize>),

    // <!ENTITY ns_extend "http://test.com">
    EntityDeclaration(&'input str, StrSpan<'input>),

    // <ns:elem
    ElementStart(&'input str, &'input str, usize),

    // ns:attr="value"
    Attribute(Range<usize>, u16, u8, &'input str, &'input str, StrSpan<'input>),

    ElementEnd(ElementEnd<'input>, Range<usize>),

    // Contains text between elements including whitespaces.
    // Basically everything between `>` and `<`.
    // Except `]]>`, which is not allowed and will lead to an error.
    Text(&'input str, Range<usize>),

    // <![CDATA[text]]>
    Cdata(&'input str, Range<usize>),
}

/// `ElementEnd` token.
#[derive(Clone, Copy)]
pub enum ElementEnd<'input> {
    /// Indicates `>`
    Open,
    /// Indicates `</ns:name>`
    Close(&'input str, &'input str),
    /// Indicates `/>`
    Empty,
}

pub trait XmlEvents<'input> {
    fn token(&mut self, token: Token<'input>) -> Result<()>;
}

// document ::= prolog element Misc*
pub fn parse<'input>(
    text: &'input str,
    allow_dtd: bool,
    events: &mut dyn XmlEvents<'input>,
) -> Result<()> {
    let s = &mut Stream::new(text);

    // Skip UTF-8 BOM.
    if s.starts_with(&[0xEF, 0xBB, 0xBF]) {
        s.advance(3);
    }

    if s.starts_with(b"<?xml ") {
        parse_declaration(s)?;
    }

    parse_misc(s, events)?;

    s.skip_spaces();
    if s.starts_with(b"<!DOCTYPE") {
        if !allow_dtd {
            return Err(Error::DtdDetected);
        }

        parse_doctype(s, events)?;
        parse_misc(s, events)?;
    }

    s.skip_spaces();
    if s.curr_byte().ok() == Some(b'<') {
        parse_element(s, events)?;
    }

    parse_misc(s, events)?;

    if !s.at_end() {
        return Err(Error::UnknownToken(s.gen_text_pos()));
    }

    Ok(())
}

// Misc ::= Comment | PI | S
fn parse_misc<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
    while !s.at_end() {
        s.skip_spaces();
        if s.starts_with(b"<!--") {
            parse_comment(s, events)?;
        } else if s.starts_with(b"<?") {
            parse_pi(s, events)?;
        } else {
            break;
        }
    }

    Ok(())
}

// XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
//
// We don't actually return a token for the XML declaration and only validate it.
fn parse_declaration(s: &mut Stream) -> Result<()> {
    fn consume_spaces(s: &mut Stream) -> Result<()> {
        if s.starts_with_space() {
            s.skip_spaces();
        } else if !s.starts_with(b"?>") && !s.at_end() {
            return Err(Error::InvalidChar2(
                "a whitespace",
                s.curr_byte_unchecked(),
                s.gen_text_pos(),
            ));
        }

        Ok(())
    }

    s.advance(5); // <?xml
    consume_spaces(s)?;

    // The `version` "attribute" is mandatory.
    if !s.starts_with(b"version") {
        // Will trigger the InvalidString error, which is what we want.
        return s.skip_string(b"version");
    }
    let _ = parse_attribute(s)?;
    consume_spaces(s)?;

    if s.starts_with(b"encoding") {
        let _ = parse_attribute(s)?;
        consume_spaces(s)?;
    }

    if s.starts_with(b"standalone") {
        let _ = parse_attribute(s)?;
    }

    s.skip_spaces();
    s.skip_string(b"?>")?;

    Ok(())
}

// '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
    let start = s.pos();
    s.advance(4);
    let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
    s.skip_string(b"-->")?;

    if text.contains("--") {
        return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
    }

    if text.ends_with('-') {
        return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
    }

    let range = s.range_from(start);
    events.token(Token::Comment(text, range))?;

    Ok(())
}

// PI       ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
// PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
    if s.starts_with(b"<?xml ") {
        return Err(Error::UnexpectedDeclaration(s.gen_text_pos()));
    }

    let start = s.pos();
    s.advance(2);
    let target = s.consume_name()?;
    s.skip_spaces();
    let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
    let content = if !content.is_empty() {
        Some(content)
    } else {
        None
    };

    s.skip_string(b"?>")?;

    let range = s.range_from(start);
    events.token(Token::ProcessingInstruction(target, content, range))?;
    Ok(())
}

fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
    let start = s.pos();
    parse_doctype_start(s)?;
    s.skip_spaces();

    if s.curr_byte() == Ok(b'>') {
        s.advance(1);
        return Ok(());
    }

    s.advance(1); // [
    while !s.at_end() {
        s.skip_spaces();
        if s.starts_with(b"<!ENTITY") {
            parse_entity_decl(s, events)?;
        } else if s.starts_with(b"<!--") {
            parse_comment(s, events)?;
        } else if s.starts_with(b"<?") {
            parse_pi(s, events)?;
        } else if s.starts_with(b"]") {
            // DTD ends with ']' S? '>', therefore we have to skip possible spaces.
            s.advance(1);
            s.skip_spaces();
            match s.curr_byte() {
                Ok(b'>') => {
                    s.advance(1);
                    break;
                }
                Ok(c) => {
                    return Err(Error::InvalidChar2("'>'", c, s.gen_text_pos()));
                }
                Err(_) => {
                    return Err(Error::UnexpectedEndOfStream);
                }
            }
        } else if s.starts_with(b"<!ELEMENT")
            || s.starts_with(b"<!ATTLIST")
            || s.starts_with(b"<!NOTATION")
        {
            if consume_decl(s).is_err() {
                let pos = s.gen_text_pos_from(start);
                return Err(Error::UnknownToken(pos));
            }
        } else {
            return Err(Error::UnknownToken(s.gen_text_pos()));
        }
    }

    Ok(())
}

// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
fn parse_doctype_start(s: &mut Stream) -> Result<()> {
    s.advance(9);

    s.consume_spaces()?;
    s.skip_name()?;
    s.skip_spaces();

    let _ = parse_external_id(s)?;
    s.skip_spaces();

    let c = s.curr_byte()?;
    if c != b'[' && c != b'>' {
        return Err(Error::InvalidChar2("'[' or '>'", c, s.gen_text_pos()));
    }

    Ok(())
}

// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
fn parse_external_id(s: &mut Stream) -> Result<bool> {
    let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
        let start = s.pos();
        s.advance(6);
        let id = s.slice_back(start);

        s.consume_spaces()?;
        let quote = s.consume_quote()?;
        let _ = s.consume_bytes(|c| c != quote);
        s.consume_byte(quote)?;

        if id == "SYSTEM" {
            // Ok
        } else {
            s.consume_spaces()?;
            let quote = s.consume_quote()?;
            let _ = s.consume_bytes(|c| c != quote);
            s.consume_byte(quote)?;
        }

        true
    } else {
        false
    };

    Ok(v)
}

// EntityDecl  ::= GEDecl | PEDecl
// GEDecl      ::= '<!ENTITY' S Name S EntityDef S? '>'
// PEDecl      ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
fn parse_entity_decl<'input>(
    s: &mut Stream<'input>,
    events: &mut dyn XmlEvents<'input>,
) -> Result<()> {
    s.advance(8);
    s.consume_spaces()?;

    let is_ge = if s.try_consume_byte(b'%') {
        s.consume_spaces()?;
        false
    } else {
        true
    };

    let name = s.consume_name()?;
    s.consume_spaces()?;
    if let Some(definition) = parse_entity_def(s, is_ge)? {
        events.token(Token::EntityDeclaration(name, definition))?;
    }
    s.skip_spaces();
    s.consume_byte(b'>')?;

    Ok(())
}

// EntityDef   ::= EntityValue | (ExternalID NDataDecl?)
// PEDef       ::= EntityValue | ExternalID
// EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' |  "'" ([^%&']
//                             | PEReference | Reference)* "'"
// ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
// NDataDecl   ::= S 'NDATA' S Name
fn parse_entity_def<'input>(
    s: &mut Stream<'input>,
    is_ge: bool,
) -> Result<Option<StrSpan<'input>>> {
    let c = s.curr_byte()?;
    match c {
        b'"' | b'\'' => {
            let quote = s.consume_quote()?;
            let start = s.pos();
            s.skip_bytes(|c| c != quote);
            let value = s.slice_back_span(start);
            s.consume_byte(quote)?;
            Ok(Some(value))
        }
        b'S' | b'P' => {
            if parse_external_id(s)? {
                if is_ge {
                    s.skip_spaces();
                    if s.starts_with(b"NDATA") {
                        s.advance(5);
                        s.consume_spaces()?;
                        s.skip_name()?;
                        // TODO: NDataDecl is not supported
                    }
                }

                Ok(None)
            } else {
                Err(Error::InvalidExternalID(s.gen_text_pos()))
            }
        }
        _ => {
            let pos = s.gen_text_pos();
            Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC", c, pos))
        }
    }
}

fn consume_decl(s: &mut Stream) -> Result<()> {
    s.skip_bytes(|c| c != b'>');
    s.consume_byte(b'>')?;
    Ok(())
}

// element ::= EmptyElemTag | STag content ETag
// '<' Name (S Attribute)* S? '>'
fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
    let start = s.pos();
    s.advance(1); // <
    let (prefix, local) = s.consume_qname()?;
    events.token(Token::ElementStart(prefix, local, start))?;

    let mut open = false;
    while !s.at_end() {
        let has_space = s.starts_with_space();
        s.skip_spaces();
        let start = s.pos();
        match s.curr_byte()? {
            b'/' => {
                s.advance(1);
                s.consume_byte(b'>')?;
                let range = s.range_from(start);
                events.token(Token::ElementEnd(ElementEnd::Empty, range))?;
                break;
            }
            b'>' => {
                s.advance(1);
                let range = s.range_from(start);
                events.token(Token::ElementEnd(ElementEnd::Open, range))?;
                open = true;
                break;
            }
            _ => {
                // An attribute must be preceded with a whitespace.
                if !has_space {
                    // Will always trigger an error. Which is what we want.
                    s.consume_spaces()?;
                }

                // Manual inlining of `parse_attribute` for performance.
                // We cannot mark `parse_attribute` as `#[inline(always)]`
                // because it will blow up the binary size.
                let (prefix, local) = s.consume_qname()?;
                let qname_end = s.pos();
                let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX);
                s.consume_eq()?;
                let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX);
                let quote = s.consume_quote()?;
                let quote_c = quote as char;
                // The attribute value must not contain the < character.
                let value_start = s.pos();
                s.skip_chars(|_, c| c != quote_c && c != '<')?;
                let value = s.slice_back_span(value_start);
                s.consume_byte(quote)?;
                let end = s.pos();
                events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?;
            }
        }
    }

    if open {
        parse_content(s, events)?;
    }

    Ok(())
}

// Attribute ::= Name Eq AttValue
fn parse_attribute<'input>(
    s: &mut Stream<'input>,
) -> Result<(&'input str, &'input str, StrSpan<'input>)> {
    let (prefix, local) = s.consume_qname()?;
    s.consume_eq()?;
    let quote = s.consume_quote()?;
    let quote_c = quote as char;
    // The attribute value must not contain the < character.
    let value_start = s.pos();
    s.skip_chars(|_, c| c != quote_c && c != '<')?;
    let value = s.slice_back_span(value_start);
    s.consume_byte(quote)?;
    Ok((prefix, local, value))
}

// content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
pub fn parse_content<'input>(
    s: &mut Stream<'input>,
    events: &mut dyn XmlEvents<'input>,
) -> Result<()> {
    while !s.at_end() {
        match s.curr_byte() {
            Ok(b'<') => match s.next_byte() {
                Ok(b'!') => {
                    if s.starts_with(b"<!--") {
                        parse_comment(s, events)?;
                    } else if s.starts_with(b"<![CDATA[") {
                        parse_cdata(s, events)?;
                    } else {
                        return Err(Error::UnknownToken(s.gen_text_pos()));
                    }
                }
                Ok(b'?') => parse_pi(s, events)?,
                Ok(b'/') => {
                    parse_close_element(s, events)?;
                    break;
                }
                Ok(_) => parse_element(s, events)?,
                Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
            },
            Ok(_) => parse_text(s, events)?,
            Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
        }
    }

    Ok(())
}

// CDSect  ::= CDStart CData CDEnd
// CDStart ::= '<![CDATA['
// CData   ::= (Char* - (Char* ']]>' Char*))
// CDEnd   ::= ']]>'
fn parse_cdata<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
    let start = s.pos();
    s.advance(9); // <![CDATA[
    let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
    s.skip_string(b"]]>")?;
    let range = s.range_from(start);
    events.token(Token::Cdata(text, range))?;
    Ok(())
}

// '</' Name S? '>'
fn parse_close_element<'input>(
    s: &mut Stream<'input>,
    events: &mut dyn XmlEvents<'input>,
) -> Result<()> {
    let start = s.pos();
    s.advance(2); // </

    let (prefix, tag_name) = s.consume_qname()?;
    s.skip_spaces();
    s.consume_byte(b'>')?;

    let range = s.range_from(start);
    events.token(Token::ElementEnd(
        ElementEnd::Close(prefix, tag_name),
        range,
    ))?;
    Ok(())
}

fn parse_text<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
    let start = s.pos();
    let text = s.consume_chars(|_, c| c != '<')?;

    // According to the spec, `]]>` must not appear inside a Text node.
    // https://www.w3.org/TR/xml/#syntax
    //
    // Search for `>` first, since it's a bit faster than looking for `]]>`.
    if text.contains('>') && text.contains("]]>") {
        return Err(Error::InvalidCharacterData(s.gen_text_pos()));
    }

    let range = s.range_from(start);
    events.token(Token::Text(text, range))?;
    Ok(())
}

/// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value.
#[derive(Clone, Copy)]
pub enum Reference<'input> {
    /// An entity reference.
    ///
    /// <https://www.w3.org/TR/xml/#NT-EntityRef>
    Entity(&'input str),

    /// A character reference.
    ///
    /// <https://www.w3.org/TR/xml/#NT-CharRef>
    Char(char),
}

#[derive(Clone)]
pub struct Stream<'input> {
    pos: usize,
    end: usize,
    span: StrSpan<'input>,
}

impl<'input> Stream<'input> {
    #[inline]
    pub fn new(text: &'input str) -> Self {
        Stream {
            pos: 0,
            end: text.len(),
            span: text.into(),
        }
    }

    #[inline]
    pub fn from_substr(text: &'input str, fragment: Range<usize>) -> Self {
        Stream {
            pos: fragment.start,
            end: fragment.end,
            span: text.into(),
        }
    }

    #[inline]
    pub fn pos(&self) -> usize {
        self.pos
    }

    #[inline]
    pub fn at_end(&self) -> bool {
        self.pos >= self.end
    }

    #[inline]
    pub fn curr_byte(&self) -> Result<u8> {
        if self.at_end() {
            return Err(Error::UnexpectedEndOfStream);
        }

        Ok(self.curr_byte_unchecked())
    }

    #[inline]
    pub fn curr_byte_unchecked(&self) -> u8 {
        self.span.text.as_bytes()[self.pos]
    }

    #[inline]
    fn next_byte(&self) -> Result<u8> {
        if self.pos + 1 >= self.end {
            return Err(Error::UnexpectedEndOfStream);
        }

        Ok(self.span.as_str().as_bytes()[self.pos + 1])
    }

    #[inline]
    pub fn advance(&mut self, n: usize) {
        debug_assert!(self.pos + n <= self.end);
        self.pos += n;
    }

    #[inline]
    fn starts_with(&self, text: &[u8]) -> bool {
        self.span.text.as_bytes()[self.pos..self.end].starts_with(text)
    }

    fn consume_byte(&mut self, c: u8) -> Result<()> {
        let curr = self.curr_byte()?;
        if curr != c {
            return Err(Error::InvalidChar(c, curr, self.gen_text_pos()));
        }

        self.advance(1);
        Ok(())
    }

    // Unlike `consume_byte()` will not return any errors.
    fn try_consume_byte(&mut self, c: u8) -> bool {
        match self.curr_byte() {
            Ok(b) if b == c => {
                self.advance(1);
                true
            }
            _ => false,
        }
    }

    fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
        if !self.starts_with(text) {
            let pos = self.gen_text_pos();

            // Assume that all input `text` are valid UTF-8 strings, so unwrap is safe.
            let expected = str::from_utf8(text).unwrap();

            return Err(Error::InvalidString(expected, pos));
        }

        self.advance(text.len());
        Ok(())
    }

    #[inline]
    fn consume_bytes<F: Fn(u8) -> bool>(&mut self, f: F) -> &'input str {
        let start = self.pos;
        self.skip_bytes(f);
        self.slice_back(start)
    }

    fn skip_bytes<F: Fn(u8) -> bool>(&mut self, f: F) {
        while !self.at_end() && f(self.curr_byte_unchecked()) {
            self.advance(1);
        }
    }

    #[inline]
    fn consume_chars<F>(&mut self, f: F) -> Result<&'input str>
    where
        F: Fn(&Stream, char) -> bool,
    {
        let start = self.pos;
        self.skip_chars(f)?;
        Ok(self.slice_back(start))
    }

    #[inline]
    fn skip_chars<F>(&mut self, f: F) -> Result<()>
    where
        F: Fn(&Stream, char) -> bool,
    {
        for c in self.chars() {
            if !c.is_xml_char() {
                return Err(Error::NonXmlChar(c, self.gen_text_pos()));
            } else if f(self, c) {
                self.advance(c.len_utf8());
            } else {
                break;
            }
        }

        Ok(())
    }

    #[inline]
    fn chars(&self) -> str::Chars<'input> {
        self.span.as_str()[self.pos..self.end].chars()
    }

    #[inline]
    fn slice_back(&self, pos: usize) -> &'input str {
        self.span.slice_region(pos, self.pos)
    }

    #[inline]
    fn slice_back_span(&self, pos: usize) -> StrSpan<'input> {
        StrSpan::from_substr(self.span.text, pos, self.pos)
    }

    #[inline]
    fn range_from(&self, start: usize) -> Range<usize> {
        start..self.pos
    }

    #[inline]
    fn skip_spaces(&mut self) {
        while self.starts_with_space() {
            self.advance(1);
        }
    }

    #[inline]
    fn starts_with_space(&self) -> bool {
        !self.at_end() && self.curr_byte_unchecked().is_xml_space()
    }

    // Like `skip_spaces()`, but checks that first char is actually a space.
    fn consume_spaces(&mut self) -> Result<()> {
        if self.at_end() {
            return Err(Error::UnexpectedEndOfStream);
        }

        if !self.starts_with_space() {
            return Err(Error::InvalidChar2(
                "a whitespace",
                self.curr_byte_unchecked(),
                self.gen_text_pos(),
            ));
        }

        self.skip_spaces();
        Ok(())
    }

    /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Reference>
    pub fn try_consume_reference(&mut self) -> Option<Reference<'input>> {
        let start = self.pos();

        // Consume reference on a substream.
        let mut s = self.clone();
        let result = s.consume_reference()?;

        // If the current data is a reference than advance the current stream
        // by number of bytes read by substream.
        self.advance(s.pos() - start);
        Some(result)
    }

    #[inline(never)]
    fn consume_reference(&mut self) -> Option<Reference<'input>> {
        if !self.try_consume_byte(b'&') {
            return None;
        }

        let reference = if self.try_consume_byte(b'#') {
            let (value, radix) = if self.try_consume_byte(b'x') {
                let value =
                    self.consume_bytes(|c| matches!(c, b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f'));
                (value, 16)
            } else {
                let value = self.consume_bytes(|c| c.is_ascii_digit());
                (value, 10)
            };

            let n = u32::from_str_radix(value, radix).ok()?;

            let c = char::from_u32(n).unwrap_or('\u{FFFD}');
            if !c.is_xml_char() {
                return None;
            }

            Reference::Char(c)
        } else {
            let name = self.consume_name().ok()?;
            match name {
                "quot" => Reference::Char('"'),
                "amp" => Reference::Char('&'),
                "apos" => Reference::Char('\''),
                "lt" => Reference::Char('<'),
                "gt" => Reference::Char('>'),
                _ => Reference::Entity(name),
            }
        };

        self.consume_byte(b';').ok()?;

        Some(reference)
    }

    /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Name>
    fn consume_name(&mut self) -> Result<&'input str> {
        let start = self.pos();
        self.skip_name()?;

        let name = self.slice_back(start);
        if name.is_empty() {
            return Err(Error::InvalidName(self.gen_text_pos_from(start)));
        }

        Ok(name)
    }

    /// The same as `consume_name()`, but does not return a consumed name.
    fn skip_name(&mut self) -> Result<()> {
        let start = self.pos();
        let mut iter = self.chars();
        if let Some(c) = iter.next() {
            if c.is_xml_name_start() {
                self.advance(c.len_utf8());
            } else {
                return Err(Error::InvalidName(self.gen_text_pos_from(start)));
            }
        }

        for c in iter {
            if c.is_xml_name() {
                self.advance(c.len_utf8());
            } else {
                break;
            }
        }

        Ok(())
    }

    /// Consumes a qualified XML name and returns it.
    ///
    /// Consumes according to: <https://www.w3.org/TR/xml-names/#ns-qualnames>
    #[inline(never)]
    fn consume_qname(&mut self) -> Result<(&'input str, &'input str)> {
        let start = self.pos();

        let mut splitter = None;

        while !self.at_end() {
            // Check for ASCII first for performance reasons.
            let b = self.curr_byte_unchecked();
            if b < 128 {
                if b == b':' {
                    if splitter.is_none() {
                        splitter = Some(self.pos());
                        self.advance(1);
                    } else {
                        // Multiple `:` is an error.
                        return Err(Error::InvalidName(self.gen_text_pos_from(start)));
                    }
                } else if b.is_xml_name() {
                    self.advance(1);
                } else {
                    break;
                }
            } else {
                // Fallback to Unicode code point.
                match self.chars().nth(0) {
                    Some(c) if c.is_xml_name() => {
                        self.advance(c.len_utf8());
                    }
                    _ => break,
                }
            }
        }

        let (prefix, local) = if let Some(splitter) = splitter {
            let prefix = self.span.slice_region(start, splitter);
            let local = self.slice_back(splitter + 1);
            (prefix, local)
        } else {
            let local = self.slice_back(start);
            // Slice an empty prefix. This way we can preserve attribute start position.
            (self.span.slice_region(start, start), local)
        };

        // Prefix must start with a `NameStartChar`.
        if let Some(c) = prefix.chars().nth(0) {
            if !c.is_xml_name_start() {
                return Err(Error::InvalidName(self.gen_text_pos_from(start)));
            }
        }

        // Local name must start with a `NameStartChar`.
        if let Some(c) = local.chars().nth(0) {
            if !c.is_xml_name_start() {
                return Err(Error::InvalidName(self.gen_text_pos_from(start)));
            }
        } else {
            // If empty - error.
            return Err(Error::InvalidName(self.gen_text_pos_from(start)));
        }

        Ok((prefix, local))
    }

    fn consume_eq(&mut self) -> Result<()> {
        self.skip_spaces();
        self.consume_byte(b'=')?;
        self.skip_spaces();

        Ok(())
    }

    fn consume_quote(&mut self) -> Result<u8> {
        let c = self.curr_byte()?;
        if c == b'\'' || c == b'"' {
            self.advance(1);
            Ok(c)
        } else {
            Err(Error::InvalidChar2("a quote", c, self.gen_text_pos()))
        }
    }

    /// Calculates a current absolute position.
    ///
    /// This operation is very expensive. Use only for errors.
    #[inline(never)]
    pub fn gen_text_pos(&self) -> TextPos {
        let text = self.span.as_str();
        let end = self.pos;

        let row = Self::calc_curr_row(text, end);
        let col = Self::calc_curr_col(text, end);
        TextPos::new(row, col)
    }

    /// Calculates an absolute position at `pos`.
    ///
    /// This operation is very expensive. Use only for errors.
    #[inline(never)]
    pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
        let mut s = self.clone();
        s.pos = core::cmp::min(pos, s.span.as_str().len());
        s.gen_text_pos()
    }

    fn calc_curr_row(text: &str, end: usize) -> u32 {
        let mut row = 1;
        for c in &text.as_bytes()[..end] {
            if *c == b'\n' {
                row += 1;
            }
        }

        row
    }

    fn calc_curr_col(text: &str, end: usize) -> u32 {
        let mut col = 1;
        for c in text[..end].chars().rev() {
            if c == '\n' {
                break;
            } else {
                col += 1;
            }
        }

        col
    }
}