1113 lines
30 KiB
Rust
1113 lines
30 KiB
Rust
use core::ops::Range;
|
|
use core::str;
|
|
|
|
use crate::{Error, TextPos};
|
|
|
|
type Result<T> = core::result::Result<T, Error>;
|
|
|
|
/// Extension methods for XML-subset only operations.
|
|
trait XmlCharExt {
|
|
/// Checks if the value is within the
|
|
/// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range.
|
|
fn is_xml_name_start(&self) -> bool;
|
|
|
|
/// Checks if the value is within the
|
|
/// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range.
|
|
fn is_xml_name(&self) -> bool;
|
|
|
|
/// Checks if the value is within the
|
|
/// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
|
|
fn is_xml_char(&self) -> bool;
|
|
}
|
|
|
|
impl XmlCharExt for char {
|
|
#[inline]
|
|
fn is_xml_name_start(&self) -> bool {
|
|
// Check for ASCII first.
|
|
if *self as u32 <= 128 {
|
|
return matches!(*self as u8, b'A'..=b'Z' | b'a'..=b'z' | b':' | b'_');
|
|
}
|
|
|
|
matches!(*self as u32,
|
|
0x0000C0..=0x0000D6
|
|
| 0x0000D8..=0x0000F6
|
|
| 0x0000F8..=0x0002FF
|
|
| 0x000370..=0x00037D
|
|
| 0x00037F..=0x001FFF
|
|
| 0x00200C..=0x00200D
|
|
| 0x002070..=0x00218F
|
|
| 0x002C00..=0x002FEF
|
|
| 0x003001..=0x00D7FF
|
|
| 0x00F900..=0x00FDCF
|
|
| 0x00FDF0..=0x00FFFD
|
|
| 0x010000..=0x0EFFFF)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_xml_name(&self) -> bool {
|
|
// Check for ASCII first.
|
|
if *self as u32 <= 128 {
|
|
return (*self as u8).is_xml_name();
|
|
}
|
|
|
|
matches!(*self as u32, 0x0000B7
|
|
| 0x0000C0..=0x0000D6
|
|
| 0x0000D8..=0x0000F6
|
|
| 0x0000F8..=0x0002FF
|
|
| 0x000300..=0x00036F
|
|
| 0x000370..=0x00037D
|
|
| 0x00037F..=0x001FFF
|
|
| 0x00200C..=0x00200D
|
|
| 0x00203F..=0x002040
|
|
| 0x002070..=0x00218F
|
|
| 0x002C00..=0x002FEF
|
|
| 0x003001..=0x00D7FF
|
|
| 0x00F900..=0x00FDCF
|
|
| 0x00FDF0..=0x00FFFD
|
|
| 0x010000..=0x0EFFFF)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_xml_char(&self) -> bool {
|
|
// Does not check for surrogate code points U+D800-U+DFFF,
|
|
// since that check was performed by Rust when the `&str` was constructed.
|
|
if (*self as u32) < 0x20 {
|
|
return (*self as u8).is_xml_space();
|
|
}
|
|
|
|
!matches!(*self as u32, 0xFFFF | 0xFFFE)
|
|
}
|
|
}
|
|
|
|
trait XmlByteExt {
|
|
/// Checks if byte is a space.
|
|
///
|
|
/// `[ \r\n\t]`
|
|
fn is_xml_space(&self) -> bool;
|
|
|
|
/// Checks if byte is within the ASCII
|
|
/// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
|
|
fn is_xml_name(&self) -> bool;
|
|
}
|
|
|
|
impl XmlByteExt for u8 {
|
|
#[inline]
|
|
fn is_xml_space(&self) -> bool {
|
|
matches!(*self, b' ' | b'\t' | b'\n' | b'\r')
|
|
}
|
|
|
|
#[inline]
|
|
fn is_xml_name(&self) -> bool {
|
|
matches!(*self, b'A'..=b'Z' | b'a'..=b'z'| b'0'..=b'9'| b':' | b'_' | b'-' | b'.')
|
|
}
|
|
}
|
|
|
|
/// A string slice.
|
|
///
|
|
/// Like `&str`, but also contains the position in the input XML
|
|
/// from which it was parsed.
|
|
#[must_use]
|
|
#[derive(Clone, Copy)]
|
|
pub struct StrSpan<'input> {
|
|
text: &'input str,
|
|
start: usize,
|
|
}
|
|
|
|
impl<'input> From<&'input str> for StrSpan<'input> {
|
|
#[inline]
|
|
fn from(text: &'input str) -> Self {
|
|
StrSpan { text, start: 0 }
|
|
}
|
|
}
|
|
|
|
impl<'input> StrSpan<'input> {
|
|
#[inline]
|
|
pub fn from_substr(text: &str, start: usize, end: usize) -> StrSpan {
|
|
debug_assert!(start <= end);
|
|
StrSpan {
|
|
text: &text[start..end],
|
|
start,
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
pub fn range(&self) -> Range<usize> {
|
|
self.start..(self.start + self.text.len())
|
|
}
|
|
|
|
#[inline]
|
|
pub fn as_str(&self) -> &'input str {
|
|
self.text
|
|
}
|
|
|
|
#[inline]
|
|
fn slice_region(&self, start: usize, end: usize) -> &'input str {
|
|
&self.text[start..end]
|
|
}
|
|
}
|
|
|
|
pub enum Token<'input> {
|
|
// <?target content?>
|
|
ProcessingInstruction(&'input str, Option<&'input str>, Range<usize>),
|
|
|
|
// <!-- text -->
|
|
Comment(&'input str, Range<usize>),
|
|
|
|
// <!ENTITY ns_extend "http://test.com">
|
|
EntityDeclaration(&'input str, StrSpan<'input>),
|
|
|
|
// <ns:elem
|
|
ElementStart(&'input str, &'input str, usize),
|
|
|
|
// ns:attr="value"
|
|
Attribute(Range<usize>, u16, u8, &'input str, &'input str, StrSpan<'input>),
|
|
|
|
ElementEnd(ElementEnd<'input>, Range<usize>),
|
|
|
|
// Contains text between elements including whitespaces.
|
|
// Basically everything between `>` and `<`.
|
|
// Except `]]>`, which is not allowed and will lead to an error.
|
|
Text(&'input str, Range<usize>),
|
|
|
|
// <![CDATA[text]]>
|
|
Cdata(&'input str, Range<usize>),
|
|
}
|
|
|
|
/// `ElementEnd` token.
|
|
#[derive(Clone, Copy)]
|
|
pub enum ElementEnd<'input> {
|
|
/// Indicates `>`
|
|
Open,
|
|
/// Indicates `</ns:name>`
|
|
Close(&'input str, &'input str),
|
|
/// Indicates `/>`
|
|
Empty,
|
|
}
|
|
|
|
pub trait XmlEvents<'input> {
|
|
fn token(&mut self, token: Token<'input>) -> Result<()>;
|
|
}
|
|
|
|
// document ::= prolog element Misc*
|
|
pub fn parse<'input>(
|
|
text: &'input str,
|
|
allow_dtd: bool,
|
|
events: &mut dyn XmlEvents<'input>,
|
|
) -> Result<()> {
|
|
let s = &mut Stream::new(text);
|
|
|
|
// Skip UTF-8 BOM.
|
|
if s.starts_with(&[0xEF, 0xBB, 0xBF]) {
|
|
s.advance(3);
|
|
}
|
|
|
|
if s.starts_with(b"<?xml ") {
|
|
parse_declaration(s)?;
|
|
}
|
|
|
|
parse_misc(s, events)?;
|
|
|
|
s.skip_spaces();
|
|
if s.starts_with(b"<!DOCTYPE") {
|
|
if !allow_dtd {
|
|
return Err(Error::DtdDetected);
|
|
}
|
|
|
|
parse_doctype(s, events)?;
|
|
parse_misc(s, events)?;
|
|
}
|
|
|
|
s.skip_spaces();
|
|
if s.curr_byte().ok() == Some(b'<') {
|
|
parse_element(s, events)?;
|
|
}
|
|
|
|
parse_misc(s, events)?;
|
|
|
|
if !s.at_end() {
|
|
return Err(Error::UnknownToken(s.gen_text_pos()));
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Misc ::= Comment | PI | S
|
|
fn parse_misc<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
|
|
while !s.at_end() {
|
|
s.skip_spaces();
|
|
if s.starts_with(b"<!--") {
|
|
parse_comment(s, events)?;
|
|
} else if s.starts_with(b"<?") {
|
|
parse_pi(s, events)?;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
|
|
//
|
|
// We don't actually return a token for the XML declaration and only validate it.
|
|
fn parse_declaration(s: &mut Stream) -> Result<()> {
|
|
fn consume_spaces(s: &mut Stream) -> Result<()> {
|
|
if s.starts_with_space() {
|
|
s.skip_spaces();
|
|
} else if !s.starts_with(b"?>") && !s.at_end() {
|
|
return Err(Error::InvalidChar2(
|
|
"a whitespace",
|
|
s.curr_byte_unchecked(),
|
|
s.gen_text_pos(),
|
|
));
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
s.advance(5); // <?xml
|
|
consume_spaces(s)?;
|
|
|
|
// The `version` "attribute" is mandatory.
|
|
if !s.starts_with(b"version") {
|
|
// Will trigger the InvalidString error, which is what we want.
|
|
return s.skip_string(b"version");
|
|
}
|
|
let _ = parse_attribute(s)?;
|
|
consume_spaces(s)?;
|
|
|
|
if s.starts_with(b"encoding") {
|
|
let _ = parse_attribute(s)?;
|
|
consume_spaces(s)?;
|
|
}
|
|
|
|
if s.starts_with(b"standalone") {
|
|
let _ = parse_attribute(s)?;
|
|
}
|
|
|
|
s.skip_spaces();
|
|
s.skip_string(b"?>")?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
|
|
fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
|
|
let start = s.pos();
|
|
s.advance(4);
|
|
let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
|
|
s.skip_string(b"-->")?;
|
|
|
|
if text.contains("--") {
|
|
return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
|
|
}
|
|
|
|
if text.ends_with('-') {
|
|
return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
|
|
}
|
|
|
|
let range = s.range_from(start);
|
|
events.token(Token::Comment(text, range))?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
|
|
// PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
|
|
fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
|
|
if s.starts_with(b"<?xml ") {
|
|
return Err(Error::UnexpectedDeclaration(s.gen_text_pos()));
|
|
}
|
|
|
|
let start = s.pos();
|
|
s.advance(2);
|
|
let target = s.consume_name()?;
|
|
s.skip_spaces();
|
|
let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
|
|
let content = if !content.is_empty() {
|
|
Some(content)
|
|
} else {
|
|
None
|
|
};
|
|
|
|
s.skip_string(b"?>")?;
|
|
|
|
let range = s.range_from(start);
|
|
events.token(Token::ProcessingInstruction(target, content, range))?;
|
|
Ok(())
|
|
}
|
|
|
|
fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
|
|
let start = s.pos();
|
|
parse_doctype_start(s)?;
|
|
s.skip_spaces();
|
|
|
|
if s.curr_byte() == Ok(b'>') {
|
|
s.advance(1);
|
|
return Ok(());
|
|
}
|
|
|
|
s.advance(1); // [
|
|
while !s.at_end() {
|
|
s.skip_spaces();
|
|
if s.starts_with(b"<!ENTITY") {
|
|
parse_entity_decl(s, events)?;
|
|
} else if s.starts_with(b"<!--") {
|
|
parse_comment(s, events)?;
|
|
} else if s.starts_with(b"<?") {
|
|
parse_pi(s, events)?;
|
|
} else if s.starts_with(b"]") {
|
|
// DTD ends with ']' S? '>', therefore we have to skip possible spaces.
|
|
s.advance(1);
|
|
s.skip_spaces();
|
|
match s.curr_byte() {
|
|
Ok(b'>') => {
|
|
s.advance(1);
|
|
break;
|
|
}
|
|
Ok(c) => {
|
|
return Err(Error::InvalidChar2("'>'", c, s.gen_text_pos()));
|
|
}
|
|
Err(_) => {
|
|
return Err(Error::UnexpectedEndOfStream);
|
|
}
|
|
}
|
|
} else if s.starts_with(b"<!ELEMENT")
|
|
|| s.starts_with(b"<!ATTLIST")
|
|
|| s.starts_with(b"<!NOTATION")
|
|
{
|
|
if consume_decl(s).is_err() {
|
|
let pos = s.gen_text_pos_from(start);
|
|
return Err(Error::UnknownToken(pos));
|
|
}
|
|
} else {
|
|
return Err(Error::UnknownToken(s.gen_text_pos()));
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
|
|
fn parse_doctype_start(s: &mut Stream) -> Result<()> {
|
|
s.advance(9);
|
|
|
|
s.consume_spaces()?;
|
|
s.skip_name()?;
|
|
s.skip_spaces();
|
|
|
|
let _ = parse_external_id(s)?;
|
|
s.skip_spaces();
|
|
|
|
let c = s.curr_byte()?;
|
|
if c != b'[' && c != b'>' {
|
|
return Err(Error::InvalidChar2("'[' or '>'", c, s.gen_text_pos()));
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
|
|
fn parse_external_id(s: &mut Stream) -> Result<bool> {
|
|
let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
|
|
let start = s.pos();
|
|
s.advance(6);
|
|
let id = s.slice_back(start);
|
|
|
|
s.consume_spaces()?;
|
|
let quote = s.consume_quote()?;
|
|
let _ = s.consume_bytes(|c| c != quote);
|
|
s.consume_byte(quote)?;
|
|
|
|
if id == "SYSTEM" {
|
|
// Ok
|
|
} else {
|
|
s.consume_spaces()?;
|
|
let quote = s.consume_quote()?;
|
|
let _ = s.consume_bytes(|c| c != quote);
|
|
s.consume_byte(quote)?;
|
|
}
|
|
|
|
true
|
|
} else {
|
|
false
|
|
};
|
|
|
|
Ok(v)
|
|
}
|
|
|
|
// EntityDecl ::= GEDecl | PEDecl
|
|
// GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
|
|
// PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
|
|
fn parse_entity_decl<'input>(
|
|
s: &mut Stream<'input>,
|
|
events: &mut dyn XmlEvents<'input>,
|
|
) -> Result<()> {
|
|
s.advance(8);
|
|
s.consume_spaces()?;
|
|
|
|
let is_ge = if s.try_consume_byte(b'%') {
|
|
s.consume_spaces()?;
|
|
false
|
|
} else {
|
|
true
|
|
};
|
|
|
|
let name = s.consume_name()?;
|
|
s.consume_spaces()?;
|
|
if let Some(definition) = parse_entity_def(s, is_ge)? {
|
|
events.token(Token::EntityDeclaration(name, definition))?;
|
|
}
|
|
s.skip_spaces();
|
|
s.consume_byte(b'>')?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// EntityDef ::= EntityValue | (ExternalID NDataDecl?)
|
|
// PEDef ::= EntityValue | ExternalID
|
|
// EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&']
|
|
// | PEReference | Reference)* "'"
|
|
// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
|
|
// NDataDecl ::= S 'NDATA' S Name
|
|
fn parse_entity_def<'input>(
|
|
s: &mut Stream<'input>,
|
|
is_ge: bool,
|
|
) -> Result<Option<StrSpan<'input>>> {
|
|
let c = s.curr_byte()?;
|
|
match c {
|
|
b'"' | b'\'' => {
|
|
let quote = s.consume_quote()?;
|
|
let start = s.pos();
|
|
s.skip_bytes(|c| c != quote);
|
|
let value = s.slice_back_span(start);
|
|
s.consume_byte(quote)?;
|
|
Ok(Some(value))
|
|
}
|
|
b'S' | b'P' => {
|
|
if parse_external_id(s)? {
|
|
if is_ge {
|
|
s.skip_spaces();
|
|
if s.starts_with(b"NDATA") {
|
|
s.advance(5);
|
|
s.consume_spaces()?;
|
|
s.skip_name()?;
|
|
// TODO: NDataDecl is not supported
|
|
}
|
|
}
|
|
|
|
Ok(None)
|
|
} else {
|
|
Err(Error::InvalidExternalID(s.gen_text_pos()))
|
|
}
|
|
}
|
|
_ => {
|
|
let pos = s.gen_text_pos();
|
|
Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC", c, pos))
|
|
}
|
|
}
|
|
}
|
|
|
|
fn consume_decl(s: &mut Stream) -> Result<()> {
|
|
s.skip_bytes(|c| c != b'>');
|
|
s.consume_byte(b'>')?;
|
|
Ok(())
|
|
}
|
|
|
|
// element ::= EmptyElemTag | STag content ETag
|
|
// '<' Name (S Attribute)* S? '>'
|
|
fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
|
|
let start = s.pos();
|
|
s.advance(1); // <
|
|
let (prefix, local) = s.consume_qname()?;
|
|
events.token(Token::ElementStart(prefix, local, start))?;
|
|
|
|
let mut open = false;
|
|
while !s.at_end() {
|
|
let has_space = s.starts_with_space();
|
|
s.skip_spaces();
|
|
let start = s.pos();
|
|
match s.curr_byte()? {
|
|
b'/' => {
|
|
s.advance(1);
|
|
s.consume_byte(b'>')?;
|
|
let range = s.range_from(start);
|
|
events.token(Token::ElementEnd(ElementEnd::Empty, range))?;
|
|
break;
|
|
}
|
|
b'>' => {
|
|
s.advance(1);
|
|
let range = s.range_from(start);
|
|
events.token(Token::ElementEnd(ElementEnd::Open, range))?;
|
|
open = true;
|
|
break;
|
|
}
|
|
_ => {
|
|
// An attribute must be preceded with a whitespace.
|
|
if !has_space {
|
|
// Will always trigger an error. Which is what we want.
|
|
s.consume_spaces()?;
|
|
}
|
|
|
|
// Manual inlining of `parse_attribute` for performance.
|
|
// We cannot mark `parse_attribute` as `#[inline(always)]`
|
|
// because it will blow up the binary size.
|
|
let (prefix, local) = s.consume_qname()?;
|
|
let qname_end = s.pos();
|
|
let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX);
|
|
s.consume_eq()?;
|
|
let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX);
|
|
let quote = s.consume_quote()?;
|
|
let quote_c = quote as char;
|
|
// The attribute value must not contain the < character.
|
|
let value_start = s.pos();
|
|
s.skip_chars(|_, c| c != quote_c && c != '<')?;
|
|
let value = s.slice_back_span(value_start);
|
|
s.consume_byte(quote)?;
|
|
let end = s.pos();
|
|
events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?;
|
|
}
|
|
}
|
|
}
|
|
|
|
if open {
|
|
parse_content(s, events)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Attribute ::= Name Eq AttValue
|
|
fn parse_attribute<'input>(
|
|
s: &mut Stream<'input>,
|
|
) -> Result<(&'input str, &'input str, StrSpan<'input>)> {
|
|
let (prefix, local) = s.consume_qname()?;
|
|
s.consume_eq()?;
|
|
let quote = s.consume_quote()?;
|
|
let quote_c = quote as char;
|
|
// The attribute value must not contain the < character.
|
|
let value_start = s.pos();
|
|
s.skip_chars(|_, c| c != quote_c && c != '<')?;
|
|
let value = s.slice_back_span(value_start);
|
|
s.consume_byte(quote)?;
|
|
Ok((prefix, local, value))
|
|
}
|
|
|
|
// content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
|
|
pub fn parse_content<'input>(
|
|
s: &mut Stream<'input>,
|
|
events: &mut dyn XmlEvents<'input>,
|
|
) -> Result<()> {
|
|
while !s.at_end() {
|
|
match s.curr_byte() {
|
|
Ok(b'<') => match s.next_byte() {
|
|
Ok(b'!') => {
|
|
if s.starts_with(b"<!--") {
|
|
parse_comment(s, events)?;
|
|
} else if s.starts_with(b"<![CDATA[") {
|
|
parse_cdata(s, events)?;
|
|
} else {
|
|
return Err(Error::UnknownToken(s.gen_text_pos()));
|
|
}
|
|
}
|
|
Ok(b'?') => parse_pi(s, events)?,
|
|
Ok(b'/') => {
|
|
parse_close_element(s, events)?;
|
|
break;
|
|
}
|
|
Ok(_) => parse_element(s, events)?,
|
|
Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
|
|
},
|
|
Ok(_) => parse_text(s, events)?,
|
|
Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// CDSect ::= CDStart CData CDEnd
|
|
// CDStart ::= '<![CDATA['
|
|
// CData ::= (Char* - (Char* ']]>' Char*))
|
|
// CDEnd ::= ']]>'
|
|
fn parse_cdata<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
|
|
let start = s.pos();
|
|
s.advance(9); // <![CDATA[
|
|
let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
|
|
s.skip_string(b"]]>")?;
|
|
let range = s.range_from(start);
|
|
events.token(Token::Cdata(text, range))?;
|
|
Ok(())
|
|
}
|
|
|
|
// '</' Name S? '>'
|
|
fn parse_close_element<'input>(
|
|
s: &mut Stream<'input>,
|
|
events: &mut dyn XmlEvents<'input>,
|
|
) -> Result<()> {
|
|
let start = s.pos();
|
|
s.advance(2); // </
|
|
|
|
let (prefix, tag_name) = s.consume_qname()?;
|
|
s.skip_spaces();
|
|
s.consume_byte(b'>')?;
|
|
|
|
let range = s.range_from(start);
|
|
events.token(Token::ElementEnd(
|
|
ElementEnd::Close(prefix, tag_name),
|
|
range,
|
|
))?;
|
|
Ok(())
|
|
}
|
|
|
|
fn parse_text<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
|
|
let start = s.pos();
|
|
let text = s.consume_chars(|_, c| c != '<')?;
|
|
|
|
// According to the spec, `]]>` must not appear inside a Text node.
|
|
// https://www.w3.org/TR/xml/#syntax
|
|
//
|
|
// Search for `>` first, since it's a bit faster than looking for `]]>`.
|
|
if text.contains('>') && text.contains("]]>") {
|
|
return Err(Error::InvalidCharacterData(s.gen_text_pos()));
|
|
}
|
|
|
|
let range = s.range_from(start);
|
|
events.token(Token::Text(text, range))?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value.
|
|
#[derive(Clone, Copy)]
|
|
pub enum Reference<'input> {
|
|
/// An entity reference.
|
|
///
|
|
/// <https://www.w3.org/TR/xml/#NT-EntityRef>
|
|
Entity(&'input str),
|
|
|
|
/// A character reference.
|
|
///
|
|
/// <https://www.w3.org/TR/xml/#NT-CharRef>
|
|
Char(char),
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
pub struct Stream<'input> {
|
|
pos: usize,
|
|
end: usize,
|
|
span: StrSpan<'input>,
|
|
}
|
|
|
|
impl<'input> Stream<'input> {
|
|
#[inline]
|
|
pub fn new(text: &'input str) -> Self {
|
|
Stream {
|
|
pos: 0,
|
|
end: text.len(),
|
|
span: text.into(),
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
pub fn from_substr(text: &'input str, fragment: Range<usize>) -> Self {
|
|
Stream {
|
|
pos: fragment.start,
|
|
end: fragment.end,
|
|
span: text.into(),
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
pub fn pos(&self) -> usize {
|
|
self.pos
|
|
}
|
|
|
|
#[inline]
|
|
pub fn at_end(&self) -> bool {
|
|
self.pos >= self.end
|
|
}
|
|
|
|
#[inline]
|
|
pub fn curr_byte(&self) -> Result<u8> {
|
|
if self.at_end() {
|
|
return Err(Error::UnexpectedEndOfStream);
|
|
}
|
|
|
|
Ok(self.curr_byte_unchecked())
|
|
}
|
|
|
|
#[inline]
|
|
pub fn curr_byte_unchecked(&self) -> u8 {
|
|
self.span.text.as_bytes()[self.pos]
|
|
}
|
|
|
|
#[inline]
|
|
fn next_byte(&self) -> Result<u8> {
|
|
if self.pos + 1 >= self.end {
|
|
return Err(Error::UnexpectedEndOfStream);
|
|
}
|
|
|
|
Ok(self.span.as_str().as_bytes()[self.pos + 1])
|
|
}
|
|
|
|
#[inline]
|
|
pub fn advance(&mut self, n: usize) {
|
|
debug_assert!(self.pos + n <= self.end);
|
|
self.pos += n;
|
|
}
|
|
|
|
#[inline]
|
|
fn starts_with(&self, text: &[u8]) -> bool {
|
|
self.span.text.as_bytes()[self.pos..self.end].starts_with(text)
|
|
}
|
|
|
|
fn consume_byte(&mut self, c: u8) -> Result<()> {
|
|
let curr = self.curr_byte()?;
|
|
if curr != c {
|
|
return Err(Error::InvalidChar(c, curr, self.gen_text_pos()));
|
|
}
|
|
|
|
self.advance(1);
|
|
Ok(())
|
|
}
|
|
|
|
// Unlike `consume_byte()` will not return any errors.
|
|
fn try_consume_byte(&mut self, c: u8) -> bool {
|
|
match self.curr_byte() {
|
|
Ok(b) if b == c => {
|
|
self.advance(1);
|
|
true
|
|
}
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
|
|
if !self.starts_with(text) {
|
|
let pos = self.gen_text_pos();
|
|
|
|
// Assume that all input `text` are valid UTF-8 strings, so unwrap is safe.
|
|
let expected = str::from_utf8(text).unwrap();
|
|
|
|
return Err(Error::InvalidString(expected, pos));
|
|
}
|
|
|
|
self.advance(text.len());
|
|
Ok(())
|
|
}
|
|
|
|
#[inline]
|
|
fn consume_bytes<F: Fn(u8) -> bool>(&mut self, f: F) -> &'input str {
|
|
let start = self.pos;
|
|
self.skip_bytes(f);
|
|
self.slice_back(start)
|
|
}
|
|
|
|
fn skip_bytes<F: Fn(u8) -> bool>(&mut self, f: F) {
|
|
while !self.at_end() && f(self.curr_byte_unchecked()) {
|
|
self.advance(1);
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn consume_chars<F>(&mut self, f: F) -> Result<&'input str>
|
|
where
|
|
F: Fn(&Stream, char) -> bool,
|
|
{
|
|
let start = self.pos;
|
|
self.skip_chars(f)?;
|
|
Ok(self.slice_back(start))
|
|
}
|
|
|
|
#[inline]
|
|
fn skip_chars<F>(&mut self, f: F) -> Result<()>
|
|
where
|
|
F: Fn(&Stream, char) -> bool,
|
|
{
|
|
for c in self.chars() {
|
|
if !c.is_xml_char() {
|
|
return Err(Error::NonXmlChar(c, self.gen_text_pos()));
|
|
} else if f(self, c) {
|
|
self.advance(c.len_utf8());
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[inline]
|
|
fn chars(&self) -> str::Chars<'input> {
|
|
self.span.as_str()[self.pos..self.end].chars()
|
|
}
|
|
|
|
#[inline]
|
|
fn slice_back(&self, pos: usize) -> &'input str {
|
|
self.span.slice_region(pos, self.pos)
|
|
}
|
|
|
|
#[inline]
|
|
fn slice_back_span(&self, pos: usize) -> StrSpan<'input> {
|
|
StrSpan::from_substr(self.span.text, pos, self.pos)
|
|
}
|
|
|
|
#[inline]
|
|
fn range_from(&self, start: usize) -> Range<usize> {
|
|
start..self.pos
|
|
}
|
|
|
|
#[inline]
|
|
fn skip_spaces(&mut self) {
|
|
while self.starts_with_space() {
|
|
self.advance(1);
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn starts_with_space(&self) -> bool {
|
|
!self.at_end() && self.curr_byte_unchecked().is_xml_space()
|
|
}
|
|
|
|
// Like `skip_spaces()`, but checks that first char is actually a space.
|
|
fn consume_spaces(&mut self) -> Result<()> {
|
|
if self.at_end() {
|
|
return Err(Error::UnexpectedEndOfStream);
|
|
}
|
|
|
|
if !self.starts_with_space() {
|
|
return Err(Error::InvalidChar2(
|
|
"a whitespace",
|
|
self.curr_byte_unchecked(),
|
|
self.gen_text_pos(),
|
|
));
|
|
}
|
|
|
|
self.skip_spaces();
|
|
Ok(())
|
|
}
|
|
|
|
/// Consumes according to: <https://www.w3.org/TR/xml/#NT-Reference>
|
|
pub fn try_consume_reference(&mut self) -> Option<Reference<'input>> {
|
|
let start = self.pos();
|
|
|
|
// Consume reference on a substream.
|
|
let mut s = self.clone();
|
|
let result = s.consume_reference()?;
|
|
|
|
// If the current data is a reference than advance the current stream
|
|
// by number of bytes read by substream.
|
|
self.advance(s.pos() - start);
|
|
Some(result)
|
|
}
|
|
|
|
#[inline(never)]
|
|
fn consume_reference(&mut self) -> Option<Reference<'input>> {
|
|
if !self.try_consume_byte(b'&') {
|
|
return None;
|
|
}
|
|
|
|
let reference = if self.try_consume_byte(b'#') {
|
|
let (value, radix) = if self.try_consume_byte(b'x') {
|
|
let value =
|
|
self.consume_bytes(|c| matches!(c, b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f'));
|
|
(value, 16)
|
|
} else {
|
|
let value = self.consume_bytes(|c| c.is_ascii_digit());
|
|
(value, 10)
|
|
};
|
|
|
|
let n = u32::from_str_radix(value, radix).ok()?;
|
|
|
|
let c = char::from_u32(n).unwrap_or('\u{FFFD}');
|
|
if !c.is_xml_char() {
|
|
return None;
|
|
}
|
|
|
|
Reference::Char(c)
|
|
} else {
|
|
let name = self.consume_name().ok()?;
|
|
match name {
|
|
"quot" => Reference::Char('"'),
|
|
"amp" => Reference::Char('&'),
|
|
"apos" => Reference::Char('\''),
|
|
"lt" => Reference::Char('<'),
|
|
"gt" => Reference::Char('>'),
|
|
_ => Reference::Entity(name),
|
|
}
|
|
};
|
|
|
|
self.consume_byte(b';').ok()?;
|
|
|
|
Some(reference)
|
|
}
|
|
|
|
/// Consumes according to: <https://www.w3.org/TR/xml/#NT-Name>
|
|
fn consume_name(&mut self) -> Result<&'input str> {
|
|
let start = self.pos();
|
|
self.skip_name()?;
|
|
|
|
let name = self.slice_back(start);
|
|
if name.is_empty() {
|
|
return Err(Error::InvalidName(self.gen_text_pos_from(start)));
|
|
}
|
|
|
|
Ok(name)
|
|
}
|
|
|
|
/// The same as `consume_name()`, but does not return a consumed name.
|
|
fn skip_name(&mut self) -> Result<()> {
|
|
let start = self.pos();
|
|
let mut iter = self.chars();
|
|
if let Some(c) = iter.next() {
|
|
if c.is_xml_name_start() {
|
|
self.advance(c.len_utf8());
|
|
} else {
|
|
return Err(Error::InvalidName(self.gen_text_pos_from(start)));
|
|
}
|
|
}
|
|
|
|
for c in iter {
|
|
if c.is_xml_name() {
|
|
self.advance(c.len_utf8());
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Consumes a qualified XML name and returns it.
|
|
///
|
|
/// Consumes according to: <https://www.w3.org/TR/xml-names/#ns-qualnames>
|
|
#[inline(never)]
|
|
fn consume_qname(&mut self) -> Result<(&'input str, &'input str)> {
|
|
let start = self.pos();
|
|
|
|
let mut splitter = None;
|
|
|
|
while !self.at_end() {
|
|
// Check for ASCII first for performance reasons.
|
|
let b = self.curr_byte_unchecked();
|
|
if b < 128 {
|
|
if b == b':' {
|
|
if splitter.is_none() {
|
|
splitter = Some(self.pos());
|
|
self.advance(1);
|
|
} else {
|
|
// Multiple `:` is an error.
|
|
return Err(Error::InvalidName(self.gen_text_pos_from(start)));
|
|
}
|
|
} else if b.is_xml_name() {
|
|
self.advance(1);
|
|
} else {
|
|
break;
|
|
}
|
|
} else {
|
|
// Fallback to Unicode code point.
|
|
match self.chars().nth(0) {
|
|
Some(c) if c.is_xml_name() => {
|
|
self.advance(c.len_utf8());
|
|
}
|
|
_ => break,
|
|
}
|
|
}
|
|
}
|
|
|
|
let (prefix, local) = if let Some(splitter) = splitter {
|
|
let prefix = self.span.slice_region(start, splitter);
|
|
let local = self.slice_back(splitter + 1);
|
|
(prefix, local)
|
|
} else {
|
|
let local = self.slice_back(start);
|
|
// Slice an empty prefix. This way we can preserve attribute start position.
|
|
(self.span.slice_region(start, start), local)
|
|
};
|
|
|
|
// Prefix must start with a `NameStartChar`.
|
|
if let Some(c) = prefix.chars().nth(0) {
|
|
if !c.is_xml_name_start() {
|
|
return Err(Error::InvalidName(self.gen_text_pos_from(start)));
|
|
}
|
|
}
|
|
|
|
// Local name must start with a `NameStartChar`.
|
|
if let Some(c) = local.chars().nth(0) {
|
|
if !c.is_xml_name_start() {
|
|
return Err(Error::InvalidName(self.gen_text_pos_from(start)));
|
|
}
|
|
} else {
|
|
// If empty - error.
|
|
return Err(Error::InvalidName(self.gen_text_pos_from(start)));
|
|
}
|
|
|
|
Ok((prefix, local))
|
|
}
|
|
|
|
fn consume_eq(&mut self) -> Result<()> {
|
|
self.skip_spaces();
|
|
self.consume_byte(b'=')?;
|
|
self.skip_spaces();
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn consume_quote(&mut self) -> Result<u8> {
|
|
let c = self.curr_byte()?;
|
|
if c == b'\'' || c == b'"' {
|
|
self.advance(1);
|
|
Ok(c)
|
|
} else {
|
|
Err(Error::InvalidChar2("a quote", c, self.gen_text_pos()))
|
|
}
|
|
}
|
|
|
|
/// Calculates a current absolute position.
|
|
///
|
|
/// This operation is very expensive. Use only for errors.
|
|
#[inline(never)]
|
|
pub fn gen_text_pos(&self) -> TextPos {
|
|
let text = self.span.as_str();
|
|
let end = self.pos;
|
|
|
|
let row = Self::calc_curr_row(text, end);
|
|
let col = Self::calc_curr_col(text, end);
|
|
TextPos::new(row, col)
|
|
}
|
|
|
|
/// Calculates an absolute position at `pos`.
|
|
///
|
|
/// This operation is very expensive. Use only for errors.
|
|
#[inline(never)]
|
|
pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
|
|
let mut s = self.clone();
|
|
s.pos = core::cmp::min(pos, s.span.as_str().len());
|
|
s.gen_text_pos()
|
|
}
|
|
|
|
fn calc_curr_row(text: &str, end: usize) -> u32 {
|
|
let mut row = 1;
|
|
for c in &text.as_bytes()[..end] {
|
|
if *c == b'\n' {
|
|
row += 1;
|
|
}
|
|
}
|
|
|
|
row
|
|
}
|
|
|
|
fn calc_curr_col(text: &str, end: usize) -> u32 {
|
|
let mut col = 1;
|
|
for c in text[..end].chars().rev() {
|
|
if c == '\n' {
|
|
break;
|
|
} else {
|
|
col += 1;
|
|
}
|
|
}
|
|
|
|
col
|
|
}
|
|
}
|