use core::ops::Range; use core::str; use crate::{Error, TextPos}; type Result = core::result::Result; /// Extension methods for XML-subset only operations. trait XmlCharExt { /// Checks if the value is within the /// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range. fn is_xml_name_start(&self) -> bool; /// Checks if the value is within the /// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range. fn is_xml_name(&self) -> bool; /// Checks if the value is within the /// [Char](https://www.w3.org/TR/xml/#NT-Char) range. fn is_xml_char(&self) -> bool; } impl XmlCharExt for char { #[inline] fn is_xml_name_start(&self) -> bool { // Check for ASCII first. if *self as u32 <= 128 { return matches!(*self as u8, b'A'..=b'Z' | b'a'..=b'z' | b':' | b'_'); } matches!(*self as u32, 0x0000C0..=0x0000D6 | 0x0000D8..=0x0000F6 | 0x0000F8..=0x0002FF | 0x000370..=0x00037D | 0x00037F..=0x001FFF | 0x00200C..=0x00200D | 0x002070..=0x00218F | 0x002C00..=0x002FEF | 0x003001..=0x00D7FF | 0x00F900..=0x00FDCF | 0x00FDF0..=0x00FFFD | 0x010000..=0x0EFFFF) } #[inline] fn is_xml_name(&self) -> bool { // Check for ASCII first. if *self as u32 <= 128 { return (*self as u8).is_xml_name(); } matches!(*self as u32, 0x0000B7 | 0x0000C0..=0x0000D6 | 0x0000D8..=0x0000F6 | 0x0000F8..=0x0002FF | 0x000300..=0x00036F | 0x000370..=0x00037D | 0x00037F..=0x001FFF | 0x00200C..=0x00200D | 0x00203F..=0x002040 | 0x002070..=0x00218F | 0x002C00..=0x002FEF | 0x003001..=0x00D7FF | 0x00F900..=0x00FDCF | 0x00FDF0..=0x00FFFD | 0x010000..=0x0EFFFF) } #[inline] fn is_xml_char(&self) -> bool { // Does not check for surrogate code points U+D800-U+DFFF, // since that check was performed by Rust when the `&str` was constructed. if (*self as u32) < 0x20 { return (*self as u8).is_xml_space(); } !matches!(*self as u32, 0xFFFF | 0xFFFE) } } trait XmlByteExt { /// Checks if byte is a space. /// /// `[ \r\n\t]` fn is_xml_space(&self) -> bool; /// Checks if byte is within the ASCII /// [Char](https://www.w3.org/TR/xml/#NT-Char) range. fn is_xml_name(&self) -> bool; } impl XmlByteExt for u8 { #[inline] fn is_xml_space(&self) -> bool { matches!(*self, b' ' | b'\t' | b'\n' | b'\r') } #[inline] fn is_xml_name(&self) -> bool { matches!(*self, b'A'..=b'Z' | b'a'..=b'z'| b'0'..=b'9'| b':' | b'_' | b'-' | b'.') } } /// A string slice. /// /// Like `&str`, but also contains the position in the input XML /// from which it was parsed. #[must_use] #[derive(Clone, Copy)] pub struct StrSpan<'input> { text: &'input str, start: usize, } impl<'input> From<&'input str> for StrSpan<'input> { #[inline] fn from(text: &'input str) -> Self { StrSpan { text, start: 0 } } } impl<'input> StrSpan<'input> { #[inline] pub fn from_substr(text: &str, start: usize, end: usize) -> StrSpan { debug_assert!(start <= end); StrSpan { text: &text[start..end], start, } } #[inline] pub fn range(&self) -> Range { self.start..(self.start + self.text.len()) } #[inline] pub fn as_str(&self) -> &'input str { self.text } #[inline] fn slice_region(&self, start: usize, end: usize) -> &'input str { &self.text[start..end] } } pub enum Token<'input> { // ProcessingInstruction(&'input str, Option<&'input str>, Range), // Comment(&'input str, Range), // EntityDeclaration(&'input str, StrSpan<'input>), // , u16, u8, &'input str, &'input str, StrSpan<'input>), ElementEnd(ElementEnd<'input>, Range), // Contains text between elements including whitespaces. // Basically everything between `>` and `<`. // Except `]]>`, which is not allowed and will lead to an error. Text(&'input str, Range), // Cdata(&'input str, Range), } /// `ElementEnd` token. #[derive(Clone, Copy)] pub enum ElementEnd<'input> { /// Indicates `>` Open, /// Indicates `` Close(&'input str, &'input str), /// Indicates `/>` Empty, } pub trait XmlEvents<'input> { fn token(&mut self, token: Token<'input>) -> Result<()>; } // document ::= prolog element Misc* pub fn parse<'input>( text: &'input str, allow_dtd: bool, events: &mut dyn XmlEvents<'input>, ) -> Result<()> { let s = &mut Stream::new(text); // Skip UTF-8 BOM. if s.starts_with(&[0xEF, 0xBB, 0xBF]) { s.advance(3); } if s.starts_with(b"(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { while !s.at_end() { s.skip_spaces(); if s.starts_with(b"' fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { let start = s.pos(); s.advance(4); let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?; s.skip_string(b"-->")?; if text.contains("--") { return Err(Error::InvalidComment(s.gen_text_pos_from(start))); } if text.ends_with('-') { return Err(Error::InvalidComment(s.gen_text_pos_from(start))); } let range = s.range_from(start); events.token(Token::Comment(text, range))?; Ok(()) } // PI ::= '' Char*)))? '?>' // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { if s.starts_with(b"")))?; let content = if !content.is_empty() { Some(content) } else { None }; s.skip_string(b"?>")?; let range = s.range_from(start); events.token(Token::ProcessingInstruction(target, content, range))?; Ok(()) } fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { let start = s.pos(); parse_doctype_start(s)?; s.skip_spaces(); if s.curr_byte() == Ok(b'>') { s.advance(1); return Ok(()); } s.advance(1); // [ while !s.at_end() { s.skip_spaces(); if s.starts_with(b"', therefore we have to skip possible spaces. s.advance(1); s.skip_spaces(); match s.curr_byte() { Ok(b'>') => { s.advance(1); break; } Ok(c) => { return Err(Error::InvalidChar2("'>'", c, s.gen_text_pos())); } Err(_) => { return Err(Error::UnexpectedEndOfStream); } } } else if s.starts_with(b"' fn parse_doctype_start(s: &mut Stream) -> Result<()> { s.advance(9); s.consume_spaces()?; s.skip_name()?; s.skip_spaces(); let _ = parse_external_id(s)?; s.skip_spaces(); let c = s.curr_byte()?; if c != b'[' && c != b'>' { return Err(Error::InvalidChar2("'[' or '>'", c, s.gen_text_pos())); } Ok(()) } // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral fn parse_external_id(s: &mut Stream) -> Result { let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") { let start = s.pos(); s.advance(6); let id = s.slice_back(start); s.consume_spaces()?; let quote = s.consume_quote()?; let _ = s.consume_bytes(|c| c != quote); s.consume_byte(quote)?; if id == "SYSTEM" { // Ok } else { s.consume_spaces()?; let quote = s.consume_quote()?; let _ = s.consume_bytes(|c| c != quote); s.consume_byte(quote)?; } true } else { false }; Ok(v) } // EntityDecl ::= GEDecl | PEDecl // GEDecl ::= '' // PEDecl ::= '' fn parse_entity_decl<'input>( s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>, ) -> Result<()> { s.advance(8); s.consume_spaces()?; let is_ge = if s.try_consume_byte(b'%') { s.consume_spaces()?; false } else { true }; let name = s.consume_name()?; s.consume_spaces()?; if let Some(definition) = parse_entity_def(s, is_ge)? { events.token(Token::EntityDeclaration(name, definition))?; } s.skip_spaces(); s.consume_byte(b'>')?; Ok(()) } // EntityDef ::= EntityValue | (ExternalID NDataDecl?) // PEDef ::= EntityValue | ExternalID // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] // | PEReference | Reference)* "'" // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral // NDataDecl ::= S 'NDATA' S Name fn parse_entity_def<'input>( s: &mut Stream<'input>, is_ge: bool, ) -> Result>> { let c = s.curr_byte()?; match c { b'"' | b'\'' => { let quote = s.consume_quote()?; let start = s.pos(); s.skip_bytes(|c| c != quote); let value = s.slice_back_span(start); s.consume_byte(quote)?; Ok(Some(value)) } b'S' | b'P' => { if parse_external_id(s)? { if is_ge { s.skip_spaces(); if s.starts_with(b"NDATA") { s.advance(5); s.consume_spaces()?; s.skip_name()?; // TODO: NDataDecl is not supported } } Ok(None) } else { Err(Error::InvalidExternalID(s.gen_text_pos())) } } _ => { let pos = s.gen_text_pos(); Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC", c, pos)) } } } fn consume_decl(s: &mut Stream) -> Result<()> { s.skip_bytes(|c| c != b'>'); s.consume_byte(b'>')?; Ok(()) } // element ::= EmptyElemTag | STag content ETag // '<' Name (S Attribute)* S? '>' fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { let start = s.pos(); s.advance(1); // < let (prefix, local) = s.consume_qname()?; events.token(Token::ElementStart(prefix, local, start))?; let mut open = false; while !s.at_end() { let has_space = s.starts_with_space(); s.skip_spaces(); let start = s.pos(); match s.curr_byte()? { b'/' => { s.advance(1); s.consume_byte(b'>')?; let range = s.range_from(start); events.token(Token::ElementEnd(ElementEnd::Empty, range))?; break; } b'>' => { s.advance(1); let range = s.range_from(start); events.token(Token::ElementEnd(ElementEnd::Open, range))?; open = true; break; } _ => { // An attribute must be preceded with a whitespace. if !has_space { // Will always trigger an error. Which is what we want. s.consume_spaces()?; } // Manual inlining of `parse_attribute` for performance. // We cannot mark `parse_attribute` as `#[inline(always)]` // because it will blow up the binary size. let (prefix, local) = s.consume_qname()?; let qname_end = s.pos(); let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX); s.consume_eq()?; let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX); let quote = s.consume_quote()?; let quote_c = quote as char; // The attribute value must not contain the < character. let value_start = s.pos(); s.skip_chars(|_, c| c != quote_c && c != '<')?; let value = s.slice_back_span(value_start); s.consume_byte(quote)?; let end = s.pos(); events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?; } } } if open { parse_content(s, events)?; } Ok(()) } // Attribute ::= Name Eq AttValue fn parse_attribute<'input>( s: &mut Stream<'input>, ) -> Result<(&'input str, &'input str, StrSpan<'input>)> { let (prefix, local) = s.consume_qname()?; s.consume_eq()?; let quote = s.consume_quote()?; let quote_c = quote as char; // The attribute value must not contain the < character. let value_start = s.pos(); s.skip_chars(|_, c| c != quote_c && c != '<')?; let value = s.slice_back_span(value_start); s.consume_byte(quote)?; Ok((prefix, local, value)) } // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* pub fn parse_content<'input>( s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>, ) -> Result<()> { while !s.at_end() { match s.curr_byte() { Ok(b'<') => match s.next_byte() { Ok(b'!') => { if s.starts_with(b"