Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
use crate::common::is_whitespace_char;
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::lexer::Token;
use super::{PullParser, Result, State};
impl PullParser {
pub fn inside_cdata(&mut self, t: Token) -> Option<Result> {
match t {
Token::CDataEnd => {
let event = if self.config.c.cdata_to_characters {
// start called push_pos, but there will be no event to pop it
if self.buf.is_empty() {
self.next_pos();
}
None
} else {
let data = self.take_buf();
Some(Ok(XmlEvent::CData(data)))
};
self.into_state(State::OutsideTag, event)
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
Token::Character(c) => {
if !is_whitespace_char(c) {
self.inside_whitespace = false;
}
self.buf.push(c);
None
},
_ => {
debug_assert!(false, "unreachable");
None
},
}
}
}

View File

@@ -0,0 +1,32 @@
use super::{ClosingTagSubstate, PullParser, QualifiedNameTarget, Result, State};
use crate::common::is_whitespace_char;
use crate::namespace;
use crate::reader::error::SyntaxError;
use crate::reader::lexer::Token;
impl PullParser {
pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option<Result> {
match s {
ClosingTagSubstate::CTInsideName => self.read_qualified_name(t, QualifiedNameTarget::ClosingTagNameTarget, |this, token, name| {
match name.prefix_ref() {
Some(prefix) if prefix == namespace::NS_XML_PREFIX ||
prefix == namespace::NS_XMLNS_PREFIX =>
Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))),
_ => {
this.data.element_name = Some(name.clone());
match token {
Token::TagEnd => this.emit_end_element(),
Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)),
_ => Some(this.error(SyntaxError::UnexpectedTokenInClosingTag(token))),
}
}
}
}),
ClosingTagSubstate::CTAfterName => match t {
Token::TagEnd => self.emit_end_element(),
Token::Character(c) if is_whitespace_char(c) => None, // Skip whitespace
_ => Some(self.error(SyntaxError::UnexpectedTokenInClosingTag(t))),
},
}
}
}

View File

@@ -0,0 +1,34 @@
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::lexer::Token;
use super::{PullParser, Result, State};
impl PullParser {
pub fn inside_comment(&mut self, t: Token) -> Option<Result> {
match t {
Token::CommentEnd if self.config.c.ignore_comments => {
self.into_state_continue(State::OutsideTag)
}
Token::CommentEnd => {
let data = self.take_buf();
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data)))
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
_ if self.config.c.ignore_comments => None, // Do not modify buffer if ignoring the comment
_ => {
if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
t.push_to_string(&mut self.buf);
None
},
}
}
}

View File

@@ -0,0 +1,180 @@
use crate::common::{is_whitespace_char, XmlVersion};
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::lexer::Token;
use crate::util::Encoding;
use super::{
DeclarationSubstate, Encountered, PullParser, QualifiedNameTarget, Result, State,
DEFAULT_VERSION,
};
impl PullParser {
#[inline(never)]
fn emit_start_document(&mut self) -> Option<Result> {
debug_assert!(self.encountered == Encountered::None);
self.encountered = Encountered::Declaration;
let version = self.data.version;
let encoding = self.data.take_encoding();
let standalone = self.data.standalone;
if let Some(new_encoding) = encoding.as_deref() {
let new_encoding = match new_encoding.parse() {
Ok(e) => e,
Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1,
Err(_) => return Some(self.error(SyntaxError::UnsupportedEncoding(new_encoding.into()))),
};
let current_encoding = self.lexer.encoding();
if current_encoding != new_encoding {
let set = match (current_encoding, new_encoding) {
(Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new,
(Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding,
_ if self.config.ignore_invalid_encoding_declarations => current_encoding,
_ => return Some(self.error(SyntaxError::ConflictingEncoding(new_encoding, current_encoding))),
};
self.lexer.set_encoding(set);
}
}
let current_encoding = self.lexer.encoding();
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument {
version: version.unwrap_or(DEFAULT_VERSION),
encoding: encoding.unwrap_or_else(move || current_encoding.to_string()),
standalone
}))
}
// TODO: remove redundancy via macros or extra methods
pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option<Result> {
match s {
DeclarationSubstate::BeforeVersion => match t {
Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)),
Token::Character(c) if is_whitespace_char(c) => None, // continue
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
match &*name.local_name {
"ersion" if name.namespace.is_none() =>
this.into_state_continue(State::InsideDeclaration(
if token == Token::EqualsSign {
DeclarationSubstate::InsideVersionValue
} else {
DeclarationSubstate::AfterVersion
}
)),
_ => Some(this.error(SyntaxError::UnexpectedNameInsideXml(name.to_string().into()))),
}
}),
DeclarationSubstate::AfterVersion => match t {
Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)),
Token::Character(c) if is_whitespace_char(c) => None,
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| {
this.data.version = match &*value {
"1.0" => Some(XmlVersion::Version10),
"1.1" => Some(XmlVersion::Version11),
_ => None
};
if this.data.version.is_some() {
this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue))
} else {
Some(this.error(SyntaxError::UnexpectedXmlVersion(value.into())))
}
}),
DeclarationSubstate::AfterVersionValue => match t {
Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeEncoding)),
Token::ProcessingInstructionEnd => self.emit_start_document(),
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::BeforeEncoding => match t {
Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)),
Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
Token::ProcessingInstructionEnd => self.emit_start_document(),
Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
match &*name.local_name {
"ncoding" if name.namespace.is_none() =>
this.into_state_continue(State::InsideDeclaration(
if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding }
)),
_ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into())))
}
}),
DeclarationSubstate::AfterEncoding => match t {
Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)),
Token::Character(c) if is_whitespace_char(c) => None,
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| {
this.data.encoding = Some(value);
this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterEncodingValue))
}),
DeclarationSubstate::AfterEncodingValue => match t {
Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)),
Token::ProcessingInstructionEnd => self.emit_start_document(),
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::BeforeStandaloneDecl => match t {
Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
Token::ProcessingInstructionEnd => self.emit_start_document(),
Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
match &*name.local_name {
"tandalone" if name.namespace.is_none() =>
this.into_state_continue(State::InsideDeclaration(
if token == Token::EqualsSign {
DeclarationSubstate::InsideStandaloneDeclValue
} else {
DeclarationSubstate::AfterStandaloneDecl
}
)),
_ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))),
}
}),
DeclarationSubstate::AfterStandaloneDecl => match t {
Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)),
Token::Character(c) if is_whitespace_char(c) => None,
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| {
let standalone = match &*value {
"yes" => Some(true),
"no" => Some(false),
_ => None
};
if standalone.is_some() {
this.data.standalone = standalone;
this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue))
} else {
Some(this.error(SyntaxError::InvalidStandaloneDeclaration(value.into())))
}
}),
DeclarationSubstate::AfterStandaloneDeclValue => match t {
Token::ProcessingInstructionEnd => self.emit_start_document(),
Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
}
}
}

View File

@@ -0,0 +1,251 @@
use std::fmt::Write;
use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
use crate::reader::error::SyntaxError;
use crate::reader::lexer::Token;
use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State};
impl PullParser {
pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option<Result> {
if let Some(ref mut doctype) = self.data.doctype {
write!(doctype, "{t}").ok()?;
if doctype.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
}
match substate {
DoctypeSubstate::Outside => match t {
Token::TagEnd => self.into_state_continue(State::OutsideTag),
Token::MarkupDeclarationStart => {
self.buf.clear();
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName))
},
Token::Character('%') => {
self.data.ref_data.clear();
self.data.ref_data.push('%');
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd))
},
Token::CommentStart => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment))
},
Token::SingleQuote | Token::DoubleQuote => {
// just discard string literals
self.data.quote = super::QuoteToken::from_token(t);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String))
},
Token::CDataEnd | Token::CDataStart => Some(self.error(SyntaxError::UnexpectedToken(t))),
// TODO: parse SYSTEM, and [
_ => None,
},
DoctypeSubstate::String => match t {
Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => None,
Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => None,
Token::SingleQuote | Token::DoubleQuote => {
self.data.quote = None;
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
},
_ => None,
},
DoctypeSubstate::Comment => match t {
Token::CommentEnd => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
},
_ => None,
},
DoctypeSubstate::InsideName => match t {
Token::Character(c @ 'A'..='Z') => {
self.buf.push(c);
None
},
Token::Character(c) if is_whitespace_char(c) => {
let buf = self.take_buf();
match buf.as_str() {
"ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)),
"NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)),
_ => Some(self.error(SyntaxError::UnknownMarkupDeclaration(buf.into()))),
}
},
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DoctypeSubstate::BeforeEntityName => {
self.data.name.clear();
match t {
Token::Character(c) if is_whitespace_char(c) => None,
Token::Character('%') => { // % is for PEDecl
self.data.name.push('%');
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart))
},
Token::Character(c) if is_name_start_char(c) => {
if self.data.name.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.data.name.push(c);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
}
},
DoctypeSubstate::EntityName => match t {
Token::Character(c) if is_whitespace_char(c) => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
},
Token::Character(c) if is_name_char(c) => {
if self.data.name.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.data.name.push(c);
None
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::BeforeEntityValue => {
self.buf.clear();
match t {
Token::Character(c) if is_whitespace_char(c) => None,
// SYSTEM/PUBLIC not supported
Token::Character('S' | 'P') => {
let name = self.data.take_name();
self.entities.entry(name).or_default(); // Dummy value, but at least the name is recognized
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration))
},
Token::SingleQuote | Token::DoubleQuote => {
self.data.quote = super::QuoteToken::from_token(t);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
}
},
DoctypeSubstate::EntityValue => match t {
Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None },
Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None },
Token::SingleQuote | Token::DoubleQuote => {
self.data.quote = None;
let name = self.data.take_name();
let val = self.take_buf();
self.entities.entry(name).or_insert(val); // First wins
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME
},
Token::ReferenceStart | Token::Character('&') => {
self.data.ref_data.clear();
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart))
},
Token::Character('%') => {
self.data.ref_data.clear();
self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue))
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
Token::Character(c) => {
self.buf.push(c);
None
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::PEReferenceDefinitionStart => match t {
Token::Character(c) if is_whitespace_char(c) => None,
Token::Character(c) if is_name_start_char(c) => {
debug_assert_eq!(self.data.name, "%");
self.data.name.push(c);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::PEReferenceDefinition => match t {
Token::Character(c) if is_name_char(c) => {
if self.data.name.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.data.name.push(c);
None
},
Token::Character(c) if is_whitespace_char(c) => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::PEReferenceInDtd => match t {
Token::Character(c) if is_name_char(c) => {
self.data.ref_data.push(c);
None
},
Token::ReferenceEnd | Token::Character(';') => {
let name = self.data.take_ref_data();
match self.entities.get(&name) {
Some(ent) => {
if let Err(e) = self.lexer.reparse(ent) {
return Some(Err(e));
}
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
},
None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))),
}
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::PEReferenceInValue => match t {
Token::Character(c) if is_name_char(c) => {
self.data.ref_data.push(c);
None
},
Token::ReferenceEnd | Token::Character(';') => {
let name = self.data.take_ref_data();
match self.entities.get(&name) {
Some(ent) => {
self.buf.push_str(ent);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
},
None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))),
}
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::NumericReferenceStart => match t {
Token::Character('#') => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference))
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
Token::Character(c) => {
self.buf.push('&');
self.buf.push(c);
// named entities are not expanded inside doctype
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::NumericReference => match t {
Token::ReferenceEnd | Token::Character(';') => {
let r = self.data.take_ref_data();
// https://www.w3.org/TR/xml/#sec-entexpand
match self.numeric_reference_from_str(&r) {
Ok(c) => {
self.buf.push(c);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
},
Err(e) => Some(self.error(e)),
}
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
Token::Character(c) => {
self.data.ref_data.push(c);
None
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::SkipDeclaration => match t {
Token::TagEnd => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
},
_ => None,
},
}
}
}

View File

@@ -0,0 +1,120 @@
use crate::attribute::OwnedAttribute;
use crate::common::{is_name_start_char, is_whitespace_char};
use crate::namespace;
use crate::reader::error::SyntaxError;
use crate::reader::lexer::Token;
use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State};
impl PullParser {
pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option<Result> {
let max_attrs = self.config.max_attributes;
match s {
OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| {
match name.prefix_ref() {
Some(prefix) if prefix == namespace::NS_XML_PREFIX ||
prefix == namespace::NS_XMLNS_PREFIX =>
Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))),
_ => {
this.data.element_name = Some(name.clone());
match token {
Token::TagEnd => this.emit_start_element(false),
Token::EmptyTagEnd => this.emit_start_element(true),
Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)),
_ => {
debug_assert!(false, "unreachable");
None
},
}
}
}
}),
OpeningTagSubstate::InsideTag => match t {
Token::TagEnd => self.emit_start_element(false),
Token::EmptyTagEnd => self.emit_start_element(true),
Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
Token::Character(c) if is_name_start_char(c) => {
if self.buf.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push(c);
self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
},
OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
// check that no attribute with such name is already present
// if there is one, XML is not well-formed
if this.data.attributes.contains(&name) {
return Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into())))
}
this.data.attr_name = Some(name);
match token {
Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)),
_ => Some(this.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) // likely unreachable
}
}),
OpeningTagSubstate::AfterAttributeName => match t {
Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
Token::Character(c) if is_whitespace_char(c) => None,
_ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
},
OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| {
let name = this.data.take_attr_name()?; // will always succeed here
match name.prefix_ref() {
// declaring a new prefix; it is sufficient to check prefix only
// because "xmlns" prefix is reserved
Some(namespace::NS_XMLNS_PREFIX) => {
let ln = &*name.local_name;
if ln == namespace::NS_XMLNS_PREFIX {
Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix))
} else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI {
Some(this.error(SyntaxError::CannotRedefineXmlPrefix))
} else if value.is_empty() {
Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into())))
} else {
this.nst.put(name.local_name.clone(), value);
this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
},
// declaring default namespace
None if &*name.local_name == namespace::NS_XMLNS_PREFIX =>
match &*value {
namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI =>
Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))),
_ => {
this.nst.put(namespace::NS_NO_PREFIX, value.clone());
this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
},
// regular attribute
_ => {
if this.data.attributes.len() >= max_attrs {
return Some(this.error(SyntaxError::ExceededConfiguredLimit));
}
this.data.attributes.push(OwnedAttribute { name, value });
this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
},
}
}),
OpeningTagSubstate::AfterAttributeValue => match t {
Token::Character(c) if is_whitespace_char(c) => {
self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
},
Token::TagEnd => self.emit_start_element(false),
Token::EmptyTagEnd => self.emit_start_element(true),
_ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
},
}
}
}

View File

@@ -0,0 +1,113 @@
use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::lexer::Token;
use super::{DeclarationSubstate, Encountered, ProcessingInstructionSubstate, PullParser, Result, State};
impl PullParser {
pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option<Result> {
match s {
ProcessingInstructionSubstate::PIInsideName => match t {
Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) ||
self.buf_has_data() && is_name_char(c) => {
if self.buf.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push(c);
None
},
Token::ProcessingInstructionEnd => {
// self.buf contains PI name
let name = self.take_buf();
// Don't need to check for declaration because it has mandatory attributes
// but there is none
match &*name {
// Name is empty, it is an error
"" => Some(self.error(SyntaxError::ProcessingInstructionWithoutName)),
// Found <?xml-like PI not at the beginning of a document,
// it is an error - see section 2.6 of XML 1.1 spec
n if "xml".eq_ignore_ascii_case(n) =>
Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))),
// All is ok, emitting event
_ => {
debug_assert!(self.next_event.is_none(), "{:?}", self.next_event);
// can't have a PI before `<?xml`
let event1 = self.set_encountered(Encountered::Declaration);
let event2 = Some(Ok(XmlEvent::ProcessingInstruction {
name,
data: None
}));
// emitting two events at once is cumbersome
let event1 = if event1.is_some() {
self.next_event = event2;
event1
} else {
event2
};
self.into_state(State::OutsideTag, event1)
},
}
},
Token::Character(c) if is_whitespace_char(c) => {
// self.buf contains PI name
let name = self.take_buf();
match &*name {
// We have not ever encountered an element and have not parsed XML declaration
"xml" if self.encountered == Encountered::None =>
self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)),
// Found <?xml-like PI after the beginning of a document,
// it is an error - see section 2.6 of XML 1.1 spec
n if "xml".eq_ignore_ascii_case(n) =>
Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))),
// All is ok, starting parsing PI data
_ => {
self.data.name = name;
// can't have a PI before `<?xml`
let next_event = self.set_encountered(Encountered::Declaration);
self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData), next_event)
},
}
},
_ => {
let buf = self.take_buf();
Some(self.error(SyntaxError::UnexpectedProcessingInstruction(buf.into(), t)))
},
},
ProcessingInstructionSubstate::PIInsideData => match t {
Token::ProcessingInstructionEnd => {
let name = self.data.take_name();
let data = self.take_buf();
self.into_state_emit(
State::OutsideTag,
Ok(XmlEvent::ProcessingInstruction { name, data: Some(data) }),
)
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
// Any other token should be treated as plain characters
_ => {
if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
t.push_to_string(&mut self.buf);
None
},
},
}
}
}

View File

@@ -0,0 +1,76 @@
use super::{PullParser, Result, State};
use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
use crate::reader::error::SyntaxError;
use crate::reader::lexer::Token;
use std::char;
impl PullParser {
pub fn inside_reference(&mut self, t: Token) -> Option<Result> {
match t {
Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) ||
self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => {
self.data.ref_data.push(c);
None
},
Token::ReferenceEnd => {
let name = self.data.take_ref_data();
if name.is_empty() {
return Some(self.error(SyntaxError::EmptyEntity));
}
let c = match &*name {
"lt" => Some('<'),
"gt" => Some('>'),
"amp" => Some('&'),
"apos" => Some('\''),
"quot" => Some('"'),
_ if name.starts_with('#') => match self.numeric_reference_from_str(&name[1..]) {
Ok(c) => Some(c),
Err(e) => return Some(self.error(e)),
},
_ => None,
};
if let Some(c) = c {
self.buf.push(c);
} else if let Some(v) = self.config.c.extra_entities.get(&name) {
self.buf.push_str(v);
} else if let Some(v) = self.entities.get(&name) {
if self.state_after_reference == State::OutsideTag {
// an entity can expand to *elements*, so outside of a tag it needs a full reparse
if let Err(e) = self.lexer.reparse(v) {
return Some(Err(e));
}
} else {
// however, inside attributes it's not allowed to affect attribute quoting,
// so it can't be fed to the lexer
self.buf.push_str(v);
}
} else {
return Some(self.error(SyntaxError::UnexpectedEntity(name.into())));
}
let prev_st = self.state_after_reference;
if prev_st == State::OutsideTag && !is_whitespace_char(self.buf.chars().last().unwrap_or('\0')) {
self.inside_whitespace = false;
}
self.into_state_continue(prev_st)
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
}
}
pub(crate) fn numeric_reference_from_str(&self, num_str: &str) -> std::result::Result<char, SyntaxError> {
let val = if let Some(hex) = num_str.strip_prefix('x') {
u32::from_str_radix(hex, 16).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))?
} else {
num_str.parse::<u32>().map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))?
};
match char::from_u32(val) {
Some(c) if self.is_valid_xml_char(c) => Ok(c),
Some(_) if self.config.c.replace_unknown_entity_references => Ok('\u{fffd}'),
None if self.config.c.replace_unknown_entity_references => Ok('\u{fffd}'),
_ => Err(SyntaxError::InvalidCharacterEntity(val)),
}
}
}

View File

@@ -0,0 +1,211 @@
use crate::common::is_whitespace_char;
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::lexer::Token;
use super::{
ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate,
ProcessingInstructionSubstate, PullParser, Result, State,
};
impl PullParser {
pub fn outside_tag(&mut self, t: Token) -> Option<Result> {
match t {
Token::Character(c) => {
if is_whitespace_char(c) {
// skip whitespace outside of the root element
if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
(self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
return None;
}
} else {
self.inside_whitespace = false;
if self.depth() == 0 {
return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
}
}
if !self.is_valid_xml_char_not_restricted(c) {
return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
}
if self.buf.is_empty() {
self.push_pos();
} else if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push(c);
None
},
Token::CommentEnd | Token::TagEnd | Token::EqualsSign |
Token::DoubleQuote | Token::SingleQuote |
Token::ProcessingInstructionEnd | Token::EmptyTagEnd => {
if self.depth() == 0 {
return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
}
self.inside_whitespace = false;
if let Some(s) = t.as_static_str() {
if self.buf.is_empty() {
self.push_pos();
} else if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push_str(s);
}
None
},
Token::ReferenceStart if self.depth() > 0 => {
self.state_after_reference = State::OutsideTag;
self.into_state_continue(State::InsideReference)
},
Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity
self.inside_whitespace = false;
if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
Token::ReferenceEnd.push_to_string(&mut self.buf);
None
},
Token::CommentStart if self.config.c.coalesce_characters && self.config.c.ignore_comments => {
let next_event = self.set_encountered(Encountered::Comment);
// We need to switch the lexer into a comment mode inside comments
self.into_state(State::InsideComment, next_event)
}
Token::CDataStart if self.depth() > 0 && self.config.c.coalesce_characters && self.config.c.cdata_to_characters => {
if self.buf.is_empty() {
self.push_pos(); // CDataEnd will pop pos if the buffer remains empty
}
// if coalescing chars, continue without event
self.into_state_continue(State::InsideCData)
},
_ => {
// Encountered some markup event, flush the buffer as characters
// or a whitespace
let mut next_event = if self.buf_has_data() {
let buf = self.take_buf();
if self.inside_whitespace && self.config.c.trim_whitespace {
// there will be no event emitted for this, but start of buffering has pushed a pos
self.next_pos();
None
} else if self.inside_whitespace && !self.config.c.whitespace_to_characters {
debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}");
Some(Ok(XmlEvent::Whitespace(buf)))
} else if self.config.c.trim_whitespace {
Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into())))
} else {
Some(Ok(XmlEvent::Characters(buf)))
}
} else { None };
self.inside_whitespace = true; // Reset inside_whitespace flag
// pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it
// and ignored comments don't pop
if t != Token::CommentStart || !self.config.c.ignore_comments {
self.push_pos();
}
match t {
Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => {
if let Some(e) = self.set_encountered(Encountered::Element) {
next_event = Some(e);
}
self.nst.push_empty();
self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
},
Token::ClosingTagStart if self.depth() > 0 =>
self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event),
Token::CommentStart => {
if let Some(e) = self.set_encountered(Encountered::Comment) {
next_event = Some(e);
}
// We need to switch the lexer into a comment mode inside comments
self.into_state(State::InsideComment, next_event)
},
Token::DoctypeStart if self.encountered < Encountered::Doctype => {
if let Some(e) = self.set_encountered(Encountered::Doctype) {
next_event = Some(e);
}
self.data.doctype = Some(Token::DoctypeStart.to_string());
// We don't have a doctype event so skip this position
// FIXME: update when we have a doctype event
self.next_pos();
self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
},
Token::ProcessingInstructionStart =>
self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event),
Token::CDataStart if self.depth() > 0 => {
self.into_state(State::InsideCData, next_event)
},
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
}
},
}
}
pub fn document_start(&mut self, t: Token) -> Option<Result> {
debug_assert!(self.encountered < Encountered::Declaration);
match t {
Token::Character(c) => {
let next_event = self.set_encountered(Encountered::AnyChars);
if !is_whitespace_char(c) {
return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
}
self.inside_whitespace = true;
// skip whitespace outside of the root element
if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
(self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
return self.into_state(State::OutsideTag, next_event);
}
self.push_pos();
self.buf.push(c);
self.into_state(State::OutsideTag, next_event)
},
Token::CommentStart => {
let next_event = self.set_encountered(Encountered::Comment);
self.into_state(State::InsideComment, next_event)
},
Token::OpeningTagStart => {
let next_event = self.set_encountered(Encountered::Element);
self.nst.push_empty();
self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
},
Token::DoctypeStart => {
let next_event = self.set_encountered(Encountered::Doctype);
self.data.doctype = Some(Token::DoctypeStart.to_string());
// We don't have a doctype event so skip this position
// FIXME: update when we have a doctype event
self.next_pos();
self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
},
Token::ProcessingInstructionStart => {
self.push_pos();
self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName))
},
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
}
}
}