use crate::token::{Float, Integer, Location, PreprocessorError, Punct}; use std::str::Chars; use unicode_xid::UnicodeXID; type CharAndLine = (char, u32); // GLSL ES 3.20 specification section 3.10. Logical Phases of Compilation // This iterator implements phases 4 and 5 of the logical phases of compilation: // // 4. Each {carriage-return, line-feed} and {line-feed, carriage return} sequence is replaced by // a single newline. All remaining carriage-return and line-feed characters are then each // replaced by a newline. // // 5. Line numbering for each character, which is equal to the number of preceding newlines plus // one, is noted. Note this can only be subsequently changed by the #line directive and is not // affected by the removal of newlines in phase 6 of compilation. // // It expects that phases 1 to 3 are already done and that valid utf8 is passed in. #[derive(Clone)] pub struct CharsAndLine<'a> { inner: Chars<'a>, line: u32, } impl<'a> CharsAndLine<'a> { pub fn new(input: &'a str) -> Self { CharsAndLine { inner: input.chars(), line: 1, } } pub fn get_current_ptr(&self) -> *const u8 { self.inner.as_str().as_ptr() } } impl<'a> Iterator for CharsAndLine<'a> { type Item = CharAndLine; fn next(&mut self) -> Option { let current = self.inner.next()?; match current { '\n' => { // Consume the token but see if we can grab a \r that follows let mut peek_inner = self.inner.clone(); if peek_inner.next() == Some('\r') { self.inner = peek_inner; } let res = Some(('\n', self.line)); self.line += 1; res } '\r' => { // Consume the token but see if we can grab a \n that follows let mut peek_inner = self.inner.clone(); if peek_inner.next() == Some('\n') { self.inner = peek_inner; } let res = Some(('\n', self.line)); self.line += 1; res } _ => Some((current, self.line)), } } } // An iterator that adds stage 6 on top of CharsAndLocation: // // 6. Wherever a backslash ('\') occurs immediately before a newline, both are deleted. Note that // no whitespace is substituted, thereby allowing a single preprocessing token to span a // newline. This operation is not recursive; any new {backslash newline} sequences generated // are not removed. #[derive(Clone)] pub struct SkipBackslashNewline<'a> { inner: CharsAndLine<'a>, } impl<'a> SkipBackslashNewline<'a> { pub fn new(input: &'a str) -> Self { SkipBackslashNewline { inner: CharsAndLine::new(input), } } pub fn get_current_ptr(&self) -> *const u8 { self.inner.get_current_ptr() } } impl<'a> Iterator for SkipBackslashNewline<'a> { type Item = CharAndLine; fn next(&mut self) -> Option { let mut current = self.inner.next()?; while current.0 == '\\' { let mut peek_inner = self.inner.clone(); if let Some(('\n', _)) = peek_inner.next() { self.inner = peek_inner; current = self.next()?; } else { return Some(current); } } Some(current) } } // An iterator that adds stage 7 on top of SkipBackslashNewline: // // 7. All comments are replaced with a single space. All (non-zero) characters and invalid UTF-8 // byte sequences are allowed within comments. '//' style comments include the initial '//' // marker and continue up to, but not including, the terminating newline. '/…/' comments // include both the start and end marker. #[derive(Clone)] pub struct ReplaceComments<'a> { inner: SkipBackslashNewline<'a>, } // The lexer wants to know when whitespace is a comment to know if a comment was ever processed. // To avoid adding state we use a sentinel value of '\r' because all '\r' have been consumed and // turned into '\n' by CharsAndLocation. pub const COMMENT_SENTINEL_VALUE: char = '\r'; impl<'a> ReplaceComments<'a> { pub fn new(input: &'a str) -> Self { ReplaceComments { inner: SkipBackslashNewline::new(input), } } pub fn get_current_ptr(&self) -> *const u8 { self.inner.get_current_ptr() } } impl<'a> Iterator for ReplaceComments<'a> { type Item = CharAndLine; fn next(&mut self) -> Option { let current = self.inner.next()?; if current.0 != '/' { debug_assert!(current.0 != COMMENT_SENTINEL_VALUE); return Some(current); } let mut peek_inner = self.inner.clone(); match peek_inner.next() { // The // case, consume until but not including the next \n Some(('/', _)) => { self.inner = peek_inner.clone(); while let Some((next, _)) = peek_inner.next() { if next == '\n' { break; } self.inner = peek_inner.clone(); } Some((COMMENT_SENTINEL_VALUE, current.1)) } // The /*, consume until the next */ Some(('*', _)) => { let mut was_star = false; while let Some((next, _)) = peek_inner.next() { if was_star && next == '/' { break; } was_star = next == '*'; } self.inner = peek_inner; Some((COMMENT_SENTINEL_VALUE, current.1)) } // Not // or /*, do nothing _ => Some(current), } } } // A lexer for GLSL tokens that also emits a couple extra tokens that are useful to the // preprocessor: # and newlines. It also include metadata for the token for whether it is at the // start of the line, or if it has leading whitespace. // This is a helper iterator to abstract away the tracking of location data (offset, line) from // `Lexer`. It looks like a Peekable> with `next_char` and `peek_char` but also // allows querying the last seen/consumed lines / offset. #[derive(Clone)] struct LexerCharIterator<'a> { inner: ReplaceComments<'a>, peeked: Option<(CharAndLine, *const u8)>, last_consumed: (CharAndLine, *const u8), input_start: *const u8, } pub const NONE_CONSUMED_SENTINEL_VALUE: char = '\r'; impl<'a> LexerCharIterator<'a> { pub fn new(input: &'a str) -> Self { LexerCharIterator { inner: ReplaceComments::new(input), peeked: None, last_consumed: ((NONE_CONSUMED_SENTINEL_VALUE, 0), input.as_bytes().as_ptr()), input_start: input.as_bytes().as_ptr(), } } fn next_char(&mut self) -> Option { self.last_consumed = match self.peeked.take() { Some(v) => v, None => { let ptr = self.inner.get_current_ptr(); (self.inner.next()?, ptr) } }; Some(self.last_consumed.0 .0) } fn peek_char(&mut self) -> Option { match self.peeked { Some(v) => Some(v.0 .0), None => { let ptr = self.inner.get_current_ptr(); let next = self.inner.next()?; self.peeked = Some((next, ptr)); Some(next.0) } } } fn get_last_seen_line(&self) -> u32 { self.peeked.unwrap_or(self.last_consumed).0 .1 } fn get_last_seen_start_offset(&self) -> usize { self.peeked.unwrap_or(self.last_consumed).1 as usize - self.input_start as usize } fn get_last_consumed_end_offset(&self) -> usize { self.last_consumed.1 as usize - self.input_start as usize + self.last_consumed.0 .0.len_utf8() } } // A superset of the token value returned by the preprocessor #[derive(Clone, PartialEq, Debug)] pub enum TokenValue { // Preprocessor specific token values Hash, NewLine, // Regular token values Ident(String), Integer(Integer), Float(Float), Punct(Punct), } impl From for TokenValue { fn from(punct: Punct) -> Self { TokenValue::Punct(punct) } } #[derive(Clone, PartialEq, Debug)] pub struct Token { pub value: TokenValue, pub location: Location, pub leading_whitespace: bool, pub start_of_line: bool, } pub type LexerItem = Result; pub struct Lexer<'a> { inner: LexerCharIterator<'a>, leading_whitespace: bool, start_of_line: bool, had_comments: bool, } impl<'a> Lexer<'a> { pub fn new(input: &'a str) -> Self { // TODO bail out on source that is too large. Lexer { inner: LexerCharIterator::new(input), leading_whitespace: true, start_of_line: true, had_comments: false, } } pub fn had_comments(&self) -> bool { self.had_comments } // TODO: Make a runtime flag to toggle unicode identifiers support // The glsl spec only allows ascii identifiers fn parse_identifier(&mut self) -> Result { let mut identifier = String::default(); if let Some(c) = self.next_char_if(|c| c.is_xid_start() || c == '_') { identifier.push(c); } let rest = self.consume_chars(|c| c.is_xid_continue()); identifier.push_str(&rest); // TODO check if identifier is larger than the limit. Ok(TokenValue::Ident(identifier)) } fn parse_integer_signedness_suffix(&mut self) -> bool { self.next_char_if(|c| c == 'u' || c == 'U').is_none() } fn parse_integer_width_suffix(&mut self) -> Result { match self.inner.peek_char() { Some('l') | Some('L') => Err(PreprocessorError::NotSupported64BitLiteral), Some('s') | Some('S') => Err(PreprocessorError::NotSupported16BitLiteral), _ => Ok(32), } } fn parse_float_width_suffix(&mut self) -> Result { match self.inner.peek_char() { Some('l') | Some('L') => Err(PreprocessorError::NotSupported64BitLiteral), Some('h') | Some('H') => Err(PreprocessorError::NotSupported16BitLiteral), Some('f') | Some('F') => { self.inner.next_char(); Ok(32) } _ => Ok(32), } } fn next_char_if(&mut self, predicate: impl FnOnce(char) -> bool) -> Option { if let Some(c) = self.inner.peek_char() { if predicate(c) { return self.inner.next_char(); } } None } fn consume_chars(&mut self, filter: impl Fn(char) -> bool) -> String { let mut result: String = Default::default(); while let Some(c) = self.next_char_if(&filter) { result.push(c); } result } fn parse_number(&mut self, first_char: char) -> Result { let mut is_float = false; let mut integer_radix = 10; let mut raw: String = Default::default(); raw.push(first_char); // Handle hexadecimal numbers that needs to consume a..f in addition to digits. if first_char == '0' { match self.inner.peek_char() { Some('x') | Some('X') => { self.inner.next_char(); raw += &self.consume_chars(|c| matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F')); integer_radix = 16; } // Octal numbers can also be the prefix of floats, so we need to parse all digits // and not just 0..7 in case it is a float like 00009.0f, the parsing of all digits // is done below, but we still need to remember the radix. Some('0'..='9') => { integer_radix = 8; } _ => {} }; } if first_char == '.' { is_float = true; } else { // Parse any digits at the end of integers, or for the non-fractional part of floats. raw += &self.consume_chars(|c| ('0'..='9').contains(&c)); if self.next_char_if(|c| c == '.').is_some() { raw.push('.'); is_float = true; } } // At this point either we're an integer missing only suffixes, or we're a float with // everything up to the . consumed. if is_float { raw += &self.consume_chars(|c| ('0'..='9').contains(&c)); } // Handle scientific notation with a (e|E)(+|-|)\d+ suffix when we're a float or an // an integer that could turn into a float if we add a exponent to it (so 0x1E-1 // isn't recognized as a float). if (is_float || integer_radix == 8 || integer_radix == 10) && self.next_char_if(|c| c == 'e' || c == 'E').is_some() { raw.push('e'); is_float = true; match self.inner.peek_char() { Some('+') => { self.inner.next_char(); raw.push('+'); } Some('-') => { self.inner.next_char(); raw.push('-'); } _ => {} } // TODO: what should we do when there is no number after the exponent? raw += &self.consume_chars(|c| ('0'..='9').contains(&c)); } if is_float { // TODO: Depending on the GLSL version make it an error to not have the suffix. let width = self.parse_float_width_suffix()?; Ok(TokenValue::Float(Float { value: raw .parse::() .map_err(|_| PreprocessorError::FloatParsingError)?, width, })) } else { let signed = self.parse_integer_signedness_suffix(); let width = self.parse_integer_width_suffix()?; // Skip the initial 0 in hexa or octal (in hexa we never added the 'x'). if integer_radix != 10 { raw = raw.split_off(1); } Ok(TokenValue::Integer(Integer { value: u64::from_str_radix(&raw, integer_radix) .map_err(|_err| PreprocessorError::IntegerOverflow)?, signed, width, })) } } fn parse_punctuation(&mut self) -> Result { let save_point = self.inner.clone(); let char0 = self.inner.next_char().unwrap_or('\0'); let char1 = self.inner.next_char().unwrap_or('\0'); let char2 = self.inner.next_char().unwrap_or('\0'); let maybe_punct = match (char0, char1, char2) { ('<', '<', '=') => Some((Punct::LeftShiftAssign, 3)), ('<', '<', _) => Some((Punct::LeftShift, 2)), ('<', '=', _) => Some((Punct::LessEqual, 2)), ('<', _, _) => Some((Punct::LeftAngle, 1)), ('>', '>', '=') => Some((Punct::RightShiftAssign, 3)), ('>', '>', _) => Some((Punct::RightShift, 2)), ('>', '=', _) => Some((Punct::GreaterEqual, 2)), ('>', _, _) => Some((Punct::RightAngle, 1)), ('+', '+', _) => Some((Punct::Increment, 2)), ('+', '=', _) => Some((Punct::AddAssign, 2)), ('+', _, _) => Some((Punct::Plus, 1)), ('-', '-', _) => Some((Punct::Decrement, 2)), ('-', '=', _) => Some((Punct::SubAssign, 2)), ('-', _, _) => Some((Punct::Minus, 1)), ('&', '&', _) => Some((Punct::LogicalAnd, 2)), ('&', '=', _) => Some((Punct::AndAssign, 2)), ('&', _, _) => Some((Punct::Ampersand, 1)), ('|', '|', _) => Some((Punct::LogicalOr, 2)), ('|', '=', _) => Some((Punct::OrAssign, 2)), ('|', _, _) => Some((Punct::Pipe, 1)), ('^', '^', _) => Some((Punct::LogicalXor, 2)), ('^', '=', _) => Some((Punct::XorAssign, 2)), ('^', _, _) => Some((Punct::Caret, 1)), ('=', '=', _) => Some((Punct::EqualEqual, 2)), ('=', _, _) => Some((Punct::Equal, 1)), ('!', '=', _) => Some((Punct::NotEqual, 2)), ('!', _, _) => Some((Punct::Bang, 1)), ('*', '=', _) => Some((Punct::MulAssign, 2)), ('*', _, _) => Some((Punct::Star, 1)), ('/', '=', _) => Some((Punct::DivAssign, 2)), ('/', _, _) => Some((Punct::Slash, 1)), ('%', '=', _) => Some((Punct::ModAssign, 2)), ('%', _, _) => Some((Punct::Percent, 1)), ('(', _, _) => Some((Punct::LeftParen, 1)), (')', _, _) => Some((Punct::RightParen, 1)), ('{', _, _) => Some((Punct::LeftBrace, 1)), ('}', _, _) => Some((Punct::RightBrace, 1)), ('[', _, _) => Some((Punct::LeftBracket, 1)), (']', _, _) => Some((Punct::RightBracket, 1)), (',', _, _) => Some((Punct::Comma, 1)), (';', _, _) => Some((Punct::Semicolon, 1)), (':', _, _) => Some((Punct::Colon, 1)), ('~', _, _) => Some((Punct::Tilde, 1)), ('?', _, _) => Some((Punct::Question, 1)), // Note that Dot (".") is handled in Lexer::next since it can be // either punctuation or the start of a floating point number. _ => None, }; if let Some((punct, size)) = maybe_punct { self.inner = save_point; for _ in 0..size { self.inner.next_char(); } Ok(punct.into()) } else if char0 == '#' { self.inner = save_point; self.inner.next_char(); Ok(TokenValue::Hash) } else { Err(PreprocessorError::UnexpectedCharacter) } } } impl<'a> Iterator for Lexer<'a> { type Item = LexerItem; fn next(&mut self) -> Option { while let Some(current_char) = self.inner.peek_char() { let had_leading_whitespace = self.leading_whitespace; self.leading_whitespace = false; let mut location = Location { line: self.inner.get_last_seen_line(), start: self.inner.get_last_seen_start_offset() as u32, end: 0, }; let was_start_of_line = self.start_of_line; self.start_of_line = false; let value = match current_char { ' ' | '\t' | '\x0b' | '\x0c' | COMMENT_SENTINEL_VALUE => { if current_char == COMMENT_SENTINEL_VALUE { self.had_comments = true; } self.start_of_line = was_start_of_line; self.leading_whitespace = true; self.inner.next_char(); continue; } '\n' => { self.leading_whitespace = true; self.start_of_line = true; self.inner.next_char(); Ok(TokenValue::NewLine) } c @ '0'..='9' => { self.inner.next_char(); self.parse_number(c) } // Special case . as a punctuation because it can be the start of a float. '.' => { self.inner.next_char(); match self.inner.peek_char() { Some('0'..='9') => self.parse_number('.'), _ => Ok(TokenValue::Punct(Punct::Dot)), } } _ => { // TODO: see todo in `parse_identifier` for information if current_char.is_xid_start() || current_char == '_' { self.parse_identifier() } else { self.parse_punctuation() } } }; location.end = self.inner.get_last_consumed_end_offset() as u32; return Some(value.map_err(|e| (e, Default::default())).map(|t| Token { value: t, location, leading_whitespace: had_leading_whitespace, start_of_line: was_start_of_line, })); } // Do the C hack of always ending with a newline so that preprocessor directives are ended. if !self.start_of_line { self.start_of_line = true; let end_offset = self.inner.get_last_consumed_end_offset() as u32; Some(Ok(Token { value: TokenValue::NewLine, location: Location { line: self.inner.get_last_seen_line(), start: end_offset, end: end_offset, }, leading_whitespace: self.leading_whitespace, start_of_line: false, })) } else { None } } }