362 lines
11 KiB
Rust
362 lines
11 KiB
Rust
// (C) Copyright 2016 Jethro G. Beekman
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
// option. This file may not be copied, modified, or distributed
|
|
// except according to those terms.
|
|
//! Parsing C literals from byte slices.
|
|
//!
|
|
//! This will parse a representation of a C literal into a Rust type.
|
|
//!
|
|
//! # characters
|
|
//! Character literals are stored into the `CChar` type, which can hold values
|
|
//! that are not valid Unicode code points. ASCII characters are represented as
|
|
//! `char`, literal bytes with the high byte set are converted into the raw
|
|
//! representation. Escape sequences are supported. If hex and octal escapes
|
|
//! map to an ASCII character, that is used, otherwise, the raw encoding is
|
|
//! used, including for values over 255. Unicode escapes are checked for
|
|
//! validity and mapped to `char`. Character sequences are not supported. Width
|
|
//! prefixes are ignored.
|
|
//!
|
|
//! # strings
|
|
//! Strings are interpreted as byte vectors. Escape sequences are supported. If
|
|
//! hex and octal escapes map onto multi-byte characters, they are truncated to
|
|
//! one 8-bit character. Unicode escapes are converted into their UTF-8
|
|
//! encoding. Width prefixes are ignored.
|
|
//!
|
|
//! # integers
|
|
//! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are
|
|
//! all supported. If the literal value is between `i64::MAX` and `u64::MAX`,
|
|
//! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and
|
|
//! sign suffixes are ignored. Sign prefixes are not supported.
|
|
//!
|
|
//! # real numbers
|
|
//! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are
|
|
//! not supported in the significand. Hexadecimal floating points are not
|
|
//! supported.
|
|
|
|
use std::char;
|
|
use std::str::{self, FromStr};
|
|
|
|
use nom::branch::alt;
|
|
use nom::bytes::complete::is_not;
|
|
use nom::bytes::complete::tag;
|
|
use nom::character::complete::{char, one_of};
|
|
use nom::combinator::{complete, map, map_opt, opt, recognize};
|
|
use nom::multi::{fold_many0, many0, many1, many_m_n};
|
|
use nom::sequence::{delimited, pair, preceded, terminated, tuple};
|
|
use nom::*;
|
|
|
|
use crate::expr::EvalResult;
|
|
use crate::ToCexprResult;
|
|
|
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
|
/// Representation of a C character
|
|
pub enum CChar {
|
|
/// A character that can be represented as a `char`
|
|
Char(char),
|
|
/// Any other character (8-bit characters, unicode surrogates, etc.)
|
|
Raw(u64),
|
|
}
|
|
|
|
impl From<u8> for CChar {
|
|
fn from(i: u8) -> CChar {
|
|
match i {
|
|
0..=0x7f => CChar::Char(i as u8 as char),
|
|
_ => CChar::Raw(i as u64),
|
|
}
|
|
}
|
|
}
|
|
|
|
// A non-allocating version of this would be nice...
|
|
impl std::convert::Into<Vec<u8>> for CChar {
|
|
fn into(self) -> Vec<u8> {
|
|
match self {
|
|
CChar::Char(c) => {
|
|
let mut s = String::with_capacity(4);
|
|
s.extend(&[c]);
|
|
s.into_bytes()
|
|
}
|
|
CChar::Raw(i) => {
|
|
let mut v = Vec::with_capacity(1);
|
|
v.push(i as u8);
|
|
v
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// ensures the child parser consumes the whole input
|
|
pub fn full<I: Clone, O, F>(
|
|
f: F,
|
|
) -> impl Fn(I) -> nom::IResult<I, O>
|
|
where
|
|
I: nom::InputLength,
|
|
F: Fn(I) -> nom::IResult<I, O>,
|
|
{
|
|
move |input| {
|
|
let res = f(input);
|
|
match res {
|
|
Ok((i, o)) => {
|
|
if i.input_len() == 0 {
|
|
Ok((i, o))
|
|
} else {
|
|
Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::Complete)))
|
|
}
|
|
}
|
|
r => r,
|
|
}
|
|
}
|
|
}
|
|
|
|
// =================================
|
|
// ======== matching digits ========
|
|
// =================================
|
|
|
|
macro_rules! byte {
|
|
($($p: pat)|* ) => {{
|
|
fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> {
|
|
match i.split_first() {
|
|
$(Some((&c @ $p,rest)))|* => Ok((rest,c)),
|
|
Some(_) => Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::OneOf))),
|
|
None => Err(nom::Err::Incomplete(Needed::new(1))),
|
|
}
|
|
}
|
|
|
|
parser
|
|
}}
|
|
}
|
|
|
|
fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> {
|
|
byte!(b'0'..=b'1')(i)
|
|
}
|
|
|
|
fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> {
|
|
byte!(b'0'..=b'7')(i)
|
|
}
|
|
|
|
fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> {
|
|
byte!(b'0'..=b'9')(i)
|
|
}
|
|
|
|
fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> {
|
|
byte!(b'0' ..= b'9' | b'a' ..= b'f' | b'A' ..= b'F')(i)
|
|
}
|
|
|
|
// ========================================
|
|
// ======== characters and strings ========
|
|
// ========================================
|
|
|
|
fn escape2char(c: char) -> CChar {
|
|
CChar::Char(match c {
|
|
'a' => '\x07',
|
|
'b' => '\x08',
|
|
'f' => '\x0c',
|
|
'n' => '\n',
|
|
'r' => '\r',
|
|
't' => '\t',
|
|
'v' => '\x0b',
|
|
_ => unreachable!("invalid escape {}", c),
|
|
})
|
|
}
|
|
|
|
fn c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar> {
|
|
str::from_utf8(&n)
|
|
.ok()
|
|
.and_then(|i| u64::from_str_radix(i, radix).ok())
|
|
.map(|i| match i {
|
|
0..=0x7f => CChar::Char(i as u8 as char),
|
|
_ => CChar::Raw(i),
|
|
})
|
|
}
|
|
|
|
fn c_unicode_escape(n: Vec<u8>) -> Option<CChar> {
|
|
str::from_utf8(&n)
|
|
.ok()
|
|
.and_then(|i| u32::from_str_radix(i, 16).ok())
|
|
.and_then(char::from_u32)
|
|
.map(CChar::Char)
|
|
}
|
|
|
|
fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> {
|
|
preceded(
|
|
char('\\'),
|
|
alt((
|
|
map(one_of(r#"'"?\"#), CChar::Char),
|
|
map(one_of("abfnrtv"), escape2char),
|
|
map_opt(many_m_n(1, 3, octal), |v| c_raw_escape(v, 8)),
|
|
map_opt(preceded(char('x'), many1(hexadecimal)), |v| {
|
|
c_raw_escape(v, 16)
|
|
}),
|
|
map_opt(
|
|
preceded(char('u'), many_m_n(4, 4, hexadecimal)),
|
|
c_unicode_escape,
|
|
),
|
|
map_opt(
|
|
preceded(char('U'), many_m_n(8, 8, hexadecimal)),
|
|
c_unicode_escape,
|
|
),
|
|
)),
|
|
)(i)
|
|
}
|
|
|
|
fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> {
|
|
alt((tag("u8"), tag("u"), tag("U"), tag("L")))(i)
|
|
}
|
|
|
|
fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> {
|
|
delimited(
|
|
terminated(opt(c_width_prefix), char('\'')),
|
|
alt((
|
|
escaped_char,
|
|
map(byte!(0 ..= 91 /* \=92 */ | 93 ..= 255), CChar::from),
|
|
)),
|
|
char('\''),
|
|
)(i)
|
|
}
|
|
|
|
fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>> {
|
|
delimited(
|
|
alt((preceded(c_width_prefix, char('"')), char('"'))),
|
|
fold_many0(
|
|
alt((
|
|
map(escaped_char, |c: CChar| c.into()),
|
|
map(is_not([b'\\', b'"']), |c: &[u8]| c.into()),
|
|
)),
|
|
Vec::new,
|
|
|mut v: Vec<u8>, res: Vec<u8>| {
|
|
v.extend_from_slice(&res);
|
|
v
|
|
},
|
|
),
|
|
char('"'),
|
|
)(i)
|
|
}
|
|
|
|
// ================================
|
|
// ======== parse integers ========
|
|
// ================================
|
|
|
|
fn c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64> {
|
|
str::from_utf8(&n)
|
|
.ok()
|
|
.and_then(|i| u64::from_str_radix(i, radix).ok())
|
|
}
|
|
|
|
fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
|
let r = input.split_at_position(|c| c != b'u' && c != b'U' && c != b'l' && c != b'L');
|
|
match r {
|
|
Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)),
|
|
res => res,
|
|
}
|
|
}
|
|
|
|
fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> {
|
|
map(
|
|
terminated(
|
|
alt((
|
|
map_opt(preceded(tag("0x"), many1(complete(hexadecimal))), |v| {
|
|
c_int_radix(v, 16)
|
|
}),
|
|
map_opt(preceded(tag("0X"), many1(complete(hexadecimal))), |v| {
|
|
c_int_radix(v, 16)
|
|
}),
|
|
map_opt(preceded(tag("0b"), many1(complete(binary))), |v| {
|
|
c_int_radix(v, 2)
|
|
}),
|
|
map_opt(preceded(tag("0B"), many1(complete(binary))), |v| {
|
|
c_int_radix(v, 2)
|
|
}),
|
|
map_opt(preceded(char('0'), many1(complete(octal))), |v| {
|
|
c_int_radix(v, 8)
|
|
}),
|
|
map_opt(many1(complete(decimal)), |v| c_int_radix(v, 10)),
|
|
|input| Err(crate::nom::Err::Error(nom::error::Error::new(input, crate::nom::ErrorKind::Fix))),
|
|
)),
|
|
opt(take_ul),
|
|
),
|
|
|i| i as i64,
|
|
)(i)
|
|
}
|
|
|
|
// ==============================
|
|
// ======== parse floats ========
|
|
// ==============================
|
|
|
|
fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> {
|
|
nom::combinator::complete(byte!(b'f' | b'l' | b'F' | b'L'))(i)
|
|
}
|
|
|
|
fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)> {
|
|
preceded(
|
|
byte!(b'e' | b'E'),
|
|
pair(opt(byte!(b'-' | b'+')), many1(complete(decimal))),
|
|
)(i)
|
|
}
|
|
|
|
fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> {
|
|
map_opt(
|
|
alt((
|
|
terminated(
|
|
recognize(tuple((
|
|
many1(complete(decimal)),
|
|
byte!(b'.'),
|
|
many0(complete(decimal)),
|
|
))),
|
|
opt(float_width),
|
|
),
|
|
terminated(
|
|
recognize(tuple((
|
|
many0(complete(decimal)),
|
|
byte!(b'.'),
|
|
many1(complete(decimal)),
|
|
))),
|
|
opt(float_width),
|
|
),
|
|
terminated(
|
|
recognize(tuple((
|
|
many0(complete(decimal)),
|
|
opt(byte!(b'.')),
|
|
many1(complete(decimal)),
|
|
float_exp,
|
|
))),
|
|
opt(float_width),
|
|
),
|
|
terminated(
|
|
recognize(tuple((
|
|
many1(complete(decimal)),
|
|
opt(byte!(b'.')),
|
|
many0(complete(decimal)),
|
|
float_exp,
|
|
))),
|
|
opt(float_width),
|
|
),
|
|
terminated(recognize(many1(complete(decimal))), float_width),
|
|
)),
|
|
|v| str::from_utf8(v).ok().and_then(|i| f64::from_str(i).ok()),
|
|
)(i)
|
|
}
|
|
|
|
// ================================
|
|
// ======== main interface ========
|
|
// ================================
|
|
|
|
fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> {
|
|
alt((
|
|
map(full(c_char), EvalResult::Char),
|
|
map(full(c_int), |i| EvalResult::Int(::std::num::Wrapping(i))),
|
|
map(full(c_float), EvalResult::Float),
|
|
map(full(c_string), EvalResult::Str),
|
|
))(input)
|
|
.to_cexpr_result()
|
|
}
|
|
|
|
/// Parse a C literal.
|
|
///
|
|
/// The input must contain exactly the representation of a single literal
|
|
/// token, and in particular no whitespace or sign prefixes.
|
|
pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> {
|
|
crate::assert_full_parse(one_literal(input))
|
|
}
|