vrl/compiler/conversion/
mod.rs

1use std::{
2    collections::{HashMap, HashSet},
3    fmt::Debug,
4    num::{ParseFloatError, ParseIntError},
5};
6
7use bytes::Bytes;
8use chrono::{DateTime, LocalResult, ParseError as ChronoParseError, TimeZone as _, Utc};
9use ordered_float::NotNan;
10use snafu::{ResultExt, Snafu};
11
12use super::datetime::{TimeZone, datetime_to_utc};
13
14#[cfg(test)]
15mod tests;
16
17#[allow(clippy::module_name_repetitions)]
18#[derive(Debug, Snafu)]
19pub enum ConversionError {
20    #[snafu(display("Unknown conversion name {:?}", name))]
21    UnknownConversion { name: String },
22}
23
24/// `Conversion` is a place-holder for a type conversion operation, to convert
25/// from a plain `Bytes` into another type. The inner type of every `Value`
26/// variant is represented here.
27#[derive(Clone, Debug)]
28pub enum Conversion {
29    Bytes,
30    Integer,
31    Float,
32    Boolean,
33    Timestamp(TimeZone),
34    TimestampFmt(String, TimeZone),
35    TimestampTzFmt(String),
36}
37
38#[derive(Debug, Eq, PartialEq, Snafu)]
39pub enum Error {
40    #[snafu(display("Invalid boolean value {:?}", s))]
41    BoolParse { s: String },
42    #[snafu(display("Invalid integer {:?}: {}", s, source))]
43    IntParse { s: String, source: ParseIntError },
44    #[snafu(display("NaN number not supported {:?}", s))]
45    NanFloat { s: String },
46    #[snafu(display("Invalid floating point number {:?}: {}", s, source))]
47    FloatParse { s: String, source: ParseFloatError },
48    #[snafu(
49        display("Invalid timestamp {:?}: {}", s, source),
50        visibility(pub(super))
51    )]
52    TimestampParse { s: String, source: ChronoParseError },
53    #[snafu(display("No matching timestamp format found for {:?}", s))]
54    AutoTimestampParse { s: String },
55}
56
57/// Helper function to parse a conversion map and check against a list of names
58///
59/// # Errors
60///
61/// See `fn Conversion::parse`.
62#[allow(clippy::implicit_hasher)]
63pub fn parse_check_conversion_map(
64    types: &HashMap<String, String>,
65    names: &[impl AsRef<str>],
66    tz: TimeZone,
67) -> Result<HashMap<String, Conversion>, ConversionError> {
68    // Check if any named type references a nonexistent field
69    let names = names
70        .iter()
71        .map(std::convert::AsRef::as_ref)
72        .collect::<HashSet<_>>();
73    for name in types.keys() {
74        if !names.contains(name.as_str()) {
75            tracing::warn!(
76                message = "Field was specified in the types but is not a valid field name.",
77                field = &name[..]
78            );
79        }
80    }
81
82    parse_conversion_map(types, tz)
83}
84
85/// Helper function to parse a mapping of conversion descriptions into actual Conversion values.
86///
87/// # Errors
88///
89/// See `fn Conversion::parse`.
90#[allow(clippy::implicit_hasher)]
91pub fn parse_conversion_map(
92    types: &HashMap<String, String>,
93    tz: TimeZone,
94) -> Result<HashMap<String, Conversion>, ConversionError> {
95    types
96        .iter()
97        .map(|(field, typename)| Conversion::parse(typename, tz).map(|conv| (field.clone(), conv)))
98        .collect()
99}
100
101impl Conversion {
102    /// Convert the string into a type conversion. The following
103    /// conversion names are supported:
104    ///
105    ///  * `"asis"`, `"bytes"`, or `"string"` => As-is (no conversion)
106    ///  * `"int"` or `"integer"` => Signed integer
107    ///  * `"float"` => Floating point number
108    ///  * `"bool"` or `"boolean"` => Boolean
109    ///  * `"timestamp"` => Timestamp, guessed using a set of formats
110    ///  * `"timestamp|FORMAT"` => Timestamp using the given format
111    ///
112    /// # Errors
113    ///
114    /// Returns an error if the conversion name is unknown.
115    pub fn parse(s: impl AsRef<str>, tz: TimeZone) -> Result<Self, ConversionError> {
116        let s = s.as_ref();
117        let mut split = s.splitn(2, '|').map(str::trim);
118        match (split.next(), split.next()) {
119            (Some("asis" | "bytes" | "string"), None) => Ok(Self::Bytes),
120            (Some("integer" | "int"), None) => Ok(Self::Integer),
121            (Some("float"), None) => Ok(Self::Float),
122            (Some("bool" | "boolean"), None) => Ok(Self::Boolean),
123            (Some("timestamp"), None) => Ok(Self::Timestamp(tz)),
124            (Some("timestamp"), Some(fmt)) => Ok(Self::timestamp(fmt, tz)),
125            _ => Err(ConversionError::UnknownConversion { name: s.into() }),
126        }
127    }
128
129    /// Convert the string into timestamp
130    #[must_use]
131    pub fn timestamp(fmt: &str, tz: TimeZone) -> Self {
132        // DateTime<Utc> can only convert timestamps without
133        // time zones, and DateTime<FixedOffset> can only
134        // convert with tone zones, so this has to distinguish
135        // between the two types of formats.
136        if format_has_zone(fmt) {
137            Self::TimestampTzFmt(fmt.into())
138        } else {
139            Self::TimestampFmt(fmt.into(), tz)
140        }
141    }
142
143    /// Use this `Conversion` variant to turn the given `bytes` into a new `T`.
144    ///
145    /// # Errors
146    ///
147    /// Returns errors from the underlying conversion functions. See `enum Error`.
148    #[allow(clippy::trait_duplication_in_bounds)] // appears to be a false positive
149    pub fn convert<T>(&self, bytes: Bytes) -> Result<T, Error>
150    where
151        T: From<Bytes> + From<i64> + From<NotNan<f64>> + From<bool> + From<DateTime<Utc>>,
152    {
153        Ok(match self {
154            Self::Bytes => bytes.into(),
155            Self::Integer => {
156                let s = String::from_utf8_lossy(&bytes);
157                s.parse::<i64>()
158                    .with_context(|_| IntParseSnafu { s })?
159                    .into()
160            }
161            Self::Float => {
162                let s = String::from_utf8_lossy(&bytes);
163                let parsed = s
164                    .parse::<f64>()
165                    .with_context(|_| FloatParseSnafu { s: s.clone() })?;
166                let f = NotNan::new(parsed).map_err(|_| Error::NanFloat { s: s.to_string() })?;
167                f.into()
168            }
169            Self::Boolean => parse_bool(&String::from_utf8_lossy(&bytes))?.into(),
170            Self::Timestamp(tz) => parse_timestamp(*tz, &String::from_utf8_lossy(&bytes))?.into(),
171            Self::TimestampFmt(format, tz) => {
172                let s = String::from_utf8_lossy(&bytes);
173                let dt = tz
174                    .datetime_from_str(&s, format)
175                    .context(TimestampParseSnafu { s })?;
176
177                datetime_to_utc(&dt).into()
178            }
179            Self::TimestampTzFmt(format) => {
180                let s = String::from_utf8_lossy(&bytes);
181                let dt = DateTime::parse_from_str(&s, format)
182                    .with_context(|_| TimestampParseSnafu { s })?;
183
184                datetime_to_utc(&dt).into()
185            }
186        })
187    }
188}
189
190/// Parse a string into a native `bool`. The built in `bool::from_str`
191/// only handles two cases, `"true"` and `"false"`. We want to be able
192/// to convert from a more diverse set of strings. In particular, the
193/// following set of source strings are allowed:
194///
195///  * `"true"`, `"t"`, `"yes"`, `"y"` (all case-insensitive), and
196///    non-zero integers all convert to `true`.
197///
198///  * `"false"`, `"f"`, `"no"`, `"n"` (all case-insensitive), and `"0"`
199///    all convert to `false`.
200///
201/// # Errors
202///
203/// Any input value besides those described above result in a parse error.
204fn parse_bool(s: &str) -> Result<bool, Error> {
205    match s {
206        "true" | "t" | "yes" | "y" => Ok(true),
207        "false" | "f" | "no" | "n" | "0" => Ok(false),
208        _ => {
209            if let Ok(n) = s.parse::<isize>() {
210                Ok(n != 0)
211            } else {
212                // Do the case conversion only if simple matches fail,
213                // since this operation can be expensive.
214                match s.to_lowercase().as_str() {
215                    "true" | "t" | "yes" | "y" => Ok(true),
216                    "false" | "f" | "no" | "n" => Ok(false),
217                    _ => Err(Error::BoolParse { s: s.into() }),
218                }
219            }
220        }
221    }
222}
223
224/// Does the format specifier have a time zone option?
225fn format_has_zone(fmt: &str) -> bool {
226    fmt.contains("%Z")
227        || fmt.contains("%z")
228        || fmt.contains("%:z")
229        || fmt.contains("%#z")
230        || fmt.contains("%+")
231}
232
233/// The list of allowed "automatic" timestamp formats with assumed local time zone
234const TIMESTAMP_LOCAL_FORMATS: &[&str] = &[
235    "%F %T",           // YYYY-MM-DD HH:MM:SS
236    "%v %T",           // DD-Mmm-YYYY HH:MM:SS
237    "%FT%T",           // ISO 8601 / RFC 3339 without TZ
238    "%m/%d/%Y:%T",     // ???
239    "%a, %d %b %Y %T", // RFC 822/2822 without TZ
240    "%a %d %b %T %Y",  // `date` command output without TZ
241    "%A %d %B %T %Y",  // `date` command output without TZ, long names
242    "%a %b %e %T %Y",  // ctime format
243];
244
245/// The list of allowed "automatic" timestamp formats with time zones
246const TIMESTAMP_TZ_FORMATS: &[&str] = &[
247    "%+",                 // ISO 8601 / RFC 3339
248    "%a %d %b %T %Z %Y",  // `date` command output
249    "%a %d %b %T %z %Y",  // `date` command output, numeric TZ
250    "%a %d %b %T %#z %Y", // `date` command output, numeric TZ
251    "%d/%b/%Y:%T %z",     // Common Log
252];
253
254fn parse_unix_timestamp(timestamp_str: &str) -> LocalResult<DateTime<Utc>> {
255    if let Ok(seconds_since_epoch) = timestamp_str.parse::<i64>() {
256        Utc.timestamp_opt(seconds_since_epoch, 0)
257    } else {
258        LocalResult::None
259    }
260}
261
262/// Parse a string into a timestamp using one of a set of formats
263///
264/// # Errors
265///
266/// Returns an error if the string could not be matched by one of the
267/// predefined timestamp formats.
268fn parse_timestamp(tz: TimeZone, s: &str) -> Result<DateTime<Utc>, Error> {
269    for format in TIMESTAMP_LOCAL_FORMATS {
270        if let Ok(result) = tz.datetime_from_str(s, format) {
271            return Ok(result);
272        }
273    }
274
275    // This is equivalent to the "%s" format.
276    if let LocalResult::Single(result) = parse_unix_timestamp(s) {
277        return Ok(result);
278    }
279
280    // This also handles "%FT%TZ" formats.
281    if let Ok(result) = DateTime::parse_from_rfc3339(s) {
282        return Ok(datetime_to_utc(&result));
283    }
284
285    if let Ok(result) = DateTime::parse_from_rfc2822(s) {
286        return Ok(datetime_to_utc(&result));
287    }
288
289    for format in TIMESTAMP_TZ_FORMATS {
290        if let Ok(result) = DateTime::parse_from_str(s, format) {
291            return Ok(datetime_to_utc(&result));
292        }
293    }
294
295    Err(Error::AutoTimestampParse { s: s.into() })
296}