vector/transforms/sample/
config.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
use snafu::Snafu;
use vector_lib::config::{LegacyKey, LogNamespace};
use vector_lib::configurable::configurable_component;
use vector_lib::lookup::{lookup_v2::OptionalValuePath, owned_value_path};
use vrl::value::Kind;

use crate::{
    conditions::AnyCondition,
    config::{
        DataType, GenerateConfig, Input, OutputId, TransformConfig, TransformContext,
        TransformOutput,
    },
    schema,
    template::Template,
    transforms::Transform,
};

use super::transform::{Sample, SampleMode};

#[derive(Debug, Snafu)]
pub enum SampleError {
    // Errors from `determine_sample_mode`
    #[snafu(display(
        "Only positive, non-zero numbers are allowed values for `ratio`, value: {ratio}"
    ))]
    InvalidRatio { ratio: f64 },

    #[snafu(display("Only non-zero numbers are allowed values for `rate`"))]
    InvalidRate,

    #[snafu(display(
        "Exactly one value must be provided for either 'rate' or 'ratio', but not both"
    ))]
    InvalidConfiguration,
}

/// Configuration for the `sample` transform.
#[configurable_component(transform(
    "sample",
    "Sample events from an event stream based on supplied criteria and at a configurable rate."
))]
#[derive(Clone, Debug)]
#[serde(deny_unknown_fields)]
pub struct SampleConfig {
    /// The rate at which events are forwarded, expressed as `1/N`.
    ///
    /// For example, `rate = 1500` means 1 out of every 1500 events are forwarded and the rest are
    /// dropped. This differs from `ratio` which allows more precise control over the number of events
    /// retained and values greater than 1/2. It is an error to provide a value for both `rate` and `ratio`.
    #[configurable(metadata(docs::examples = 1500))]
    pub rate: Option<u64>,

    /// The rate at which events are forwarded, expressed as a percentage
    ///
    /// For example, `ratio = .13` means that 13% out of all events on the stream are forwarded and
    /// the rest are dropped. This differs from `rate` allowing the configuration of a higher
    /// precision value and also the ability to retain values of greater than 50% of all events. It is
    /// an error to provide a value for both `rate` and `ratio`.
    #[configurable(metadata(docs::examples = 0.13))]
    #[configurable(validation(range(min = 0.0, max = 1.0)))]
    pub ratio: Option<f64>,

    /// The name of the field whose value is hashed to determine if the event should be
    /// sampled.
    ///
    /// Each unique value for the key creates a bucket of related events to be sampled together
    /// and the rate is applied to the buckets themselves to sample `1/N` buckets.  The overall rate
    /// of sampling may differ from the configured one if values in the field are not uniformly
    /// distributed. If left unspecified, or if the event doesn’t have `key_field`, then the
    /// event is sampled independently.
    ///
    /// This can be useful to, for example, ensure that all logs for a given transaction are
    /// sampled together, but that overall `1/N` transactions are sampled.
    #[configurable(metadata(docs::examples = "message"))]
    pub key_field: Option<String>,

    /// The event key in which the sample rate is stored. If set to an empty string, the sample rate will not be added to the event.
    #[configurable(metadata(docs::examples = "sample_rate"))]
    #[serde(default = "default_sample_rate_key")]
    pub sample_rate_key: OptionalValuePath,

    /// The value to group events into separate buckets to be sampled independently.
    ///
    /// If left unspecified, or if the event doesn't have `group_by`, then the event is not
    /// sampled separately.
    #[configurable(metadata(
        docs::examples = "{{ service }}",
        docs::examples = "{{ hostname }}-{{ service }}"
    ))]
    pub group_by: Option<Template>,

    /// A logical condition used to exclude events from sampling.
    pub exclude: Option<AnyCondition>,
}

impl SampleConfig {
    fn sample_rate(&self) -> Result<SampleMode, SampleError> {
        match (self.rate, self.ratio) {
            (None, Some(ratio)) => {
                if ratio <= 0.0 {
                    Err(SampleError::InvalidRatio { ratio })
                } else {
                    Ok(SampleMode::new_ratio(ratio))
                }
            }
            (Some(rate), None) => {
                if rate == 0 {
                    Err(SampleError::InvalidRate)
                } else {
                    Ok(SampleMode::new_rate(rate))
                }
            }
            _ => Err(SampleError::InvalidConfiguration),
        }
    }
}

impl GenerateConfig for SampleConfig {
    fn generate_config() -> toml::Value {
        toml::Value::try_from(Self {
            rate: None,
            ratio: Some(0.1),
            key_field: None,
            group_by: None,
            exclude: None::<AnyCondition>,
            sample_rate_key: default_sample_rate_key(),
        })
        .unwrap()
    }
}

#[async_trait::async_trait]
#[typetag::serde(name = "sample")]
impl TransformConfig for SampleConfig {
    async fn build(&self, context: &TransformContext) -> crate::Result<Transform> {
        Ok(Transform::function(Sample::new(
            Self::NAME.to_string(),
            self.sample_rate()?,
            self.key_field.clone(),
            self.group_by.clone(),
            self.exclude
                .as_ref()
                .map(|condition| condition.build(&context.enrichment_tables))
                .transpose()?,
            self.sample_rate_key.clone(),
        )))
    }

    fn input(&self) -> Input {
        Input::new(DataType::Log | DataType::Trace)
    }

    fn validate(&self, _: &schema::Definition) -> Result<(), Vec<String>> {
        self.sample_rate()
            .map(|_| ())
            .map_err(|e| vec![e.to_string()])
    }

    fn outputs(
        &self,
        _: vector_lib::enrichment::TableRegistry,
        input_definitions: &[(OutputId, schema::Definition)],
        _: LogNamespace,
    ) -> Vec<TransformOutput> {
        vec![TransformOutput::new(
            DataType::Log | DataType::Trace,
            input_definitions
                .iter()
                .map(|(output, definition)| {
                    (
                        output.clone(),
                        definition.clone().with_source_metadata(
                            SampleConfig::NAME,
                            Some(LegacyKey::Overwrite(owned_value_path!("sample_rate"))),
                            &owned_value_path!("sample_rate"),
                            Kind::bytes(),
                            None,
                        ),
                    )
                })
                .collect(),
        )]
    }
}

pub fn default_sample_rate_key() -> OptionalValuePath {
    OptionalValuePath::from(owned_value_path!("sample_rate"))
}

#[cfg(test)]
mod tests {
    use crate::transforms::sample::config::SampleConfig;

    #[test]
    fn generate_config() {
        crate::test_util::test_generate_config::<SampleConfig>();
    }
}