vector/transforms/dedupe/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
#[cfg(feature = "transforms-dedupe")]
pub mod config;
#[cfg(feature = "transforms-impl-dedupe")]
pub mod transform;
#[cfg(feature = "transforms-impl-dedupe")]
pub mod common {
use std::num::NonZeroUsize;
use vector_lib::{configurable::configurable_component, lookup::lookup_v2::ConfigTargetPath};
use crate::config::log_schema;
/// Caching configuration for deduplication.
#[configurable_component]
#[derive(Clone, Debug)]
#[serde(deny_unknown_fields)]
pub struct CacheConfig {
/// Number of events to cache and use for comparing incoming events to previously seen events.
pub num_events: NonZeroUsize,
}
pub fn default_cache_config() -> CacheConfig {
CacheConfig {
num_events: NonZeroUsize::new(5000).expect("static non-zero number"),
}
}
/// Options to control what fields to match against.
///
/// When no field matching configuration is specified, events are matched using the `timestamp`,
/// `host`, and `message` fields from an event. The specific field names used are those set in
/// the global [`log schema`][global_log_schema] configuration.
///
/// [global_log_schema]: https://vector.dev/docs/reference/configuration/global-options/#log_schema
// TODO: This enum renders correctly in terms of providing equivalent Cue output when using the
// machine-generated stuff vs the previously-hand-written Cue... but what it _doesn't_ have in the
// machine-generated output is any sort of blurb that these "fields" (`match` and `ignore`) are
// actually mutually exclusive.
//
// We know that to be the case when we're generating the output from the configuration schema, so we
// need to emit something in that output to indicate as much, and further, actually use it on the
// Cue side to add some sort of boilerplate about them being mutually exclusive, etc.
#[configurable_component]
#[derive(Clone, Debug)]
#[serde(deny_unknown_fields)]
pub enum FieldMatchConfig {
/// Matches events using only the specified fields.
#[serde(rename = "match")]
MatchFields(
#[configurable(metadata(
docs::examples = "field1",
docs::examples = "parent.child_field"
))]
Vec<ConfigTargetPath>,
),
/// Matches events using all fields except for the ignored ones.
#[serde(rename = "ignore")]
IgnoreFields(
#[configurable(metadata(
docs::examples = "field1",
docs::examples = "parent.child_field",
docs::examples = "host",
docs::examples = "hostname"
))]
Vec<ConfigTargetPath>,
),
}
pub fn fill_default_fields_match(maybe_fields: Option<&FieldMatchConfig>) -> FieldMatchConfig {
// We provide a default value on `fields`, based on `default_match_fields`, in order to
// drive the configuration schema and documentation. Since we're getting the values from the
// configured log schema, though, the default field values shown in the configuration
// schema/documentation may not be the same as an actual user's Vector configuration.
match maybe_fields {
Some(FieldMatchConfig::MatchFields(x)) => FieldMatchConfig::MatchFields(x.clone()),
Some(FieldMatchConfig::IgnoreFields(y)) => FieldMatchConfig::IgnoreFields(y.clone()),
None => FieldMatchConfig::MatchFields(default_match_fields()),
}
}
// TODO: Add support to the `configurable(metadata(..))` helper attribute for passing an expression
// that will provide the value for the metadata attribute's value, as well as letting all metadata
// attributes have whatever value they want, so long as it can be serialized by `serde_json`.
//
// Once we have that, we could curry these default values (and others) via a metadata attribute
// instead of via `serde(default = "...")` to allow for displaying default values in the
// configuration schema _without_ actually changing how a field is populated during deserialization.
//
// See the comment in `fill_default_fields_match` for more information on why this is required.
//
// TODO: These values are used even for events with the new "Vector" log namespace.
// These aren't great defaults in that case, but hard-coding isn't much better since the
// structure can vary significantly. This should probably either become a required field
// in the future, or maybe the "semantic meaning" can be utilized here.
fn default_match_fields() -> Vec<ConfigTargetPath> {
let mut fields = Vec::new();
if let Some(message_key) = log_schema().message_key_target_path() {
fields.push(ConfigTargetPath(message_key.clone()));
}
if let Some(host_key) = log_schema().host_key_target_path() {
fields.push(ConfigTargetPath(host_key.clone()));
}
if let Some(timestamp_key) = log_schema().timestamp_key_target_path() {
fields.push(ConfigTargetPath(timestamp_key.clone()));
}
fields
}
}