1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
use std::sync::Arc;
use azure_storage_blobs::prelude::*;
use tower::ServiceBuilder;
use vector_lib::codecs::{encoding::Framer, JsonSerializerConfig, NewlineDelimitedEncoderConfig};
use vector_lib::configurable::configurable_component;
use vector_lib::sensitive_string::SensitiveString;
use super::request_builder::AzureBlobRequestOptions;
use crate::sinks::util::service::TowerRequestConfigDefaults;
use crate::{
codecs::{Encoder, EncodingConfigWithFraming, SinkType},
config::{AcknowledgementsConfig, DataType, GenerateConfig, Input, SinkConfig, SinkContext},
sinks::{
azure_common::{
self, config::AzureBlobRetryLogic, service::AzureBlobService, sink::AzureBlobSink,
},
util::{
partitioner::KeyPartitioner, BatchConfig, BulkSizeBasedDefaultBatchSettings,
Compression, ServiceBuilderExt, TowerRequestConfig,
},
Healthcheck, VectorSink,
},
template::Template,
Result,
};
#[derive(Clone, Copy, Debug)]
pub struct AzureBlobTowerRequestConfigDefaults;
impl TowerRequestConfigDefaults for AzureBlobTowerRequestConfigDefaults {
const RATE_LIMIT_NUM: u64 = 250;
}
/// Configuration for the `azure_blob` sink.
#[configurable_component(sink(
"azure_blob",
"Store your observability data in Azure Blob Storage."
))]
#[derive(Clone, Debug)]
#[serde(deny_unknown_fields)]
pub struct AzureBlobSinkConfig {
/// The Azure Blob Storage Account connection string.
///
/// Authentication with an access key or shared access signature (SAS)
/// are supported authentication methods. If using a non-account SAS,
/// healthchecks will fail and will need to be disabled by setting
/// `healthcheck.enabled` to `false` for this sink
///
/// When generating an account SAS, the following are the minimum required option
/// settings for Vector to access blob storage and pass a health check.
/// | Option | Value |
/// | ---------------------- | ------------------ |
/// | Allowed services | Blob |
/// | Allowed resource types | Container & Object |
/// | Allowed permissions | Read & Create |
///
/// Either `storage_account`, or this field, must be specified.
#[configurable(metadata(
docs::examples = "DefaultEndpointsProtocol=https;AccountName=mylogstorage;AccountKey=storageaccountkeybase64encoded;EndpointSuffix=core.windows.net"
))]
#[configurable(metadata(
docs::examples = "BlobEndpoint=https://mylogstorage.blob.core.windows.net/;SharedAccessSignature=generatedsastoken"
))]
pub connection_string: Option<SensitiveString>,
/// The Azure Blob Storage Account name.
///
/// Attempts to load credentials for the account in the following ways, in order:
///
/// - read from environment variables ([more information][env_cred_docs])
/// - looks for a [Managed Identity][managed_ident_docs]
/// - uses the `az` CLI tool to get an access token ([more information][az_cli_docs])
///
/// Either `connection_string`, or this field, must be specified.
///
/// [env_cred_docs]: https://docs.rs/azure_identity/latest/azure_identity/struct.EnvironmentCredential.html
/// [managed_ident_docs]: https://docs.microsoft.com/en-us/azure/active-directory/managed-identities-azure-resources/overview
/// [az_cli_docs]: https://docs.microsoft.com/en-us/cli/azure/account?view=azure-cli-latest#az-account-get-access-token
#[configurable(metadata(docs::examples = "mylogstorage"))]
pub storage_account: Option<String>,
/// The Azure Blob Storage Endpoint URL.
///
/// This is used to override the default blob storage endpoint URL in cases where you are using
/// credentials read from the environment/managed identities or access tokens without using an
/// explicit connection_string (which already explicitly supports overriding the blob endpoint
/// URL).
///
/// This may only be used with `storage_account` and is ignored when used with
/// `connection_string`.
#[configurable(metadata(docs::examples = "https://test.blob.core.usgovcloudapi.net/"))]
#[configurable(metadata(docs::examples = "https://test.blob.core.windows.net/"))]
pub endpoint: Option<String>,
/// The Azure Blob Storage Account container name.
#[configurable(metadata(docs::examples = "my-logs"))]
pub(super) container_name: String,
/// A prefix to apply to all blob keys.
///
/// Prefixes are useful for partitioning objects, such as by creating a blob key that
/// stores blobs under a particular directory. If using a prefix for this purpose, it must end
/// in `/` to act as a directory path. A trailing `/` is **not** automatically added.
#[configurable(metadata(docs::examples = "date/%F/hour/%H/"))]
#[configurable(metadata(docs::examples = "year=%Y/month=%m/day=%d/"))]
#[configurable(metadata(
docs::examples = "kubernetes/{{ metadata.cluster }}/{{ metadata.application_name }}/"
))]
#[serde(default = "default_blob_prefix")]
pub blob_prefix: Template,
/// The timestamp format for the time component of the blob key.
///
/// By default, blob keys are appended with a timestamp that reflects when the blob are sent to
/// Azure Blob Storage, such that the resulting blob key is functionally equivalent to joining
/// the blob prefix with the formatted timestamp, such as `date=2022-07-18/1658176486`.
///
/// This would represent a `blob_prefix` set to `date=%F/` and the timestamp of Mon Jul 18 2022
/// 20:34:44 GMT+0000, with the `filename_time_format` being set to `%s`, which renders
/// timestamps in seconds since the Unix epoch.
///
/// Supports the common [`strftime`][chrono_strftime_specifiers] specifiers found in most
/// languages.
///
/// When set to an empty string, no timestamp is appended to the blob prefix.
///
/// [chrono_strftime_specifiers]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html#specifiers
#[configurable(metadata(docs::syntax_override = "strftime"))]
pub blob_time_format: Option<String>,
/// Whether or not to append a UUID v4 token to the end of the blob key.
///
/// The UUID is appended to the timestamp portion of the object key, such that if the blob key
/// generated is `date=2022-07-18/1658176486`, setting this field to `true` results
/// in an blob key that looks like
/// `date=2022-07-18/1658176486-30f6652c-71da-4f9f-800d-a1189c47c547`.
///
/// This ensures there are no name collisions, and can be useful in high-volume workloads where
/// blob keys must be unique.
pub blob_append_uuid: Option<bool>,
#[serde(flatten)]
pub encoding: EncodingConfigWithFraming,
/// Compression configuration.
///
/// All compression algorithms use the default compression level unless otherwise specified.
///
/// Some cloud storage API clients and browsers handle decompression transparently, so
/// depending on how they are accessed, files may not always appear to be compressed.
#[configurable(derived)]
#[serde(default = "Compression::gzip_default")]
pub compression: Compression,
#[configurable(derived)]
#[serde(default)]
pub batch: BatchConfig<BulkSizeBasedDefaultBatchSettings>,
#[configurable(derived)]
#[serde(default)]
pub request: TowerRequestConfig<AzureBlobTowerRequestConfigDefaults>,
#[configurable(derived)]
#[serde(
default,
deserialize_with = "crate::serde::bool_or_struct",
skip_serializing_if = "crate::serde::is_default"
)]
pub(super) acknowledgements: AcknowledgementsConfig,
}
pub fn default_blob_prefix() -> Template {
Template::try_from(DEFAULT_KEY_PREFIX).unwrap()
}
impl GenerateConfig for AzureBlobSinkConfig {
fn generate_config() -> toml::Value {
toml::Value::try_from(Self {
connection_string: Some(String::from("DefaultEndpointsProtocol=https;AccountName=some-account-name;AccountKey=some-account-key;").into()),
storage_account: Some(String::from("some-account-name")),
container_name: String::from("logs"),
endpoint: None,
blob_prefix: default_blob_prefix(),
blob_time_format: Some(String::from("%s")),
blob_append_uuid: Some(true),
encoding: (Some(NewlineDelimitedEncoderConfig::new()), JsonSerializerConfig::default()).into(),
compression: Compression::gzip_default(),
batch: BatchConfig::default(),
request: TowerRequestConfig::default(),
acknowledgements: Default::default(),
})
.unwrap()
}
}
#[async_trait::async_trait]
#[typetag::serde(name = "azure_blob")]
impl SinkConfig for AzureBlobSinkConfig {
async fn build(&self, _cx: SinkContext) -> Result<(VectorSink, Healthcheck)> {
let client = azure_common::config::build_client(
self.connection_string
.as_ref()
.map(|v| v.inner().to_string()),
self.storage_account.as_ref().map(|v| v.to_string()),
self.container_name.clone(),
self.endpoint.clone(),
)?;
let healthcheck = azure_common::config::build_healthcheck(
self.container_name.clone(),
Arc::clone(&client),
)?;
let sink = self.build_processor(client)?;
Ok((sink, healthcheck))
}
fn input(&self) -> Input {
Input::new(self.encoding.config().1.input_type() & DataType::Log)
}
fn acknowledgements(&self) -> &AcknowledgementsConfig {
&self.acknowledgements
}
}
const DEFAULT_KEY_PREFIX: &str = "blob/%F/";
const DEFAULT_FILENAME_TIME_FORMAT: &str = "%s";
const DEFAULT_FILENAME_APPEND_UUID: bool = true;
impl AzureBlobSinkConfig {
pub fn build_processor(&self, client: Arc<ContainerClient>) -> crate::Result<VectorSink> {
let request_limits = self.request.into_settings();
let service = ServiceBuilder::new()
.settings(request_limits, AzureBlobRetryLogic)
.service(AzureBlobService::new(client));
// Configure our partitioning/batching.
let batcher_settings = self.batch.into_batcher_settings()?;
let blob_time_format = self
.blob_time_format
.as_ref()
.cloned()
.unwrap_or_else(|| DEFAULT_FILENAME_TIME_FORMAT.into());
let blob_append_uuid = self
.blob_append_uuid
.unwrap_or(DEFAULT_FILENAME_APPEND_UUID);
let transformer = self.encoding.transformer();
let (framer, serializer) = self.encoding.build(SinkType::MessageBased)?;
let encoder = Encoder::<Framer>::new(framer, serializer);
let request_options = AzureBlobRequestOptions {
container_name: self.container_name.clone(),
blob_time_format,
blob_append_uuid,
encoder: (transformer, encoder),
compression: self.compression,
};
let sink = AzureBlobSink::new(
service,
request_options,
self.key_partitioner()?,
batcher_settings,
);
Ok(VectorSink::from_event_streamsink(sink))
}
pub fn key_partitioner(&self) -> crate::Result<KeyPartitioner> {
Ok(KeyPartitioner::new(self.blob_prefix.clone(), None))
}
}