From 4078094970cc93fe44898d063cf10eaf2cf764ca Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 16:55:50 +0100 Subject: [PATCH 01/12] Create `crates_io_cdn_logs` package --- Cargo.lock | 9 +++++++++ crates_io_cdn_logs/Cargo.toml | 12 ++++++++++++ crates_io_cdn_logs/README.md | 6 ++++++ crates_io_cdn_logs/src/lib.rs | 0 4 files changed, 27 insertions(+) create mode 100644 crates_io_cdn_logs/Cargo.toml create mode 100644 crates_io_cdn_logs/README.md create mode 100644 crates_io_cdn_logs/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 62a0a078aaf..5807e99af86 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -984,6 +984,15 @@ dependencies = [ "url", ] +[[package]] +name = "crates_io_cdn_logs" +version = "0.0.0" +dependencies = [ + "chrono", + "insta", + "semver", +] + [[package]] name = "crates_io_env_vars" version = "0.0.0" diff --git a/crates_io_cdn_logs/Cargo.toml b/crates_io_cdn_logs/Cargo.toml new file mode 100644 index 00000000000..14f377f8cf3 --- /dev/null +++ b/crates_io_cdn_logs/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "crates_io_cdn_logs" +version = "0.0.0" +license = "MIT OR Apache-2.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] + +[dev-dependencies] diff --git a/crates_io_cdn_logs/README.md b/crates_io_cdn_logs/README.md new file mode 100644 index 00000000000..bb0260176b3 --- /dev/null +++ b/crates_io_cdn_logs/README.md @@ -0,0 +1,6 @@ +crates_io_cdn_logs +=============================================================================== + +This package contains code to parse the log files from the crates.io CDNs +(AWS CloudFront and Fastly) and to count how often crates/versions are +downloaded each day. diff --git a/crates_io_cdn_logs/src/lib.rs b/crates_io_cdn_logs/src/lib.rs new file mode 100644 index 00000000000..e69de29bb2d From 4a77054d750b4417b5fe74b0d35410c00d872a9c Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 16:56:52 +0100 Subject: [PATCH 02/12] cdn_logs: Implement `DownloadsMap` wrapper struct This provides a slightly nicer API on top of the regular `HashMap`, but, most importantly, provides a decently useful and concise `Debug` output. --- crates_io_cdn_logs/Cargo.toml | 3 + crates_io_cdn_logs/src/download_map.rs | 98 ++++++++++++++++++++++++++ crates_io_cdn_logs/src/lib.rs | 3 + 3 files changed, 104 insertions(+) create mode 100644 crates_io_cdn_logs/src/download_map.rs diff --git a/crates_io_cdn_logs/Cargo.toml b/crates_io_cdn_logs/Cargo.toml index 14f377f8cf3..c4ddcdefcb5 100644 --- a/crates_io_cdn_logs/Cargo.toml +++ b/crates_io_cdn_logs/Cargo.toml @@ -8,5 +8,8 @@ edition = "2021" workspace = true [dependencies] +chrono = { version = "=0.4.33", features = ["serde"] } +semver = "=1.0.21" [dev-dependencies] +insta = "=1.34.0" diff --git a/crates_io_cdn_logs/src/download_map.rs b/crates_io_cdn_logs/src/download_map.rs new file mode 100644 index 00000000000..a5fc7e69433 --- /dev/null +++ b/crates_io_cdn_logs/src/download_map.rs @@ -0,0 +1,98 @@ +use chrono::NaiveDate; +use semver::Version; +use std::collections::HashMap; +use std::fmt::Debug; + +#[derive(Clone, Default)] +pub struct DownloadsMap(HashMap<(String, Version, NaiveDate), u64>); + +impl DownloadsMap { + pub fn new() -> Self { + Self(HashMap::new()) + } + + /// Increments the download count for the given crate version on the given date. + pub fn add(&mut self, name: String, version: Version, date: NaiveDate) { + *self.0.entry((name, version, date)).or_default() += 1; + } + + pub fn as_inner(&self) -> &HashMap<(String, Version, NaiveDate), u64> { + &self.0 + } + + pub fn into_inner(self) -> HashMap<(String, Version, NaiveDate), u64> { + self.0 + } +} + +impl Debug for DownloadsMap { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut downloads = self + .0 + .iter() + .map(|((krate, version, date), downloads)| (date, krate, version, downloads)) + .collect::>(); + + downloads.sort(); + + f.write_str("DownloadsMap {\n")?; + for (date, krate, version, downloads) in downloads { + f.write_str(" ")?; + f.write_fmt(format_args!("{date} {krate}@{version} .. {downloads}"))?; + f.write_str("\n")?; + } + f.write_str("}")?; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use chrono::NaiveDate; + use insta::assert_debug_snapshot; + use semver::Version; + + fn add(downloads: &mut DownloadsMap, name: &str, version: &str, date: &str) { + downloads.add( + name.to_string(), + version.parse::().unwrap(), + date.parse::().unwrap(), + ); + } + + #[test] + fn test_downloads_map() { + let mut downloads = DownloadsMap::new(); + + // Add an entry to the map + add(&mut downloads, "xmas", "2.0.0", "2023-12-25"); + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2023-12-25 xmas@2.0.0 .. 1 + } + "###); + + // Add the same entry again + add(&mut downloads, "xmas", "2.0.0", "2023-12-25"); + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2023-12-25 xmas@2.0.0 .. 2 + } + "###); + + // Add other entries + add(&mut downloads, "foo", "2.0.0", "2023-12-25"); + add(&mut downloads, "xmas", "1.0.0", "2023-12-25"); + add(&mut downloads, "xmas", "2.0.0", "2023-12-26"); + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2023-12-25 foo@2.0.0 .. 1 + 2023-12-25 xmas@1.0.0 .. 1 + 2023-12-25 xmas@2.0.0 .. 2 + 2023-12-26 xmas@2.0.0 .. 1 + } + "###); + } +} diff --git a/crates_io_cdn_logs/src/lib.rs b/crates_io_cdn_logs/src/lib.rs index e69de29bb2d..624bb259359 100644 --- a/crates_io_cdn_logs/src/lib.rs +++ b/crates_io_cdn_logs/src/lib.rs @@ -0,0 +1,3 @@ +mod download_map; + +pub use crate::download_map::DownloadsMap; From a129d627fa19bc132fd4c486a74bad4fb48a687e Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 16:59:02 +0100 Subject: [PATCH 03/12] cdn_logs: Implement `enable_tracing_output()` test utility --- Cargo.lock | 2 ++ crates_io_cdn_logs/Cargo.toml | 2 ++ crates_io_cdn_logs/src/lib.rs | 2 ++ crates_io_cdn_logs/src/test_utils.rs | 11 +++++++++++ 4 files changed, 17 insertions(+) create mode 100644 crates_io_cdn_logs/src/test_utils.rs diff --git a/Cargo.lock b/Cargo.lock index 5807e99af86..e5ee3f0b29c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -991,6 +991,8 @@ dependencies = [ "chrono", "insta", "semver", + "tracing", + "tracing-subscriber", ] [[package]] diff --git a/crates_io_cdn_logs/Cargo.toml b/crates_io_cdn_logs/Cargo.toml index c4ddcdefcb5..a19217e00dd 100644 --- a/crates_io_cdn_logs/Cargo.toml +++ b/crates_io_cdn_logs/Cargo.toml @@ -10,6 +10,8 @@ workspace = true [dependencies] chrono = { version = "=0.4.33", features = ["serde"] } semver = "=1.0.21" +tracing = "=0.1.40" [dev-dependencies] insta = "=1.34.0" +tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] } diff --git a/crates_io_cdn_logs/src/lib.rs b/crates_io_cdn_logs/src/lib.rs index 624bb259359..a35ef5583a5 100644 --- a/crates_io_cdn_logs/src/lib.rs +++ b/crates_io_cdn_logs/src/lib.rs @@ -1,3 +1,5 @@ mod download_map; +#[cfg(test)] +mod test_utils; pub use crate::download_map::DownloadsMap; diff --git a/crates_io_cdn_logs/src/test_utils.rs b/crates_io_cdn_logs/src/test_utils.rs new file mode 100644 index 00000000000..2b29f324bed --- /dev/null +++ b/crates_io_cdn_logs/src/test_utils.rs @@ -0,0 +1,11 @@ +use tracing::dispatcher::DefaultGuard; +use tracing::subscriber; +use tracing_subscriber::fmt; + +/// Enable tracing output for tests. +/// +/// The tracing test output is only enabled as long as the returned guard +/// is not dropped. +pub fn enable_tracing_output() -> DefaultGuard { + subscriber::set_default(fmt().compact().with_test_writer().finish()) +} From 0c13c83ffa648e4eda623496271bbb55992717a3 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 17:07:04 +0100 Subject: [PATCH 04/12] cdn_logs: Implement URL/path parsing --- Cargo.lock | 1 + crates_io_cdn_logs/Cargo.toml | 1 + crates_io_cdn_logs/src/lib.rs | 1 + crates_io_cdn_logs/src/paths.rs | 113 ++++++++++++++++++++++++++++++++ 4 files changed, 116 insertions(+) create mode 100644 crates_io_cdn_logs/src/paths.rs diff --git a/Cargo.lock b/Cargo.lock index e5ee3f0b29c..bcf2f5c311d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -989,6 +989,7 @@ name = "crates_io_cdn_logs" version = "0.0.0" dependencies = [ "chrono", + "claims", "insta", "semver", "tracing", diff --git a/crates_io_cdn_logs/Cargo.toml b/crates_io_cdn_logs/Cargo.toml index a19217e00dd..47cac61e56d 100644 --- a/crates_io_cdn_logs/Cargo.toml +++ b/crates_io_cdn_logs/Cargo.toml @@ -13,5 +13,6 @@ semver = "=1.0.21" tracing = "=0.1.40" [dev-dependencies] +claims = "=0.7.1" insta = "=1.34.0" tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] } diff --git a/crates_io_cdn_logs/src/lib.rs b/crates_io_cdn_logs/src/lib.rs index a35ef5583a5..07e54c89ff4 100644 --- a/crates_io_cdn_logs/src/lib.rs +++ b/crates_io_cdn_logs/src/lib.rs @@ -1,4 +1,5 @@ mod download_map; +mod paths; #[cfg(test)] mod test_utils; diff --git a/crates_io_cdn_logs/src/paths.rs b/crates_io_cdn_logs/src/paths.rs new file mode 100644 index 00000000000..48d95840c0e --- /dev/null +++ b/crates_io_cdn_logs/src/paths.rs @@ -0,0 +1,113 @@ +use semver::Version; +use tracing::instrument; + +/// Parse crate name and version from a download URL or URL path. +#[instrument(level = "debug")] +pub fn parse_path(mut path: &str) -> Option<(String, Version)> { + // This would ideally use a regular expression to simplify the code, but + // regexes are slow, and we want to keep this code as fast as possible. + + // Remove any query parameters. + if let Some(pos) = path.find('?') { + path = &path[..pos]; + } + + // Find the start of the path. We assume that we don't have any nested + // `crates` folders on the server (e.g. `/foo/crates/...`). + let pos = path.find("/crates/")?; + let path = &path[pos + 8..]; + + let (folder, filename) = path.split_once('/')?; + let filename = filename.strip_suffix(".crate")?; + + let version = filename.strip_prefix(folder)?; + let version = version.strip_prefix('-')?; + let version = Version::parse(version).ok()?; + + Some((folder.to_owned(), version)) +} + +#[cfg(test)] +mod tests { + use super::*; + use claims::{assert_none, assert_some}; + use semver::Version; + + fn format((name, version): &(String, Version)) -> String { + format!("{name}@{version}") + } + + #[test] + fn test_parse_path_valid() { + let result = assert_some!(parse_path("/crates/foo/foo-1.2.3.crate")); + assert_eq!(format(&result), "foo@1.2.3"); + } + + #[test] + fn test_parse_path_with_query_params() { + let result = assert_some!(parse_path("/crates/foo/foo-1.2.3.crate?param=value")); + assert_eq!(format(&result), "foo@1.2.3"); + } + + #[test] + fn test_parse_path_with_full_url() { + let path = "https://static.crates.io/crates/foo/foo-1.2.3.crate"; + let result = assert_some!(parse_path(path)); + assert_eq!(format(&result), "foo@1.2.3"); + } + + #[test] + fn test_parse_path_empty() { + assert_none!(parse_path("")); + } + + #[test] + fn test_parse_path_only_query_params() { + assert_none!(parse_path("?param=value")); + } + + #[test] + fn test_parse_path_only_crates_prefix() { + assert_none!(parse_path("/crates/")); + } + + #[test] + fn test_parse_path_unrelated_path() { + assert_none!(parse_path("/readmes/foo/foo-1.2.3.crate")); + } + + #[test] + fn test_parse_path_no_folder() { + assert_none!(parse_path("/crates/foo-1.2.3.crate")); + } + + #[test] + fn test_parse_path_no_file_extension() { + assert_none!(parse_path("/crates/foo/foo-1.2.3")); + } + + #[test] + fn test_parse_path_wrong_file_extension() { + assert_none!(parse_path("/crates/foo/foo-1.2.3.html")); + } + + #[test] + fn test_parse_path_bad_crate_name() { + assert_none!(parse_path("/crates/foo/bar-1.2.3.crate")); + } + + #[test] + fn test_parse_path_invalid_separator() { + assert_none!(parse_path("/crates/foo/foo@1.2.3.crate")); + } + + #[test] + fn test_parse_path_no_version() { + assert_none!(parse_path("/crates/foo/foo.crate")); + } + + #[test] + fn test_parse_path_invalid_version() { + assert_none!(parse_path("/crates/foo/foo-1.2.3§foo.crate")); + } +} From bfed9075157e5fda1e6a88cb91a36471f0d20195 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 17:10:03 +0100 Subject: [PATCH 05/12] cdn_logs: Implement CloudFront download counting --- Cargo.lock | 3 + crates_io_cdn_logs/Cargo.toml | 4 + crates_io_cdn_logs/src/cloudfront.rs | 217 ++++++++++++++++++ crates_io_cdn_logs/src/lib.rs | 1 + .../test_data/cloudfront/basic.log | 22 ++ .../test_data/cloudfront/percent-encoding.log | 5 + .../cloudfront/recoverable-errors.log | 20 ++ .../test_data/cloudfront/unknown-version.log | 3 + .../cloudfront/unrelated-traffic.log | 7 + 9 files changed, 282 insertions(+) create mode 100644 crates_io_cdn_logs/src/cloudfront.rs create mode 100644 crates_io_cdn_logs/test_data/cloudfront/basic.log create mode 100644 crates_io_cdn_logs/test_data/cloudfront/percent-encoding.log create mode 100644 crates_io_cdn_logs/test_data/cloudfront/recoverable-errors.log create mode 100644 crates_io_cdn_logs/test_data/cloudfront/unknown-version.log create mode 100644 crates_io_cdn_logs/test_data/cloudfront/unrelated-traffic.log diff --git a/Cargo.lock b/Cargo.lock index bcf2f5c311d..ad66cff37f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -988,10 +988,13 @@ dependencies = [ name = "crates_io_cdn_logs" version = "0.0.0" dependencies = [ + "anyhow", "chrono", "claims", "insta", + "percent-encoding", "semver", + "tokio", "tracing", "tracing-subscriber", ] diff --git a/crates_io_cdn_logs/Cargo.toml b/crates_io_cdn_logs/Cargo.toml index 47cac61e56d..ed8ec3445a5 100644 --- a/crates_io_cdn_logs/Cargo.toml +++ b/crates_io_cdn_logs/Cargo.toml @@ -8,11 +8,15 @@ edition = "2021" workspace = true [dependencies] +anyhow = "=1.0.79" chrono = { version = "=0.4.33", features = ["serde"] } +percent-encoding = "=2.3.1" semver = "=1.0.21" +tokio = { version = "=1.35.1", features = ["io-util"] } tracing = "=0.1.40" [dev-dependencies] claims = "=0.7.1" insta = "=1.34.0" +tokio = { version = "=1.35.1", features = ["macros", "rt"] } tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] } diff --git a/crates_io_cdn_logs/src/cloudfront.rs b/crates_io_cdn_logs/src/cloudfront.rs new file mode 100644 index 00000000000..1876c1a17bc --- /dev/null +++ b/crates_io_cdn_logs/src/cloudfront.rs @@ -0,0 +1,217 @@ +use crate::paths::parse_path; +use crate::DownloadsMap; +use chrono::NaiveDate; +use std::borrow::Cow; +use tokio::io::{AsyncBufRead, AsyncBufReadExt}; +use tracing::{instrument, warn}; + +const HEADER_PREFIX: char = '#'; +const HEADER_VERSION: &str = "#Version:"; +const HEADER_FIELDS: &str = "#Fields:"; + +const FIELD_DATE: &str = "date"; +const FIELD_METHOD: &str = "cs-method"; +const FIELD_PATH: &str = "cs-uri-stem"; +const FIELD_STATUS: &str = "sc-status"; + +#[instrument(level = "debug", skip(reader))] +pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Result { + let mut num_fields = 0; + let mut date_index = None; + let mut method_index = None; + let mut path_index = None; + let mut status_index = None; + + let mut downloads = DownloadsMap::new(); + + let mut lines = reader.lines(); + while let Some(line) = lines.next_line().await? { + if let Some(version) = line.strip_prefix(HEADER_VERSION) { + let version = version.trim(); + if version != "1.0" { + anyhow::bail!("Unsupported version: {}", version); + } + continue; + } + + if let Some(fields_str) = line.strip_prefix(HEADER_FIELDS) { + let fields = fields_str.trim().split(' ').collect::>(); + + num_fields = fields.len(); + date_index = fields.iter().position(|f| f == &FIELD_DATE); + method_index = fields.iter().position(|f| f == &FIELD_METHOD); + path_index = fields.iter().position(|f| f == &FIELD_PATH); + status_index = fields.iter().position(|f| f == &FIELD_STATUS); + + continue; + } + + if line.starts_with(HEADER_PREFIX) { + warn!("Unexpected log header line: {}", line); + continue; + } + + let values = line.split('\t').collect::>(); + + let num_values = values.len(); + if num_values != num_fields { + warn!("Expected {num_fields} fields, but found {num_values}"); + continue; + } + + let method = get_value(&values, method_index, FIELD_METHOD); + if method != "GET" { + // Ignore non-GET requests. + continue; + } + + let status = get_value(&values, status_index, FIELD_STATUS); + if status != "200" { + // Ignore non-200 responses. + continue; + } + + let path = get_value(&values, path_index, FIELD_PATH); + + // Deal with paths like `/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.4%252B5.3.0-patched.crate`. + // + // Yes, the second round of decoding is intentional, since cargo is + // requesting crates with a percent-encoded path, and then CloudFront + // is percent-encoding that percent-encoded path again when logging it. + let path = decode_path(path); + let path = decode_path(&path); + + let Some((name, version)) = parse_path(&path) else { + continue; + }; + + let date = get_value(&values, date_index, FIELD_DATE); + let date = match date.parse::() { + Ok(date) => date, + Err(error) => { + warn!(%date, %error, "Failed to parse date"); + continue; + } + }; + + downloads.add(name, version, date); + } + + Ok(downloads) +} + +#[instrument(level = "debug", skip(path))] +fn decode_path(path: &str) -> Cow<'_, str> { + percent_encoding::percent_decode_str(path).decode_utf8_lossy() +} + +fn get_value<'a>(values: &'a [&'a str], index: Option, field_name: &'static str) -> &'a str { + index + .and_then(|i| values.get(i)) + .copied() + .unwrap_or_else(|| { + warn!(?index, "Failed to find {field_name} field."); + "" + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::*; + use claims::{assert_err, assert_ok}; + use insta::{assert_debug_snapshot, assert_display_snapshot}; + use std::io::Cursor; + + #[tokio::test] + async fn test_basic() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!("../test_data/cloudfront/basic.log")); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 bindgen@0.65.1 .. 1 + 2024-01-16 cumulus-primitives-core@0.4.0 .. 1 + 2024-01-16 derive_more@0.99.17 .. 1 + 2024-01-16 hash-db@0.15.2 .. 1 + 2024-01-16 hyper-rustls@0.24.2 .. 1 + 2024-01-16 jsonrpsee-server@0.16.3 .. 1 + 2024-01-16 peeking_take_while@0.1.2 .. 1 + 2024-01-16 quick-error@1.2.3 .. 2 + 2024-01-16 tracing-core@0.1.32 .. 1 + 2024-01-17 flatbuffers@23.1.21 .. 1 + 2024-01-17 jemallocator@0.5.4 .. 1 + 2024-01-17 leveldb-sys@2.0.9 .. 1 + 2024-01-17 num_cpus@1.15.0 .. 1 + 2024-01-17 paste@1.0.12 .. 1 + 2024-01-17 quick-error@1.2.3 .. 1 + 2024-01-17 rand@0.8.5 .. 1 + 2024-01-17 serde_derive@1.0.163 .. 1 + 2024-01-17 smallvec@1.10.0 .. 1 + 2024-01-17 tar@0.4.38 .. 1 + } + "###); + } + + #[tokio::test] + async fn test_percent_encoding() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!( + "../test_data/cloudfront/percent-encoding.log" + )); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-17 zstd-sys@2.0.8+zstd.1.5.5 .. 3 + } + "###); + } + + #[tokio::test] + async fn test_unrelated_traffic() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!( + "../test_data/cloudfront/unrelated-traffic.log" + )); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 bindgen@0.65.1 .. 2 + } + "###); + } + + #[tokio::test] + async fn test_recoverable_errors() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!( + "../test_data/cloudfront/recoverable-errors.log" + )); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 bindgen@0.65.1 .. 1 + } + "###); + } + + #[tokio::test] + async fn test_unknown_version() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!( + "../test_data/cloudfront/unknown-version.log" + )); + let error = assert_err!(count_downloads(&mut cursor).await); + + assert_display_snapshot!(error, @"Unsupported version: 2.0"); + } +} diff --git a/crates_io_cdn_logs/src/lib.rs b/crates_io_cdn_logs/src/lib.rs index 07e54c89ff4..a0bde1baa4a 100644 --- a/crates_io_cdn_logs/src/lib.rs +++ b/crates_io_cdn_logs/src/lib.rs @@ -1,3 +1,4 @@ +pub mod cloudfront; mod download_map; mod paths; #[cfg(test)] diff --git a/crates_io_cdn_logs/test_data/cloudfront/basic.log b/crates_io_cdn_logs/test_data/cloudfront/basic.log new file mode 100644 index 00000000000..9a961853c05 --- /dev/null +++ b/crates_io_cdn_logs/test_data/cloudfront/basic.log @@ -0,0 +1,22 @@ +#Version: 1.0 +#Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken x-forwarded-for ssl-protocol ssl-cipher x-edge-response-result-type cs-protocol-version fle-status fle-encrypted-fields c-port time-to-first-byte x-edge-detailed-result-type sc-content-type sc-content-len sc-range-start sc-range-end +2024-01-16 23:56:42 CMH68-P2 214182 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/bindgen/bindgen-0.65.1.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit eGC6xGseFkxo1BMAlPTAqh0w9-Bxi9fsSLT2MZWcPcqdjNjngxfOvQ== static.crates.io https 97 0.017 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.017 Hit application/gzip 213479 - - +2024-01-16 23:56:42 CMH68-P2 7086 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/peeking_take_while/peeking_take_while-0.1.2.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit 7uIQOqT8RjS2a8wieP36WZnCUiYp6uWF_l-RRcf2iwc9GhxgA2mftw== static.crates.io https 57 0.018 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.018 Hit application/x-tar 6697 - - +2024-01-16 23:56:42 CMH68-P2 30688 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/hyper-rustls/hyper-rustls-0.24.2.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit lIl6i1qQ4uobuaJWGB5tBVM0-1hbdZGav7dhvGroX8MQu_lUK-TgPA== static.crates.io https 50 0.018 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.018 Hit application/gzip 30195 - - +2024-01-16 23:56:42 CMH68-P2 28089 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/jsonrpsee-server/jsonrpsee-server-0.16.3.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit rqAHuO_tJeVQA6ggKgifVvMGVv57PWMwJfL-anEcFRihqjstrRa_Zg== static.crates.io https 55 0.019 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.019 Hit application/gzip 27595 - - +2024-01-16 23:56:42 CMH68-P2 61750 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/tracing-core/tracing-core-0.1.32.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit tQy5zezloJS1nPgF1BKE5BG12iVjiyw-HyGZZ09CSRnvlFM0IrxTEw== static.crates.io https 49 0.020 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.020 Hit application/gzip 61221 - - +2024-01-16 23:56:42 CMH68-P2 15495 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/quick-error/quick-error-1.2.3.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit iDfI0scfFBqKcmnFlcQ32TJ-2QQRcZXOF2pT1fqPJ0SwYt8gvsTH3w== static.crates.io https 48 0.024 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.024 Hit application/x-tar 15066 - - +2024-01-16 23:56:42 CMH68-P2 56237 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/derive_more/derive_more-0.99.17.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit XnGcfcl5zVqdp6dculUru98eQXOCqhgbvqcQ90p_EATSKrVDKZZ0tg== static.crates.io https 49 0.024 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.024 Hit application/x-tar 55771 - - +2024-01-16 23:56:42 CMH68-P2 5767 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/cumulus-primitives-core/cumulus-primitives-core-0.4.0.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit y6F8cWgKHDKvlwjhIUxeFO2iR8VCqrPahYcGBe9z72tBWa8iDAGI8w== static.crates.io https 64 0.025 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.025 Hit application/gzip 5301 - - +2024-01-16 23:56:42 CMH68-P2 3597 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/hash-db/hash-db-0.15.2.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit kiLod5aHqFX-syXZ1HMU2qykOWo2T88PKxnBagAgA7RsgI8nR_R_yg== static.crates.io https 42 0.029 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.029 Hit application/x-tar 3224 - - +2024-01-16 23:57:42 CMH68-P2 15495 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/quick-error/quick-error-1.2.3.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit iDfI0scfFBqKcmnFlcQ32TJ-2QQRcZXOF2pT1fqPJ0SwYt8gvsTH3w== static.crates.io https 48 0.024 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.024 Hit application/x-tar 15066 - - +2024-01-17 00:41:12 HIO50-C2 24428 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/flatbuffers/flatbuffers-23.1.21.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit HFwag0aWj8eJgyO0eaTqQA_q0Jo091a2TPUYFl6MrBBaN3aFYMr5ug== static.crates.io https 49 0.043 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.043 Hit application/gzip 23944 - - +2024-01-17 00:41:12 HIO50-C2 13564 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/jemallocator/jemallocator-0.5.4.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit W_8eJKaKdIXNRhhlAVaZmjxcej-ibyT5XRWqPfi8Udjxv8KN-aA6ig== static.crates.io https 48 0.044 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.044 Hit application/gzip 13088 - - +2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - - +2024-01-17 00:41:12 HIO50-C2 18642 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/paste/paste-1.0.12.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit J83xY2di3uOqjGBy49qk3M0nE0vvxNTIIsVkIvlEt5ZD7Cv54bDPug== static.crates.io https 39 0.056 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 18156 - - +2024-01-17 00:41:12 HIO50-C2 87644 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/rand/rand-0.8.5.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit ceSKUmY8GOILmHQ8W1UcW_pGErGTv1DH_aHDXIWNKe9G2IJslpzFYg== static.crates.io https 38 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/x-tar 87113 - - +2024-01-17 00:41:12 HIO50-C2 54969 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/serde_derive/serde_derive-1.0.163.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit 0H_zJOQvRFEEy-yrtgM7FusA3kEaxhOfMvWishmefb4pjt3EuNfn2w== static.crates.io https 50 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 54447 - - +2024-01-17 00:41:12 HIO50-C2 32030 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/smallvec/smallvec-1.10.0.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit h4eeKqlsnOS3ws5PuPqCRWQxUcgriXgnKdCIz77o4lvu7VztQv1Ebg== static.crates.io https 44 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 31564 - - +2024-01-17 00:41:12 HIO50-C2 49623 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/tar/tar-0.4.38.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit 5mYmtPCMX1KGY9vQfEldnCGTZAAAFnibp97GUKoMPgP8rpM9eRcLHQ== static.crates.io https 37 0.058 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/x-tar 49158 - - +2024-01-17 00:41:12 HIO50-C2 1322578 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/leveldb-sys/leveldb-sys-2.0.9.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit 06YCYVLUfQ1ne9ATFuNPfZBobMNbwEBAOScTV3FaXmmGu3Eyu8zBpA== static.crates.io https 48 0.072 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.010 Hit application/x-tar 1320697 - - +2024-01-17 00:41:12 HIO50-C2 15495 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/quick-error/quick-error-1.2.3.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit iDfI0scfFBqKcmnFlcQ32TJ-2QQRcZXOF2pT1fqPJ0SwYt8gvsTH3w== static.crates.io https 48 0.024 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.024 Hit application/x-tar 15066 - - diff --git a/crates_io_cdn_logs/test_data/cloudfront/percent-encoding.log b/crates_io_cdn_logs/test_data/cloudfront/percent-encoding.log new file mode 100644 index 00000000000..2cf9337ee59 --- /dev/null +++ b/crates_io_cdn_logs/test_data/cloudfront/percent-encoding.log @@ -0,0 +1,5 @@ +#Version: 1.0 +#Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken x-forwarded-for ssl-protocol ssl-cipher x-edge-response-result-type cs-protocol-version fle-status fle-encrypted-fields c-port time-to-first-byte x-edge-detailed-result-type sc-content-type sc-content-len sc-range-start sc-range-end +2024-01-17 00:41:12 HIO50-C2 737549 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/zstd-sys/zstd-sys-2.0.8%252Bzstd.1.5.5.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit lq9xG92EbzMk6CPAWkMUm6sDiTVvPWb3mQPrndUwu4y0HRzx99RPqA== static.crates.io https 53 0.058 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 736270 - - +2024-01-17 00:41:12 HIO50-C2 737549 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/zstd-sys/zstd-sys-2.0.8%2Bzstd.1.5.5.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit lq9xG92EbzMk6CPAWkMUm6sDiTVvPWb3mQPrndUwu4y0HRzx99RPqA== static.crates.io https 53 0.058 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 736270 - - +2024-01-17 00:41:12 HIO50-C2 737549 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/zstd-sys/zstd-sys-2.0.8+zstd.1.5.5.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit lq9xG92EbzMk6CPAWkMUm6sDiTVvPWb3mQPrndUwu4y0HRzx99RPqA== static.crates.io https 53 0.058 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 736270 - - diff --git a/crates_io_cdn_logs/test_data/cloudfront/recoverable-errors.log b/crates_io_cdn_logs/test_data/cloudfront/recoverable-errors.log new file mode 100644 index 00000000000..e1d900fd420 --- /dev/null +++ b/crates_io_cdn_logs/test_data/cloudfront/recoverable-errors.log @@ -0,0 +1,20 @@ +#Version: 1.0 +#Foo? +#Fields: date time cs-method cs-uri-stem sc-status sc-foo +2024-01-16 23:56:42 GET /crates/bindgen/bindgen-0.65.1.crate 200 +#Fields: date time cs-uri-stem sc-status +2024-01-16 23:56:42 /crates/bindgen/bindgen-0.65.1.crate 200 +#Fields: date time cs-method cs-uri-stem +2024-01-16 23:56:42 GET /crates/bindgen/bindgen-0.65.1.crate +#Fields: date time cs-method sc-status +2024-01-16 23:56:42 GET 200 +#Fields: cs-method cs-uri-stem sc-status +GET /crates/bindgen/bindgen-0.65.1.crate 200 +#Fields: date time cs-method cs-uri-stem sc-status +foo 23:56:42 GET /crates/bindgen/bindgen-0.65.1.crate 200 +#Fields: date time cs-method cs-uri-stem sc-status +2024-01-16 23:56:42 GET /crates/foo/bindgen-0.65.1.crate 200 +#Fields: date time cs-method cs-uri-stem sc-status +2024-01-16 23:56:42 GET /crates/bindgen/bindgen-0.0.0§foo.crate 200 +#Fields: date time cs-method cs-uri-stem sc-status +2024-01-16 23:56:42 GET /crates/bindgen/bindgen-0.65.1.crate 200 diff --git a/crates_io_cdn_logs/test_data/cloudfront/unknown-version.log b/crates_io_cdn_logs/test_data/cloudfront/unknown-version.log new file mode 100644 index 00000000000..36ad50af6e2 --- /dev/null +++ b/crates_io_cdn_logs/test_data/cloudfront/unknown-version.log @@ -0,0 +1,3 @@ +#Version: 2.0 +#Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken x-forwarded-for ssl-protocol ssl-cipher x-edge-response-result-type cs-protocol-version fle-status fle-encrypted-fields c-port time-to-first-byte x-edge-detailed-result-type sc-content-type sc-content-len sc-range-start sc-range-end +2024-01-16 23:56:42 CMH68-P2 214182 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/bindgen/bindgen-0.65.1.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit eGC6xGseFkxo1BMAlPTAqh0w9-Bxi9fsSLT2MZWcPcqdjNjngxfOvQ== static.crates.io https 97 0.017 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.017 Hit application/gzip 213479 - - diff --git a/crates_io_cdn_logs/test_data/cloudfront/unrelated-traffic.log b/crates_io_cdn_logs/test_data/cloudfront/unrelated-traffic.log new file mode 100644 index 00000000000..96adfa1917c --- /dev/null +++ b/crates_io_cdn_logs/test_data/cloudfront/unrelated-traffic.log @@ -0,0 +1,7 @@ +#Version: 1.0 +#Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken x-forwarded-for ssl-protocol ssl-cipher x-edge-response-result-type cs-protocol-version fle-status fle-encrypted-fields c-port time-to-first-byte x-edge-detailed-result-type sc-content-type sc-content-len sc-range-start sc-range-end +2024-01-16 23:56:42 CMH68-P2 214182 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/bindgen/bindgen-0.65.1.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit eGC6xGseFkxo1BMAlPTAqh0w9-Bxi9fsSLT2MZWcPcqdjNjngxfOvQ== static.crates.io https 97 0.017 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.017 Hit application/gzip 213479 - - +2024-01-16 23:56:42 CMH68-P2 214182 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/bindgen/bindgen-0.65.2.crate 404 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit eGC6xGseFkxo1BMAlPTAqh0w9-Bxi9fsSLT2MZWcPcqdjNjngxfOvQ== static.crates.io https 97 0.017 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.017 Hit application/gzip 213479 - - +2024-01-16 23:56:42 CMH68-P2 214182 1.2.3.4 HEAD d19xqa3lc3clo8.cloudfront.net /crates/bindgen/bindgen-0.65.1.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit eGC6xGseFkxo1BMAlPTAqh0w9-Bxi9fsSLT2MZWcPcqdjNjngxfOvQ== static.crates.io https 97 0.017 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.017 Hit application/gzip 213479 - - +2024-01-16 23:56:42 CMH68-P2 214182 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /readmes/bindgen/bindgen-0.65.1.html 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit eGC6xGseFkxo1BMAlPTAqh0w9-Bxi9fsSLT2MZWcPcqdjNjngxfOvQ== static.crates.io https 97 0.017 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.017 Hit application/gzip 213479 - - +2024-01-16 23:56:42 CMH68-P2 214182 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/bindgen/bindgen-0.65.1.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit eGC6xGseFkxo1BMAlPTAqh0w9-Bxi9fsSLT2MZWcPcqdjNjngxfOvQ== static.crates.io https 97 0.017 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.017 Hit application/gzip 213479 - - From cf25315c276910199dbd5639f2bbcaf08d9edf79 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 17:11:56 +0100 Subject: [PATCH 06/12] cdn_logs: Implement Fastly download counting --- Cargo.lock | 2 + crates_io_cdn_logs/Cargo.toml | 2 + crates_io_cdn_logs/src/fastly/json.rs | 109 +++++++++++ crates_io_cdn_logs/src/fastly/mod.rs | 170 ++++++++++++++++++ crates_io_cdn_logs/src/lib.rs | 1 + crates_io_cdn_logs/test_data/fastly/basic.log | 23 +++ .../test_data/fastly/percent-encoding.log | 2 + .../test_data/fastly/recoverable-errors.log | 7 + .../test_data/fastly/unrelated-traffic.log | 5 + 9 files changed, 321 insertions(+) create mode 100644 crates_io_cdn_logs/src/fastly/json.rs create mode 100644 crates_io_cdn_logs/src/fastly/mod.rs create mode 100644 crates_io_cdn_logs/test_data/fastly/basic.log create mode 100644 crates_io_cdn_logs/test_data/fastly/percent-encoding.log create mode 100644 crates_io_cdn_logs/test_data/fastly/recoverable-errors.log create mode 100644 crates_io_cdn_logs/test_data/fastly/unrelated-traffic.log diff --git a/Cargo.lock b/Cargo.lock index ad66cff37f0..75a738a49d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -994,6 +994,8 @@ dependencies = [ "insta", "percent-encoding", "semver", + "serde", + "serde_json", "tokio", "tracing", "tracing-subscriber", diff --git a/crates_io_cdn_logs/Cargo.toml b/crates_io_cdn_logs/Cargo.toml index ed8ec3445a5..e109df1cd8a 100644 --- a/crates_io_cdn_logs/Cargo.toml +++ b/crates_io_cdn_logs/Cargo.toml @@ -12,6 +12,8 @@ anyhow = "=1.0.79" chrono = { version = "=0.4.33", features = ["serde"] } percent-encoding = "=2.3.1" semver = "=1.0.21" +serde = { version = "=1.0.196", features = ["derive"] } +serde_json = "=1.0.113" tokio = { version = "=1.35.1", features = ["io-util"] } tracing = "=0.1.40" diff --git a/crates_io_cdn_logs/src/fastly/json.rs b/crates_io_cdn_logs/src/fastly/json.rs new file mode 100644 index 00000000000..c2f3c8a6953 --- /dev/null +++ b/crates_io_cdn_logs/src/fastly/json.rs @@ -0,0 +1,109 @@ +//! Imported from + +use chrono::{DateTime, Utc}; +use serde::Deserialize; +use std::borrow::Cow; + +/// This struct corresponds to the JSON payload of a log line from +/// Fastly's CDN logs. +#[derive(Debug, Deserialize)] +#[serde(tag = "version")] +pub enum LogLine<'a> { + #[serde(borrow, rename = "1")] + V1(LogLineV1<'a>), +} + +impl LogLine<'_> { + pub fn date_time(&self) -> DateTime { + match self { + LogLine::V1(line) => line.date_time, + } + } + + pub fn method(&self) -> &str { + match self { + LogLine::V1(line) => &line.method, + } + } + + pub fn url(&self) -> &str { + match self { + LogLine::V1(line) => &line.url, + } + } + + pub fn status(&self) -> u16 { + match self { + LogLine::V1(line) => line.status, + } + } +} + +/// This struct corresponds to the `"version": "1"` variant of the [LogLine] enum. +/// +/// Compared to the implementation in the [rust-lang/simpleinfra](https://github.com/rust-lang/simpleinfra/) +/// repository, there are a couple of differences: +/// +/// - The `bytes` field is not included, because we don't need it. +/// - The `ip` field is not included, because we don't need it. +/// - The `method` and `status` fields are not optional, because we handle +/// parsing errors gracefully. +/// - The `date_time` field is using `chrono` like the rest of the +/// crates.io codebase. +/// - The `method` and `url` fields are using `Cow` to avoid +/// unnecessary allocations. +#[derive(Debug, Deserialize)] +pub struct LogLineV1<'a> { + pub date_time: DateTime, + #[serde(borrow)] + pub method: Cow<'a, str>, + #[serde(borrow)] + pub url: Cow<'a, str>, + pub status: u16, +} + +#[cfg(test)] +mod tests { + use super::*; + use claims::assert_ok; + use insta::assert_debug_snapshot; + + #[test] + fn test_parse() { + let input = r#"{"bytes":null,"date_time":"2024-01-16T16:03:04.44007323Z","ip":"45.79.107.220","method":"GET","status":403,"url":"https://static.staging.crates.io/?1705420437","version":"1"}"#; + let output = assert_ok!(serde_json::from_str::>(input)); + assert_debug_snapshot!(output, @r###" + V1( + LogLineV1 { + date_time: 2024-01-16T16:03:04.440073230Z, + method: "GET", + url: "https://static.staging.crates.io/?1705420437", + status: 403, + }, + ) + "###); + + assert_eq!( + output.date_time().to_string(), + "2024-01-16 16:03:04.440073230 UTC" + ); + assert_eq!(output.method(), "GET"); + assert_eq!(output.url(), "https://static.staging.crates.io/?1705420437"); + assert_eq!(output.status(), 403); + + match output { + LogLine::V1(l) => { + assert!(is_borrowed(&l.method)); + assert!(is_borrowed(&l.url)); + } + } + } + + #[allow(clippy::ptr_arg)] + fn is_borrowed(s: &Cow<'_, str>) -> bool { + match s { + Cow::Borrowed(_) => true, + Cow::Owned(_) => false, + } + } +} diff --git a/crates_io_cdn_logs/src/fastly/mod.rs b/crates_io_cdn_logs/src/fastly/mod.rs new file mode 100644 index 00000000000..8457922d081 --- /dev/null +++ b/crates_io_cdn_logs/src/fastly/mod.rs @@ -0,0 +1,170 @@ +mod json; + +use crate::paths::parse_path; +use crate::DownloadsMap; +use std::borrow::Cow; +use tokio::io::{AsyncBufRead, AsyncBufReadExt}; +use tracing::{debug_span, instrument, warn}; + +#[instrument(level = "debug", skip(reader))] +pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Result { + let mut downloads = DownloadsMap::new(); + + let mut lines = reader.lines(); + while let Some(line) = lines.next_line().await? { + let span = debug_span!("process_line"); + let _guard = span.enter(); + + let Some(json) = parse_line(&line) else { + warn!("Failed to find JSON start"); + continue; + }; + + let json = match parse_json(json) { + Ok(json) => json, + Err(error) => { + warn!("Failed to parse JSON: {error}"); + continue; + } + }; + + if json.method() != "GET" { + // Ignore non-GET requests. + continue; + } + + if json.status() != 200 { + // Ignore non-200 responses. + continue; + } + + let url = decode_url(json.url()); + + // We're avoiding parsing to `url::Url` here for performance reasons. + // Since we're already filtering out non-200 responses, we can assume + // that the URL is valid. + + let Some((name, version)) = parse_path(&url) else { + continue; + }; + + let date = json.date_time().date_naive(); + + downloads.add(name, version, date); + } + + Ok(downloads) +} + +#[instrument(level = "debug", skip(line))] +fn parse_line(line: &str) -> Option<&str> { + // A regex could also be used here, but the `find()` call appears to + // be roughly 10x faster. + line.find(r#"]: {"#).map(|pos| &line[pos + 3..]) +} + +#[instrument(level = "debug", skip(json))] +fn parse_json(json: &str) -> Result, serde_json::Error> { + serde_json::from_str(json) +} + +/// Deal with paths like `/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.4%2B5.3.0-patched.crate`. +/// +/// Compared to the CloudFront logs, we only need a single round of +/// percent-decoding here, since JSON has its own escaping rules. +#[instrument(level = "debug", skip(url))] +fn decode_url(url: &str) -> Cow<'_, str> { + percent_encoding::percent_decode_str(url).decode_utf8_lossy() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::*; + use claims::assert_ok; + use insta::assert_debug_snapshot; + use std::io::Cursor; + + #[tokio::test] + async fn test_basic() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!("../../test_data/fastly/basic.log")); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 strsim@0.10.0 .. 1 + 2024-01-16 tikv-jemalloc-sys@0.5.2+5.3.0-patched .. 1 + 2024-01-16 tinyvec@1.6.0 .. 1 + 2024-01-16 winapi-x86_64-pc-windows-gnu@0.4.0 .. 1 + 2024-01-16 windows_x86_64_gnu@0.48.0 .. 1 + 2024-01-16 windows_x86_64_gnullvm@0.42.2 .. 1 + 2024-01-16 winnow@0.5.4 .. 1 + 2024-01-17 anstyle@1.0.1 .. 1 + 2024-01-17 cast@0.3.0 .. 1 + 2024-01-17 cc@1.0.73 .. 1 + 2024-01-17 croaring-sys@1.1.0 .. 1 + 2024-01-17 half@1.8.2 .. 1 + 2024-01-17 jemalloc-sys@0.3.2 .. 1 + 2024-01-17 lazy_static@1.4.0 .. 1 + 2024-01-17 libc@0.2.126 .. 1 + 2024-01-17 lzma-sys@0.1.20 .. 1 + 2024-01-17 sqlparser@0.40.0 .. 1 + 2024-01-17 synchronized-writer@1.1.11 .. 1 + 2024-01-17 tikv-jemalloc-sys@0.5.4+5.3.0-patched .. 1 + 2024-01-17 windows_x86_64_gnu@0.48.0 .. 2 + 2024-01-17 xz2@0.1.7 .. 1 + 2024-01-17 zstd-safe@7.0.0 .. 1 + } + "###); + } + + #[tokio::test] + async fn test_percent_encoding() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!( + "../../test_data/fastly/percent-encoding.log" + )); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 tikv-jemalloc-sys@0.5.2+5.3.0-patched .. 2 + } + "###); + } + + #[tokio::test] + async fn test_unrelated_traffic() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!( + "../../test_data/fastly/unrelated-traffic.log" + )); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 strsim@0.10.0 .. 2 + } + "###); + } + + #[tokio::test] + async fn test_recoverable_errors() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!( + "../../test_data/fastly/recoverable-errors.log" + )); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 strsim@0.10.0 .. 1 + } + "###); + } +} diff --git a/crates_io_cdn_logs/src/lib.rs b/crates_io_cdn_logs/src/lib.rs index a0bde1baa4a..317fefb59a3 100644 --- a/crates_io_cdn_logs/src/lib.rs +++ b/crates_io_cdn_logs/src/lib.rs @@ -1,5 +1,6 @@ pub mod cloudfront; mod download_map; +pub mod fastly; mod paths; #[cfg(test)] mod test_utils; diff --git a/crates_io_cdn_logs/test_data/fastly/basic.log b/crates_io_cdn_logs/test_data/fastly/basic.log new file mode 100644 index 00000000000..af73b73e503 --- /dev/null +++ b/crates_io_cdn_logs/test_data/fastly/basic.log @@ -0,0 +1,23 @@ +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/strsim/strsim-0.10.0.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":45991,"date_time":"2024-01-16T23:53:20.463371599Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/tinyvec/tinyvec-1.6.0.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":880664,"date_time":"2024-01-16T23:53:20.463067918Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.2%2B5.3.0-patched.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":142308,"date_time":"2024-01-16T23:53:20.463371469Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/winnow/winnow-0.5.4.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":2947998,"date_time":"2024-01-16T23:53:20.464702435Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/winapi-x86_64-pc-windows-gnu/winapi-x86_64-pc-windows-gnu-0.4.0.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":364068,"date_time":"2024-01-16T23:53:20.465564579Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/windows_x86_64_gnullvm/windows_x86_64_gnullvm-0.42.2.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":703595,"date_time":"2024-01-16T23:53:20.465360118Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/windows_x86_64_gnu/windows_x86_64_gnu-0.48.0.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-bfi-kbfi7400035 s3-request-logs[322614]: {"bytes":590481,"date_time":"2024-01-17T00:36:43.114288133Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/libc/libc-0.2.126.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-bfi-kbfi7400035 s3-request-logs[322614]: {"bytes":57880,"date_time":"2024-01-17T00:36:43.11526654Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/cc/cc-1.0.73.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-bfi-kbfi7400035 s3-request-logs[322614]: {"bytes":10443,"date_time":"2024-01-17T00:36:43.115641947Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/lazy_static/lazy_static-1.4.0.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-bfi-kbfi7400035 s3-request-logs[322614]: {"bytes":1344660,"date_time":"2024-01-17T00:36:43.115208269Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/jemalloc-sys/jemalloc-sys-0.3.2.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-sjc1000089 s3-request-logs[322614]: {"bytes":13977,"date_time":"2024-01-17T00:36:43.260687891Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/anstyle/anstyle-1.0.1.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kcgs7200055 s3-request-logs[322614]: {"bytes":11452,"date_time":"2024-01-17T00:36:43.429574011Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/cast/cast-0.3.0.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kcgs7200055 s3-request-logs[322614]: {"bytes":884765,"date_time":"2024-01-17T00:36:43.430515963Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.4%2B5.3.0-patched.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kcgs7200055 s3-request-logs[322614]: {"bytes":2512,"date_time":"2024-01-17T00:36:43.431066209Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/synchronized-writer/synchronized-writer-1.1.11.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kcgs7200055 s3-request-logs[322614]: {"bytes":41874,"date_time":"2024-01-17T00:36:43.432765119Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/half/half-1.8.2.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":703595,"date_time":"2024-01-17T00:36:43.43284285Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/windows_x86_64_gnu/windows_x86_64_gnu-0.48.0.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":703595,"date_time":"2024-01-17T00:36:43.43284285Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/windows_x86_64_gnu/windows_x86_64_gnu-0.48.0.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kcgs7200055 s3-request-logs[322614]: {"bytes":20463,"date_time":"2024-01-17T00:36:43.43284285Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/zstd-safe/zstd-safe-7.0.0.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kcgs7200055 s3-request-logs[322614]: {"bytes":23892,"date_time":"2024-01-17T00:36:43.433042882Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/xz2/xz2-0.1.7.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kcgs7200055 s3-request-logs[322614]: {"bytes":158273,"date_time":"2024-01-17T00:36:43.433175064Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/sqlparser/sqlparser-0.40.0.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kcgs7200055 s3-request-logs[322614]: {"bytes":163754,"date_time":"2024-01-17T00:36:43.43364737Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/croaring-sys/croaring-sys-1.1.0.crate","version":"1"} +<134>2024-01-17T00:36:43Z cache-iad-kcgs7200055 s3-request-logs[322614]: {"bytes":760045,"date_time":"2024-01-17T00:36:43.43449992Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/lzma-sys/lzma-sys-0.1.20.crate","version":"1"} diff --git a/crates_io_cdn_logs/test_data/fastly/percent-encoding.log b/crates_io_cdn_logs/test_data/fastly/percent-encoding.log new file mode 100644 index 00000000000..3ac8f7382c6 --- /dev/null +++ b/crates_io_cdn_logs/test_data/fastly/percent-encoding.log @@ -0,0 +1,2 @@ +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":880664,"date_time":"2024-01-16T23:53:20.463067918Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.2%2B5.3.0-patched.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":880664,"date_time":"2024-01-16T23:53:20.463067918Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.2+5.3.0-patched.crate","version":"1"} diff --git a/crates_io_cdn_logs/test_data/fastly/recoverable-errors.log b/crates_io_cdn_logs/test_data/fastly/recoverable-errors.log new file mode 100644 index 00000000000..d4f9c1ab3f1 --- /dev/null +++ b/crates_io_cdn_logs/test_data/fastly/recoverable-errors.log @@ -0,0 +1,7 @@ +foo: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/strsim/strsim-0.10.0.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/strsim/strsim-0.10.0.crate","version":"1" +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/strsim/strsim-0.10.0.crate","version":"42"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"foo","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/foo/strsim-0.10.0.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/strsim/strsim-0.0.0§foo.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/strsim/strsim-0.10.0.crate","version":"1"} diff --git a/crates_io_cdn_logs/test_data/fastly/unrelated-traffic.log b/crates_io_cdn_logs/test_data/fastly/unrelated-traffic.log new file mode 100644 index 00000000000..e4807061d48 --- /dev/null +++ b/crates_io_cdn_logs/test_data/fastly/unrelated-traffic.log @@ -0,0 +1,5 @@ +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/strsim/strsim-0.10.0.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":404,"url":"https://static.crates.io/crates/strsim/strsim-0.10.0.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"HEAD","status":200,"url":"https://static.crates.io/crates/strsim/strsim-0.10.0.crate","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/readmes/bindgen/bindgen-0.65.1.html","version":"1"} +<134>2024-01-16T23:53:20Z cache-iad-kiad7000128 s3-request-logs[322614]: {"bytes":11355,"date_time":"2024-01-16T23:53:20.460557177Z","ip":"1.2.3.4","method":"GET","status":200,"url":"https://static.crates.io/crates/strsim/strsim-0.10.0.crate","version":"1"} From 5c1d75358e8142cd68b837732c6667a1a7c38322 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 17:13:51 +0100 Subject: [PATCH 07/12] cdn_logs: Implement download counting benchmarks --- Cargo.lock | 158 ++++++++++++++++++ crates_io_cdn_logs/Cargo.toml | 5 + crates_io_cdn_logs/benches/count_downloads.rs | 29 ++++ 3 files changed, 192 insertions(+) create mode 100644 crates_io_cdn_logs/benches/count_downloads.rs diff --git a/Cargo.lock b/Cargo.lock index 75a738a49d1..c98623b002d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anstream" version = "0.6.11" @@ -712,6 +718,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cc" version = "1.0.83" @@ -753,6 +765,33 @@ dependencies = [ "stacker", ] +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "cipher" version = "0.4.4" @@ -991,6 +1030,7 @@ dependencies = [ "anyhow", "chrono", "claims", + "criterion", "insta", "percent-encoding", "semver", @@ -1135,6 +1175,44 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "futures", + "is-terminal", + "itertools 0.10.5", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "tokio", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools 0.10.5", +] + [[package]] name = "crossbeam-channel" version = "0.5.11" @@ -1169,6 +1247,12 @@ version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "crypto-bigint" version = "0.5.5" @@ -1897,6 +1981,16 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +dependencies = [ + "cfg-if", + "crunchy", +] + [[package]] name = "hashbrown" version = "0.14.3" @@ -2282,6 +2376,26 @@ dependencies = [ "serde", ] +[[package]] +name = "is-terminal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" +dependencies = [ + "hermit-abi", + "rustix", + "windows-sys 0.52.0", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.11.0" @@ -2765,6 +2879,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + [[package]] name = "opaque-debug" version = "0.3.0" @@ -3019,6 +3139,34 @@ version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" +[[package]] +name = "plotters" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" + +[[package]] +name = "plotters-svg" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +dependencies = [ + "plotters-backend", +] + [[package]] name = "polyval" version = "0.6.1" @@ -4149,6 +4297,16 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.6.0" diff --git a/crates_io_cdn_logs/Cargo.toml b/crates_io_cdn_logs/Cargo.toml index e109df1cd8a..e1bb53ae07b 100644 --- a/crates_io_cdn_logs/Cargo.toml +++ b/crates_io_cdn_logs/Cargo.toml @@ -19,6 +19,11 @@ tracing = "=0.1.40" [dev-dependencies] claims = "=0.7.1" +criterion = { version = "=0.5.1", features = ["async_tokio"] } insta = "=1.34.0" tokio = { version = "=1.35.1", features = ["macros", "rt"] } tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] } + +[[bench]] +name = "count_downloads" +harness = false diff --git a/crates_io_cdn_logs/benches/count_downloads.rs b/crates_io_cdn_logs/benches/count_downloads.rs new file mode 100644 index 00000000000..f968f500c12 --- /dev/null +++ b/crates_io_cdn_logs/benches/count_downloads.rs @@ -0,0 +1,29 @@ +use crates_io_cdn_logs::{cloudfront, fastly}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use std::io::Cursor; + +fn criterion_benchmark(c: &mut Criterion) { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + let bytes = include_bytes!("../test_data/cloudfront/basic.log"); + c.bench_function("cloudfront", |b| { + // Insert a call to `to_async` to convert the bencher to async mode. + // The timing loops are the same as with the normal bencher. + b.to_async(&rt) + .iter(|| cloudfront::count_downloads(black_box(Cursor::new(bytes)))); + }); + + let bytes = include_bytes!("../test_data/fastly/basic.log"); + c.bench_function("fastly", |b| { + // Insert a call to `to_async` to convert the bencher to async mode. + // The timing loops are the same as with the normal bencher. + b.to_async(&rt) + .iter(|| fastly::count_downloads(black_box(Cursor::new(bytes)))); + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 64d2cef91fe53ae156a42c0958ba6015a3137504 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 17:15:15 +0100 Subject: [PATCH 08/12] cdn_logs: Implement download counting with file format detection --- crates_io_cdn_logs/src/lib.rs | 116 ++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/crates_io_cdn_logs/src/lib.rs b/crates_io_cdn_logs/src/lib.rs index 317fefb59a3..fbba9762d03 100644 --- a/crates_io_cdn_logs/src/lib.rs +++ b/crates_io_cdn_logs/src/lib.rs @@ -6,3 +6,119 @@ mod paths; mod test_utils; pub use crate::download_map::DownloadsMap; +use std::io::Cursor; +use tokio::io::{AsyncBufRead, AsyncReadExt}; + +pub async fn count_downloads(mut reader: R) -> anyhow::Result +where + R: AsyncBufRead + Unpin, +{ + // Read the first byte to determine the file format. + match reader.read_u8().await? { + // CloudFront log files start with a `#Version` header. + b'#' => { + // We can't use `AsyncSeek` here because `async-compression` does + // not support it, but we can use `Cursor` to prepend the `#` back + // onto the reader. + let reader = Cursor::new(b"#").chain(reader); + cloudfront::count_downloads(reader).await + } + // Fastly log lines start with a `<123>` field. + b'<' => { + // We can't use `AsyncSeek` here because `async-compression` does + // not support it, but we can use `Cursor` to prepend the `<` back + // onto the reader. + let reader = Cursor::new(b"<").chain(reader); + fastly::count_downloads(reader).await + } + // Anything else is rejected. + byte => { + anyhow::bail!("Failed to determine log file format. Unrecognized first byte: {byte:?}.") + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::*; + use claims::{assert_err, assert_ok}; + use insta::{assert_debug_snapshot, assert_display_snapshot}; + use std::io::Cursor; + + #[tokio::test] + async fn test_cloudfront() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!("../test_data/cloudfront/basic.log")); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 bindgen@0.65.1 .. 1 + 2024-01-16 cumulus-primitives-core@0.4.0 .. 1 + 2024-01-16 derive_more@0.99.17 .. 1 + 2024-01-16 hash-db@0.15.2 .. 1 + 2024-01-16 hyper-rustls@0.24.2 .. 1 + 2024-01-16 jsonrpsee-server@0.16.3 .. 1 + 2024-01-16 peeking_take_while@0.1.2 .. 1 + 2024-01-16 quick-error@1.2.3 .. 2 + 2024-01-16 tracing-core@0.1.32 .. 1 + 2024-01-17 flatbuffers@23.1.21 .. 1 + 2024-01-17 jemallocator@0.5.4 .. 1 + 2024-01-17 leveldb-sys@2.0.9 .. 1 + 2024-01-17 num_cpus@1.15.0 .. 1 + 2024-01-17 paste@1.0.12 .. 1 + 2024-01-17 quick-error@1.2.3 .. 1 + 2024-01-17 rand@0.8.5 .. 1 + 2024-01-17 serde_derive@1.0.163 .. 1 + 2024-01-17 smallvec@1.10.0 .. 1 + 2024-01-17 tar@0.4.38 .. 1 + } + "###); + } + + #[tokio::test] + async fn test_fastly() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(include_bytes!("../test_data/fastly/basic.log")); + let downloads = assert_ok!(count_downloads(&mut cursor).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 strsim@0.10.0 .. 1 + 2024-01-16 tikv-jemalloc-sys@0.5.2+5.3.0-patched .. 1 + 2024-01-16 tinyvec@1.6.0 .. 1 + 2024-01-16 winapi-x86_64-pc-windows-gnu@0.4.0 .. 1 + 2024-01-16 windows_x86_64_gnu@0.48.0 .. 1 + 2024-01-16 windows_x86_64_gnullvm@0.42.2 .. 1 + 2024-01-16 winnow@0.5.4 .. 1 + 2024-01-17 anstyle@1.0.1 .. 1 + 2024-01-17 cast@0.3.0 .. 1 + 2024-01-17 cc@1.0.73 .. 1 + 2024-01-17 croaring-sys@1.1.0 .. 1 + 2024-01-17 half@1.8.2 .. 1 + 2024-01-17 jemalloc-sys@0.3.2 .. 1 + 2024-01-17 lazy_static@1.4.0 .. 1 + 2024-01-17 libc@0.2.126 .. 1 + 2024-01-17 lzma-sys@0.1.20 .. 1 + 2024-01-17 sqlparser@0.40.0 .. 1 + 2024-01-17 synchronized-writer@1.1.11 .. 1 + 2024-01-17 tikv-jemalloc-sys@0.5.4+5.3.0-patched .. 1 + 2024-01-17 windows_x86_64_gnu@0.48.0 .. 2 + 2024-01-17 xz2@0.1.7 .. 1 + 2024-01-17 zstd-safe@7.0.0 .. 1 + } + "###); + } + + #[tokio::test] + async fn test_unknown() { + let _guard = enable_tracing_output(); + + let mut cursor = Cursor::new(b"foo"); + let error = assert_err!(count_downloads(&mut cursor).await); + assert_display_snapshot!(error, @"Failed to determine log file format. Unrecognized first byte: 102."); + } +} From 854d926f9c6d49434e035ab0bf95dafaa8c43265 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 17:16:37 +0100 Subject: [PATCH 09/12] cdn_logs: Implement gzip and zstd compression support --- Cargo.lock | 1 + crates_io_cdn_logs/Cargo.toml | 1 + crates_io_cdn_logs/src/compression.rs | 42 ++++++++++ crates_io_cdn_logs/src/lib.rs | 78 ++++++++++++++++++ .../test_data/cloudfront/basic.log.gz | Bin 0 -> 2050 bytes .../test_data/fastly/basic.log.zst | Bin 0 -> 886 bytes 6 files changed, 122 insertions(+) create mode 100644 crates_io_cdn_logs/src/compression.rs create mode 100644 crates_io_cdn_logs/test_data/cloudfront/basic.log.gz create mode 100644 crates_io_cdn_logs/test_data/fastly/basic.log.zst diff --git a/Cargo.lock b/Cargo.lock index c98623b002d..99c8d2e10ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1028,6 +1028,7 @@ name = "crates_io_cdn_logs" version = "0.0.0" dependencies = [ "anyhow", + "async-compression", "chrono", "claims", "criterion", diff --git a/crates_io_cdn_logs/Cargo.toml b/crates_io_cdn_logs/Cargo.toml index e1bb53ae07b..12cc5f92022 100644 --- a/crates_io_cdn_logs/Cargo.toml +++ b/crates_io_cdn_logs/Cargo.toml @@ -9,6 +9,7 @@ workspace = true [dependencies] anyhow = "=1.0.79" +async-compression = { version = "=0.4.6", features = ["gzip", "tokio", "zstd"] } chrono = { version = "=0.4.33", features = ["serde"] } percent-encoding = "=2.3.1" semver = "=1.0.21" diff --git a/crates_io_cdn_logs/src/compression.rs b/crates_io_cdn_logs/src/compression.rs new file mode 100644 index 00000000000..9a7c2cb8427 --- /dev/null +++ b/crates_io_cdn_logs/src/compression.rs @@ -0,0 +1,42 @@ +use async_compression::tokio::bufread::{GzipDecoder, ZstdDecoder}; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tokio::io::{AsyncBufRead, AsyncRead, ReadBuf}; + +/// A wrapper for the compression formats that CDN logs are currently stored in. +pub enum Decompressor { + Gzip(GzipDecoder), + Zstd(ZstdDecoder), +} + +impl Decompressor { + pub fn from_extension(inner: T, extension: Option<&str>) -> anyhow::Result { + match extension { + Some("gz") => Ok(Decompressor::gzip(inner)), + Some("zst") => Ok(Decompressor::zstd(inner)), + Some(ext) => anyhow::bail!("Unexpected file extension: {}", ext), + None => anyhow::bail!("Unexpected missing file extension"), + } + } + + pub fn gzip(inner: T) -> Self { + Decompressor::Gzip(GzipDecoder::new(inner)) + } + + pub fn zstd(inner: T) -> Self { + Decompressor::Zstd(ZstdDecoder::new(inner)) + } +} + +impl AsyncRead for Decompressor { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + match &mut *self { + Decompressor::Gzip(inner) => Pin::new(inner).poll_read(cx, buf), + Decompressor::Zstd(inner) => Pin::new(inner).poll_read(cx, buf), + } + } +} diff --git a/crates_io_cdn_logs/src/lib.rs b/crates_io_cdn_logs/src/lib.rs index fbba9762d03..e4543483dd5 100644 --- a/crates_io_cdn_logs/src/lib.rs +++ b/crates_io_cdn_logs/src/lib.rs @@ -1,10 +1,12 @@ pub mod cloudfront; +mod compression; mod download_map; pub mod fastly; mod paths; #[cfg(test)] mod test_utils; +pub use crate::compression::Decompressor; pub use crate::download_map::DownloadsMap; use std::io::Cursor; use tokio::io::{AsyncBufRead, AsyncReadExt}; @@ -41,6 +43,7 @@ where #[cfg(test)] mod tests { use super::*; + use crate::compression::Decompressor; use crate::test_utils::*; use claims::{assert_err, assert_ok}; use insta::{assert_debug_snapshot, assert_display_snapshot}; @@ -78,6 +81,42 @@ mod tests { "###); } + #[tokio::test] + async fn test_compressed_cloudfront() { + let _guard = enable_tracing_output(); + + let cursor = Cursor::new(include_bytes!("../test_data/cloudfront/basic.log.gz")); + + let decompressor = assert_ok!(Decompressor::from_extension(cursor, Some("gz"))); + let reader = tokio::io::BufReader::new(decompressor); + + let downloads = assert_ok!(count_downloads(reader).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 bindgen@0.65.1 .. 1 + 2024-01-16 cumulus-primitives-core@0.4.0 .. 1 + 2024-01-16 derive_more@0.99.17 .. 1 + 2024-01-16 hash-db@0.15.2 .. 1 + 2024-01-16 hyper-rustls@0.24.2 .. 1 + 2024-01-16 jsonrpsee-server@0.16.3 .. 1 + 2024-01-16 peeking_take_while@0.1.2 .. 1 + 2024-01-16 quick-error@1.2.3 .. 2 + 2024-01-16 tracing-core@0.1.32 .. 1 + 2024-01-17 flatbuffers@23.1.21 .. 1 + 2024-01-17 jemallocator@0.5.4 .. 1 + 2024-01-17 leveldb-sys@2.0.9 .. 1 + 2024-01-17 num_cpus@1.15.0 .. 1 + 2024-01-17 paste@1.0.12 .. 1 + 2024-01-17 quick-error@1.2.3 .. 1 + 2024-01-17 rand@0.8.5 .. 1 + 2024-01-17 serde_derive@1.0.163 .. 1 + 2024-01-17 smallvec@1.10.0 .. 1 + 2024-01-17 tar@0.4.38 .. 1 + } + "###); + } + #[tokio::test] async fn test_fastly() { let _guard = enable_tracing_output(); @@ -113,6 +152,45 @@ mod tests { "###); } + #[tokio::test] + async fn test_compressed_fastly() { + let _guard = enable_tracing_output(); + + let cursor = Cursor::new(include_bytes!("../test_data/fastly/basic.log.zst")); + + let decompressor = assert_ok!(Decompressor::from_extension(cursor, Some("zst"))); + let reader = tokio::io::BufReader::new(decompressor); + + let downloads = assert_ok!(count_downloads(reader).await); + + assert_debug_snapshot!(downloads, @r###" + DownloadsMap { + 2024-01-16 strsim@0.10.0 .. 1 + 2024-01-16 tikv-jemalloc-sys@0.5.2+5.3.0-patched .. 1 + 2024-01-16 tinyvec@1.6.0 .. 1 + 2024-01-16 winapi-x86_64-pc-windows-gnu@0.4.0 .. 1 + 2024-01-16 windows_x86_64_gnu@0.48.0 .. 1 + 2024-01-16 windows_x86_64_gnullvm@0.42.2 .. 1 + 2024-01-16 winnow@0.5.4 .. 1 + 2024-01-17 anstyle@1.0.1 .. 1 + 2024-01-17 cast@0.3.0 .. 1 + 2024-01-17 cc@1.0.73 .. 1 + 2024-01-17 croaring-sys@1.1.0 .. 1 + 2024-01-17 half@1.8.2 .. 1 + 2024-01-17 jemalloc-sys@0.3.2 .. 1 + 2024-01-17 lazy_static@1.4.0 .. 1 + 2024-01-17 libc@0.2.126 .. 1 + 2024-01-17 lzma-sys@0.1.20 .. 1 + 2024-01-17 sqlparser@0.40.0 .. 1 + 2024-01-17 synchronized-writer@1.1.11 .. 1 + 2024-01-17 tikv-jemalloc-sys@0.5.4+5.3.0-patched .. 1 + 2024-01-17 windows_x86_64_gnu@0.48.0 .. 2 + 2024-01-17 xz2@0.1.7 .. 1 + 2024-01-17 zstd-safe@7.0.0 .. 1 + } + "###); + } + #[tokio::test] async fn test_unknown() { let _guard = enable_tracing_output(); diff --git a/crates_io_cdn_logs/test_data/cloudfront/basic.log.gz b/crates_io_cdn_logs/test_data/cloudfront/basic.log.gz new file mode 100644 index 0000000000000000000000000000000000000000..2dbddb5d54fb275202dfb03f724f0a47516794f0 GIT binary patch literal 2050 zcmV+d2>tgTiwForE3st&17cxwX=5&IZ)X7Qm|1V)IuysBxu2ra)sbFZ!&*NZ}tMUT`B~TWoEqJO==12Lq^jZOYq|xD;2gh^CFvJ|0nrGRn#xUnVJKg67ij*MKCa>`*?dN718Czy4U6(KHV-Av?tM zR}=4?W`b|?7LTtgc+|8hE2oNCltoq)l;zV#%by`+<|7SxZzp0qliAcROE+m6{2EYJ zEI;=6JinHXqcDBBhi7XD#R=Yx6VM+&v@FulVuv^@7di6etm24e-**!6R>t>S?(WA* zHqA^w-~*no#OYa4m~lw2>H$5z31)`wab-UF&ngY=zYsu@0H6Tn3KDOzd@G?!z1Nb7 zFhCUqB}h;OR#8=~N)^3fRBWj36H^Q<(F!6`<(J%c7+5 z1yGf76;_W=D+qv!P_azrM*koHtBO zSw)sr<$dNNkO{c}y2n2)Ca^rsf|nHz8cA1^4%&h2!(=Gs(Kxi>NYvuh_)}hm(CcAFe^_^+a)29Vi2D9F-7F5u zkIdAoh^uc1ih@5vT!spN@1G~0G1KDe!QBRcoY^vvj^I2Z#2%Rh@_UR5V(dkAN^a>GDHZzPa0y0zmU(9 zPIBLR6ewd6dtMe!GhK(-zjN9kwVY-x=~~;c8CXLR865!)ha+n_?>A9wKqnb=z<9UF zh+CvaOFW191WLLJv-E~oLkwj3ea=`$qH+Oe9_9H3U2RK)epcr8tg2P0TuFK!>Xu^# z_-UHhv20s;Fk!h$=y2YzC!V`55^Jb}c-7F1ahFZ+y4<|8Gt%+hp#KbMj1}br7L678 z63|w@&4av*b^WcMan9*UMc!!VFMuIwGx$Guc*e03!9 z=-)>+Hd{$^E~JO~61I91lpG%Wvj`c447&TUX1bcIDWlYFlW?>etq$jm8y)#v3Sn2XluL2u~R431|s+@haX zxO8AX+jXF4D^9p;cJ223VdQy%HZ_;q&EBFL!CxN?JRi-Hf#Z{jz1bI}`yiN_?4JXD zLUoZz{{;Qoq>rGX2nhKAXc=Ph0?=W;U0HGdtH3pbYxA3)?}LPU_3+^Oq-{3=lPudE ztVLD?`+Pk{+16i2cB;)AH8H;f-HGevJb6?fl`~nQbo6}EMV{{`;Ws=V#4-UNAWUQ~ z)t@IEn`uT*SI(AuzBm2e2ibOrxL+XK7xR9y(Q5}uO&-M_2pga%_76t8olYOyMbOCb z@=mE2SX$o=&dG#$lnE?f!xq2AmO%UgJtm4QT_O|A4DI8E%L3tJuLEn*ad)y^5WU~N z-?oNi1}D~R73&SA8wI>;t<2Wlyghs9QdLLoP8!5d&BZyeCo~tG+?@e?@^nOjP<)>( z4=7~y0d)lHXGD?bJ*1vYB6*gfgRR$W2tFcGxXQ7s_2t$)El@1!s4Dv zN7jAo%=x}}JZi=l9e}>85|Rq>N1_Lwo?zvYh&Z4H4R}VE9$u6MzQw9~twzX;`eJ%N zafUFYs%A9vhk>)KMeE+fde^9F{jp_C#ilvmZuMMj9CGqhi_ZlSM^*~DhBSPA9QQr* g;Gh4ibN%14KIV9zC;fkp_bSKx4@K@4_3a%10K=0382|tP literal 0 HcmV?d00001 diff --git a/crates_io_cdn_logs/test_data/fastly/basic.log.zst b/crates_io_cdn_logs/test_data/fastly/basic.log.zst new file mode 100644 index 0000000000000000000000000000000000000000..6c6d61a2e1c2cc5283e90a3803bc3e943b844284 GIT binary patch literal 886 zcmV-+1Bv`7wJ-f-I~PS80QReVBp{8}0jx1x33GhI$4dEn6l<&iaS0kQQgc&Mg6$tc zU;uy!9(e$C0Br!wVj_dFNXeX#v?M2*q=hOkNlqr5GA$B(_igR8+1JrPm+@>u^>eDK%Q(nhs@Unjd+(}7Wdb_>*1z3#yDeoH4CCydnL5{=ckxoiFhr+& zonzl!>s%jWt*YK)1H*%g4Gite_o9l>-0C-bthMptNQC1|o=2nc8uMq&1q>q-4!`%@ znX7(#JTZb$#B}p-XRUu#Mo{8F$XC|38#}gb@q`3x^K-qLY8s$ajwvVJ)v-HnUv8~x za3JLEvTxkp-1@FvtDc>m8GHX@%e23_b2=qtR=u&u&H;dn zs6iD`PC-?oq#!A@8E^t25fLPM5`eKN2qWMaiXu=L2qDN2gg_y<#S~GB5yN^63A!F5 zWOs%^GB)Occx2kbz|etEvaw`lnL7|eaOtD@_IF6$7)0)l0^_QG;w(7)DTI>K0;Q+? zVC3GJPC&#ex50SHq`jh68$-hwGC$#Cjt;l`nwD|g$||QbUM`f+4A?}Pw?p>r(;uN% z^uU`lbX4$XNe1nSF-4IFK+5Im2h=IVa#{=GfG?G0PXpK$gZ`WY)(Ovm5)7yWEE!_} z367d*^|Ejt9+?qz>nN09ikQ9~>}i5D-EGzq4D!Kjkf)bkin7!bsWvPRW`Jxe4Sx%g zA4z~-f%m#${2A~XqKd~6kTgRhv*#v)Y&@QbAjF;ifRUq(iB*aCSNF MuF6qBtKnn%-JRL0CIA2c literal 0 HcmV?d00001 From e2304357699bb7f74622741e6bcda55734484e03 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 26 Jan 2024 17:17:19 +0100 Subject: [PATCH 10/12] cdn_logs: Implement "count_downloads" example binary --- Cargo.lock | 1 + crates_io_cdn_logs/Cargo.toml | 3 +- .../examples/count_downloads.rs | 78 +++++++++++++++++++ 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 crates_io_cdn_logs/examples/count_downloads.rs diff --git a/Cargo.lock b/Cargo.lock index 99c8d2e10ab..947956744b3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1031,6 +1031,7 @@ dependencies = [ "async-compression", "chrono", "claims", + "clap", "criterion", "insta", "percent-encoding", diff --git a/crates_io_cdn_logs/Cargo.toml b/crates_io_cdn_logs/Cargo.toml index 12cc5f92022..cbb3b8db1f9 100644 --- a/crates_io_cdn_logs/Cargo.toml +++ b/crates_io_cdn_logs/Cargo.toml @@ -20,9 +20,10 @@ tracing = "=0.1.40" [dev-dependencies] claims = "=0.7.1" +clap = { version = "=4.4.18", features = ["derive"] } criterion = { version = "=0.5.1", features = ["async_tokio"] } insta = "=1.34.0" -tokio = { version = "=1.35.1", features = ["macros", "rt"] } +tokio = { version = "=1.35.1", features = ["fs", "macros", "rt", "rt-multi-thread"] } tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] } [[bench]] diff --git a/crates_io_cdn_logs/examples/count_downloads.rs b/crates_io_cdn_logs/examples/count_downloads.rs new file mode 100644 index 00000000000..1156ce098fd --- /dev/null +++ b/crates_io_cdn_logs/examples/count_downloads.rs @@ -0,0 +1,78 @@ +use anyhow::Context; +use clap::Parser; +use crates_io_cdn_logs::{count_downloads, Decompressor}; +use std::collections::HashSet; +use std::path::PathBuf; +use std::time::SystemTime; +use tokio::fs::File; +use tokio::io::BufReader; +use tracing::level_filters::LevelFilter; +use tracing_subscriber::{fmt, EnvFilter}; + +#[derive(Debug, clap::Parser)] +struct Options { + /// The path to the CDN log file to parse + path: PathBuf, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + init_tracing(); + + let options = Options::parse(); + + let file = File::open(&options.path) + .await + .with_context(|| format!("Failed to open {}", options.path.display()))?; + + let reader = BufReader::new(file); + + let extension = options + .path + .extension() + .and_then(|ext| ext.to_str()) + .unwrap_or_default(); + + let start = SystemTime::now(); + let downloads = match extension { + "gz" | "zst" => { + let decompressor = Decompressor::from_extension(reader, Some(extension))?; + let reader = BufReader::new(decompressor); + count_downloads(reader).await? + } + _ => count_downloads(reader).await?, + }; + let duration = start.elapsed()?; + println!("{downloads:?}"); + println!(); + + let num_crates = downloads + .as_inner() + .iter() + .map(|((_, krate, _), _)| krate) + .collect::>() + .len(); + + let total_inserts = downloads.as_inner().len(); + + let total_downloads = downloads + .as_inner() + .iter() + .map(|(_, downloads)| downloads) + .sum::(); + + println!("Number of crates: {num_crates}"); + println!("Number of needed inserts: {total_inserts}"); + println!("Total number of downloads: {total_downloads}"); + println!("Time to parse: {duration:?}"); + + Ok(()) +} + +fn init_tracing() { + let env_filter = EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .from_env_lossy(); + + fmt().compact().with_env_filter(env_filter).init(); +} From aaa86b67751a1d38f593151101d52168faaeaecb Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Wed, 31 Jan 2024 09:39:07 +0100 Subject: [PATCH 11/12] cdn_logs/paths: Add additional "path with dashes" test --- crates_io_cdn_logs/src/paths.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crates_io_cdn_logs/src/paths.rs b/crates_io_cdn_logs/src/paths.rs index 48d95840c0e..e22e57ce0bd 100644 --- a/crates_io_cdn_logs/src/paths.rs +++ b/crates_io_cdn_logs/src/paths.rs @@ -56,6 +56,13 @@ mod tests { assert_eq!(format(&result), "foo@1.2.3"); } + #[test] + fn test_parse_path_with_dashes() { + let path = "/crates/foo-bar/foo-bar-1.0.0-rc.1.crate"; + let result = assert_some!(parse_path(path)); + assert_eq!(format(&result), "foo-bar@1.0.0-rc.1"); + } + #[test] fn test_parse_path_empty() { assert_none!(parse_path("")); From c5fd44535af7ea8725b72b3fc05bc53a6202936e Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Wed, 31 Jan 2024 09:44:30 +0100 Subject: [PATCH 12/12] cdn_logs: Add log file format documentation links --- crates_io_cdn_logs/src/cloudfront.rs | 5 +++++ crates_io_cdn_logs/src/fastly/mod.rs | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/crates_io_cdn_logs/src/cloudfront.rs b/crates_io_cdn_logs/src/cloudfront.rs index 1876c1a17bc..026c8ac17b5 100644 --- a/crates_io_cdn_logs/src/cloudfront.rs +++ b/crates_io_cdn_logs/src/cloudfront.rs @@ -1,3 +1,8 @@ +//! # CloudFront log parsing +//! +//! see +//! and . + use crate::paths::parse_path; use crate::DownloadsMap; use chrono::NaiveDate; diff --git a/crates_io_cdn_logs/src/fastly/mod.rs b/crates_io_cdn_logs/src/fastly/mod.rs index 8457922d081..0a3c8989eb4 100644 --- a/crates_io_cdn_logs/src/fastly/mod.rs +++ b/crates_io_cdn_logs/src/fastly/mod.rs @@ -1,3 +1,7 @@ +//! # Fastly CDN log parsing +//! +//! see . + mod json; use crate::paths::parse_path;