From 23bdede2e38bf093d676bae3f081fa88f6ce9afb Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Tue, 12 Nov 2024 13:11:11 +0100 Subject: [PATCH] Implement missing chunks parser features This should implement everything except for the `complexity` parser. --- core/benches/pyreport.rs | 92 ++------- core/src/parsers/pyreport/chunks_serde.rs | 226 +++++++++++++--------- core/src/parsers/pyreport/report_json.rs | 2 +- core/src/report/models.rs | 5 +- core/src/report/pyreport/types.rs | 10 +- test_utils/src/fixtures.rs | 8 +- 6 files changed, 161 insertions(+), 182 deletions(-) diff --git a/core/benches/pyreport.rs b/core/benches/pyreport.rs index 21621c7..fceffef 100644 --- a/core/benches/pyreport.rs +++ b/core/benches/pyreport.rs @@ -1,12 +1,11 @@ use std::collections::HashMap; use codecov_rs::{ - parsers::pyreport::{chunks, chunks_serde, report_json}, - test_utils::test_report::{TestReport, TestReportBuilder}, + parsers::pyreport::{chunks_serde, report_json}, + test_utils::test_report::TestReportBuilder, }; use criterion::{criterion_group, criterion_main, Criterion}; use test_utils::fixtures::{read_fixture, FixtureFormat::Pyreport, FixtureSize::Large}; -use winnow::Parser as _; criterion_group!( benches, @@ -55,24 +54,25 @@ fn parse_report_json(input: &[u8]) -> report_json::ParsedReportJson { } fn simple_chunks(c: &mut Criterion) { - let chunks = &[ + let chunks: &[&[u8]] = &[ // Header and one chunk with an empty line - "{}\n<<<<< end_of_header >>>>>\n{}\n", + b"{}\n<<<<< end_of_header >>>>>\n{}\n", // No header, one chunk with a populated line and an empty line - "{}\n[1, null, [[0, 1]]]\n", + b"{}\n[1, null, [[0, 1]]]\n", // No header, two chunks, the second having just one empty line - "{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n", + b"{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n", // Header, two chunks, the second having multiple data lines and an empty line - "{}\n<<<<< end_of_header >>>>>\n{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n[1, null, [[0, 1]]]\n[1, null, [[0, 1]]]\n", + b"{}\n<<<<< end_of_header >>>>>\n{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n[1, null, [[0, 1]]]\n[1, null, [[0, 1]]]\n", ]; let files = HashMap::from([(0, 0), (1, 1), (2, 2)]); let sessions = HashMap::from([(0, 0), (1, 1), (2, 2)]); + let report_json = report_json::ParsedReportJson { files, sessions }; c.bench_function("simple_chunks", |b| { b.iter(|| { for input in chunks { - parse_chunks_file(input, files.clone(), sessions.clone()) + parse_chunks_file_serde(input, report_json.clone()); } }) }); @@ -87,7 +87,6 @@ fn complex_chunks(c: &mut Criterion) { "worker-c71ddfd4cb1753c7a540e5248c2beaa079fc3341-chunks.txt", ) .unwrap(); - let chunks = std::str::from_utf8(&chunks).unwrap(); // parsing the chunks depends on having loaded the `report_json` let report = read_fixture( @@ -96,79 +95,14 @@ fn complex_chunks(c: &mut Criterion) { "worker-c71ddfd4cb1753c7a540e5248c2beaa079fc3341-report_json.json", ) .unwrap(); - let report_json::ParsedReportJson { files, sessions } = parse_report_json(&report); + let report_json = parse_report_json(&report); c.bench_function("complex_chunks", |b| { - b.iter(|| parse_chunks_file(chunks, files.clone(), sessions.clone())) + b.iter(|| parse_chunks_file_serde(&chunks, report_json.clone())) }); } -fn parse_chunks_file(input: &str, files: HashMap, sessions: HashMap) { +fn parse_chunks_file_serde(input: &[u8], report_json: report_json::ParsedReportJson) { let report_builder = TestReportBuilder::default(); - - let chunks_ctx = chunks::ParseCtx::new(report_builder, files, sessions); - let mut chunks_stream = chunks::ReportOutputStream::<&str, TestReport, TestReportBuilder> { - input, - state: chunks_ctx, - }; - - chunks::parse_chunks_file - .parse_next(&mut chunks_stream) - .unwrap(); -} - -#[divan::bench] -fn simple_chunks_serde() { - let chunks: &[&[u8]] = &[ - // Header and one chunk with an empty line - b"{}\n<<<<< end_of_header >>>>>\n{}\n", - // No header, one chunk with a populated line and an empty line - b"{}\n[1, null, [[0, 1]]]\n", - // No header, two chunks, the second having just one empty line - b"{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n", - // Header, two chunks, the second having multiple data lines and an empty line - b"{}\n<<<<< end_of_header >>>>>\n{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n[1, null, [[0, 1]]]\n[1, null, [[0, 1]]]\n", - ]; - - let report_json = report_json::ParsedReportJson { - files: Default::default(), - sessions: Default::default(), - }; - - for input in chunks { - parse_chunks_file_serde(input, &report_json); - } -} - -// this is currently <300 ms on my machine -#[divan::bench(sample_count = 10)] -fn complex_chunks_serde(bencher: Bencher) { - // this is a ~96M `chunks` file - let chunks = - load_fixture("pyreport/large/worker-c71ddfd4cb1753c7a540e5248c2beaa079fc3341-chunks.txt"); - - // parsing the chunks depends on having loaded the `report_json` - let report = load_fixture( - "pyreport/large/worker-c71ddfd4cb1753c7a540e5248c2beaa079fc3341-report_json.json", - ); - let report_json = parse_report_json(&report); - - bencher.bench(|| parse_chunks_file_serde(&chunks, &report_json)); -} - -fn parse_chunks_file_serde(input: &[u8], report_json: &report_json::ParsedReportJson) { - let mut report_builder = TestReportBuilder::default(); - chunks_serde::parse_chunks_file(input, report_json, &mut report_builder).unwrap(); -} - -#[track_caller] -fn load_fixture(path: &str) -> Vec { - let path = format!("./fixtures/{path}"); - let contents = std::fs::read(path).unwrap(); - - if contents.starts_with(b"version https://git-lfs.github.com/spec/v1") { - panic!("Fixture has not been pulled from Git LFS"); - } - - contents + chunks_serde::parse_chunks_file(input, report_json, report_builder).unwrap(); } diff --git a/core/src/parsers/pyreport/chunks_serde.rs b/core/src/parsers/pyreport/chunks_serde.rs index 9372e7b..2ec60fa 100644 --- a/core/src/parsers/pyreport/chunks_serde.rs +++ b/core/src/parsers/pyreport/chunks_serde.rs @@ -9,11 +9,12 @@ //! - `"labels_index"`: assigns a numeric ID to each label to save space //! //! If the `"labels_index"` key is present, this parser will insert each label -//! into the report as a [`crate::report::models::Context`] and create a mapping +//! into the report as a [`Context`](models::Context) and create a mapping //! in `buf.state.labels_index` from numeric ID in the header to the -//! new `Context`'s ID in the output report. If the `"labels_index"` key is -//! _not_ present, we will populate `buf.state.labels_index` gradually as we -//! encounter new labels during parsing. +//! new [`Context`](models::Context)'s ID in the output report. If the +//! `"labels_index"` key is _not_ present, we will populate +//! `buf.state.labels_index` gradually as we encounter new labels during +//! parsing. //! //! A chunk contains all of the line-by-line measurements for //! a file. The Nth chunk corresponds to the file whose entry in @@ -26,24 +27,24 @@ //! A line may be empty, or it may contain a [`LineRecord`]. //! A [`LineRecord`] itself does not correspond to anything in the output, //! but it's an umbrella that includes all of the data -//! tied to a line/[`CoverageSample`]. +//! tied to a line/[`CoverageSample`](models::CoverageSample). //! //! This parser performs all the writes it can to the output -//! stream and only returns a `ReportLine` for tests. The `report_line_or_empty` -//! parser which wraps this and supports empty lines returns `Ok(())`. +//! stream and only returns a [`ReportLine`] for tests. The +//! `report_line_or_empty` parser which wraps this and supports empty lines +//! returns `Ok(())`. use std::{collections::HashMap, fmt, mem, sync::OnceLock}; use memchr::{memchr, memmem}; use serde::{de, de::IgnoredAny, Deserialize}; -use super::report_json::ParsedReportJson; +use super::{chunks::ParseCtx, report_json::ParsedReportJson, utils}; use crate::{ error::CodecovError, report::{ - models, pyreport::{ - types::{self, PyreportCoverage, ReportLine}, + types::{self, CoverageType, MissingBranch, Partial, PyreportCoverage, ReportLine}, CHUNKS_FILE_END_OF_CHUNK, CHUNKS_FILE_HEADER_TERMINATOR, }, Report, ReportBuilder, @@ -52,8 +53,8 @@ use crate::{ pub fn parse_chunks_file( input: &[u8], - _report_json: &ParsedReportJson, - builder: &mut B, + report_json: ParsedReportJson, + mut builder: B, ) -> Result<(), CodecovError> where B: ReportBuilder, @@ -67,47 +68,50 @@ where labels_index.insert(index.clone(), context.id); } + let mut ctx = ParseCtx::new(builder, report_json.files, report_json.sessions); + let mut report_lines = vec![]; let mut chunks = chunks_file.chunks(); + let mut chunk_no = 0; while let Some(mut chunk) = chunks.next_chunk()? { let mut line_no = 0; report_lines.clear(); while let Some(line) = chunk.next_line()? { line_no += 1; if let Some(line) = line { - let coverage_type = match line.1.unwrap_or_default() { - CoverageType::Line => models::CoverageType::Line, - CoverageType::Branch => models::CoverageType::Branch, - CoverageType::Method => models::CoverageType::Method, - }; let sessions = line .2 .into_iter() .map(|session| types::LineSession { session_id: session.0, - coverage: session.1.into(), - branches: None, // TODO - partials: None, // TODO + coverage: session.1, + branches: session.2.into(), + partials: session.3.into(), complexity: None, // TODO }) .collect(); + let datapoints = line + .5 + .map(|dps| dps.into_iter().map(|dp| (dp.0, dp.into())).collect()); let mut report_line = ReportLine { line_no, - coverage: line.0.into(), - coverage_type, + coverage: line.0, + coverage_type: line.1.unwrap_or_default(), sessions, _messages: None, _complexity: None, - datapoints: None, // TODO + datapoints: Some(datapoints), }; report_line.normalize(); report_lines.push(report_line); } } - // TODO: - // utils::save_report_lines()?; + + ctx.chunk.index = chunk_no; + utils::save_report_lines(&report_lines, &mut ctx)?; + chunk_no += 1; } Ok(()) @@ -214,7 +218,7 @@ pub struct Chunk<'d> { input: &'d [u8], } -impl<'d> Chunk<'d> { +impl Chunk<'_> { pub fn present_sessions(&self) -> &[u32] { &self.chunk_header.present_sessions } @@ -230,7 +234,7 @@ impl<'d> Chunk<'d> { let line_record: LineRecord = serde_json::from_slice(line).map_err(ChunksFileParseError::InvalidLineRecord)?; - return Ok(Some(Some(line_record))); + Ok(Some(Some(line_record))) } } @@ -273,7 +277,7 @@ impl Eq for IgnoredAnyEq {} #[derive(Debug, Clone, PartialEq, Eq, Deserialize)] pub struct LineRecord( /// coverage - Coverage, + PyreportCoverage, /// coverage type Option, /// sessions @@ -284,9 +288,9 @@ pub struct LineRecord( /// complexity #[serde(default)] Option, - /// TODO: datapoints + /// datapoints #[serde(default)] - Option, + Option>, ); #[derive(Debug, Clone, PartialEq, Eq, Deserialize)] @@ -294,25 +298,41 @@ pub struct LineSession( /// session id usize, /// coverage - Coverage, - /// TODO: branches + PyreportCoverage, + /// branches #[serde(default)] - Option, - /// TODO: partials + Option>, + /// partials #[serde(default)] - Option, + Option>, /// TODO: complexity #[serde(default)] Option, ); -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Deserialize)] -#[serde(try_from = "&str")] -pub enum CoverageType { - #[default] - Line, - Branch, - Method, +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +pub struct CoverageDatapoint( + /// session id + u32, + /// coverage + PyreportCoverage, + /// coverage type + #[serde(default)] + Option, + /// labels + #[serde(default)] + Option>, +); + +impl From for types::CoverageDatapoint { + fn from(datapoint: CoverageDatapoint) -> Self { + Self { + session_id: datapoint.0, + _coverage: datapoint.1, + _coverage_type: datapoint.2, + labels: datapoint.3.unwrap_or_default(), + } + } } impl<'s> TryFrom<&'s str> for CoverageType { @@ -328,33 +348,14 @@ impl<'s> TryFrom<&'s str> for CoverageType { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Coverage { - Partial, - BranchTaken(u32, u32), - HitCount(u32), -} - -impl Into for Coverage { - fn into(self) -> PyreportCoverage { - match self { - Coverage::Partial => PyreportCoverage::Partial(), - Coverage::BranchTaken(covered, total) => { - PyreportCoverage::BranchesTaken { covered, total } - } - Coverage::HitCount(hits) => PyreportCoverage::HitCount(hits), - } - } -} - -impl<'de> Deserialize<'de> for Coverage { - fn deserialize(deserializer: D) -> Result +impl<'de> Deserialize<'de> for PyreportCoverage { + fn deserialize(deserializer: D) -> Result where D: de::Deserializer<'de>, { struct CoverageVisitor; - impl<'de> de::Visitor<'de> for CoverageVisitor { - type Value = Coverage; + impl de::Visitor<'_> for CoverageVisitor { + type Value = PyreportCoverage; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { formatter.write_str("a coverage value") @@ -365,7 +366,7 @@ impl<'de> Deserialize<'de> for Coverage { E: de::Error, { if v { - Ok(Coverage::Partial) + Ok(PyreportCoverage::Partial()) } else { Err(de::Error::invalid_value(de::Unexpected::Bool(v), &self)) } @@ -375,7 +376,7 @@ impl<'de> Deserialize<'de> for Coverage { where E: de::Error, { - Ok(Coverage::HitCount(value as u32)) + Ok(PyreportCoverage::HitCount(value as u32)) } fn visit_str(self, v: &str) -> Result @@ -387,7 +388,7 @@ impl<'de> Deserialize<'de> for Coverage { let covered: u32 = covered.parse().map_err(|_| invalid())?; let total: u32 = total.parse().map_err(|_| invalid())?; - Ok(Coverage::BranchTaken(covered, total)) + Ok(PyreportCoverage::BranchesTaken { covered, total }) } } @@ -395,72 +396,109 @@ impl<'de> Deserialize<'de> for Coverage { } } +impl<'de> Deserialize<'de> for MissingBranch { + fn deserialize(deserializer: D) -> Result + where + D: de::Deserializer<'de>, + { + struct MissingBranchVisitor; + impl de::Visitor<'_> for MissingBranchVisitor { + type Value = MissingBranch; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a missing branch value") + } + + fn visit_str(self, v: &str) -> Result + where + E: de::Error, + { + let invalid = || de::Error::invalid_value(de::Unexpected::Str(v), &self); + + if let Some((block, branch)) = v.split_once(":") { + let block: u32 = block.parse().map_err(|_| invalid())?; + let branch: u32 = branch.parse().map_err(|_| invalid())?; + + return Ok(MissingBranch::BlockAndBranch(block, branch)); + } + + if let Some(condition) = v.strip_suffix(":jump") { + let condition: u32 = condition.parse().map_err(|_| invalid())?; + + // TODO(swatinem): can we skip saving the `jump` here? + return Ok(MissingBranch::Condition(condition, Some("jump".into()))); + } + + let line: u32 = v.parse().map_err(|_| invalid())?; + Ok(MissingBranch::Line(line)) + } + } + + deserializer.deserialize_any(MissingBranchVisitor) + } +} + #[cfg(test)] mod tests { use super::*; #[test] - fn test_parsing_events() { + fn test_parsing_chunks() { let simple_line_record = LineRecord( - Coverage::HitCount(1), + PyreportCoverage::HitCount(1), None, - vec![LineSession(0, Coverage::HitCount(1), None, None, None)], + vec![LineSession( + 0, + PyreportCoverage::HitCount(1), + None, + None, + None, + )], None, None, None, ); + #[allow(clippy::type_complexity)] let cases: &[( &[u8], // input - HashMap, // labels index - &[(&[u32], &[Option])], // chunks: session ids, line records + &[&[Option]], // chunks: line records )] = &[ ( // Header and one chunk with an empty line b"{}\n<<<<< end_of_header >>>>>\n{}\n", - HashMap::default(), - &[(&[], &[])], + &[&[]], ), ( // No header, one chunk with a populated line and an empty line b"{}\n[1, null, [[0, 1]]]\n", - HashMap::default(), - &[(&[], &[Some(simple_line_record.clone())])], + &[&[Some(simple_line_record.clone())]], ), ( // No header, two chunks, the second having just one empty line b"{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n", - HashMap::default(), - &[(&[], &[Some(simple_line_record.clone())]), (&[], &[])], + &[&[Some(simple_line_record.clone())], &[]], ), ( // Header, two chunks, the second having multiple data lines and an empty line b"{}\n<<<<< end_of_header >>>>>\n{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n[1, null, [[0, 1]]]\n[1, null, [[0, 1]]]\n", - HashMap::default(), &[ - (&[], &[Some(simple_line_record.clone())]), - ( - &[], - &[ - Some(simple_line_record.clone()), - Some(simple_line_record.clone()), - ], - ), + &[Some(simple_line_record.clone())], + &[ + Some(simple_line_record.clone()), + Some(simple_line_record.clone()), + ], ], ), ]; - for (input, expected_labels_index, expected_chunks) in cases { + for (input, expected_chunks) in cases { let chunks_file = ChunksFile::new(input).unwrap(); let mut chunks = chunks_file.chunks(); - assert_eq!(chunks_file.labels_index(), expected_labels_index); - - for (expected_sessions, expected_line_records) in *expected_chunks { + for expected_line_records in *expected_chunks { let mut chunk = chunks.next_chunk().unwrap().unwrap(); - assert_eq!(chunk.present_sessions(), *expected_sessions); - let mut lines = vec![]; while let Some(line) = chunk.next_line().unwrap() { lines.push(line); diff --git a/core/src/parsers/pyreport/report_json.rs b/core/src/parsers/pyreport/report_json.rs index f9a24e0..05da433 100644 --- a/core/src/parsers/pyreport/report_json.rs +++ b/core/src/parsers/pyreport/report_json.rs @@ -214,7 +214,7 @@ struct Session { session_extras: Option, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ParsedReportJson { pub files: HashMap, pub sessions: HashMap, diff --git a/core/src/report/models.rs b/core/src/report/models.rs index 17f63f9..41ca5d0 100644 --- a/core/src/report/models.rs +++ b/core/src/report/models.rs @@ -95,9 +95,12 @@ * and cast back to `u64` when querying. */ +use serde::Deserialize; + use crate::parsers::json::JsonVal; -#[derive(PartialEq, Debug, Clone, Copy, Default)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Deserialize)] +#[serde(try_from = "&str")] pub enum CoverageType { #[default] Line = 1, diff --git a/core/src/report/pyreport/types.rs b/core/src/report/pyreport/types.rs index 0da5847..8402b80 100644 --- a/core/src/report/pyreport/types.rs +++ b/core/src/report/pyreport/types.rs @@ -1,5 +1,7 @@ use std::collections::HashMap; +use serde::Deserialize; + pub use super::super::models::CoverageType; use crate::parsers::json::JsonVal; #[cfg(doc)] @@ -10,7 +12,7 @@ use crate::report::models; /// /// Most of the time, we can parse this field into a `HitCount` or /// `BranchesTaken`. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum PyreportCoverage { /// Contains the number of times the target was hit (or sometimes just 0 or /// 1). Most formats represent line and method coverage this way. In some @@ -41,7 +43,7 @@ pub enum Complexity { } /// Enum representing the possible shapes of data about missing branch coverage. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum MissingBranch { /// Identifies a specific branch by its "block" and "branch" numbers chosen /// by the instrumentation. Lcov does it this way. @@ -57,7 +59,7 @@ pub enum MissingBranch { } /// Struct representing a subspan of a single line and its coverage status. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] pub struct Partial { pub start_col: Option, pub end_col: Option, @@ -122,7 +124,7 @@ pub enum RawLabel { /// An object that is similar to a [`LineSession`], containing coverage /// measurements specific to a session. It is mostly redundant and ignored in /// this parser, save for the `labels` field which is not found anywhere else. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq, Deserialize)] pub struct CoverageDatapoint { /// This ID indicates which session the measurement was taken in. It can be /// used as a key in `buf.state.report_json_sessions` to get the ID of a diff --git a/test_utils/src/fixtures.rs b/test_utils/src/fixtures.rs index cdd08c7..e917cf2 100644 --- a/test_utils/src/fixtures.rs +++ b/test_utils/src/fixtures.rs @@ -66,9 +66,11 @@ pub fn read_fixture( name: &str, ) -> Result, &'static str> { // Just make sure the file exists and that it has been pulled from Git LFS - let _file = open_fixture(format, size, name)?; + let mut file = open_fixture(format, size, name)?; // Actually read and return the contents - let path = fixture_dir(format, size).join(name); - std::fs::read(path).map_err(|_| "failed to read file") + let mut buf = Vec::new(); + file.read_to_end(&mut buf) + .map_err(|_| "failed to read file")?; + Ok(buf) }