Merge remote-tracking branch 'apache/main' into alamb/prepare_54_release

apache · Dec 18, 2024 · ef479af · ef479af
2 parents b3fb421 + fc814bc
commit ef479af
Show file tree

Hide file tree

Showing 14 changed files with 362 additions and 33 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -101,18 +101,19 @@ jobs:
       - name: Format arrow
         run: cargo fmt --all -- --check
       - name: Format parquet
-        # Many modules in parquet are skipped, so check parquet separately. If this check fails, run:
-        #   cargo fmt -p parquet -- --config skip_children=true `find ./parquet -name "*.rs" \! -name format.rs`
-        # from the top level arrow-rs directory and check in the result.
+        # Many modules in parquet are skipped, so check parquet separately
         # https://github.com/apache/arrow-rs/issues/6179
         working-directory: parquet
-        run: cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs`
+        run: |
+          # if this fails, run this from the parquet directory:
+          # cargo fmt -p parquet -- --config skip_children=true `find . -name "*.rs" \! -name format.rs`
+          cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs`
       - name: Format object_store
         working-directory: object_store
         run: cargo fmt --all -- --check
 
   msrv:
-    name: Verify MSRV
+    name: Verify MSRV (Minimum Supported Rust Version)
     runs-on: ubuntu-latest
     container:
       image: amd64/rust
@@ -126,13 +127,19 @@ jobs:
         run: cargo update -p ahash --precise 0.8.7
       - name: Check arrow
         working-directory: arrow
-        run: cargo msrv --log-target stdout verify
+        run: |
+          # run `cd arrow; cargo msrv verify` to see problematic dependencies
+          cargo msrv verify --output-format=json
       - name: Check parquet
         working-directory: parquet
-        run: cargo msrv --log-target stdout verify
+        run: |
+          # run `cd parquet; cargo msrv verify` to see problematic dependencies
+          cargo msrv verify --output-format=json
       - name: Check arrow-flight
         working-directory: arrow-flight
-        run: cargo msrv --log-target stdout verify
+        run: |
+          # run `cd arrow-flight; cargo msrv verify` to see problematic dependencies
+          cargo msrv verify --output-format=json
       - name: Downgrade object_store dependencies
         working-directory: object_store
         # Necessary because tokio 1.30.0 updates MSRV to 1.63
@@ -142,4 +149,6 @@ jobs:
           cargo update -p url --precise 2.5.0
       - name: Check object_store
         working-directory: object_store
-        run: cargo msrv --log-target stdout verify
+        run: |
+          # run `cd object_store; cargo msrv verify` to see problematic dependencies
+          cargo msrv verify --output-format=json
diff --git a/README.md b/README.md
@@ -63,13 +63,14 @@ is described in the [contributing] guide.
 
 Planned Release Schedule
 
-| Approximate Date | Version  | Notes                                   |
-| ---------------- | -------- | --------------------------------------- |
-| Nov 2024         | `53.3.0` | Minor, NO breaking API changes          |
-| Dec 2024         | `54.0.0` | Major, potentially breaking API changes |
-| Jan 2025         | `54.1.0` | Minor, NO breaking API changes          |
-| Feb 2025         | `54.2.0` | Minor, NO breaking API changes          |
-| Mar 2025         | `55.0.0` | Major, potentially breaking API changes |
+| Approximate Date | Version  | Notes                                      |
+| ---------------- | -------- | ------------------------------------------ |
+| Nov 2024         | `53.3.0` | Minor, NO breaking API changes             |
+| Dec 2024         | `54.0.0` | Major, potentially breaking API changes    |
+| Jan 2025         | `53.4.0` | Minor, NO breaking API changes (`53` line) |
+| Jan 2025         | `54.1.0` | Minor, NO breaking API changes             |
+| Feb 2025         | `54.2.0` | Minor, NO breaking API changes             |
+| Mar 2025         | `55.0.0` | Major, potentially breaking API changes    |
 
 [this ticket]: https://github.com/apache/arrow-rs/issues/5368
 [semantic versioning]: https://semver.org/
@@ -82,6 +83,33 @@ versions approximately every 2 months.
 
 [`object_store`]: https://crates.io/crates/object_store
 
+### Deprecation Guidelines
+
+Minor releases may deprecate, but not remove APIs. Deprecating APIs allows
+downstream Rust programs to still compile, but generate compiler warnings. This
+gives downstream crates time to migrate prior to API removal.
+
+To deprecate an API:
+
+- Mark the API as deprecated using `#[deprecated]` and specify the exact arrow-rs version in which it was deprecated
+- Concisely describe the preferred API to help the user transition
+
+The deprecated version is the next version which will be released (please
+consult the list above). To mark the API as deprecated, use the
+`#[deprecated(since = "...", note = "...")]` attribute.
+
+Foe example
+
+```rust
+#[deprecated(since = "51.0.0", note = "Use `date_part` instead")]
+```
+
+In general, deprecated APIs will remain in the codebase for at least two major releases after
+they were deprecated (typically between 6 - 9 months later). For example, an API
+deprecated in `51.3.0` can be removed in `54.0.0` (or later). Deprecated APIs
+may be removed earlier or later than these guidelines at the discretion of the
+maintainers.
+
 ## Related Projects
 
 There are several related crates in different repositories

diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs
@@ -249,7 +249,7 @@ pub struct DictionaryArray<K: ArrowDictionaryKeyType> {
     /// map to the real values.
     keys: PrimitiveArray<K>,
 
-    /// Array of dictionary values (can by any DataType).
+    /// Array of dictionary values (can be any DataType).
     values: ArrayRef,
 
     /// Values are ordered.

diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs
@@ -196,6 +196,14 @@ pub enum DataType {
     /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
     /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
     /// ```
+    ///
+    /// Timezone string parsing
+    /// -----------------------
+    /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930".
+    ///
+    /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/)
+    /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
+    /// timezones.
     Timestamp(TimeUnit, Option<Arc<str>>),
     /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
     /// in days.

diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs
@@ -426,6 +426,19 @@ impl Field {
     }
 
     /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type.
+    ///
+    /// # Example
+    /// ```
+    /// # use arrow_schema::{DataType, Field};
+    /// // non dictionaries do not have a dict is ordered flat
+    /// let field = Field::new("c1", DataType::Int64, false);
+    /// assert_eq!(field.dict_is_ordered(), None);
+    /// // by default dictionary is not ordered
+    /// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false);
+    /// assert_eq!(field.dict_is_ordered(), Some(false));
+    /// let field = field.with_dict_is_ordered(true);
+    /// assert_eq!(field.dict_is_ordered(), Some(true));
+    /// ```
     #[inline]
     pub const fn dict_is_ordered(&self) -> Option<bool> {
         match self.data_type {
@@ -434,6 +447,18 @@ impl Field {
         }
     }
 
+    /// Set the is ordered field for this `Field`, if it is a dictionary.
+    ///
+    /// Does nothing if this is not a dictionary type.
+    ///
+    /// See [`Field::dict_is_ordered`] for more information.
+    pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self {
+        if matches!(self.data_type, DataType::Dictionary(_, _)) {
+            self.dict_is_ordered = dict_is_ordered;
+        };
+        self
+    }
+
     /// Merge this field into self if it is compatible.
     ///
     /// Struct fields are merged recursively.

diff --git a/arrow/README.md b/arrow/README.md
@@ -37,7 +37,7 @@ This crate is tested with the latest stable version of Rust. We do not currently
 
 The `arrow` crate follows the [SemVer standard] defined by Cargo and works well
 within the Rust crate ecosystem. See the [repository README] for more details on
-the release schedule and version.
+the release schedule, version and deprecation policy.
 
 [SemVer standard]: https://doc.rust-lang.org/cargo/reference/semver.html
 [repository README]: https://github.com/apache/arrow-rs

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
@@ -67,7 +67,7 @@ hashbrown = { version = "0.15", default-features = false }
 twox-hash = { version = "1.6", default-features = false }
 paste = { version = "1.0" }
 half = { version = "2.1", default-features = false, features = ["num-traits"] }
-sysinfo = { version = "0.32.0", optional = true, default-features = false, features = ["system"] }
+sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] }
 crc32fast = { version = "1.4.2", optional = true, default-features = false }
 
 [dev-dependencies]

diff --git a/parquet/README.md b/parquet/README.md
@@ -36,7 +36,7 @@ This crate is tested with the latest stable version of Rust. We do not currently
 
 The `parquet` crate follows the [SemVer standard] defined by Cargo and works well
 within the Rust crate ecosystem. See the [repository README] for more details on
-the release schedule and version.
+the release schedule, version and deprecation policy.
 
 [semver standard]: https://doc.rust-lang.org/cargo/reference/semver.html
 [repository readme]: https://github.com/apache/arrow-rs

diff --git a/parquet/examples/write_parquet.rs b/parquet/examples/write_parquet.rs
@@ -28,7 +28,7 @@ use parquet::arrow::ArrowWriter as ParquetWriter;
 use parquet::basic::Encoding;
 use parquet::errors::Result;
 use parquet::file::properties::{BloomFilterPosition, WriterProperties};
-use sysinfo::{MemoryRefreshKind, ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System};
+use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System};
 
 #[derive(ValueEnum, Clone)]
 enum BloomFilterPositionArg {
@@ -97,8 +97,7 @@ fn main() -> Result<()> {
     let file = File::create(args.path).unwrap();
     let mut writer = ParquetWriter::try_new(file, schema.clone(), Some(properties))?;
 
-    let mut system =
-        System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything()));
+    let mut system = System::new_with_specifics(RefreshKind::everything());
     eprintln!(
         "{} Writing {} batches of {} rows. RSS = {}",
         now(),

diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
@@ -989,6 +989,21 @@ mod tests {
         assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]);
     }
 
+    #[test]
+    fn test_arrow_reader_single_column_by_name() {
+        let file = get_test_file("parquet/generated_simple_numerics/blogs.parquet");
+
+        let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+        let original_schema = Arc::clone(builder.schema());
+
+        let mask = ProjectionMask::columns(builder.parquet_schema(), ["blog_id"]);
+        let reader = builder.with_projection(mask).build().unwrap();
+
+        // Verify that the schema was correctly parsed
+        assert_eq!(1, reader.schema().fields().len());
+        assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]);
+    }
+
     #[test]
     fn test_null_column_reader_test() {
         let mut file = tempfile::tempfile().unwrap();
@@ -2563,6 +2578,59 @@ mod tests {
         }
     }
 
+    #[test]
+    // same as test_read_structs but constructs projection mask via column names
+    fn test_read_structs_by_name() {
+        let testdata = arrow::util::test_util::parquet_test_data();
+        let path = format!("{testdata}/nested_structs.rust.parquet");
+        let file = File::open(&path).unwrap();
+        let record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap();
+
+        for batch in record_batch_reader {
+            batch.unwrap();
+        }
+
+        let file = File::open(&path).unwrap();
+        let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+
+        let mask = ProjectionMask::columns(
+            builder.parquet_schema(),
+            ["roll_num.count", "PC_CUR.mean", "PC_CUR.sum"],
+        );
+        let projected_reader = builder
+            .with_projection(mask)
+            .with_batch_size(60)
+            .build()
+            .unwrap();
+
+        let expected_schema = Schema::new(vec![
+            Field::new(
+                "roll_num",
+                ArrowDataType::Struct(Fields::from(vec![Field::new(
+                    "count",
+                    ArrowDataType::UInt64,
+                    false,
+                )])),
+                false,
+            ),
+            Field::new(
+                "PC_CUR",
+                ArrowDataType::Struct(Fields::from(vec![
+                    Field::new("mean", ArrowDataType::Int64, false),
+                    Field::new("sum", ArrowDataType::Int64, false),
+                ])),
+                false,
+            ),
+        ]);
+
+        assert_eq!(&expected_schema, projected_reader.schema().as_ref());
+
+        for batch in projected_reader {
+            let batch = batch.unwrap();
+            assert_eq!(batch.schema().as_ref(), &expected_schema);
+        }
+    }
+
     #[test]
     fn test_read_maps() {
         let testdata = arrow::util::test_util::parquet_test_data();