Skip to content

Commit

Permalink
Merge remote-tracking branch 'apache/main' into alamb/prepare_54_release
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Dec 18, 2024
2 parents b3fb421 + fc814bc commit ef479af
Show file tree
Hide file tree
Showing 14 changed files with 362 additions and 33 deletions.
27 changes: 18 additions & 9 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,18 +101,19 @@ jobs:
- name: Format arrow
run: cargo fmt --all -- --check
- name: Format parquet
# Many modules in parquet are skipped, so check parquet separately. If this check fails, run:
# cargo fmt -p parquet -- --config skip_children=true `find ./parquet -name "*.rs" \! -name format.rs`
# from the top level arrow-rs directory and check in the result.
# Many modules in parquet are skipped, so check parquet separately
# https://github.com/apache/arrow-rs/issues/6179
working-directory: parquet
run: cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs`
run: |
# if this fails, run this from the parquet directory:
# cargo fmt -p parquet -- --config skip_children=true `find . -name "*.rs" \! -name format.rs`
cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs`
- name: Format object_store
working-directory: object_store
run: cargo fmt --all -- --check

msrv:
name: Verify MSRV
name: Verify MSRV (Minimum Supported Rust Version)
runs-on: ubuntu-latest
container:
image: amd64/rust
Expand All @@ -126,13 +127,19 @@ jobs:
run: cargo update -p ahash --precise 0.8.7
- name: Check arrow
working-directory: arrow
run: cargo msrv --log-target stdout verify
run: |
# run `cd arrow; cargo msrv verify` to see problematic dependencies
cargo msrv verify --output-format=json
- name: Check parquet
working-directory: parquet
run: cargo msrv --log-target stdout verify
run: |
# run `cd parquet; cargo msrv verify` to see problematic dependencies
cargo msrv verify --output-format=json
- name: Check arrow-flight
working-directory: arrow-flight
run: cargo msrv --log-target stdout verify
run: |
# run `cd arrow-flight; cargo msrv verify` to see problematic dependencies
cargo msrv verify --output-format=json
- name: Downgrade object_store dependencies
working-directory: object_store
# Necessary because tokio 1.30.0 updates MSRV to 1.63
Expand All @@ -142,4 +149,6 @@ jobs:
cargo update -p url --precise 2.5.0
- name: Check object_store
working-directory: object_store
run: cargo msrv --log-target stdout verify
run: |
# run `cd object_store; cargo msrv verify` to see problematic dependencies
cargo msrv verify --output-format=json
42 changes: 35 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,14 @@ is described in the [contributing] guide.

Planned Release Schedule

| Approximate Date | Version | Notes |
| ---------------- | -------- | --------------------------------------- |
| Nov 2024 | `53.3.0` | Minor, NO breaking API changes |
| Dec 2024 | `54.0.0` | Major, potentially breaking API changes |
| Jan 2025 | `54.1.0` | Minor, NO breaking API changes |
| Feb 2025 | `54.2.0` | Minor, NO breaking API changes |
| Mar 2025 | `55.0.0` | Major, potentially breaking API changes |
| Approximate Date | Version | Notes |
| ---------------- | -------- | ------------------------------------------ |
| Nov 2024 | `53.3.0` | Minor, NO breaking API changes |
| Dec 2024 | `54.0.0` | Major, potentially breaking API changes |
| Jan 2025 | `53.4.0` | Minor, NO breaking API changes (`53` line) |
| Jan 2025 | `54.1.0` | Minor, NO breaking API changes |
| Feb 2025 | `54.2.0` | Minor, NO breaking API changes |
| Mar 2025 | `55.0.0` | Major, potentially breaking API changes |

[this ticket]: https://github.com/apache/arrow-rs/issues/5368
[semantic versioning]: https://semver.org/
Expand All @@ -82,6 +83,33 @@ versions approximately every 2 months.

[`object_store`]: https://crates.io/crates/object_store

### Deprecation Guidelines

Minor releases may deprecate, but not remove APIs. Deprecating APIs allows
downstream Rust programs to still compile, but generate compiler warnings. This
gives downstream crates time to migrate prior to API removal.

To deprecate an API:

- Mark the API as deprecated using `#[deprecated]` and specify the exact arrow-rs version in which it was deprecated
- Concisely describe the preferred API to help the user transition

The deprecated version is the next version which will be released (please
consult the list above). To mark the API as deprecated, use the
`#[deprecated(since = "...", note = "...")]` attribute.

Foe example

```rust
#[deprecated(since = "51.0.0", note = "Use `date_part` instead")]
```

In general, deprecated APIs will remain in the codebase for at least two major releases after
they were deprecated (typically between 6 - 9 months later). For example, an API
deprecated in `51.3.0` can be removed in `54.0.0` (or later). Deprecated APIs
may be removed earlier or later than these guidelines at the discretion of the
maintainers.

## Related Projects

There are several related crates in different repositories
Expand Down
2 changes: 1 addition & 1 deletion arrow-array/src/array/dictionary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ pub struct DictionaryArray<K: ArrowDictionaryKeyType> {
/// map to the real values.
keys: PrimitiveArray<K>,

/// Array of dictionary values (can by any DataType).
/// Array of dictionary values (can be any DataType).
values: ArrayRef,

/// Values are ordered.
Expand Down
8 changes: 8 additions & 0 deletions arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,14 @@ pub enum DataType {
/// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
/// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
/// ```
///
/// Timezone string parsing
/// -----------------------
/// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930".
///
/// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/)
/// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
/// timezones.
Timestamp(TimeUnit, Option<Arc<str>>),
/// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
/// in days.
Expand Down
25 changes: 25 additions & 0 deletions arrow-schema/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,19 @@ impl Field {
}

/// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type.
///
/// # Example
/// ```
/// # use arrow_schema::{DataType, Field};
/// // non dictionaries do not have a dict is ordered flat
/// let field = Field::new("c1", DataType::Int64, false);
/// assert_eq!(field.dict_is_ordered(), None);
/// // by default dictionary is not ordered
/// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false);
/// assert_eq!(field.dict_is_ordered(), Some(false));
/// let field = field.with_dict_is_ordered(true);
/// assert_eq!(field.dict_is_ordered(), Some(true));
/// ```
#[inline]
pub const fn dict_is_ordered(&self) -> Option<bool> {
match self.data_type {
Expand All @@ -434,6 +447,18 @@ impl Field {
}
}

/// Set the is ordered field for this `Field`, if it is a dictionary.
///
/// Does nothing if this is not a dictionary type.
///
/// See [`Field::dict_is_ordered`] for more information.
pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self {
if matches!(self.data_type, DataType::Dictionary(_, _)) {
self.dict_is_ordered = dict_is_ordered;
};
self
}

/// Merge this field into self if it is compatible.
///
/// Struct fields are merged recursively.
Expand Down
2 changes: 1 addition & 1 deletion arrow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ This crate is tested with the latest stable version of Rust. We do not currently

The `arrow` crate follows the [SemVer standard] defined by Cargo and works well
within the Rust crate ecosystem. See the [repository README] for more details on
the release schedule and version.
the release schedule, version and deprecation policy.

[SemVer standard]: https://doc.rust-lang.org/cargo/reference/semver.html
[repository README]: https://github.com/apache/arrow-rs
Expand Down
2 changes: 1 addition & 1 deletion parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ hashbrown = { version = "0.15", default-features = false }
twox-hash = { version = "1.6", default-features = false }
paste = { version = "1.0" }
half = { version = "2.1", default-features = false, features = ["num-traits"] }
sysinfo = { version = "0.32.0", optional = true, default-features = false, features = ["system"] }
sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] }
crc32fast = { version = "1.4.2", optional = true, default-features = false }

[dev-dependencies]
Expand Down
2 changes: 1 addition & 1 deletion parquet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ This crate is tested with the latest stable version of Rust. We do not currently

The `parquet` crate follows the [SemVer standard] defined by Cargo and works well
within the Rust crate ecosystem. See the [repository README] for more details on
the release schedule and version.
the release schedule, version and deprecation policy.

[semver standard]: https://doc.rust-lang.org/cargo/reference/semver.html
[repository readme]: https://github.com/apache/arrow-rs
Expand Down
5 changes: 2 additions & 3 deletions parquet/examples/write_parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use parquet::arrow::ArrowWriter as ParquetWriter;
use parquet::basic::Encoding;
use parquet::errors::Result;
use parquet::file::properties::{BloomFilterPosition, WriterProperties};
use sysinfo::{MemoryRefreshKind, ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System};
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System};

#[derive(ValueEnum, Clone)]
enum BloomFilterPositionArg {
Expand Down Expand Up @@ -97,8 +97,7 @@ fn main() -> Result<()> {
let file = File::create(args.path).unwrap();
let mut writer = ParquetWriter::try_new(file, schema.clone(), Some(properties))?;

let mut system =
System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything()));
let mut system = System::new_with_specifics(RefreshKind::everything());
eprintln!(
"{} Writing {} batches of {} rows. RSS = {}",
now(),
Expand Down
68 changes: 68 additions & 0 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,21 @@ mod tests {
assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]);
}

#[test]
fn test_arrow_reader_single_column_by_name() {
let file = get_test_file("parquet/generated_simple_numerics/blogs.parquet");

let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
let original_schema = Arc::clone(builder.schema());

let mask = ProjectionMask::columns(builder.parquet_schema(), ["blog_id"]);
let reader = builder.with_projection(mask).build().unwrap();

// Verify that the schema was correctly parsed
assert_eq!(1, reader.schema().fields().len());
assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]);
}

#[test]
fn test_null_column_reader_test() {
let mut file = tempfile::tempfile().unwrap();
Expand Down Expand Up @@ -2563,6 +2578,59 @@ mod tests {
}
}

#[test]
// same as test_read_structs but constructs projection mask via column names
fn test_read_structs_by_name() {
let testdata = arrow::util::test_util::parquet_test_data();
let path = format!("{testdata}/nested_structs.rust.parquet");
let file = File::open(&path).unwrap();
let record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap();

for batch in record_batch_reader {
batch.unwrap();
}

let file = File::open(&path).unwrap();
let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();

let mask = ProjectionMask::columns(
builder.parquet_schema(),
["roll_num.count", "PC_CUR.mean", "PC_CUR.sum"],
);
let projected_reader = builder
.with_projection(mask)
.with_batch_size(60)
.build()
.unwrap();

let expected_schema = Schema::new(vec![
Field::new(
"roll_num",
ArrowDataType::Struct(Fields::from(vec![Field::new(
"count",
ArrowDataType::UInt64,
false,
)])),
false,
),
Field::new(
"PC_CUR",
ArrowDataType::Struct(Fields::from(vec![
Field::new("mean", ArrowDataType::Int64, false),
Field::new("sum", ArrowDataType::Int64, false),
])),
false,
),
]);

assert_eq!(&expected_schema, projected_reader.schema().as_ref());

for batch in projected_reader {
let batch = batch.unwrap();
assert_eq!(batch.schema().as_ref(), &expected_schema);
}
}

#[test]
fn test_read_maps() {
let testdata = arrow::util::test_util::parquet_test_data();
Expand Down
Loading

0 comments on commit ef479af

Please sign in to comment.