Skip to content

Commit

Permalink
Implement UnitsEssentials Data Provider (#5002)
Browse files Browse the repository at this point in the history
  • Loading branch information
younies authored Jun 25, 2024
1 parent f7317af commit b888362
Show file tree
Hide file tree
Showing 3 changed files with 308 additions and 0 deletions.
2 changes: 2 additions & 0 deletions components/experimental/src/dimension/provider/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

pub mod currency;
pub mod extended_currency;
pub mod pattern_key;
pub mod percent;
pub mod ule;
pub mod units;
pub mod units_essentials;
204 changes: 204 additions & 0 deletions components/experimental/src/dimension/provider/pattern_key.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

// Provider structs must be stable
#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]

use zerovec::{
maps::ZeroMapKV,
ule::{AsULE, ULE},
ZeroVecError,
};

use crate::dimension::provider::units_essentials::CompoundCount;

#[derive(Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_experimental::dimension::provider::units_essentials)
)]
#[repr(u8)]
pub enum PowerValue {
Two,
Three,
}

#[derive(Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_experimental::dimension::provider::units_essentials)
)]
pub enum PatternKey {
Binary(u8),
Decimal(i8),
Power(PowerValue, CompoundCount),
}

/// [`PatternKeyULE`] is a type optimized for efficient storage and
/// deserialization of [`PatternKey`] using the `ZeroVec` model.
///
/// The serialization model packages the pattern item in a single byte.
///
/// The first two bits (b7 & b6) determine the variant of the pattern key:
/// - `00`: `Binary`
/// - `01`: `Decimal`
/// - `10`: `Power`
/// - `11`: Forbidden
///
/// The next 6 bits (b5 to b0) determine the value of the pattern key:
/// - For `Binary`, the value is mapped directly to the pattern value.
/// - For `Decimal`:
/// - b5 is determining the sign of the value. if b5 is 0, the value is positive. if b5 is 1, the value is negative.
/// - b4 to b0 are determining the magnitude of the value.
/// - For `Power`:
/// - b5 and b4 represent the power value, which can be `10` to represent `Two` and `11` to represent `Three`.
/// - b3 to b0 represent the count value, which can be:
/// - `0000`: Zero
/// - `0001`: One
/// - `0010`: Two
/// - `0011`: Few
/// - `0100`: Many
/// - `0101`: Other
/// - Note: In the `Power` case, b3 is always 0, and when b2 is 1, b1 must be 0.
#[derive(Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Debug)]
pub struct PatternKeyULE(u8);

unsafe impl ULE for PatternKeyULE {
fn validate_byte_slice(bytes: &[u8]) -> Result<(), zerovec::ZeroVecError> {
if bytes.len() != 1 {
return Err(ZeroVecError::length::<Self>(bytes.len()));
}

let byte = &bytes[0];

// Ensure the first two bits (b7 & b6) are not 11.
if (byte & 0b1100_0000) == 0b1100_0000 {
return Err(ZeroVecError::parse::<Self>());
}

// For the `Power` variant:
// b5 & b4 must be 10 or 11. (this means that b5 must be 1)
// b3 must be 0.
// When b2 is 1, b1 must be 0.
if (byte & 0b1100_0000) == 0b1000_0000 {
// b5 must be 1
if (byte & 0b0010_0000) == 0 {
return Err(ZeroVecError::parse::<Self>());
}

// b3 must be 0
if (byte & 0b0000_1000) != 0 {
return Err(ZeroVecError::parse::<Self>());
}

// If b2 is 1, b1 must be 0
if (byte & 0b0000_0100) != 0 && (byte & 0b0000_0010) != 0 {
return Err(ZeroVecError::parse::<Self>());
}
}

Ok(())
}
}

impl AsULE for PatternKey {
type ULE = PatternKeyULE;

fn to_unaligned(self) -> Self::ULE {
let byte = match self {
PatternKey::Binary(value) => value,
PatternKey::Decimal(value) => {
let sign = if value < 0 { 0b0010_0000 } else { 0 };
debug_assert!(value > -32 && value < 32);
(0b01 << 6) | sign | (value as u8 & 0b0001_1111)
}
PatternKey::Power(power, count) => {
let power_bits = {
match power {
PowerValue::Two => 0b10 << 4,
PowerValue::Three => 0b11 << 4,
}
};
// Combine the bits to form the final byte
(0b10 << 6) | power_bits | count as u8
}
};

PatternKeyULE(byte)
}

fn from_unaligned(unaligned: Self::ULE) -> Self {
let byte = unaligned.0;

let variant = (byte & 0b1100_0000) >> 6;
let value = byte & 0b0011_1111;

match variant {
0b00 => PatternKey::Binary(value),
0b01 => match value & 0b0010_0000 {
0b0000_0000 => PatternKey::Decimal(value as i8),
0b0010_0000 => PatternKey::Decimal(-((value & 0b0001_1111) as i8)),
_ => unreachable!(),
},
0b10 => {
let power = match value & 0b0011_0000 {
0b0010_0000 => PowerValue::Two,
0b0011_0000 => PowerValue::Three,
_ => unreachable!(),
};
let count = value & 0b0000_1111;
PatternKey::Power(power, count.into())
}
_ => unreachable!(),
}
}
}

impl<'a> ZeroMapKV<'a> for PatternKey {
type Container = zerovec::ZeroVec<'a, PatternKey>;
type Slice = zerovec::ZeroSlice<PatternKey>;
type GetType = <PatternKey as AsULE>::ULE;
type OwnedType = PatternKey;
}

#[test]
fn test_pattern_key_ule() {
use PowerValue::{Three, Two};

let binary = PatternKey::Binary(0b0000_1111);
let binary_ule = binary.to_unaligned();
PatternKeyULE::validate_byte_slice(&[binary_ule.0]).unwrap();
assert_eq!(binary_ule.0, 0b0000_1111);

let decimal = PatternKey::Decimal(0b0000_1111);
let decimal_ule = decimal.to_unaligned();
PatternKeyULE::validate_byte_slice(&[decimal_ule.0]).unwrap();
assert_eq!(decimal_ule.0, 0b0100_1111);

let power2 = PatternKey::Power(Two, CompoundCount::Two);
let power2_ule = power2.to_unaligned();
PatternKeyULE::validate_byte_slice(&[power2_ule.0]).unwrap();
assert_eq!(power2_ule.0, 0b1010_0010);

let power3 = PatternKey::Power(Three, CompoundCount::Two);
let power3_ule = power3.to_unaligned();
PatternKeyULE::validate_byte_slice(&[power3_ule.0]).unwrap();
assert_eq!(power3_ule.0, 0b1011_0010);

let binary = PatternKey::from_unaligned(binary_ule);
assert_eq!(binary, PatternKey::Binary(0b0000_1111));

let decimal = PatternKey::from_unaligned(decimal_ule);
assert_eq!(decimal, PatternKey::Decimal(0b0000_1111));

let power2 = PatternKey::from_unaligned(power2_ule);
assert_eq!(power2, PatternKey::Power(Two, CompoundCount::Two));

let power3 = PatternKey::from_unaligned(power3_ule);
assert_eq!(power3, PatternKey::Power(Three, CompoundCount::Two));
}
102 changes: 102 additions & 0 deletions components/experimental/src/dimension/provider/units_essentials.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

// Provider structs must be stable
#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]

//! Data provider struct definitions for this ICU4X component.
//!
//! Read more about data providers: [`icu_provider`]
use alloc::borrow::Cow;
use zerovec::ZeroMap;

use icu_provider::prelude::*;

#[cfg(feature = "compiled_data")]
/// Baked data
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
/// </div>
pub use crate::provider::Baked;

use super::pattern_key::PatternKey;

// TODO: use `Pattern`s instead of `str` for the patterns' string representations.
/// This type contains all of the essential data for units formatting such as `per`, `power`, `times`, etc.
///
/// Note:
// TODO: Use the auxiliary key as real key in the future because the choice of the length is developer's decision.
/// Auxiliary key represent the length: e.g. `long`, `short`, `narrow`.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[icu_provider::data_struct(UnitsEssentialsV1Marker = "units/essentials@1")]
#[derive(Clone, PartialEq, Debug)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_experimental::dimension::provider::units_essentials),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
pub struct UnitsEssentialsV1<'data> {
// TODO: use `SinglePlaceholderPattern` instead of `str` for the patterns' string representations.
#[cfg_attr(feature = "serde", serde(borrow))]
pub prefixes: ZeroMap<'data, PatternKey, str>,

// TODO: use `DoublePlaceholderPattern` instead of `str` for the patterns' string representations.
#[cfg_attr(feature = "serde", serde(borrow))]
pub per: Cow<'data, str>,

// TODO: use `DoublePlaceholderPattern` instead of `str` for the patterns' string representations.
#[cfg_attr(feature = "serde", serde(borrow))]
pub times: Cow<'data, str>,
}

/// A CLDR plural keyword, or the explicit value 1.
/// See <https://www.unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules>. // TODO??
#[zerovec::make_ule(CompoundCountULE)]
#[derive(Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_experimental::dimension::provider::extended_currency)
)]
#[repr(u8)]
pub enum CompoundCount {
/// The CLDR keyword `zero`.
Zero = 0,
/// The CLDR keyword `one`.
One = 1,
/// The CLDR keyword `two`.
Two = 2,
/// The CLDR keyword `few`.
Few = 3,
/// The CLDR keyword `many`.
Many = 4,
/// The CLDR keyword `other`.
Other = 5,
}

impl From<u8> for CompoundCount {
fn from(val: u8) -> Self {
match val {
0 => CompoundCount::Zero,
1 => CompoundCount::One,
2 => CompoundCount::Two,
3 => CompoundCount::Few,
4 => CompoundCount::Many,
5 => CompoundCount::Other,
_ => unreachable!(),
}
}
}

0 comments on commit b888362

Please sign in to comment.