Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 65 additions & 73 deletions arrow-cast/src/cast/decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ where

/// Parses given string to specified decimal native (i128/i256) based on given
/// scale. Returns an `Err` if it cannot parse given string.
pub(crate) fn parse_string_to_decimal_native<T: DecimalType>(
pub fn parse_string_to_decimal_native<T: DecimalType>(
value_str: &str,
scale: usize,
) -> Result<T::Native, ArrowError>
Expand Down Expand Up @@ -777,15 +777,15 @@ where
if cast_options.safe {
array
.unary_opt::<_, D>(|v| {
D::Native::from_f64((mul * v.as_()).round())
single_float_to_decimal::<D>(v.as_(), mul)
.filter(|v| D::is_valid_decimal_precision(*v, precision))
})
.with_precision_and_scale(precision, scale)
.map(|a| Arc::new(a) as ArrayRef)
} else {
array
.try_unary::<_, D, _>(|v| {
D::Native::from_f64((mul * v.as_()).round())
single_float_to_decimal::<D>(v.as_(), mul)
.ok_or_else(|| {
ArrowError::CastError(format!(
"Cannot cast to {}({}, {}). Overflowing on {:?}",
Expand All @@ -802,6 +802,17 @@ where
}
}

/// Cast a single floating point value to a decimal native with the given multiple.
/// Returns `None` if the value cannot be represented with the requested precision.
#[inline]
pub fn single_float_to_decimal<D>(input: f64, mul: f64) -> Option<D::Native>
where
D: DecimalType + ArrowPrimitiveType,
<D as ArrowPrimitiveType>::Native: DecimalCast,
{
D::Native::from_f64((mul * input).round())
}

pub(crate) fn cast_decimal_to_integer<D, T>(
array: &dyn Array,
base: D::Native,
Expand All @@ -826,84 +837,65 @@ where

let mut value_builder = PrimitiveBuilder::<T>::with_capacity(array.len());

if scale < 0 {
match cast_options.safe {
true => {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array
.value(i)
.mul_checked(div)
.ok()
.and_then(<T::Native as NumCast>::from::<D::Native>);
value_builder.append_option(v);
}
}
}
false => {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array.value(i).mul_checked(div)?;

let value =
<T::Native as NumCast>::from::<D::Native>(v).ok_or_else(|| {
ArrowError::CastError(format!(
"value of {:?} is out of range {}",
v,
T::DATA_TYPE
))
})?;

value_builder.append_value(value);
}
// Helper macro for emitting nearly the same loop every time, so we can hoist branches out
// The compiler will specialize the resulting code (inlining and jump threading)
macro_rules! cast_loop {
(|$v: ident| $body:expr) => {{
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let $v = cast_single_decimal_to_integer::<D, T::Native>(
array.value(i),
div,
<i16 as From<i8>>::from(scale),
T::DATA_TYPE,
);
$body
}
}
}};
}
if scale < 0 {
if cast_options.safe {
cast_loop!(|v| value_builder.append_option(v.ok()));
} else {
cast_loop!(|v| value_builder.append_value(v?));
}
} else if cast_options.safe {
cast_loop!(|v| value_builder.append_option(v.ok()));
} else {
match cast_options.safe {
true => {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array
.value(i)
.div_checked(div)
.ok()
.and_then(<T::Native as NumCast>::from::<D::Native>);
value_builder.append_option(v);
}
}
}
false => {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array.value(i).div_checked(div)?;

let value =
<T::Native as NumCast>::from::<D::Native>(v).ok_or_else(|| {
ArrowError::CastError(format!(
"value of {:?} is out of range {}",
v,
T::DATA_TYPE
))
})?;

value_builder.append_value(value);
}
}
}
}
cast_loop!(|v| value_builder.append_value(v?));
}

Ok(Arc::new(value_builder.finish()))
}

/// Casting a given decimal to an integer based on given div and scale.
/// The value is scaled by multiplying or dividing with the div based on the scale sign.
/// Returns `Err` if the value is overflow or cannot be represented with the requested precision.
pub fn cast_single_decimal_to_integer<D, T>(
value: D::Native,
div: D::Native,
scale: i16,
type_name: DataType,
) -> Result<T, ArrowError>
where
T: NumCast + ToPrimitive,
D: DecimalType + ArrowPrimitiveType,
<D as ArrowPrimitiveType>::Native: ToPrimitive,
{
let v = if scale < 0 {
value.mul_checked(div)?
} else {
value.div_checked(div)?
};

T::from::<D::Native>(v).ok_or_else(|| {
ArrowError::CastError(format!("value of {:?} is out of range {:?}", v, type_name))
})
}

/// Cast a decimal array to a floating point array.
///
/// Conversion is lossy and follows standard floating point semantics. Values
Expand Down
24 changes: 21 additions & 3 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,26 @@ use arrow_schema::*;
use arrow_select::take::take;
use num_traits::{NumCast, ToPrimitive, cast::AsPrimitive};

pub use decimal::{DecimalCast, rescale_decimal};
pub use decimal::{
DecimalCast, cast_single_decimal_to_integer, parse_string_to_decimal_native, rescale_decimal,
single_float_to_decimal,
};
pub use string::cast_single_string_to_boolean_default;

/// Lossy conversion from decimal to float.
///
/// Conversion is lossy and follows standard floating point semantics. Values
/// that exceed the representable range become `INFINITY` or `-INFINITY` without
/// returning an error.
#[inline]
pub fn single_decimal_to_float_lossy<D, F>(f: &F, x: D::Native, scale: i32) -> f64
where
D: DecimalType,
F: Fn(D::Native) -> f64,
{
f(x) * 10_f64.powi(-scale)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rescuing #9689 (comment) from github oblivion...

I just remembered that floating point a * 10**-b is technically NOT equivalent to a / 10**b. The algebraic operators section of rust docs talks about it:

Algebraic operators of the form a.algebraic_*(b) allow the compiler to optimize floating point operations using all the usual algebraic properties of real numbers – despite the fact that those properties do not hold on floating point numbers. This can give a great performance boost since it may unlock vectorization.

The exact set of optimizations is unspecified but typically allows combining operations, rearranging series of operations based on mathematical properties, converting between division and reciprocal multiplication, and disregarding the sign of zero.

(emphasis mine)

Whether we think any difference matters for this specific case... I don't know. But we should probably defer a change like this to its own PR with appropriate performance evaluation and weighing of trade-offs.

At least there's now a narrow waist for such a future optimization to be made easily.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have the same question before changing the code, but the code on play rust below shows that they are equal, not sure if this is enought.

yes, If this code below is not enough,we should keep it as it in the current pr

    let max: u8 = u8::MAX;
    for i in 0..max {
        let left = 1f64 / 10_f64.powi(<i32 as From::<u8>>::from(i));
        let right = 1f64 * 10_f64.powi(-<i32 as From::<u8>>::from(i));
        if left != right {
            println!("No equal {:?}", i);
        }
    }
    println!("Over")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pure luck that 1f didn't trigger anything. A more complete example finds plenty of values just in -10..=10:

a = -10, b = -30
(a as f64) / 10_f64.powi(b)   = -1.00000000000000007618e31
(a as f64) * 10_f64.powi(-b) = -9.99999999999999963590e30
a = -10, b = -26
(a as f64) / 10_f64.powi(b)   = -1.00000000000000015073e27
(a as f64) * 10_f64.powi(-b) = -1.00000000000000001329e27
a = -10, b = -23
(a as f64) / 10_f64.powi(b)   = -9.99999999999999849005e23
(a as f64) * 10_f64.powi(-b) = -9.99999999999999983223e23
a = -10, b = -17
(a as f64) / 10_f64.powi(b)   = -9.99999999999999872000e17
(a as f64) * 10_f64.powi(-b) = -1.00000000000000000000e18
a = -10, b = -5
(a as f64) / 10_f64.powi(b)   = -9.99999999999999883585e5
(a as f64) * 10_f64.powi(-b) = -1.00000000000000000000e6
a = -10, b = 6
(a as f64) / 10_f64.powi(b)   = -1.00000000000000008180e-5
(a as f64) * 10_f64.powi(-b) = -9.99999999999999912396e-6
a = -10, b = 11
(a as f64) / 10_f64.powi(b)   = -1.00000000000000003643e-10
(a as f64) * 10_f64.powi(-b) = -9.99999999999999907185e-11
a = -10, b = 15
(a as f64) / 10_f64.powi(b)   = -9.99999999999999998819e-15
(a as f64) * 10_f64.powi(-b) = -1.00000000000000015659e-14
a = -10, b = 17
(a as f64) / 10_f64.powi(b)   = -9.99999999999999979098e-17
(a as f64) * 10_f64.powi(-b) = -1.00000000000000010236e-16
Found 246 examples in all

}

/// CastOptions provides a way to override the default cast behaviors
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct CastOptions<'a> {
Expand Down Expand Up @@ -2314,10 +2331,11 @@ where
Int32 => cast_decimal_to_integer::<D, Int32Type>(array, base, *scale, cast_options),
Int64 => cast_decimal_to_integer::<D, Int64Type>(array, base, *scale, cast_options),
Float32 => cast_decimal_to_float::<D, Float32Type, _>(array, |x| {
(as_float(x) / 10_f64.powi(*scale as i32)) as f32
single_decimal_to_float_lossy::<D, F>(&as_float, x, <i32 as From<i8>>::from(*scale))
as f32
}),
Float64 => cast_decimal_to_float::<D, Float64Type, _>(array, |x| {
as_float(x) / 10_f64.powi(*scale as i32)
single_decimal_to_float_lossy::<D, F>(&as_float, x, <i32 as From<i8>>::from(*scale))
}),
Utf8View => value_to_string_view(array, cast_options),
Utf8 => value_to_string::<i32>(array, cast_options),
Expand Down
20 changes: 18 additions & 2 deletions parquet-variant-compute/src/type_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

//! Module for transforming a typed arrow `Array` to `VariantArray`.

use arrow::compute::{CastOptions, DecimalCast, rescale_decimal};
use arrow::compute::{
CastOptions, DecimalCast, parse_string_to_decimal_native, rescale_decimal,
single_float_to_decimal,
};
use arrow::datatypes::{
self, ArrowPrimitiveType, ArrowTimestampType, Decimal32Type, Decimal64Type, Decimal128Type,
DecimalType,
Expand Down Expand Up @@ -204,9 +207,12 @@ impl_timestamp_from_variant!(
///
/// - `precision` and `scale` specify the target Arrow decimal parameters
/// - Integer variants (`Int8/16/32/64`) are treated as decimals with scale 0
/// - Floating point variants (`Float/Double`) are converted to decimals with the given scale
/// - String variants (`String/ShortString`) are parsed as decimals with the given scale
/// - Decimal variants (`Decimal4/8/16`) use their embedded precision and scale
///
/// The value is rescaled to (`precision`, `scale`) using `rescale_decimal` and
/// The value is rescaled to (`precision`, `scale`) using `rescale_decimal` for integers,
/// `single_float_to_decimal` for floats, and `parse_string_to_decimal_native` for strings.
/// returns `None` if it cannot fit the requested precision.
pub(crate) fn variant_to_unscaled_decimal<O>(
variant: &Variant<'_, '_>,
Expand All @@ -217,6 +223,8 @@ where
O: DecimalType,
O::Native: DecimalCast,
{
let mul = 10_f64.powi(scale as i32);

match variant {
Variant::Int8(i) => rescale_decimal::<Decimal32Type, O>(
*i as i32,
Expand Down Expand Up @@ -246,6 +254,14 @@ where
precision,
scale,
),
Variant::Float(f) => single_float_to_decimal::<O>(f64::from(*f), mul),
Variant::Double(f) => single_float_to_decimal::<O>(*f, mul),
// arrow-cast only support cast string to decimal with scale >=0 for now
// Please see `cast_string_to_decimal` in arrow-cast/src/cast/decimal.rs for more detail
Variant::String(v) if scale >= 0 => parse_string_to_decimal_native::<O>(v, scale as _).ok(),
Variant::ShortString(v) if scale >= 0 => {
parse_string_to_decimal_native::<O>(v, scale as _).ok()
}
Variant::Decimal4(d) => rescale_decimal::<Decimal32Type, O>(
d.integer(),
VariantDecimal4::MAX_PRECISION,
Expand Down
Loading
Loading