From fafb7e2ee0e54092137ffd205c7325bb15ae3666 Mon Sep 17 00:00:00 2001 From: 0lai0 Date: Mon, 16 Mar 2026 17:55:33 +0800 Subject: [PATCH 1/2] use match_to_type produces timestamp --- .../spark-expr/src/conversion_funcs/string.rs | 121 ++++++++++++++++-- .../apache/comet/expressions/CometCast.scala | 6 +- 2 files changed, 118 insertions(+), 9 deletions(-) diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index 7c193716d0..7f1d7a9f58 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -25,7 +25,7 @@ use arrow::datatypes::{ i256, is_validate_decimal_precision, DataType, Date32Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimestampMicrosecondType, }; -use chrono::{DateTime, NaiveDate, TimeZone, Timelike}; +use chrono::{DateTime, NaiveDate, TimeZone, Timelike, Utc}; use num::traits::CheckedNeg; use num::{CheckedSub, Integer}; use regex::Regex; @@ -34,9 +34,13 @@ use std::str::FromStr; use std::sync::Arc; macro_rules! cast_utf8_to_timestamp { - ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr) => {{ + ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr, $with_tz:expr) => {{ let len = $array.len(); - let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC"); + let mut cast_array = if let Some(tz_str) = $with_tz { + PrimitiveArray::<$array_type>::builder(len).with_timezone(tz_str) + } else { + PrimitiveArray::<$array_type>::builder(len) + }; for i in 0..len { if $array.is_null(i) { cast_array.append_null() @@ -665,16 +669,28 @@ pub(crate) fn cast_string_to_timestamp( .downcast_ref::>() .expect("Expected a string array"); - let tz = &timezone::Tz::from_str(timezone_str).unwrap(); - let cast_array: ArrayRef = match to_type { - DataType::Timestamp(_, _) => { + DataType::Timestamp(_, Some(_)) => { + let tz = &timezone::Tz::from_str(timezone_str).unwrap(); cast_utf8_to_timestamp!( string_array, eval_mode, TimestampMicrosecondType, timestamp_parser, - tz + tz, + Some("UTC") + ) + } + DataType::Timestamp(_, None) => { + // TimestampNTZ: reuse timestamp_parser with Utc (identity timezone), + // but don't set timezone metadata on the output array + cast_utf8_to_timestamp!( + string_array, + eval_mode, + TimestampMicrosecondType, + timestamp_parser, + &Utc, + None::<&str> ) } _ => unreachable!("Invalid data type {:?} in cast from string", to_type), @@ -1340,7 +1356,8 @@ mod tests { eval_mode, TimestampMicrosecondType, timestamp_parser, - tz + tz, + Some("UTC") ); assert_eq!( @@ -1350,6 +1367,94 @@ mod tests { assert_eq!(result.len(), 4); } + #[test] + #[cfg_attr(miri, ignore)] + fn test_cast_string_to_timestamp_ntz() { + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56.123456"), + Some("not_a_timestamp"), + Some("2020-01-01"), + ])); + + let string_array = array + .as_any() + .downcast_ref::>() + .expect("Expected a string array"); + + let eval_mode = EvalMode::Legacy; + let result = cast_utf8_to_timestamp!( + &string_array, + eval_mode, + TimestampMicrosecondType, + timestamp_parser, + &Utc, + None::<&str> + ); + + // Key assertion: TimestampNTZ should have NO timezone + assert_eq!( + result.data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + assert_eq!(result.len(), 3); + + // Verify the actual values are not timezone-converted + let ts_array = result + .as_any() + .downcast_ref::>() + .expect("Expected a timestamp array"); + + // 2020-01-01T12:34:56.123456 as naive epoch micros + assert_eq!(ts_array.value(0), 1577882096123456); + // "not_a_timestamp" is invalid and should be null + assert!(ts_array.is_null(1)); + // 2020-01-01 as naive epoch micros (midnight) + assert_eq!(ts_array.value(2), 1577836800000000); + } + + #[test] + fn test_cast_string_to_timestamp_ntz_via_cast_array() -> DataFusionResult<()> { + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56.123456"), + Some("T2"), + ])); + + let timezone = "America/New_York".to_string(); + let cast_options = SparkCastOptions::new(EvalMode::Legacy, &timezone, false); + + // Cast to Timestamp with timezone + let result_tz = cast_array( + Arc::clone(&array), + &DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), + &cast_options, + )?; + assert_eq!( + *result_tz.data_type(), + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())) + ); + + // Cast to TimestampNTZ (no timezone) + let result_ntz = cast_array( + Arc::clone(&array), + &DataType::Timestamp(TimeUnit::Microsecond, None), + &cast_options, + )?; + assert_eq!( + *result_ntz.data_type(), + DataType::Timestamp(TimeUnit::Microsecond, None) + ); + + // The NTZ result should NOT have timezone metadata + let ntz_array = result_ntz + .as_any() + .downcast_ref::>() + .expect("Expected a timestamp array"); + // 2020-01-01T12:34:56.123456 stored as-is (no timezone conversion) + assert_eq!(ntz_array.value(0), 1577882096123456); + + Ok(()) + } + #[test] fn test_cast_dict_string_to_timestamp() -> DataFusionResult<()> { // prepare input data diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 95d5366907..a6d65d37af 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -21,7 +21,7 @@ package org.apache.comet.expressions import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, Literal} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType, TimestampType} +import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType, TimestampNTZType, TimestampType} import org.apache.comet.CometConf import org.apache.comet.CometSparkSessionExtensions.withInfo @@ -222,6 +222,10 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { case DataTypes.TimestampType => // https://github.com/apache/datafusion-comet/issues/328 Incompatible(Some("Not all valid formats are supported")) + case _: TimestampNTZType if evalMode == CometEvalMode.ANSI => + Incompatible(Some("ANSI mode not supported")) + case _: TimestampNTZType => + Incompatible(Some("Not all valid formats are supported")) case _ => unsupported(DataTypes.StringType, toType) } From 0ae37110071b6eaa61969352c8c9722f9f49f9c1 Mon Sep 17 00:00:00 2001 From: 0lai0 Date: Tue, 17 Mar 2026 09:50:53 +0800 Subject: [PATCH 2/2] add SQL file and string test --- .../spark-expr/src/conversion_funcs/string.rs | 52 +++++++++++++++++++ .../cast/cast_string_to_timestamp_ntz.sql | 39 ++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 spark/src/test/resources/sql-tests/expressions/cast/cast_string_to_timestamp_ntz.sql diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index 7f1d7a9f58..dcfc346a36 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -1412,6 +1412,58 @@ mod tests { assert_eq!(ts_array.value(2), 1577836800000000); } + #[test] + #[cfg_attr(miri, ignore)] + fn test_cast_string_with_timezone_offset_to_timestamp_ntz() { + // Strings with explicit timezone offsets should produce null for TimestampNTZ + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56+05:00"), + Some("2020-01-01T12:34:56-08:00"), + Some("2020-01-01T12:34:56Z"), + Some("2020-01-01T12:34:56.123456+00:00"), + Some("2020-01-01T12:34:56.123456"), + ])); + + let string_array = array + .as_any() + .downcast_ref::>() + .expect("Expected a string array"); + + let eval_mode = EvalMode::Legacy; + let result = cast_utf8_to_timestamp!( + &string_array, + eval_mode, + TimestampMicrosecondType, + timestamp_parser, + &Utc, + None::<&str> + ); + + assert_eq!( + result.data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + assert_eq!(result.len(), 5); + + let ts_array = result + .as_any() + .downcast_ref::>() + .expect("Expected a timestamp array"); + + // All strings with timezone offsets should be null + assert!(ts_array.is_null(0), "'+05:00' offset should produce null"); + assert!(ts_array.is_null(1), "'-08:00' offset should produce null"); + assert!(ts_array.is_null(2), "'Z' suffix should produce null"); + assert!( + ts_array.is_null(3), + "'+00:00' offset with micros should produce null" + ); + + // The one without offset should parse correctly + assert!(!ts_array.is_null(4)); + assert_eq!(ts_array.value(4), 1577882096123456); + } + #[test] fn test_cast_string_to_timestamp_ntz_via_cast_array() -> DataFusionResult<()> { let array: ArrayRef = Arc::new(StringArray::from(vec![ diff --git a/spark/src/test/resources/sql-tests/expressions/cast/cast_string_to_timestamp_ntz.sql b/spark/src/test/resources/sql-tests/expressions/cast/cast_string_to_timestamp_ntz.sql new file mode 100644 index 0000000000..b43461ac4d --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/cast/cast_string_to_timestamp_ntz.sql @@ -0,0 +1,39 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ConfigMatrix: parquet.enable.dictionary=false,true + +-- Test casting string to timestamp_ntz +-- https://github.com/apache/datafusion-comet/issues/3179 + +statement +CREATE TABLE test_cast_ts_ntz(s string) USING parquet + +statement +INSERT INTO test_cast_ts_ntz VALUES ('2020-01-01T12:34:56.123456'), ('2020-01-01'), ('2020-01-01T12:34:56'), ('2020'), ('2020-01'), (NULL), ('not_a_timestamp'), ('2020-01-01T12:34:56+05:00') + +-- Cast string to timestamp_ntz: valid formats should parse, invalid should be null +query +SELECT s, cast(s AS timestamp_ntz) FROM test_cast_ts_ntz + +-- Verify that timestamp_ntz values are not affected by session timezone +query +SELECT s, cast(s AS timestamp_ntz) FROM test_cast_ts_ntz WHERE s = '2020-01-01T12:34:56.123456' + +-- Compare timestamp_ntz vs timestamp (with timezone) to show they differ +query +SELECT s, cast(s AS timestamp_ntz) as ts_ntz, cast(s AS timestamp) as ts FROM test_cast_ts_ntz WHERE s IS NOT NULL AND s != 'not_a_timestamp' AND s != '2020-01-01T12:34:56+05:00'