From 238d8c3352d9e9ba3d77467e7da8210561348286 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 25 Jun 2026 11:49:09 -0600 Subject: [PATCH 1/3] feat: route Unsupported through codegen dispatch for opt-in serdes CodegenDispatchFallback already routes Incompatible cases through the JVM codegen dispatcher so the projection stays inside the Comet pipeline instead of falling back to Spark. Apply the same routing to Unsupported cases: when getSupportLevel returns Unsupported and the serde mixes in CodegenDispatchFallback, run Spark's own doGenCode via the dispatcher before resorting to Spark fallback. Affects Concat (non-string children), SortArray (nested Struct/Null children), ArrayIntersect (collated strings), TruncDate and TruncTimestamp (formats outside the native set). Also refresh the expression compatibility docs: drop "faster native" wording (no measured claim), clarify that the default path runs in the JVM via Spark codegen, and render Unsupported reasons differently for CodegenDispatchFallback serdes (always JVM dispatch) vs everything else (always Spark fallback). --- .../latest/compatibility/expressions/index.md | 2 +- .../expressions/spark-3.4/index.md | 2 +- .../expressions/spark-3.5/index.md | 2 +- .../expressions/spark-4.0/index.md | 2 +- .../expressions/spark-4.1/index.md | 2 +- .../user-guide/latest/compatibility/index.md | 8 ++--- .../scala/org/apache/comet/GenerateDocs.scala | 32 +++++++++++++------ .../comet/serde/CometExpressionSerde.scala | 30 +++++++++++------ .../apache/comet/serde/QueryPlanSerde.scala | 15 +++++++-- .../scala/org/apache/comet/serde/arrays.scala | 3 +- 10 files changed, 67 insertions(+), 31 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility/expressions/index.md b/docs/source/user-guide/latest/compatibility/expressions/index.md index 3df1d9f4c0..3574699195 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/index.md @@ -35,7 +35,7 @@ Spark 4.1 Expressions that are not 100% Spark-compatible fall back to Spark by default, except those with a JVM codegen-dispatch path, which stay in Comet's native pipeline and match Spark exactly. Set `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is -the Spark expression class name, to run Comet's faster native implementation despite its +the Spark expression class name, to run Comet's native implementation despite its differences from Spark. See the [Comet Supported Expressions Guide](../../expressions.md) for more information on this configuration setting. diff --git a/docs/source/user-guide/latest/compatibility/expressions/spark-3.4/index.md b/docs/source/user-guide/latest/compatibility/expressions/spark-3.4/index.md index 8dc1b26c93..40fe54c4ea 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/spark-3.4/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/spark-3.4/index.md @@ -23,7 +23,7 @@ Compatibility notes for Comet running on Apache Spark 3.4. Expressions that are Spark-compatible fall back to Spark by default, except those with a JVM codegen-dispatch path, which stay in Comet's native pipeline and match Spark exactly. Set `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark -expression class name, to run Comet's faster native implementation despite its differences +expression class name, to run Comet's native implementation despite its differences from Spark. See the [Comet Supported Expressions Guide](../../../expressions.md) for more information on this configuration setting. diff --git a/docs/source/user-guide/latest/compatibility/expressions/spark-3.5/index.md b/docs/source/user-guide/latest/compatibility/expressions/spark-3.5/index.md index 57536d768c..db11f25083 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/spark-3.5/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/spark-3.5/index.md @@ -23,7 +23,7 @@ Compatibility notes for Comet running on Apache Spark 3.5. Expressions that are Spark-compatible fall back to Spark by default, except those with a JVM codegen-dispatch path, which stay in Comet's native pipeline and match Spark exactly. Set `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark -expression class name, to run Comet's faster native implementation despite its differences +expression class name, to run Comet's native implementation despite its differences from Spark. See the [Comet Supported Expressions Guide](../../../expressions.md) for more information on this configuration setting. diff --git a/docs/source/user-guide/latest/compatibility/expressions/spark-4.0/index.md b/docs/source/user-guide/latest/compatibility/expressions/spark-4.0/index.md index 659b830e9c..371da9a49f 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/spark-4.0/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/spark-4.0/index.md @@ -23,7 +23,7 @@ Compatibility notes for Comet running on Apache Spark 4.0. Expressions that are Spark-compatible fall back to Spark by default, except those with a JVM codegen-dispatch path, which stay in Comet's native pipeline and match Spark exactly. Set `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark -expression class name, to run Comet's faster native implementation despite its differences +expression class name, to run Comet's native implementation despite its differences from Spark. See the [Comet Supported Expressions Guide](../../../expressions.md) for more information on this configuration setting. diff --git a/docs/source/user-guide/latest/compatibility/expressions/spark-4.1/index.md b/docs/source/user-guide/latest/compatibility/expressions/spark-4.1/index.md index aa2f4a130b..aaca6b485b 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/spark-4.1/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/spark-4.1/index.md @@ -23,7 +23,7 @@ Compatibility notes for Comet running on Apache Spark 4.1. Expressions that are Spark-compatible fall back to Spark by default, except those with a JVM codegen-dispatch path, which stay in Comet's native pipeline and match Spark exactly. Set `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark -expression class name, to run Comet's faster native implementation despite its differences +expression class name, to run Comet's native implementation despite its differences from Spark. See the [Comet Supported Expressions Guide](../../../expressions.md) for more information on this configuration setting. diff --git a/docs/source/user-guide/latest/compatibility/index.md b/docs/source/user-guide/latest/compatibility/index.md index f3b531a18f..52a5ed6850 100644 --- a/docs/source/user-guide/latest/compatibility/index.md +++ b/docs/source/user-guide/latest/compatibility/index.md @@ -34,7 +34,7 @@ This guide documents areas where Comet's behavior is known to differ from Spark. ## Compatible by default, opt in to native Comet runs a Spark-compatible implementation of every supported expression by default. Some -expressions also have a faster native implementation that can differ from Spark for certain +expressions also have a native implementation that can differ from Spark for certain inputs. These are not used unless you opt in by setting the relevant `spark.comet.expression..allowIncompatible=true` config (a few use a dedicated config, noted per expression below), after which you accept the documented differences. @@ -53,11 +53,11 @@ Some Spark expressions have two implementations in Comet: produces byte-exact Spark results at the cost of one JNI round-trip per batch. It is gated globally by `spark.comet.exec.scalaUDF.codegen.enabled` (enabled by default); when the dispatcher is disabled, these expressions fall back to Spark. -- A **native** (Rust / DataFusion) implementation that is faster, with no JNI overhead, but - has known semantic differences from Spark for some inputs or patterns. +- A **native** (Rust / DataFusion) implementation that avoids the JNI round-trip but has + known semantic differences from Spark for some inputs or patterns. Because the codegen-dispatch path matches Spark exactly, Comet uses it by **default**. The -faster native path is **opt-in per expression** via that expression's +native path is **opt-in per expression** via that expression's `spark.comet.expression..allowIncompatible=true` flag, which declares that you accept its differences from Spark. There is no global opt-in. When the native path is enabled but a specific input or pattern has no native implementation, Comet routes that case back diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 195e6a05bf..9ea512c40c 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.Cast import org.apache.comet.CometConf.COMET_ONHEAP_MEMORY_OVERHEAD import org.apache.comet.expressions.{CometCast, CometEvalMode} -import org.apache.comet.serde.{CometAggregateExpressionSerde, CometExpressionSerde, Compatible, Incompatible, NativeOptInAvailable, QueryPlanSerde, Unsupported} +import org.apache.comet.serde.{CodegenDispatchFallback, CometAggregateExpressionSerde, CometExpressionSerde, Compatible, Incompatible, NativeOptInAvailable, QueryPlanSerde, Unsupported} /** * Utility for generating markdown documentation from the configs. @@ -49,12 +49,15 @@ object GenerateDocs { * @param incompatibleReasons * reasons the native implementation is incompatible with Spark * @param unsupportedReasons - * cases that Comet does not support + * cases that Comet's native implementation does not handle * @param nativeOptIn * whether the serde implements `NativeOptInAvailable`, meaning the expression runs a - * Spark-compatible path by default and the user can opt into a faster native path + * Spark-compatible path by default and the user can opt into a native path * @param nativeOptInConfigKey * the config key the user sets to opt into the native path + * @param codegenDispatchFallback + * whether the serde mixes in `CodegenDispatchFallback`, meaning `unsupportedReasons` cases + * route through the JVM codegen dispatcher instead of falling back to Spark */ private case class ExprNotes( name: String, @@ -62,7 +65,8 @@ object GenerateDocs { incompatibleReasons: Seq[String], unsupportedReasons: Seq[String], nativeOptIn: Boolean, - nativeOptInConfigKey: String) + nativeOptInConfigKey: String, + codegenDispatchFallback: Boolean) private type CategoryNotes = Seq[ExprNotes] @@ -80,7 +84,8 @@ object GenerateDocs { serde.getIncompatibleReasons(), serde.getUnsupportedReasons(), optIn, - key) + key, + codegenDispatchFallback = serde.isInstanceOf[CodegenDispatchFallback]) } /** Build the documentation notes for a single aggregate expression serde. */ @@ -92,7 +97,8 @@ object GenerateDocs { serde.getUnsupportedReasons(), // Aggregate serdes do not have a native opt-in path. nativeOptIn = false, - nativeOptInConfigKey = CometConf.getExprAllowIncompatConfigKey(cls)) + nativeOptInConfigKey = CometConf.getExprAllowIncompatConfigKey(cls), + codegenDispatchFallback = false) /** * Mapping from expression category to the compatibility guide filename where that category's @@ -277,8 +283,9 @@ object GenerateDocs { } if (n.incompatibleReasons.nonEmpty) { val header = if (n.nativeOptIn) { - s"\nBy default, Comet runs a Spark-compatible implementation of `$name`. Set" + - s" `${n.nativeOptInConfigKey}=true` to use Comet's faster native implementation" + + s"\nBy default, `$name` is evaluated in the JVM using Spark's own code-generated" + + " implementation (run inside the Comet pipeline), which matches Spark exactly." + + s" Set `${n.nativeOptInConfigKey}=true` to opt into Comet's native implementation" + " instead, which has the following differences from Spark:\n\n" } else { s"\nThe following incompatibilities cause `$name` to fall back to Spark by default." + @@ -291,7 +298,14 @@ object GenerateDocs { } } if (n.unsupportedReasons.nonEmpty) { - w.write("\nThe following cases are not supported by Comet:\n\n".getBytes) + val header = if (n.codegenDispatchFallback) { + "\nThe following cases have no native implementation and always run in the JVM using" + + " Spark's code-generated implementation (inside the Comet pipeline):\n\n" + } else { + "\nThe following cases are not supported by Comet and always fall back to Spark," + + " regardless of any `allowIncompatible` setting:\n\n" + } + w.write(header.getBytes) for (reason <- n.unsupportedReasons) { w.write(s"- $reason\n".getBytes) } diff --git a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala index 5baf2d9bc0..cd810e81f4 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala @@ -63,9 +63,13 @@ trait CometExpressionSerde[T <: Expression] { def getIncompatibleReasons(): Seq[String] = Seq.empty /** - * Get documentation for usages where this expression is unsupported with Spark. This is called - * from GenerateDocs when generating the Compatibility Guide. Each reason should be written in - * Markdown and may span multiple lines. + * Get documentation for usages of this expression that Comet's native implementation does not + * support. Cases listed here normally fall back to Spark, regardless of any `allowIncompatible` + * setting. When the serde mixes in `CodegenDispatchFallback` they are instead routed through + * the JVM codegen dispatcher (Spark's own `doGenCode` inside the Comet pipeline), so they stay + * in the Comet pipeline while still matching Spark exactly. This is called from GenerateDocs + * when generating the Compatibility Guide. Each reason should be written in Markdown and may + * span multiple lines. * * @return * List of reasons, defaulting to an empty list. @@ -101,14 +105,22 @@ trait CometExpressionSerde[T <: Expression] { } /** - * Opt-in marker for expression serdes that have a native implementation which is `Incompatible` - * with Spark for some inputs. When such an expression reports `Incompatible` and the user has not - * enabled `allowIncompatible` for it, mixing in this trait routes it through the JVM codegen - * dispatcher (running Spark's own `doGenCode` inside the Comet pipeline) instead of falling the - * projection back to Spark, so it stays native while still matching Spark exactly. + * Mixin for serdes whose native implementation cannot match Spark for some inputs. When + * `getSupportLevel` returns `Incompatible` and the user has not enabled `allowIncompatible`, the + * expression routes through the JVM codegen dispatcher (Spark's own `doGenCode` inside the Comet + * pipeline) instead of falling the projection back to Spark. When `getSupportLevel` returns + * `Unsupported`, the expression always routes through the dispatcher -- the serde is declaring + * "no native path exists for this case; run Spark's code in-pipeline." Spark fallback is reserved + * for the case where the dispatcher itself cannot handle the expression (e.g. the global codegen + * flag is off, or the kernel rejects the bound tree). + * + * Contract for `Unsupported` reasons on a `CodegenDispatchFallback` serde: the case must be + * something `Expression.doGenCode` can compile. If you mark something `Unsupported` because Spark + * also rejects it, that is fine -- the dispatcher will surface the same error Spark would have. * * Enrollment is opt-in: only serdes that explicitly mix this in are routed through the - * dispatcher. Every other `Incompatible` expression falls back to Spark. + * dispatcher. Every other `Incompatible` expression falls back to Spark, and every other + * `Unsupported` expression falls back to Spark. */ trait CodegenDispatchFallback extends NativeOptInAvailable { self: CometExpressionSerde[_] => } diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 40d8e39dbf..5093bdaa8d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -767,8 +767,19 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { } handler.getSupportLevel(expr) match { case Unsupported(notes) => - withFallbackReason(expr, notes.getOrElse("")) - None + // `CodegenDispatchFallback` serdes have no native path for these cases either, but the + // dispatcher can still run Spark's own `doGenCode` inside the Comet pipeline. Try that + // before falling the projection back to Spark. No `[COMET-INFO]` hint here: unlike + // `Incompatible`, there is no native opt-in for the user to flip. + val dispatched = handler match { + case _: CodegenDispatchFallback => + CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding) + case _ => None + } + dispatched.orElse { + withFallbackReason(expr, notes.getOrElse("")) + None + } case Incompatible(notes) => val exprAllowIncompat = CometConf.isExprAllowIncompat(exprConfName) if (exprAllowIncompat) { diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index eaecd1b49a..78069642c6 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -121,8 +121,7 @@ object CometSortArray extends CometExpressionSerde[SortArray] with CodegenDispat " floating-point types is not 100% compatible with Spark") override def getUnsupportedReasons(): Seq[String] = Seq( - "Nested arrays with `Struct` or `Null` child values are not supported natively and will" + - " fall back to Spark.") + "Nested arrays with `Struct` or `Null` child values are not supported natively") private def supportedSortArrayElementType( dt: DataType, From b9c46e98f6ddbaf86d8178fc35d7e5cf01531e28 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 25 Jun 2026 12:01:47 -0600 Subject: [PATCH 2/3] refactor: extract dispatchIfFallback helper for shared dispatch routing The Unsupported and Incompatible arms both pattern-match on CodegenDispatchFallback and call emitJvmCodegenDispatch. Lift that pattern into a single helper that returns the matched handler alongside the dispatched expression, so the Incompatible arm can reach nativeOptInConfigKeyOverride for its [COMET-INFO] hint without re-matching the same value. --- .../apache/comet/serde/QueryPlanSerde.scala | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 5093bdaa8d..8f49f4173d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -771,12 +771,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { // dispatcher can still run Spark's own `doGenCode` inside the Comet pipeline. Try that // before falling the projection back to Spark. No `[COMET-INFO]` hint here: unlike // `Incompatible`, there is no native opt-in for the user to flip. - val dispatched = handler match { - case _: CodegenDispatchFallback => - CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding) - case _ => None - } - dispatched.orElse { + dispatchIfFallback(handler, expr, inputs, binding).map(_._2).orElse { withFallbackReason(expr, notes.getOrElse("")) None } @@ -796,15 +791,12 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { // pipeline) so the projection stays native while still matching Spark. Everything else // falls back to Spark. Falling back is also the result when the dispatcher cannot // handle the expression. - val dispatched = handler match { - case h: CodegenDispatchFallback => - CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding).map { proto => - val key = h.nativeOptInConfigKeyOverride - .getOrElse(CometConf.getExprAllowIncompatConfigKey(exprConfName)) - withInfo(expr, NativeOptIn.message(exprConfName, key)) - proto - } - case _ => None + val dispatched = dispatchIfFallback(handler, expr, inputs, binding).map { + case (h, proto) => + val key = h.nativeOptInConfigKeyOverride + .getOrElse(CometConf.getExprAllowIncompatConfigKey(exprConfName)) + withInfo(expr, NativeOptIn.message(exprConfName, key)) + proto } dispatched.orElse { val optionalNotes = notes.map(str => s" ($str)").getOrElse("") @@ -1047,6 +1039,23 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { } + /** + * If `handler` is a `CodegenDispatchFallback`, run `expr` through the JVM codegen dispatcher + * and return `Some((handler, proto))` on success; otherwise return `None`. Shared by the + * `Unsupported` and (non-opt-in) `Incompatible` arms of `exprToProtoInternal` so they don't + * each inline the same pattern match. Returning the matched handler lets the `Incompatible` arm + * reach `nativeOptInConfigKeyOverride` without re-pattern-matching the same value. + */ + private def dispatchIfFallback( + handler: CometExpressionSerde[_], + expr: Expression, + inputs: Seq[Attribute], + binding: Boolean): Option[(CodegenDispatchFallback, Expr)] = handler match { + case h: CodegenDispatchFallback => + CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding).map(h -> _) + case _ => None + } + // scalastyle:off /** * Align w/ Arrow's From 6095e4cd14612ad6507e3adc551d53fa550ab03b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 25 Jun 2026 14:42:08 -0600 Subject: [PATCH 3/3] test: update stale fallback expectations for serdes now routed through codegen dispatch Concat, SortArray, TruncDate, and TruncTimestamp now route their previously Unsupported cases through the JVM codegen dispatcher and stay native instead of falling back to Spark. Update the test expectations to assert native execution and a matching answer rather than a fallback reason. --- .../expressions/array/array_concat.sql | 13 +++++---- .../expressions/array/sort_array.sql | 12 ++++++-- .../sql-tests/expressions/string/concat.sql | 13 +++++---- .../comet/CometTemporalExpressionSuite.scala | 28 +++++++++++-------- 4 files changed, 41 insertions(+), 25 deletions(-) diff --git a/spark/src/test/resources/sql-tests/expressions/array/array_concat.sql b/spark/src/test/resources/sql-tests/expressions/array/array_concat.sql index 21d83a7afb..95fc3bcbad 100644 --- a/spark/src/test/resources/sql-tests/expressions/array/array_concat.sql +++ b/spark/src/test/resources/sql-tests/expressions/array/array_concat.sql @@ -24,17 +24,20 @@ CREATE TABLE test_array_concat(c1 array, c2 array, c3 array, c4 a statement INSERT INTO test_array_concat VALUES (array(0, 1), array(2, 3), array(), array(null), null), (array(1, 2), array(3, 4), array(), array(null), null), (array(2, 3), array(4, 5), array(), array(null), null) -query expect_fallback(CONCAT supports only string input parameters) +-- Concat mixes in CodegenDispatchFallback, so non-string (array) children have no native path +-- but route through the JVM codegen dispatcher (Spark's own Concat.doGenCode inside the Comet +-- pipeline) and stay native while matching Spark exactly. +query SELECT concat(c1, c2) AS x FROM test_array_concat -query expect_fallback(CONCAT supports only string input parameters) +query SELECT concat(c1, c1) AS x FROM test_array_concat -query expect_fallback(CONCAT supports only string input parameters) +query SELECT concat(c1, c2, c3) AS x FROM test_array_concat -query expect_fallback(CONCAT supports only string input parameters) +query SELECT concat(c1, c2, c3, c5) AS x FROM test_array_concat -query expect_fallback(CONCAT supports only string input parameters) +query SELECT concat(concat(c1, c2, c3), concat(c1, c3)) AS x FROM test_array_concat diff --git a/spark/src/test/resources/sql-tests/expressions/array/sort_array.sql b/spark/src/test/resources/sql-tests/expressions/array/sort_array.sql index 1ced53394d..69b4472dbf 100644 --- a/spark/src/test/resources/sql-tests/expressions/array/sort_array.sql +++ b/spark/src/test/resources/sql-tests/expressions/array/sort_array.sql @@ -295,10 +295,13 @@ INSERT INTO test_sort_array_nested_struct VALUES (array()), (NULL) -query expect_fallback(Sort on array element type ArrayType(StructType(StructField(a,IntegerType) +-- SortArray mixes in CodegenDispatchFallback, so nested arrays with Struct children have no +-- native path but route through the JVM codegen dispatcher (Spark's own SortArray.doGenCode +-- inside the Comet pipeline) and stay native while matching Spark exactly. +query SELECT sort_array(arr) FROM test_sort_array_nested_struct -query expect_fallback(Sort on array element type ArrayType(StructType(StructField(a,IntegerType) +query SELECT sort_array(arr, false) FROM test_sort_array_nested_struct -- literal arguments @@ -391,7 +394,10 @@ SELECT sort_array(array(NULL, NULL)), sort_array(cast(NULL as array)) -query expect_fallback(Sort on array element type ArrayType(StructType(StructField(a,IntegerType) +-- nested arrays with Struct children have no native path but route through the JVM codegen +-- dispatcher (Spark's own SortArray.doGenCode inside the Comet pipeline) and stay native while +-- matching Spark exactly. +query SELECT sort_array( array( array(named_struct('a', 2)), diff --git a/spark/src/test/resources/sql-tests/expressions/string/concat.sql b/spark/src/test/resources/sql-tests/expressions/string/concat.sql index 3fa2bb65d0..0f299f66a8 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/concat.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/concat.sql @@ -57,17 +57,20 @@ SELECT concat('hello', ' ', 'world'), concat('', '', ''), concat(NULL, 'b', 'c') statement CREATE TABLE test_concat_binary USING parquet AS SELECT cast(uuid() as binary) c1, cast(uuid() as binary) c2, cast(uuid() as binary) c3, cast(uuid() as binary) c4, cast(null as binary) c5 FROM range(10) -query expect_fallback(CONCAT supports only string input parameters) +-- Concat mixes in CodegenDispatchFallback, so non-string (binary) children have no native path +-- but route through the JVM codegen dispatcher (Spark's own Concat.doGenCode inside the Comet +-- pipeline) and stay native while matching Spark exactly. +query SELECT concat(c1, c2) AS x FROM test_concat_binary -query expect_fallback(CONCAT supports only string input parameters) +query SELECT concat(c1, c1) AS x FROM test_concat_binary -query expect_fallback(CONCAT supports only string input parameters) +query SELECT concat(c1, c2, c3) AS x FROM test_concat_binary -query expect_fallback(CONCAT supports only string input parameters) +query SELECT concat(c1, c2, c3, c5) AS x FROM test_concat_binary -query expect_fallback(CONCAT supports only string input parameters) +query SELECT concat(concat(c1, c2, c3), concat(c1, c3)) AS x FROM test_concat_binary diff --git a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala index c8410c840d..b42aa9dd84 100644 --- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala @@ -56,10 +56,11 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH checkSparkAnswerAndOperator(s"SELECT c0, trunc(c0, '$format') from tbl order by c0, c1") } for (format <- unsupportedFormats) { - // Comet should fall back to Spark for unsupported or invalid formats - checkSparkAnswerAndFallbackReason( - s"SELECT c0, trunc(c0, '$format') from tbl order by c0, c1", - s"Format $format is not supported") + // Formats outside the native set have no native path, but TruncDate mixes in + // CodegenDispatchFallback, so they route through the JVM codegen dispatcher (Spark's own + // TruncDate.doGenCode inside the Comet pipeline) and stay native while matching Spark + // exactly (an invalid format yields null in both engines). + checkSparkAnswerAndOperator(s"SELECT c0, trunc(c0, '$format') from tbl order by c0, c1") } // Non-literal format strings are Incompatible on the native path, so Comet routes them @@ -80,10 +81,11 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH checkSparkAnswerAndOperator(s"SELECT c0, date_trunc('$format', c0) from tbl order by c0") } for (format <- unsupportedFormats) { - // Comet should fall back to Spark for unsupported or invalid formats - checkSparkAnswerAndFallbackReason( - s"SELECT c0, date_trunc('$format', c0) from tbl order by c0", - s"Format $format is not supported") + // Formats outside the native set have no native path, but TruncTimestamp mixes in + // CodegenDispatchFallback, so they route through the JVM codegen dispatcher (Spark's own + // TruncTimestamp.doGenCode inside the Comet pipeline) and stay native while matching + // Spark exactly (an invalid format yields null in both engines). + checkSparkAnswerAndOperator(s"SELECT c0, date_trunc('$format', c0) from tbl order by c0") } // Non-literal format strings are Incompatible on the native path, so Comet routes them // through the codegen dispatcher and still executes natively. @@ -107,10 +109,12 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH s"SELECT c0, date_trunc('$format', c0) from tbl order by c0") } for (format <- unsupportedFormats) { - // Comet should fall back to Spark for unsupported or invalid formats - checkSparkAnswerAndFallbackReason( - s"SELECT c0, date_trunc('$format', c0) from tbl order by c0", - s"Format $format is not supported") + // Formats outside the native set have no native path, but TruncTimestamp mixes in + // CodegenDispatchFallback, so they route through the JVM codegen dispatcher (Spark's + // own TruncTimestamp.doGenCode inside the Comet pipeline) and stay native while + // matching Spark exactly (an invalid format yields null in both engines). + checkSparkAnswerAndOperator( + s"SELECT c0, date_trunc('$format', c0) from tbl order by c0") } // Non-literal format strings are Incompatible on the native path, so Comet routes them // through the codegen dispatcher and still executes natively.