From bf5d1622f8bc297d9e63e5a271f6cd8080cf8147 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 17 Jun 2026 21:56:23 +0800 Subject: [PATCH 1/2] [SPARK-57506][SQL] Fix RTRIM-collation trimRight dropping trailing spaces for strings with supplementary characters ### What changes were proposed in this pull request? `CollationAwareUTF8String.trimRight` (the ICU path used by RTRIM-modifier collations) compared a Java-String (UTF-16) index against a Unicode code-point count: `lastNonSpacePosition == srcString.numChars()`. `lastNonSpacePosition` is a UTF-16 index into `src = srcString.toValidString()` (initialized to `src.length()` and decremented via `src.charAt`), so the sentinel must be compared against `src.length()`. This changes that one comparison to `src.length()`. ### Why are the changes needed? For RTRIM-style collations, trailing spaces are ignored while matching the trim characters but must be re-appended to the result. With the wrong unit, the "no trailing spaces were skipped" sentinel fired spuriously when the number of supplementary characters equaled the number of trailing spaces, dropping the preserved trailing spaces. For example, under UNICODE_RTRIM, right-trimming a supplementary character (such as U+1D538) followed by a single space returned an empty string instead of the preserved space. ### Does this PR introduce _any_ user-facing change? Yes, it fixes incorrect results. Under RTRIM-modifier collations, right-trim on a string that contains supplementary characters followed by trailing spaces (when the count of supplementary characters equals the count of trailing spaces) no longer drops the preserved trailing spaces. Only previously-incorrect results are affected. ### How was this patch tested? Added cases to `CollationSupportSuite#testStringTrimRight` for RTRIM-modifier collations with supplementary characters and trailing spaces: the two coincidence cases that previously returned the wrong value, plus a BMP control, a non-coincidence control, and the all-spaces early-return path. The coincidence cases fail on the old code and pass with the fix. --- .../sql/catalyst/util/CollationAwareUTF8String.java | 2 +- .../apache/spark/unsafe/types/CollationSupportSuite.java | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 2b9457c58560f..c9fee02125fd8 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -1519,7 +1519,7 @@ public static UTF8String trimRight( if (charIndex == src.length()) { return srcString; } - if (lastNonSpacePosition == srcString.numChars()) { + if (lastNonSpacePosition == src.length()) { return UTF8String.fromString(src.substring(0, charIndex)); } return UTF8String.fromString( diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 1db163c1c822d..d4732efe15bf8 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -3647,6 +3647,15 @@ public void testStringTrimRight() throws SparkException { assertStringTrimRight(UTF8_LCASE, "𝔸", "a", "𝔸"); assertStringTrimRight(UNICODE, "𝔸", "a", "𝔸"); assertStringTrimRight(UNICODE_CI, "𝔸", "a", ""); + // RTRIM-modifier collations (ICU path): trailing spaces are ignored while matching but must + // be re-appended afterwards. When the number of trailing spaces equals the number of + // supplementary code points, a Java-char-index vs code-point-count comparison previously + // dropped the preserved spaces. + assertStringTrimRight("UNICODE_RTRIM", "x ", "x", " "); + assertStringTrimRight("UNICODE_RTRIM", " ", "x", " "); + assertStringTrimRight("UNICODE_RTRIM", "𝔸 ", "𝔸", " "); + assertStringTrimRight("UNICODE_RTRIM", "𝔸 ", "𝔸", " "); + assertStringTrimRight("UNICODE_RTRIM", "𝔸𝔸 ", "𝔸", " "); } /** From 1dad854cb3c1a08b067a126c65a659933011805b Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Thu, 18 Jun 2026 13:58:01 +0800 Subject: [PATCH 2/2] [SPARK-57506][SQL][TESTS] Cover UTF8_BINARY_RTRIM and UTF8_LCASE_RTRIM in testStringTrimRight Per review feedback, add the UTF8_BINARY_RTRIM and UTF8_LCASE_RTRIM RTRIM collations alongside UNICODE_RTRIM for each input, grouped per input. These dispatch to binaryTrimRight / lowercaseTrimRight, which index by code point throughout and were already correct. Also add a case-folding case where UTF8_LCASE diverges from binary/ICU, so the lcase path can't silently regress. --- .../unsafe/types/CollationSupportSuite.java | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index d4732efe15bf8..6372d7e4663c1 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -3647,15 +3647,32 @@ public void testStringTrimRight() throws SparkException { assertStringTrimRight(UTF8_LCASE, "𝔸", "a", "𝔸"); assertStringTrimRight(UNICODE, "𝔸", "a", "𝔸"); assertStringTrimRight(UNICODE_CI, "𝔸", "a", ""); - // RTRIM-modifier collations (ICU path): trailing spaces are ignored while matching but must - // be re-appended afterwards. When the number of trailing spaces equals the number of - // supplementary code points, a Java-char-index vs code-point-count comparison previously - // dropped the preserved spaces. + // RTRIM-modifier collations ignore trailing spaces while matching the trim characters, then + // re-append them. The behaviour must agree across the UTF8_BINARY (binaryTrimRight), + // UTF8_LCASE (lowercaseTrimRight), and ICU (trimRight) paths. The supplementary-character + // cases below (trailing-space count == supplementary code-point count) regressed on the ICU + // path before SPARK-57506, which compared a Java-char index against a code-point count. + assertStringTrimRight("UTF8_BINARY_RTRIM", "x ", "x", " "); + assertStringTrimRight("UTF8_LCASE_RTRIM", "x ", "x", " "); assertStringTrimRight("UNICODE_RTRIM", "x ", "x", " "); + assertStringTrimRight("UTF8_BINARY_RTRIM", " ", "x", " "); + assertStringTrimRight("UTF8_LCASE_RTRIM", " ", "x", " "); assertStringTrimRight("UNICODE_RTRIM", " ", "x", " "); + assertStringTrimRight("UTF8_BINARY_RTRIM", "𝔸 ", "𝔸", " "); + assertStringTrimRight("UTF8_LCASE_RTRIM", "𝔸 ", "𝔸", " "); assertStringTrimRight("UNICODE_RTRIM", "𝔸 ", "𝔸", " "); + assertStringTrimRight("UTF8_BINARY_RTRIM", "𝔸 ", "𝔸", " "); + assertStringTrimRight("UTF8_LCASE_RTRIM", "𝔸 ", "𝔸", " "); assertStringTrimRight("UNICODE_RTRIM", "𝔸 ", "𝔸", " "); + assertStringTrimRight("UTF8_BINARY_RTRIM", "𝔸𝔸 ", "𝔸", " "); + assertStringTrimRight("UTF8_LCASE_RTRIM", "𝔸𝔸 ", "𝔸", " "); assertStringTrimRight("UNICODE_RTRIM", "𝔸𝔸 ", "𝔸", " "); + // Case-folding interacts with space preservation per path: only UTF8_LCASE folds B to b, so + // only it trims the trailing 'B' and re-appends the space; binary and (case-sensitive) ICU + // leave the input unchanged. This exercises the lcase space-preservation branch on its own. + assertStringTrimRight("UTF8_BINARY_RTRIM", "xB ", "b", "xB "); + assertStringTrimRight("UTF8_LCASE_RTRIM", "xB ", "b", "x "); + assertStringTrimRight("UNICODE_RTRIM", "xB ", "b", "xB "); } /**