From bf5d1622f8bc297d9e63e5a271f6cd8080cf8147 Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Wed, 17 Jun 2026 21:56:23 +0800
Subject: [PATCH 1/2] [SPARK-57506][SQL] Fix RTRIM-collation trimRight dropping
 trailing spaces for strings with supplementary characters

### What changes were proposed in this pull request?

`CollationAwareUTF8String.trimRight` (the ICU path used by RTRIM-modifier
collations) compared a Java-String (UTF-16) index against a Unicode code-point
count: `lastNonSpacePosition == srcString.numChars()`. `lastNonSpacePosition` is a
UTF-16 index into `src = srcString.toValidString()` (initialized to `src.length()`
and decremented via `src.charAt`), so the sentinel must be compared against
`src.length()`. This changes that one comparison to `src.length()`.

### Why are the changes needed?

For RTRIM-style collations, trailing spaces are ignored while matching the trim
characters but must be re-appended to the result. With the wrong unit, the "no
trailing spaces were skipped" sentinel fired spuriously when the number of
supplementary characters equaled the number of trailing spaces, dropping the
preserved trailing spaces. For example, under UNICODE_RTRIM, right-trimming a
supplementary character (such as U+1D538) followed by a single space returned an
empty string instead of the preserved space.

### Does this PR introduce _any_ user-facing change?

Yes, it fixes incorrect results. Under RTRIM-modifier collations, right-trim on a
string that contains supplementary characters followed by trailing spaces (when
the count of supplementary characters equals the count of trailing spaces) no
longer drops the preserved trailing spaces. Only previously-incorrect results are
affected.

### How was this patch tested?

Added cases to `CollationSupportSuite#testStringTrimRight` for RTRIM-modifier
collations with supplementary characters and trailing spaces: the two coincidence
cases that previously returned the wrong value, plus a BMP control, a
non-coincidence control, and the all-spaces early-return path. The coincidence
cases fail on the old code and pass with the fix.
---
 .../sql/catalyst/util/CollationAwareUTF8String.java      | 2 +-
 .../apache/spark/unsafe/types/CollationSupportSuite.java | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 2b9457c58560f..c9fee02125fd8 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -1519,7 +1519,7 @@ public static UTF8String trimRight(
     if (charIndex == src.length()) {
       return srcString;
     }
-    if (lastNonSpacePosition == srcString.numChars()) {
+    if (lastNonSpacePosition == src.length()) {
       return UTF8String.fromString(src.substring(0, charIndex));
     }
     return UTF8String.fromString(
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 1db163c1c822d..d4732efe15bf8 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -3647,6 +3647,15 @@ public void testStringTrimRight() throws SparkException {
     assertStringTrimRight(UTF8_LCASE, "𝔸", "a", "𝔸");
     assertStringTrimRight(UNICODE, "𝔸", "a", "𝔸");
     assertStringTrimRight(UNICODE_CI, "𝔸", "a", "");
+    // RTRIM-modifier collations (ICU path): trailing spaces are ignored while matching but must
+    // be re-appended afterwards. When the number of trailing spaces equals the number of
+    // supplementary code points, a Java-char-index vs code-point-count comparison previously
+    // dropped the preserved spaces.
+    assertStringTrimRight("UNICODE_RTRIM", "x ", "x", " ");
+    assertStringTrimRight("UNICODE_RTRIM", "   ", "x", "   ");
+    assertStringTrimRight("UNICODE_RTRIM", "𝔸 ", "𝔸", " ");
+    assertStringTrimRight("UNICODE_RTRIM", "𝔸  ", "𝔸", "  ");
+    assertStringTrimRight("UNICODE_RTRIM", "𝔸𝔸  ", "𝔸", "  ");
   }
 
   /**

From 1dad854cb3c1a08b067a126c65a659933011805b Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Thu, 18 Jun 2026 13:58:01 +0800
Subject: [PATCH 2/2] [SPARK-57506][SQL][TESTS] Cover UTF8_BINARY_RTRIM and
 UTF8_LCASE_RTRIM in testStringTrimRight

Per review feedback, add the UTF8_BINARY_RTRIM and UTF8_LCASE_RTRIM RTRIM
collations alongside UNICODE_RTRIM for each input, grouped per input. These
dispatch to binaryTrimRight / lowercaseTrimRight, which index by code point
throughout and were already correct. Also add a case-folding case where
UTF8_LCASE diverges from binary/ICU, so the lcase path can't silently regress.
---
 .../unsafe/types/CollationSupportSuite.java   | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index d4732efe15bf8..6372d7e4663c1 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -3647,15 +3647,32 @@ public void testStringTrimRight() throws SparkException {
     assertStringTrimRight(UTF8_LCASE, "𝔸", "a", "𝔸");
     assertStringTrimRight(UNICODE, "𝔸", "a", "𝔸");
     assertStringTrimRight(UNICODE_CI, "𝔸", "a", "");
-    // RTRIM-modifier collations (ICU path): trailing spaces are ignored while matching but must
-    // be re-appended afterwards. When the number of trailing spaces equals the number of
-    // supplementary code points, a Java-char-index vs code-point-count comparison previously
-    // dropped the preserved spaces.
+    // RTRIM-modifier collations ignore trailing spaces while matching the trim characters, then
+    // re-append them. The behaviour must agree across the UTF8_BINARY (binaryTrimRight),
+    // UTF8_LCASE (lowercaseTrimRight), and ICU (trimRight) paths. The supplementary-character
+    // cases below (trailing-space count == supplementary code-point count) regressed on the ICU
+    // path before SPARK-57506, which compared a Java-char index against a code-point count.
+    assertStringTrimRight("UTF8_BINARY_RTRIM", "x ", "x", " ");
+    assertStringTrimRight("UTF8_LCASE_RTRIM", "x ", "x", " ");
     assertStringTrimRight("UNICODE_RTRIM", "x ", "x", " ");
+    assertStringTrimRight("UTF8_BINARY_RTRIM", "   ", "x", "   ");
+    assertStringTrimRight("UTF8_LCASE_RTRIM", "   ", "x", "   ");
     assertStringTrimRight("UNICODE_RTRIM", "   ", "x", "   ");
+    assertStringTrimRight("UTF8_BINARY_RTRIM", "𝔸 ", "𝔸", " ");
+    assertStringTrimRight("UTF8_LCASE_RTRIM", "𝔸 ", "𝔸", " ");
     assertStringTrimRight("UNICODE_RTRIM", "𝔸 ", "𝔸", " ");
+    assertStringTrimRight("UTF8_BINARY_RTRIM", "𝔸  ", "𝔸", "  ");
+    assertStringTrimRight("UTF8_LCASE_RTRIM", "𝔸  ", "𝔸", "  ");
     assertStringTrimRight("UNICODE_RTRIM", "𝔸  ", "𝔸", "  ");
+    assertStringTrimRight("UTF8_BINARY_RTRIM", "𝔸𝔸  ", "𝔸", "  ");
+    assertStringTrimRight("UTF8_LCASE_RTRIM", "𝔸𝔸  ", "𝔸", "  ");
     assertStringTrimRight("UNICODE_RTRIM", "𝔸𝔸  ", "𝔸", "  ");
+    // Case-folding interacts with space preservation per path: only UTF8_LCASE folds B to b, so
+    // only it trims the trailing 'B' and re-appends the space; binary and (case-sensitive) ICU
+    // leave the input unchanged. This exercises the lcase space-preservation branch on its own.
+    assertStringTrimRight("UTF8_BINARY_RTRIM", "xB ", "b", "xB ");
+    assertStringTrimRight("UTF8_LCASE_RTRIM", "xB ", "b", "x ");
+    assertStringTrimRight("UNICODE_RTRIM", "xB ", "b", "xB ");
   }
 
   /**