From 5bea219b6a883fa70fd7eab56f0f4a1cb973f89c Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Wed, 17 Jun 2026 22:54:07 +0800
Subject: [PATCH] [SPARK-57507][SQL] Fix UTF8String.reverse reading past the
 end of a truncated trailing UTF-8 sequence

### What changes were proposed in this pull request?

`UTF8String.reverse()` reverses a string one UTF-8 character at a time, using
`numBytesForFirstByte(getByte(i))` to determine the width of the character at byte
position `i`. The per-character copy length was clamped to `numBytes` (the total
length) instead of `numBytes - i` (the bytes that actually remain). When the last
bytes of the string are a truncated multi-byte sequence (a leader byte whose
declared width exceeds the remaining bytes), `copyMemory` read past the end of the
string into adjacent memory. This changes the clamp to `numBytes - i`.

### Why are the changes needed?

`UTF8String` can hold malformed UTF-8 (for example, bytes produced by binary
coercion or truncated input). For such a string ending in an incomplete multi-byte
sequence, `reverse()` performed an out-of-bounds read and produced a wrong result.
Every other byte scan in the class already bounds by the remaining bytes; this
brings `reverse()` in line. Well-formed UTF-8 is unaffected, since a complete
sequence never exceeds the remaining bytes.

### Does this PR introduce _any_ user-facing change?

Yes, it fixes incorrect results. The SQL `reverse()` function on a string value
that contains malformed UTF-8 ending in a truncated multi-byte sequence no longer
reads past the end of the value; only previously-incorrect results change.

### How was this patch tested?

Added cases to `UTF8StringSuite#reverse()` for truncated trailing 2-, 3-, and
4-byte leaders, and for a complete multi-byte character followed by an orphan
leader. Each uses a sliced backing array with a trailing sentinel byte so the
previous over-read produces a deterministically wrong value; the cases fail on the
old code and pass with the fix.
---
 .../apache/spark/unsafe/types/UTF8String.java |  2 +-
 .../spark/unsafe/types/UTF8StringSuite.java   | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index ab57e308ee4d7..06ec4af7a5f27 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -1157,7 +1157,7 @@ public UTF8String reverse() {
 
     int i = 0; // position in byte
     while (i < numBytes) {
-      int len = Math.min(numBytesForFirstByte(getByte(i)), numBytes);
+      int len = Math.min(numBytesForFirstByte(getByte(i)), numBytes - i);
       int targetOffset = Math.max(result.length - i - len, 0);
       copyMemory(this.base, this.offset + i, result,
         BYTE_ARRAY_OFFSET + targetOffset, len);
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 26b96155377e8..a9f365173be64 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -325,6 +325,30 @@ public void reverse() {
     assertEquals(EMPTY_UTF8, EMPTY_UTF8.reverse());
     assertEquals(fromString("者行孙"), fromString("孙行者").reverse());
     assertEquals(fromString("者行孙 olleh"), fromString("hello 孙行者").reverse());
+    // Malformed UTF-8: a truncated trailing multi-byte sequence must be reversed as orphan
+    // bytes without reading past the end of the string. The backing arrays carry an extra
+    // trailing byte so a regression that over-reads would produce a deterministically wrong
+    // result rather than reading uninitialized memory.
+    // 'A' followed by an incomplete 2-byte leader (0xCE).
+    byte[] truncated2 = new byte[]{0x41, (byte) 0xCE, 0x42};
+    assertEquals(
+      fromBytes(new byte[]{(byte) 0xCE, 0x41}),
+      fromBytes(truncated2, 0, 2).reverse());
+    // 'A' followed by an incomplete 3-byte leader (0xE4 0xB8).
+    byte[] truncated3 = new byte[]{0x41, (byte) 0xE4, (byte) 0xB8, 0x42};
+    assertEquals(
+      fromBytes(new byte[]{(byte) 0xE4, (byte) 0xB8, 0x41}),
+      fromBytes(truncated3, 0, 3).reverse());
+    // 'A' followed by an incomplete 4-byte leader (0xF0 0x90).
+    byte[] truncated4 = new byte[]{0x41, (byte) 0xF0, (byte) 0x90, 0x42};
+    assertEquals(
+      fromBytes(new byte[]{(byte) 0xF0, (byte) 0x90, 0x41}),
+      fromBytes(truncated4, 0, 3).reverse());
+    // A complete 3-byte character (U+4E16) followed by an incomplete 2-byte leader (0xCE).
+    byte[] truncatedMid = new byte[]{(byte) 0xE4, (byte) 0xB8, (byte) 0x96, (byte) 0xCE, 0x42};
+    assertEquals(
+      fromBytes(new byte[]{(byte) 0xCE, (byte) 0xE4, (byte) 0xB8, (byte) 0x96}),
+      fromBytes(truncatedMid, 0, 4).reverse());
   }
 
   @Test