diff --git a/graalpython/com.oracle.graal.python.cext/src/unicodeobject.c b/graalpython/com.oracle.graal.python.cext/src/unicodeobject.c index 07b67dff80..36be194195 100644 --- a/graalpython/com.oracle.graal.python.cext/src/unicodeobject.c +++ b/graalpython/com.oracle.graal.python.cext/src/unicodeobject.c @@ -3709,8 +3709,7 @@ PyObject* PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) { // GraalPy change: different implementation - // TODO: this implementation does not honor Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors - return GraalPyPrivate_Unicode_FromUTF((void*) s, size, 1); + return GraalPyPrivate_Unicode_DecodeFSDefaultAndSize((void*) s, size); } diff --git a/graalpython/com.oracle.graal.python.test/src/tests/cpyext/test_unicode.py b/graalpython/com.oracle.graal.python.test/src/tests/cpyext/test_unicode.py index ed0cb4722d..d3f634b272 100644 --- a/graalpython/com.oracle.graal.python.test/src/tests/cpyext/test_unicode.py +++ b/graalpython/com.oracle.graal.python.test/src/tests/cpyext/test_unicode.py @@ -1069,9 +1069,10 @@ class TestPyUnicode(CPyExtTestCase): ) test_PyUnicode_FSDecoder = CPyExtFunction( - lambda args: str(args[0]), + lambda args: os.fsdecode(args[0]), lambda: ( (os.path.realpath(os_helper.TESTFN),), + (b"name-\xff",), ), code='''PyObject* wrap_PyUnicode_FSDecoder(PyObject* path) { PyObject* res; @@ -1089,6 +1090,51 @@ class TestPyUnicode(CPyExtTestCase): cmpfunc=unhandled_error_compare ) + test_PyUnicode_DecodeFSDefault = CPyExtFunction( + lambda args: os.fsdecode(args[0]), + lambda: ( + (b"name",), + (b"name-\xff",), + ), + code='''PyObject* wrap_PyUnicode_DecodeFSDefault(PyObject* path) { + char* data; + Py_ssize_t size; + if (PyBytes_AsStringAndSize(path, &data, &size) < 0) { + return NULL; + } + return PyUnicode_DecodeFSDefault(data); + } + ''', + resultspec="O", + argspec='O', + arguments=["PyObject* path"], + callfunction="wrap_PyUnicode_DecodeFSDefault", + cmpfunc=unhandled_error_compare + ) + + test_PyUnicode_DecodeFSDefaultAndSize = CPyExtFunction( + lambda args: os.fsdecode(args[0][:args[1]]), + lambda: ( + (b"name", 4), + (b"name-\xff", 6), + (b"name-\xff-suffix", 6), + ), + code='''PyObject* wrap_PyUnicode_DecodeFSDefaultAndSize(PyObject* path, Py_ssize_t size) { + char* data; + Py_ssize_t path_size; + if (PyBytes_AsStringAndSize(path, &data, &path_size) < 0) { + return NULL; + } + return PyUnicode_DecodeFSDefaultAndSize(data, size); + } + ''', + resultspec="O", + argspec='On', + arguments=["PyObject* path", "Py_ssize_t size"], + callfunction="wrap_PyUnicode_DecodeFSDefaultAndSize", + cmpfunc=unhandled_error_compare + ) + class TestUnicodeObject(unittest.TestCase): def test_intern(self): diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java index 4a86ec37ff..d3ff63eabb 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java @@ -74,6 +74,7 @@ import static com.oracle.graal.python.builtins.objects.cext.common.CExtCommonNodes.getByteArray; import static com.oracle.graal.python.builtins.objects.cext.structs.CStructAccess.writeLongField; import static com.oracle.graal.python.builtins.objects.cext.structs.CStructAccess.writePtrField; +import static com.oracle.graal.python.lib.PyUnicodeFSDecoderNode.SURROGATE_ESCAPE_FROM_UTF8_TRANSCODING_ERROR_HANDLER; import static com.oracle.graal.python.nodes.ErrorMessages.BAD_ARG_TYPE_FOR_BUILTIN_OP; import static com.oracle.graal.python.nodes.ErrorMessages.PRECISION_TOO_LARGE; import static com.oracle.graal.python.nodes.ErrorMessages.SEPARATOR_EXPECTED_STR_INSTANCE_P_FOUND; @@ -912,6 +913,25 @@ static Object fsDecoder(Object arg, } } + @CApiBuiltin(ret = PyObjectTransfer, args = {ConstCharPtr, Py_ssize_t}, call = Ignored, acquireGil = false) + static long GraalPyPrivate_Unicode_DecodeFSDefaultAndSize(long s, long lsize) { + // TODO: this implementation does not honor Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors + try { + int size = PInt.intValueExact(lsize); + TruffleString candidate = TruffleString.fromNativePointerUncached(s, 0, size, UTF_8, true); + TruffleString str; + if (candidate.isValidUncached(UTF_8)) { + str = candidate.switchEncodingUncached(TS_ENCODING); + } else { + str = candidate.switchEncodingUncached(TS_ENCODING, SURROGATE_ESCAPE_FROM_UTF8_TRANSCODING_ERROR_HANDLER); + } + // implicitly promotes TruffleString to PString + return PythonToNativeInternalNode.executeNewRefUncached(str); + } catch (OverflowException e) { + throw PRaiseNode.raiseStatic(null, MemoryError); + } + } + @CApiBuiltin(ret = PyObjectTransfer, args = {Pointer, Py_ssize_t, Int}, call = Ignored) abstract static class GraalPyPrivate_Unicode_FromUTF extends CApiTernaryBuiltinNode { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyUnicodeFSDecoderNode.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyUnicodeFSDecoderNode.java index 46ff74669c..4a9223cb65 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyUnicodeFSDecoderNode.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyUnicodeFSDecoderNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -42,6 +42,7 @@ import static com.oracle.graal.python.builtins.PythonBuiltinClassType.ValueError; import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING; +import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_8; import com.oracle.graal.python.builtins.objects.buffer.PythonBufferAccessLibrary; import com.oracle.graal.python.builtins.objects.bytes.PBytes; @@ -61,6 +62,8 @@ import com.oracle.truffle.api.frame.VirtualFrame; import com.oracle.truffle.api.library.CachedLibrary; import com.oracle.truffle.api.nodes.Node; +import com.oracle.truffle.api.strings.AbstractTruffleString; +import com.oracle.truffle.api.strings.TranscodingErrorHandler; import com.oracle.truffle.api.strings.TruffleString; import com.oracle.truffle.api.strings.TruffleString.Encoding; @@ -71,6 +74,9 @@ @GenerateUncached @GenerateInline(false) public abstract class PyUnicodeFSDecoderNode extends PNodeWithContext { + public static final TranscodingErrorHandler SURROGATE_ESCAPE_FROM_UTF8_TRANSCODING_ERROR_HANDLER = PyUnicodeFSDecoderNode::surrogateEscapeTranscodingError; + public static final TranscodingErrorHandler SURROGATE_ESCAPE_TO_UTF8_TRANSCODING_ERROR_HANDLER = PyUnicodeFSDecoderNode::surrogateEscapeToUTF8Handler; + public abstract TruffleString execute(Frame frame, Object object); @Specialization @@ -91,11 +97,17 @@ static TruffleString doPString(PString object, TruffleString doBytes(PBytes object, @CachedLibrary("object") PythonBufferAccessLibrary bufferLib, @Cached TruffleString.FromByteArrayNode fromByteArrayNode, + @Cached TruffleString.IsValidNode isValidNode, @Cached TruffleString.SwitchEncodingNode switchEncodingNode, @Shared("byteIndexOfCP") @Cached TruffleString.ByteIndexOfCodePointNode byteIndexOfCodePointNode) { - // TODO PyUnicode_DecodeFSDefault - TruffleString utf8 = fromByteArrayNode.execute(bufferLib.getCopiedByteArray(object), Encoding.UTF_8, false); - return checkString(this, switchEncodingNode.execute(utf8, TS_ENCODING), byteIndexOfCodePointNode); + TruffleString utf8 = fromByteArrayNode.execute(bufferLib.getCopiedByteArray(object), UTF_8, false); + TruffleString str; + if (isValidNode.execute(utf8, UTF_8)) { + str = switchEncodingNode.execute(utf8, TS_ENCODING); + } else { + str = switchEncodingNode.execute(utf8, TS_ENCODING, SURROGATE_ESCAPE_FROM_UTF8_TRANSCODING_ERROR_HANDLER); + } + return checkString(this, str, byteIndexOfCodePointNode); } @Fallback @@ -114,4 +126,19 @@ private static TruffleString checkString(Node raisingNode, TruffleString str, Tr } return str; } + + private static TranscodingErrorHandler.ReplacementString surrogateEscapeTranscodingError(AbstractTruffleString sourceString, int byteIndex, int estimatedByteLength, + Encoding sourceEncoding, Encoding targetEncoding) { + assert sourceEncoding == UTF_8 && targetEncoding == TS_ENCODING; + int b = sourceString.readByteUncached(byteIndex, UTF_8); + assert b >= 0x80; + return new TranscodingErrorHandler.ReplacementString(TruffleString.fromCodePointUncached(0xdc00 | b, TS_ENCODING, true), 1); + } + + private static TranscodingErrorHandler.ReplacementString surrogateEscapeToUTF8Handler(AbstractTruffleString sourceString, int byteIndex, + @SuppressWarnings("unused") int estimatedByteLength, Encoding sourceEncoding, Encoding targetEncoding) { + assert sourceEncoding == TS_ENCODING && targetEncoding == UTF_8; + int codepoint = sourceString.codePointAtByteIndexUncached(byteIndex, TS_ENCODING); + return new TranscodingErrorHandler.ReplacementString(TruffleString.fromByteArrayUncached(new byte[]{(byte) codepoint}, TruffleString.Encoding.UTF_8), 4); + } } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NativePosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NativePosixSupport.java index d7b63a16b4..6b651371fb 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NativePosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NativePosixSupport.java @@ -45,6 +45,7 @@ import static com.oracle.graal.python.annotations.NativeSimpleType.SINT32; import static com.oracle.graal.python.annotations.NativeSimpleType.SINT64; import static com.oracle.graal.python.annotations.NativeSimpleType.VOID; +import static com.oracle.graal.python.lib.PyUnicodeFSDecoderNode.SURROGATE_ESCAPE_TO_UTF8_TRANSCODING_ERROR_HANDLER; import static com.oracle.graal.python.nodes.StringLiterals.T_NATIVE; import static com.oracle.graal.python.runtime.NativePosixConstants.OFFSETOF_STRUCT_IN6_ADDR_S6_ADDR; import static com.oracle.graal.python.runtime.NativePosixConstants.OFFSETOF_STRUCT_IN_ADDR_S_ADDR; @@ -94,6 +95,7 @@ import com.oracle.graal.python.annotations.PythonOS; import com.oracle.graal.python.builtins.PythonBuiltinClassType; import com.oracle.graal.python.builtins.objects.exception.OSErrorEnum; +import com.oracle.graal.python.lib.PyUnicodeFSDecoderNode; import com.oracle.graal.python.nodes.ErrorMessages; import com.oracle.graal.python.nodes.PRaiseNode; import com.oracle.graal.python.runtime.PosixSupportLibrary.AcceptResult; @@ -131,6 +133,7 @@ import com.oracle.truffle.api.TruffleSafepoint; import com.oracle.truffle.api.dsl.Bind; import com.oracle.truffle.api.dsl.Cached; +import com.oracle.truffle.api.dsl.Cached.Exclusive; import com.oracle.truffle.api.dsl.Cached.Shared; import com.oracle.truffle.api.dsl.Specialization; import com.oracle.truffle.api.library.CachedLibrary; @@ -2540,9 +2543,9 @@ public AddrInfoCursor getaddrinfo(Object node, Object service, int family, int s @ExportMessage public TruffleString crypt(TruffleString word, TruffleString salt, @Bind Node raisingNode, - @Shared("toUtf8") @Cached TruffleString.SwitchEncodingNode switchEncodingToUtf8Node, - @Shared("tsCopyBytes") @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode, - @Shared("cString") @Cached NativeMemory.ZeroTerminatedUtf8ToTruffleStringNode zeroTerminatedUtf8ToTruffleStringNode) throws PosixException { + @Exclusive @Cached TruffleString.SwitchEncodingNode switchEncodingToUtf8Node, + @Exclusive @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode, + @Exclusive @Cached NativeMemory.ZeroTerminatedUtf8ToTruffleStringNode zeroTerminatedUtf8ToTruffleStringNode) throws PosixException { /* * From the manpage: Upon successful completion, crypt returns a pointer to a string which * encodes both the hashed passphrase, and the settings that were used to encode it. See @@ -3242,9 +3245,10 @@ private int getSysConfPwdSizeMax() throws PosixException { @ExportMessage @SuppressWarnings("static-method") public Object createPathFromString(TruffleString path, - @Shared("toUtf8") @Cached TruffleString.SwitchEncodingNode switchEncodingNode, - @Shared("tsCopyBytes") @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode) { - return checkPath(getStringBytes(path, switchEncodingNode, copyToByteArrayNode)); + @Exclusive @Cached TruffleString.SwitchEncodingNode switchEncodingNode, + @Exclusive @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode) { + TruffleString utf8 = switchEncodingNode.execute(path, UTF_8, SURROGATE_ESCAPE_TO_UTF8_TRANSCODING_ERROR_HANDLER); + return checkPath(copyToByteArrayNode.execute(utf8, UTF_8)); } @ExportMessage @@ -3256,15 +3260,21 @@ public Object createPathFromBytes(byte[] path) { @ExportMessage @SuppressWarnings("static-method") public TruffleString getPathAsString(Object path, - @Shared("tsFromBytes") @Cached TruffleString.FromByteArrayNode fromByteArrayNode, - @Shared("fromUtf8") @Cached TruffleString.SwitchEncodingNode switchEncodingNode) { + @Exclusive @Cached TruffleString.FromByteArrayNode fromByteArrayNode, + @Exclusive @Cached TruffleString.IsValidNode isValidNode, + @Exclusive @Cached TruffleString.SwitchEncodingNode switchEncodingNode) { Buffer result = (Buffer) path; if (result.length > Integer.MAX_VALUE) { // sanity check that it is safe to cast result.length to int, to be removed once // we support large arrays throw CompilerDirectives.shouldNotReachHere("Posix path cannot fit into a Java array"); } - return createString(result.data, 0, (int) result.length, true, fromByteArrayNode, switchEncodingNode); + int length = (int) result.length; + TruffleString utf8 = fromByteArrayNode.execute(result.data, 0, length, UTF_8, true); + if (isValidNode.execute(utf8, UTF_8)) { + return switchEncodingNode.execute(utf8, TS_ENCODING); + } + return switchEncodingNode.execute(utf8, TS_ENCODING, PyUnicodeFSDecoderNode.SURROGATE_ESCAPE_FROM_UTF8_TRANSCODING_ERROR_HANDLER); } @ExportMessage @@ -3275,13 +3285,11 @@ public Buffer getPathAsBytes(Object path) { private static TruffleString createString(byte[] src, int offset, int length, boolean copy, TruffleString.FromByteArrayNode fromByteArrayNode, TruffleString.SwitchEncodingNode switchEncodingNode) { - // TODO PyUnicode_DecodeFSDefault TruffleString utf8 = fromByteArrayNode.execute(src, offset, length, UTF_8, copy); return switchEncodingNode.execute(utf8, TS_ENCODING); } private static byte[] getStringBytes(TruffleString str, TruffleString.SwitchEncodingNode switchEncodingNode, TruffleString.CopyToByteArrayNode copyToByteArrayNode) { - // TODO replace getBytes with PyUnicode_FSConverter equivalent TruffleString utf8 = switchEncodingNode.execute(str, UTF_8); byte[] bytes = new byte[utf8.byteLength(UTF_8)]; copyToByteArrayNode.execute(utf8, 0, bytes, 0, bytes.length, UTF_8); diff --git a/graalpython/lib-python/3/test/test_import/__init__.py b/graalpython/lib-python/3/test/test_import/__init__.py index aaa1bc0f1d..9f8063ccbe 100644 --- a/graalpython/lib-python/3/test/test_import/__init__.py +++ b/graalpython/lib-python/3/test/test_import/__init__.py @@ -1593,7 +1593,6 @@ def exec_module(*args): else: importlib.SourceLoader.exec_module = old_exec_module - @impl_detail("[GR-27024] [GR-23324] posix native support", graalpy=False) @unittest.skipUnless(TESTFN_UNENCODABLE, 'need TESTFN_UNENCODABLE') def test_unencodable_filename(self): # Issue #11619: The Python parser and the import machinery must not diff --git a/graalpython/lib-python/3/test/test_unicode_file.py b/graalpython/lib-python/3/test/test_unicode_file.py index 63513263e0..9025895513 100644 --- a/graalpython/lib-python/3/test/test_unicode_file.py +++ b/graalpython/lib-python/3/test/test_unicode_file.py @@ -120,13 +120,11 @@ def _test_single(self, filename): # The 'test' functions are unittest entry points, and simply call our # _test functions with each of the filename combinations we wish to test - @impl_detail("[GR-27024] [GR-23324] posix native support", graalpy=False) def test_single_files(self): self._test_single(TESTFN_UNICODE) if TESTFN_UNENCODABLE is not None: self._test_single(TESTFN_UNENCODABLE) - @impl_detail("[GR-27024] [GR-23324] posix native support", graalpy=False) def test_directories(self): # For all 'equivalent' combinations: # Make dir with encoded, chdir with unicode, checkdir with encoded