diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DataWriteContextImpl.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DataWriteContextImpl.java index 02f5cebeeb9..0d0cef56137 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DataWriteContextImpl.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DataWriteContextImpl.java @@ -24,6 +24,8 @@ import io.delta.kernel.expressions.Column; import io.delta.kernel.expressions.Literal; +import io.delta.kernel.internal.fs.Path; + /** * Implements the {@link DataWriteContext} interface. In addition to the data needed for the * interface, it also contains the partition values of the targeted partition. In case of @@ -59,7 +61,9 @@ public DataWriteContextImpl( * @return fully qualified path of the target directory */ public String getTargetDirectory() { - return targetDirectory; + // TODO: this is temporary until paths are uniform (i.e. they are actually file system paths + // or URIs everywhere, but not a combination of the two). + return new Path(targetDirectory).toUri().toString(); } /** diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddFile.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddFile.java index 603ccaf357e..cdfa6eb93a8 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddFile.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddFile.java @@ -86,7 +86,7 @@ public static Row convertDataFileStatus( Path filePath = new Path(dataFileStatus.getPath()); Map valueMap = new HashMap<>(); valueMap.put(COL_NAME_TO_ORDINAL.get("path"), - relativizePath(filePath, tableRoot).toString()); + relativizePath(filePath, tableRoot).toUri().toString()); valueMap.put(COL_NAME_TO_ORDINAL.get("partitionValues"), serializePartitionMap(partitionValues)); valueMap.put(COL_NAME_TO_ORDINAL.get("size"), dataFileStatus.getSize()); diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/PartitionUtils.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/PartitionUtils.java index b0337565334..5c39171e58f 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/PartitionUtils.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/PartitionUtils.java @@ -15,9 +15,7 @@ */ package io.delta.kernel.internal.util; -import java.io.UnsupportedEncodingException; import java.math.BigDecimal; -import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.sql.Date; import java.sql.Timestamp; @@ -360,12 +358,7 @@ public static String getTargetDirectory( // Follow the delta-spark behavior to use "__HIVE_DEFAULT_PARTITION__" for null serializedValue = "__HIVE_DEFAULT_PARTITION__"; } else { - try { - serializedValue = URLEncoder.encode(serializedValue, "UTF-8"); - } catch (UnsupportedEncodingException e) { - throw new RuntimeException( - "Failed to encode partition value: " + serializedValue, e); - } + serializedValue = escapePartitionValue(serializedValue); } String partitionDirectory = partitionColName + "=" + serializedValue; targetDirectory = new Path(targetDirectory, partitionDirectory); @@ -509,4 +502,49 @@ protected static String serializePartitionValue(Literal literal) { } throw new UnsupportedOperationException("Unsupported partition column type: " + dataType); } + + //////////////////////////////////////////////////////////////////////////////////////////////// + // The following string escaping code is mainly copied from Spark // + // (org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils) which is copied from // + // Hive (o.a.h.h.common.FileUtils). // + //////////////////////////////////////////////////////////////////////////////////////////////// + private static final BitSet CHARS_TO_ESCAPE = new BitSet(128); + + static { + // ASCII 01-1F are HTTP control characters that need to be escaped. + char[] controlChars = new char[] { + '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\b', + '\t', '\n', '\u000B', '\f', '\r', '\u000E', '\u000F', '\u0010', '\u0011', + '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', + '\u001A', '\u001B', '\u001C', '\u001D', '\u001E', '\u001F', '"', '#', '%', '\'', + '*', '/', ':', '=', '?', '\\', '\u007F', '{', '[', ']', '^' + }; + + for (char c : controlChars) { + CHARS_TO_ESCAPE.set(c); + } + } + + /** + * Escapes the given string to be used as a partition value in the path. Basically this escapes + * - characters that can't be in a file path. E.g. `a\nb` will be escaped to `a%0Ab`. - + * character that are cause ambiguity in partition value parsing. E.g. For partition column `a` + * having value `b=c`, the path should be `a=b%3Dc`. + * + * @param value The partition value to escape. + * @return The escaped partition value. + */ + private static String escapePartitionValue(String value) { + StringBuilder escaped = new StringBuilder(value.length()); + for (int i = 0; i < value.length(); i++) { + char c = value.charAt(i); + if (c >= 0 && c < CHARS_TO_ESCAPE.size() && CHARS_TO_ESCAPE.get(c)) { + escaped.append('%'); + escaped.append(String.format("%02X", (int) c)); + } else { + escaped.append(c); + } + } + return escaped.toString(); + } } diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/util/PartitionUtilsSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/util/PartitionUtilsSuite.scala index 32ff85426b5..dd44c0a6bf8 100644 --- a/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/util/PartitionUtilsSuite.scala +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/util/PartitionUtilsSuite.scala @@ -228,6 +228,8 @@ class PartitionUtilsSuite extends AnyFunSuite { ofDouble(23423.422233d) -> ("23423.422233", "23423.422233"), ofNull(DoubleType.DOUBLE) -> (null, nullFileName), ofString("string_val") -> ("string_val", "string_val"), + ofString("string_\nval") -> ("string_\nval", "string_%0Aval"), + ofString("str=ing_\u0001val") -> ("str=ing_\u0001val", "str%3Ding_%01val"), ofNull(StringType.STRING) -> (null, nullFileName), ofDecimal(new java.math.BigDecimal("23423.234234"), 15, 7) -> ("23423.2342340", "23423.2342340"), @@ -237,10 +239,10 @@ class PartitionUtilsSuite extends AnyFunSuite { ofDate(4234) -> ("1981-08-05", "1981-08-05"), ofNull(DateType.DATE) -> (null, nullFileName), ofTimestamp(2342342342232L) -> - ("1970-01-28 02:39:02.342232", "1970-01-28+02%3A39%3A02.342232"), + ("1970-01-28 02:39:02.342232", "1970-01-28 02%3A39%3A02.342232"), ofNull(TimestampType.TIMESTAMP) -> (null, nullFileName), ofTimestampNtz(-2342342342L) -> - ("1969-12-31 23:20:58.657658", "1969-12-31+23%3A20%3A58.657658"), + ("1969-12-31 23:20:58.657658", "1969-12-31 23%3A20%3A58.657658"), ofNull(TimestampNTZType.TIMESTAMP_NTZ) -> (null, nullFileName) ).foreach { case (literal, (expSerializedValue, expFileName)) => test(s"serialize partition value literal as string: ${literal.getDataType}($literal)") { @@ -264,7 +266,7 @@ class PartitionUtilsSuite extends AnyFunSuite { Map("part1" -> ofInt(12), "part3" -> ofTimestamp(234234234L), "part2" -> ofString("sss")).asJava) - assert(result === "/tmp/root/part1=12/part2=sss/part3=1970-01-01+00%3A03%3A54.234234") + assert(result === "/tmp/root/part1=12/part2=sss/part3=1970-01-01 00%3A03%3A54.234234") } private def col(names: String*): Column = {