Skip to content

Commit

Permalink
[Kernel] Fix the partition path construction (#3290)
Browse files Browse the repository at this point in the history
## Description
The current code does not escape the control + special characters in
partition values when constructing a path for writing the data related
to the partition. Not escaping these characters could cause invalid path
issues.

The escaping logic is similar to what Spark and Hive do.

## How was this patch tested?
Unit tests.
  • Loading branch information
vkorukanti authored Jun 24, 2024
1 parent 864b5e3 commit 5ea073b
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import io.delta.kernel.expressions.Column;
import io.delta.kernel.expressions.Literal;

import io.delta.kernel.internal.fs.Path;

/**
* Implements the {@link DataWriteContext} interface. In addition to the data needed for the
* interface, it also contains the partition values of the targeted partition. In case of
Expand Down Expand Up @@ -59,7 +61,9 @@ public DataWriteContextImpl(
* @return fully qualified path of the target directory
*/
public String getTargetDirectory() {
return targetDirectory;
// TODO: this is temporary until paths are uniform (i.e. they are actually file system paths
// or URIs everywhere, but not a combination of the two).
return new Path(targetDirectory).toUri().toString();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public static Row convertDataFileStatus(
Path filePath = new Path(dataFileStatus.getPath());
Map<Integer, Object> valueMap = new HashMap<>();
valueMap.put(COL_NAME_TO_ORDINAL.get("path"),
relativizePath(filePath, tableRoot).toString());
relativizePath(filePath, tableRoot).toUri().toString());
valueMap.put(COL_NAME_TO_ORDINAL.get("partitionValues"),
serializePartitionMap(partitionValues));
valueMap.put(COL_NAME_TO_ORDINAL.get("size"), dataFileStatus.getSize());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
*/
package io.delta.kernel.internal.util;

import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
import java.sql.Timestamp;
Expand Down Expand Up @@ -360,12 +358,7 @@ public static String getTargetDirectory(
// Follow the delta-spark behavior to use "__HIVE_DEFAULT_PARTITION__" for null
serializedValue = "__HIVE_DEFAULT_PARTITION__";
} else {
try {
serializedValue = URLEncoder.encode(serializedValue, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(
"Failed to encode partition value: " + serializedValue, e);
}
serializedValue = escapePartitionValue(serializedValue);
}
String partitionDirectory = partitionColName + "=" + serializedValue;
targetDirectory = new Path(targetDirectory, partitionDirectory);
Expand Down Expand Up @@ -509,4 +502,49 @@ protected static String serializePartitionValue(Literal literal) {
}
throw new UnsupportedOperationException("Unsupported partition column type: " + dataType);
}

////////////////////////////////////////////////////////////////////////////////////////////////
// The following string escaping code is mainly copied from Spark //
// (org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils) which is copied from //
// Hive (o.a.h.h.common.FileUtils). //
////////////////////////////////////////////////////////////////////////////////////////////////
private static final BitSet CHARS_TO_ESCAPE = new BitSet(128);

static {
// ASCII 01-1F are HTTP control characters that need to be escaped.
char[] controlChars = new char[] {
'\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\b',
'\t', '\n', '\u000B', '\f', '\r', '\u000E', '\u000F', '\u0010', '\u0011',
'\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019',
'\u001A', '\u001B', '\u001C', '\u001D', '\u001E', '\u001F', '"', '#', '%', '\'',
'*', '/', ':', '=', '?', '\\', '\u007F', '{', '[', ']', '^'
};

for (char c : controlChars) {
CHARS_TO_ESCAPE.set(c);
}
}

/**
* Escapes the given string to be used as a partition value in the path. Basically this escapes
* - characters that can't be in a file path. E.g. `a\nb` will be escaped to `a%0Ab`. -
* character that are cause ambiguity in partition value parsing. E.g. For partition column `a`
* having value `b=c`, the path should be `a=b%3Dc`.
*
* @param value The partition value to escape.
* @return The escaped partition value.
*/
private static String escapePartitionValue(String value) {
StringBuilder escaped = new StringBuilder(value.length());
for (int i = 0; i < value.length(); i++) {
char c = value.charAt(i);
if (c >= 0 && c < CHARS_TO_ESCAPE.size() && CHARS_TO_ESCAPE.get(c)) {
escaped.append('%');
escaped.append(String.format("%02X", (int) c));
} else {
escaped.append(c);
}
}
return escaped.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ class PartitionUtilsSuite extends AnyFunSuite {
ofDouble(23423.422233d) -> ("23423.422233", "23423.422233"),
ofNull(DoubleType.DOUBLE) -> (null, nullFileName),
ofString("string_val") -> ("string_val", "string_val"),
ofString("string_\nval") -> ("string_\nval", "string_%0Aval"),
ofString("str=ing_\u0001val") -> ("str=ing_\u0001val", "str%3Ding_%01val"),
ofNull(StringType.STRING) -> (null, nullFileName),
ofDecimal(new java.math.BigDecimal("23423.234234"), 15, 7) ->
("23423.2342340", "23423.2342340"),
Expand All @@ -237,10 +239,10 @@ class PartitionUtilsSuite extends AnyFunSuite {
ofDate(4234) -> ("1981-08-05", "1981-08-05"),
ofNull(DateType.DATE) -> (null, nullFileName),
ofTimestamp(2342342342232L) ->
("1970-01-28 02:39:02.342232", "1970-01-28+02%3A39%3A02.342232"),
("1970-01-28 02:39:02.342232", "1970-01-28 02%3A39%3A02.342232"),
ofNull(TimestampType.TIMESTAMP) -> (null, nullFileName),
ofTimestampNtz(-2342342342L) ->
("1969-12-31 23:20:58.657658", "1969-12-31+23%3A20%3A58.657658"),
("1969-12-31 23:20:58.657658", "1969-12-31 23%3A20%3A58.657658"),
ofNull(TimestampNTZType.TIMESTAMP_NTZ) -> (null, nullFileName)
).foreach { case (literal, (expSerializedValue, expFileName)) =>
test(s"serialize partition value literal as string: ${literal.getDataType}($literal)") {
Expand All @@ -264,7 +266,7 @@ class PartitionUtilsSuite extends AnyFunSuite {
Map("part1" -> ofInt(12),
"part3" -> ofTimestamp(234234234L),
"part2" -> ofString("sss")).asJava)
assert(result === "/tmp/root/part1=12/part2=sss/part3=1970-01-01+00%3A03%3A54.234234")
assert(result === "/tmp/root/part1=12/part2=sss/part3=1970-01-01 00%3A03%3A54.234234")
}

private def col(names: String*): Column = {
Expand Down

0 comments on commit 5ea073b

Please sign in to comment.