[Hudi] Fix duplicate record naming in schema conversion (#3310)

anniewang-db · web-flow · commit 207d8d2f50c7 · 2024-06-28T08:57:54.000-07:00
#### Which Delta project/connector is this regarding?  - [ ] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [x] Other (Hudi) ## Description  This PR fixes a bug in the Delta->Hudi schema conversion for structs. Previously, any table conversion that included at least one struct somewhere in the schema would fail. This is because the previous code was incorrectly naming the Avro RecordSchemas. There were two problems: 1. There was no namespace added to each record, so for a struct column with the following schema: ` (myName STRUCT<myName: STRUCT<field1: INT>>)` there would be an error due to the duplicate naming of the two structs. (**Avro does not allow recordSchemas with the same name under the same namespace**) It would incorrectly place the nested struct in the same namespace as the parent struct, and even though our schema should be valid the Avro schema creation would fail. 2. For some reason we were naming each record by its data type name instead of its own name. Since we represent the Delta schema as a struct, even if we don't have any nested structs inside our table, as long as we have at least one struct column in the schema we will end up creating a nested struct. Both of these records would be named "struct" and be under the same namespace (due to problem 1), so we would run into a duplication error even if we just have a single struct of ints. So for an example table defined as follows: `CREATE TABLE myTable (col1 STRUCT<field1: INT, field2: STRING>)` the previous code would not work because it would represent our overall schema as a struct with name "struct", and our struct column would be a nested struct with name "struct" under the same namespace. Now, I have changed it so that it works and is compatible with Spark+Hudi. We are now using namespaces and also naming with column names rather than column type names. For this example, our Avro schema would look like this: ``` { "type": "record", "name": "table", "fields": [ { "name": "col1", "type": [ "null", { "type": "record", "name": "col1", "namespace": "table", "fields": [ { "name": "field1", "type": [ "null", "int" ] }, { "name": "field2", "type": [ "null", "string" ] } ] } ] } ] } ``` ## How was this patch tested? Unit test and manually tested with Hudi SparkSession reader.  ## Does this PR introduce _any_ user-facing changes?  No
diff --git a/hudi/src/main/scala/org/apache/spark/sql/delta/hudi/HudiSchemaUtils.scala b/hudi/src/main/scala/org/apache/spark/sql/delta/hudi/HudiSchemaUtils.scala
@@ -33,16 +33,17 @@ object HudiSchemaUtils extends DeltaLogging {
      * Recursively (i.e. for all nested elements) transforms the delta DataType `elem` into its
      * corresponding Avro type.
      */
-    def transform[E <: DataType](elem: E, isNullable: Boolean): Schema = elem match {
+    def transform[E <: DataType](elem: E, isNullable: Boolean, currentPath: String): Schema =
+    elem match {
       case StructType(fields) =>
 
         val avroFields: util.List[Schema.Field] = fields.map(f =>
           new Schema.Field(
             f.name,
-            transform(f.dataType, f.nullable),
+            transform(f.dataType, f.nullable, s"$currentPath.${f.name}"),
             f.getComment().orNull)).toList.asJava
         finalizeSchema(
-          Schema.createRecord(elem.typeName, null, null, false, avroFields),
+          Schema.createRecord(currentPath, null, null, false, avroFields),
           isNullable)
       // TODO: Add List and Map support: https://github.com/delta-io/delta/issues/2738
       case ArrayType(elementType, containsNull) =>
@@ -57,7 +58,7 @@ object HudiSchemaUtils extends DeltaLogging {
         throw new UnsupportedOperationException(s"Cannot convert Delta type $other to Hudi")
     }
 
-    transform(deltaSchema, false)
+    transform(deltaSchema, false, "root")
   }
 
   private def finalizeSchema(targetSchema: Schema, isNullable: Boolean): Schema = {
diff --git a/hudi/src/test/scala/org/apache/spark/sql/delta/hudi/ConvertToHudiSuite.scala b/hudi/src/test/scala/org/apache/spark/sql/delta/hudi/ConvertToHudiSuite.scala
@@ -203,7 +203,8 @@ class ConvertToHudiSuite extends QueryTest with Eventually {
   test("validate various data types") {
     _sparkSession.sql(
       s"""CREATE TABLE `$testTableName` (col1 BIGINT, col2 BOOLEAN, col3 DATE,
-         | col4 DOUBLE, col5 FLOAT, col6 INT, col7 STRING, col8 TIMESTAMP)
+         | col4 DOUBLE, col5 FLOAT, col6 INT, col7 STRING, col8 TIMESTAMP,
+         | col9 STRUCT<field1: INT, field2: STRING>)
          | USING DELTA
          |LOCATION '$testTablePath'
          |TBLPROPERTIES (
@@ -212,7 +213,8 @@ class ConvertToHudiSuite extends QueryTest with Eventually {
     val nowSeconds = Instant.now().getEpochSecond
     _sparkSession.sql(s"INSERT INTO `$testTableName` VALUES (123, true, "
       + s"date(from_unixtime($nowSeconds)), 32.1, 1.23, 456, 'hello world', "
-      + s"timestamp(from_unixtime($nowSeconds)))")
+      + s"timestamp(from_unixtime($nowSeconds)), "
+      + s"named_struct('field1', 789, 'field2', 'hello'))")
     verifyFilesAndSchemaMatch()
   }