Merge pull request delta-io#15 from delta-io/master

JassAbidi · web-flow · commit f5c35cdcd06c · 2022-01-09T18:25:02.000+01:00
update fork
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -28,7 +28,6 @@ jobs:
           # cache new stuff.
           key: delta-sbt-cache-spark3.2-scala${{ matrix.scala }}
       - name: Install Job dependencies
-        shell: bash -l {0}
         run: |
           sudo apt-get update
           sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git
diff --git a/PROTOCOL.md b/PROTOCOL.md
@@ -164,18 +164,20 @@ Subsequent` metaData` actions completely overwrite the current metadata of the t
 
 There can be at most one metadata action in a given version of the table.
 
+Every metadata action **must** include required fields at a minimum.
+
 The schema of the `metaData` action is as follows:
 
-Field Name | Data Type | Description
--|-|-
-id|`GUID`|Unique identifier for this table
-name|`String`| User-provided identifier for this table
-description|`String`| User-provided description for this table
-format|[Format Struct](#Format-Specification)| Specification of the encoding for the files stored in the table
-schemaString|[Schema Struct](#Schema-Serialization-Format)| Schema of the table
-partitionColumns|`Array[String]`| An array containing the names of columns by which the data should be partitioned
-createdTime|`Option[Long]`| The time when this metadata action is created, in milliseconds since the Unix epoch
-configuration|`Map[String, String]`| A map containing configuration options for the metadata action
+Field Name | Data Type | Description | optional/required
+-|-|-|-
+id|`GUID`|Unique identifier for this table | required
+name|`String`| User-provided identifier for this table | optional
+description|`String`| User-provided description for this table | optional
+format|[Format Struct](#Format-Specification)| Specification of the encoding for the files stored in the table | required
+schemaString|[Schema Struct](#Schema-Serialization-Format)| Schema of the table | required
+partitionColumns|`Array[String]`| An array containing the names of columns by which the data should be partitioned | required
+createdTime|`Option[Long]`| The time when this metadata action is created, in milliseconds since the Unix epoch | optional
+configuration|`Map[String, String]`| A map containing configuration options for the metadata action | required
 
 #### Format Specification
 Field Name | Data Type | Description
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/Checkpoints.scala b/core/src/main/scala/org/apache/spark/sql/delta/Checkpoints.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.functions.{col, struct, when}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.util.Utils
 
 /**
  * Records information about a checkpoint.
@@ -129,21 +130,43 @@ trait Checkpoints extends DeltaLogging {
 
   /**
    * Creates a checkpoint using snapshotToCheckpoint. By default it uses the current log version.
+   * Note that this function captures and logs all exceptions, since the checkpoint shouldn't fail
+   * the overall commit operation.
    */
   def checkpoint(snapshotToCheckpoint: Snapshot): Unit =
     recordDeltaOperation(this, "delta.checkpoint") {
-      if (snapshotToCheckpoint.version < 0) {
-        throw DeltaErrors.checkpointNonExistTable(dataPath)
+      try {
+        if (snapshotToCheckpoint.version < 0) {
+          throw DeltaErrors.checkpointNonExistTable(dataPath)
+        }
+        val checkpointMetaData = writeCheckpointFiles(snapshotToCheckpoint)
+        val json = JsonUtils.toJson(checkpointMetaData)
+        store.write(
+          LAST_CHECKPOINT,
+          Iterator(json),
+          overwrite = true,
+          newDeltaHadoopConf())
+
+        doLogCleanup()
+      } catch {
+        // Catch all non-fatal exceptions, since the checkpoint is written after the commit
+        // has completed. From the perspective of the user, the commit completed successfully.
+        // However, throw if this is in a testing environment - that way any breaking changes
+        // can be caught in unit tests.
+        case NonFatal(e) =>
+          recordDeltaEvent(
+            snapshotToCheckpoint.deltaLog,
+            "delta.checkpoint.sync.error",
+            data = Map(
+              "exception" -> e.getMessage(),
+              "stackTrace" -> e.getStackTrace()
+            )
+          )
+          logWarning(s"Error when writing checkpoint synchronously", e)
+          if (Utils.isTesting) {
+            throw e
+          }
       }
-      val checkpointMetaData = writeCheckpointFiles(snapshotToCheckpoint)
-      val json = JsonUtils.toJson(checkpointMetaData)
-      store.write(
-        LAST_CHECKPOINT,
-        Iterator(json),
-        overwrite = true,
-        newDeltaHadoopConf())
-
-      doLogCleanup()
     }
 
   protected def writeCheckpointFiles(snapshotToCheckpoint: Snapshot): CheckpointMetaData = {
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala b/core/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala
@@ -975,29 +975,53 @@ object DeltaErrors
     new AnalysisException("Cannot describe the history of a view.")
   }
 
-  def copyIntoEncryptionOnlyS3(scheme: String): Throwable = {
+  def copyIntoEncryptionNotAllowedOn(scheme: String): Throwable = {
+    // TODO: add `wasbs` once supported
     new IllegalArgumentException(
-      s"Invalid scheme $scheme. COPY INTO source encryption is only supported for S3 paths.")
+      s"Invalid scheme $scheme. " +
+        s"COPY INTO source encryption currently only supports s3/s3n/s3a/abfss.")
   }
 
   def copyIntoEncryptionSseCRequired(): Throwable = {
     new IllegalArgumentException(
-      s"Invalid encryption type. COPY INTO source encryption must specify 'type' = 'SSE-C'.")
+      s"Invalid encryption type. COPY INTO source encryption must specify 'TYPE' = 'AWS_SSE_C'.")
   }
 
   def copyIntoEncryptionMasterKeyRequired(): Throwable = {
     new IllegalArgumentException(
-      s"Invalid encryption arguments. COPY INTO source encryption must specify a masterKey.")
+      s"Invalid encryption arguments. COPY INTO source encryption must specify a MASTER_KEY.")
   }
 
-  def copyIntoCredentialsOnlyS3(scheme: String): Throwable = {
+  def copyIntoCredentialsNotAllowedOn(scheme: String): Throwable = {
+     new IllegalArgumentException(
+      s"Invalid scheme $scheme. " +
+        s"COPY INTO source encryption currently only supports s3/s3n/s3a/wasbs/abfss.")
+  }
+
+  def copyIntoCredentialsAllRequiredForS3(cause: Throwable): Throwable = {
+    new IllegalArgumentException(
+      "COPY INTO credentials must include AWS_ACCESS_KEY, AWS_SECRET_KEY, and AWS_SESSION_TOKEN.",
+      cause)
+  }
+
+  def copyIntoEncryptionRequiredForAzure(key: String, value: Option[String] = None): Throwable = {
     new IllegalArgumentException(
-      s"Invalid scheme $scheme. COPY INTO source credentials are only supported for S3 paths.")
+      if (value.nonEmpty) {
+        s"Invalid encryption option $key. " +
+          s"COPY INTO source encryption must specify '$key' = '${value.get}'."
+      } else {
+        s"COPY INTO source encryption must specify '$key'."
+      }
+    )
   }
 
-  def copyIntoCredentialsAllRequired(cause: Throwable): Throwable = {
+  def copyIntoEncryptionNotSupportedForAzure: Throwable = {
     new IllegalArgumentException(
-      "COPY INTO credentials must include awsKeyId, awsSecretKey, and awsSessionToken.", cause)
+      "COPY INTO encryption only supports ADLS Gen2, or abfss:// file scheme")
+  }
+
+  def copyIntoCredentialsRequiredForAzure(key: String): Throwable = {
+    new IllegalArgumentException(s"COPY INTO source credentials must specify '$key'.")
   }
 
   def postCommitHookFailedException(
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala b/core/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala
@@ -69,7 +69,8 @@ case class CommitStats(
   isolationLevel: String,
   fileSizeHistogram: Option[FileSizeHistogram] = None,
   addFilesHistogram: Option[FileSizeHistogram] = None,
-  removeFilesHistogram: Option[FileSizeHistogram] = None
+  removeFilesHistogram: Option[FileSizeHistogram] = None,
+  txnId: Option[String] = None
 )
 
 /**
@@ -638,14 +639,9 @@ trait OptimisticTransactionImpl extends TransactionalWrite with SQLMetricsReport
   protected def postCommit(commitVersion: Long): Unit = {
     committed = true
     if (shouldCheckpoint(commitVersion)) {
-      try {
-        // We checkpoint the version to be committed to so that no two transactions will checkpoint
-        // the same version.
-        deltaLog.checkpoint(deltaLog.getSnapshotAt(commitVersion))
-      } catch {
-        case e: IllegalStateException =>
-          logWarning("Failed to checkpoint table state.", e)
-      }
+      // We checkpoint the version to be committed to so that no two transactions will checkpoint
+      // the same version.
+      deltaLog.checkpoint(deltaLog.getSnapshotAt(commitVersion))
     }
   }
 
@@ -785,7 +781,8 @@ trait OptimisticTransactionImpl extends TransactionalWrite with SQLMetricsReport
       newMetadata = newMetadata,
       numAbsolutePathsInAdd = numAbsolutePaths,
       numDistinctPartitionsInAdd = distinctPartitions.size,
-      isolationLevel = isolationLevel.toString)
+      isolationLevel = isolationLevel.toString,
+      txnId = Some(txnId))
     recordDeltaEvent(deltaLog, "delta.commit.stats", data = stats)
 
     attemptVersion
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/PreprocessTableMerge.scala b/core/src/main/scala/org/apache/spark/sql/delta/PreprocessTableMerge.scala
@@ -177,6 +177,7 @@ case class PreprocessTableMerge(override val conf: SQLConf)
         }
       }
 
+
       val targetColNames = m.resolvedActions.map(_.targetColNameParts.head)
       if (targetColNames.distinct.size < targetColNames.size) {
         throw new AnalysisException(s"Duplicate column names in INSERT clause")
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/actions/actions.scala b/core/src/main/scala/org/apache/spark/sql/delta/actions/actions.scala
@@ -254,6 +254,9 @@ case class AddFile(
     .getOrElse(TimeUnit.MICROSECONDS.convert(modificationTime, TimeUnit.MILLISECONDS).toString)
     .toLong
 
+  @JsonIgnore
+  lazy val numAutoCompactions: Int = tag(AddFile.Tags.NUM_AUTO_COMPACTIONS).getOrElse("0").toInt
+
   def tag(tag: AddFile.Tags.KeyType): Option[String] =
     Option(tags).getOrElse(Map.empty).get(tag.name)
 
@@ -297,6 +300,17 @@ object AddFile {
 
     /** [[OPTIMIZE_TARGET_SIZE]]: target file size the file was optimized to. */
     object OPTIMIZE_TARGET_SIZE extends AddFile.Tags.KeyType("OPTIMIZE_TARGET_SIZE")
+
+    /**
+     * [[NUM_AUTO_COMPACTIONS]]: The number of times Auto Compaction is applied to the content of
+     * a file.
+     *
+     * Note: 'NUM_AUTO_OPTIMIZES' is used externally since Compaction is one of Optimize
+     * command. By using 'NUM_AUTO_OPTIMIZES', it hides detail and can support other
+     * optimize than Compaction. 'NUM_AUTO_COMPACTIONS' is used internally before current
+     * only Auto Compaction is using it.
+     */
+    object NUM_AUTO_COMPACTIONS extends AddFile.Tags.KeyType("NUM_AUTO_OPTIMIZES")
   }
 
   /** Convert a [[Tags.KeyType]] to a string to be used in the AddMap.tags Map[String, String]. */
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala b/core/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala
@@ -18,6 +18,7 @@ package org.apache.spark.sql.delta.catalog
 
 import java.util
 import java.util.Locale
+
 // scalastyle:off import.ordering.noEmptyLine
 
 import scala.collection.JavaConverters._
@@ -43,6 +44,7 @@ import org.apache.spark.sql.connector.catalog.TableCapability._
 import org.apache.spark.sql.connector.catalog.TableChange._
 import org.apache.spark.sql.connector.expressions.{BucketTransform, FieldReference, IdentityTransform, Transform}
 import org.apache.spark.sql.connector.write.{LogicalWriteInfo, V1Write, WriteBuilder}
+import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter
 import org.apache.spark.sql.internal.SQLConf
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/commands/DeltaCommand.scala b/core/src/main/scala/org/apache/spark/sql/delta/commands/DeltaCommand.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.delta.sources.{DeltaSourceUtils, DeltaSQLConf}
 import org.apache.spark.sql.delta.stats.FileSizeHistogram
 import org.apache.spark.sql.delta.util.DeltaFileOperations
 import org.apache.spark.sql.delta.util.FileNames.deltaFile
+
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.{AnalysisException, SparkSession}
@@ -232,12 +233,7 @@ trait DeltaCommand extends DeltaLogging {
 
     logInfo(s"Committed delta #$attemptVersion to ${deltaLog.logPath}. Wrote $commitSize actions.")
 
-    try {
-      deltaLog.checkpoint(currentSnapshot)
-    } catch {
-      case e: IllegalStateException =>
-        logWarning("Failed to checkpoint table state.", e)
-    }
+    deltaLog.checkpoint(currentSnapshot)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDelta.scala b/core/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDelta.scala
@@ -117,7 +117,8 @@ case class WriteIntoDelta(
     // change the actual behavior, but makes DESC TABLE to show varchar instead of char.
     val dataSchema = CharVarcharUtils.replaceCharVarcharWithStringInSchema(
       replaceCharWithVarchar(CharVarcharUtils.getRawSchema(data.schema)).asInstanceOf[StructType])
-    updateMetadata(data.sparkSession, txn, schemaInCatalog.getOrElse(dataSchema),
+    var finalSchema = schemaInCatalog.getOrElse(dataSchema)
+    updateMetadata(data.sparkSession, txn, finalSchema,
       partitionColumns, configuration, isOverwriteOperation, rearrangeOnly)
 
     val replaceOnDataColsEnabled =
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSink.scala b/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSink.scala
@@ -24,6 +24,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkContext
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric
@@ -64,6 +65,7 @@ class DeltaSink(
       throw DeltaErrors.streamWriteNullTypeException
     }
 
+
     // If the batch reads the same Delta table as this sink is going to write to, then this
     // write has dependencies. Then make sure that this commit set hasDependencies to true
     // by injecting a read on the whole table. This needs to be done explicitly because
diff --git a/core/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingTestUtils.scala b/core/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingTestUtils.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.delta
 import java.io.File
 
 import org.apache.spark.sql.delta.schema.SchemaUtils
+import io.delta.tables.{DeltaTable => OSSDeltaTable}
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkConf
@@ -341,23 +342,64 @@ trait DeltaColumnMappingTestUtilsBase extends SharedSparkSession {
     }
   }
 
+  /**
+   * Standard CONVERT TO DELTA
+   * @param tableOrPath String
+   */
+  protected def convertToDelta(tableOrPath: String): Unit = {
+    sql(s"CONVERT TO DELTA $tableOrPath")
+  }
+
 }
 
 trait DeltaColumnMappingTestUtils extends DeltaColumnMappingTestUtilsBase
 
-
 /**
  * Include this trait to enable Id column mapping mode for a suite
  */
-trait DeltaColumnMappingEnableIdMode extends SharedSparkSession {
+trait DeltaColumnMappingEnableIdMode extends SharedSparkSession
+  with DeltaColumnMappingTestUtils {
   protected override def sparkConf: SparkConf =
     super.sparkConf.set(DeltaConfigs.COLUMN_MAPPING_MODE.defaultTablePropertyKey, "id")
+
+  /**
+   * CONVERT TO DELTA blocked in id mode
+   */
+  protected override def convertToDelta(tableOrPath: String): Unit =
+    throw DeltaErrors.convertToDeltaWithColumnMappingNotSupported(
+      DeltaColumnMappingMode(columnMappingModeString)
+    )
 }
 
 /**
  * Include this trait to enable Name column mapping mode for a suite
  */
-trait DeltaColumnMappingEnableNameMode extends SharedSparkSession {
+trait DeltaColumnMappingEnableNameMode extends SharedSparkSession
+  with DeltaColumnMappingTestUtils {
+
   protected override def sparkConf: SparkConf =
     super.sparkConf.set(DeltaConfigs.COLUMN_MAPPING_MODE.defaultTablePropertyKey, "name")
+
+  /**
+   * CONVERT TO DELTA can be possible under name mode in tests
+   */
+  protected override def convertToDelta(tableOrPath: String): Unit = {
+    withColumnMappingConf("none") {
+      super.convertToDelta(tableOrPath)
+    }
+
+    val deltaPath = if (tableOrPath.contains("parquet") && tableOrPath.contains("`")) {
+      // parquet.`PATH`
+      s"""delta.${tableOrPath.split('.').last}"""
+    } else {
+      tableOrPath
+    }
+
+    sql(s"""ALTER TABLE $deltaPath SET TBLPROPERTIES (
+         |${DeltaConfigs.COLUMN_MAPPING_MODE.key} = 'name',
+         |${DeltaConfigs.MIN_READER_VERSION.key} = '2',
+         |${DeltaConfigs.MIN_WRITER_VERSION.key} = '5'
+         |)""".stripMargin)
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/sql/delta/DeltaSuite.scala b/core/src/test/scala/org/apache/spark/sql/delta/DeltaSuite.scala
@@ -1392,9 +1392,13 @@ class DeltaSuite extends QueryTest
     withTempDir { tempDir =>
       val path = tempDir.getCanonicalPath + "/table"
       spark.range(10).write.format("parquet").save(path)
-      sql(s"CONVERT TO DELTA parquet.`$path`")
+      convertToDelta(s"parquet.`$path`")
 
-      assert(spark.conf.get(DeltaSQLConf.DELTA_LAST_COMMIT_VERSION_IN_SESSION) === Some(0))
+      // In column mapping (name mode), we perform convertToDelta with a CONVERT and an ALTER,
+      // so the version has been updated
+      val commitVersion = if (columnMappingEnabled) 1 else 0
+      assert(spark.conf.get(DeltaSQLConf.DELTA_LAST_COMMIT_VERSION_IN_SESSION) ===
+        Some(commitVersion))
     }
   }
 
diff --git a/version.sbt b/version.sbt
@@ -1 +1 @@
-ThisBuild / version := "1.1.0-SNAPSHOT"
+ThisBuild / version := "1.2.0-SNAPSHOT"

Original file line number	Diff line number	Diff line change
`@@ -177,6 +177,7 @@ case class PreprocessTableMerge(override val conf: SQLConf)`
`177`	`177`	`}`
`178`	`178`	`}`
`179`	`179`
	`180`	`+`
`180`	`181`	`val targetColNames = m.resolvedActions.map(_.targetColNameParts.head)`
`181`	`182`	`if (targetColNames.distinct.size < targetColNames.size) {`
`182`	`183`	`throw new AnalysisException(s"Duplicate column names in INSERT clause")`
Original file line number	Diff line number	Diff line change
`@@ -1392,9 +1392,13 @@ class DeltaSuite extends QueryTest`
`1392`	`1392`	`withTempDir { tempDir =>`
`1393`	`1393`	`val path = tempDir.getCanonicalPath + "/table"`
`1394`	`1394`	`spark.range(10).write.format("parquet").save(path)`
`1395`		- sql(s"CONVERT TO DELTA parquet.`$path`")
	`1395`	+ convertToDelta(s"parquet.`$path`")
`1396`	`1396`
`1397`		`- assert(spark.conf.get(DeltaSQLConf.DELTA_LAST_COMMIT_VERSION_IN_SESSION) === Some(0))`
	`1397`	`+ // In column mapping (name mode), we perform convertToDelta with a CONVERT and an ALTER,`
	`1398`	`+ // so the version has been updated`
	`1399`	`+ val commitVersion = if (columnMappingEnabled) 1 else 0`
	`1400`	`+ assert(spark.conf.get(DeltaSQLConf.DELTA_LAST_COMMIT_VERSION_IN_SESSION) ===`
	`1401`	`+ Some(commitVersion))`
`1398`	`1402`	`}`
`1399`	`1403`	`}`
`1400`	`1404`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-ThisBuild / version := "1.1.0-SNAPSHOT"`
	`1`	`+ThisBuild / version := "1.2.0-SNAPSHOT"`