Avoid non-deterministic UDF to filter deleted rows

cstavr · cstavr · commit 16ddfcbd6150 · 2024-01-29T13:05:47.000+01:00
diff --git a/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala
@@ -132,6 +132,9 @@ class DeltaSparkSessionExtension extends (SparkSessionExtensions => Unit) {
       new PrepareDeltaScan(session)
     }
 
+    // Add skip row column and filter.
+    extensions.injectPlannerStrategy(PreprocessTableWithDVsStrategy)
+
     // Tries to load PrepareDeltaSharingScan class with class reflection, when delta-sharing-spark
     // 3.1+ package is installed, this will be loaded and delta sharing batch queries with
     // DeltaSharingFileIndex will be handled by the rule.
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableWithDVs.scala b/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableWithDVs.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.delta.util.DeltaFileOperations.absolutePath
 
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.sql.{Column, SparkSession}
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal}
 import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -44,7 +44,7 @@ import org.apache.spark.util.SerializableConfiguration
  * After rule:
  *   <Parent Node> ->
  *     Project(key, value) ->
- *       Filter (udf(__skip_row == 0) ->
+ *       Filter (__skip_row == 0) ->
  *         Delta Scan (key, value, __skip_row)
  *   - Here we insert a new column `__skip_row` in Delta scan. This value is populated by the
  *     Parquet reader using the DV corresponding to the Parquet file read
@@ -160,11 +160,7 @@ object ScanWithDeletionVectors {
       s"Expected only one column with name=$IS_ROW_DELETED_COLUMN_NAME")
     val skipRowColumnRef = skipRowColumnRefs.head
 
-    val keepRow = DeltaUDF.booleanFromByte( _ == RowIndexFilter.KEEP_ROW_VALUE)
-      .asNondeterministic() // To avoid constant folding the filter based on stats.
-
-    val filterExp = keepRow(new Column(skipRowColumnRef)).expr
-    Filter(filterExp, newScan)
+    Filter(EqualTo(skipRowColumnRef, Literal(RowIndexFilter.KEEP_ROW_VALUE)), newScan)
   }
 
   private def createBroadcastDVMap(
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableWithDVsStrategy.scala b/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableWithDVsStrategy.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta
+
+import org.apache.spark.sql.{SparkSession, Strategy}
+import org.apache.spark.sql.catalyst.planning.ScanOperation
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.datasources.{FileSourceStrategy, HadoopFsRelation, LogicalRelation}
+
+/**
+ * Strategy to process tables with DVs and add the skip row column and filters.
+ *
+ * This strategy will apply all transformations needed to tables with DVs and delegate to
+ * [[FileSourceStrategy]] to create the final plan. The DV filter will be the bottom-most filter in
+ * the plan and so it will be pushed down to the FileSourceScanExec at the beginning of the filter
+ * list.
+ */
+case class PreprocessTableWithDVsStrategy(session: SparkSession)
+    extends Strategy
+    with PreprocessTableWithDVs {
+
+  override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+    case ScanOperation(_, _, _, _ @ LogicalRelation(_: HadoopFsRelation, _, _, _)) =>
+      val updatedPlan = preprocessTablesWithDVs(plan)
+      FileSourceStrategy(updatedPlan)
+    case _ => Nil
+  }
+}
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/PrepareDeltaScan.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/PrepareDeltaScan.scala
@@ -52,7 +52,7 @@ trait PrepareDeltaScanBase extends Rule[LogicalPlan]
   with PredicateHelper
   with DeltaLogging
   with OptimizeMetadataOnlyDeltaQuery
-  with PreprocessTableWithDVs { self: PrepareDeltaScan =>
+  with SubqueryTransformerHelper { self: PrepareDeltaScan =>
 
   /**
    * Tracks the first-access snapshots of other logs planned by this rule. The snapshots are
@@ -204,7 +204,7 @@ trait PrepareDeltaScanBase extends Rule[LogicalPlan]
     } else {
       prepareDeltaScanWithoutFileSkipping(plan)
     }
-    preprocessTablesWithDVs(updatedPlan)
+    updatedPlan
   }
 
   protected def prepareDeltaScanWithoutFileSkipping(plan: LogicalPlan): LogicalPlan = {

Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,9 @@ class DeltaSparkSessionExtension extends (SparkSessionExtensions => Unit) {`
`132`	`132`	`new PrepareDeltaScan(session)`
`133`	`133`	`}`
`134`	`134`
	`135`	`+ // Add skip row column and filter.`
	`136`	`+ extensions.injectPlannerStrategy(PreprocessTableWithDVsStrategy)`
	`137`	`+`
`135`	`138`	`// Tries to load PrepareDeltaSharingScan class with class reflection, when delta-sharing-spark`
`136`	`139`	`// 3.1+ package is installed, this will be loaded and delta sharing batch queries with`
`137`	`140`	`// DeltaSharingFileIndex will be handled by the rule.`