Avoid non-deterministic UDF to filter deleted rows

cstavr · cstavr · commit e8e2d834d747 · 2024-01-26T13:03:11.000+01:00
diff --git a/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala
@@ -132,6 +132,9 @@ class DeltaSparkSessionExtension extends (SparkSessionExtensions => Unit) {
       new PrepareDeltaScan(session)
     }
 
+    // Add skip row column and filter.
+    extensions.injectPlannerStrategy(PreprocessTableWithDVsStrategy)
+
     // Tries to load PrepareDeltaSharingScan class with class reflection, when delta-sharing-spark
     // 3.1+ package is installed, this will be loaded and delta sharing batch queries with
     // DeltaSharingFileIndex will be handled by the rule.
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableWithDVs.scala b/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableWithDVs.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.delta.util.DeltaFileOperations.absolutePath
 
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.sql.{Column, SparkSession}
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal}
 import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -44,7 +44,7 @@ import org.apache.spark.util.SerializableConfiguration
  * After rule:
  *   <Parent Node> ->
  *     Project(key, value) ->
- *       Filter (udf(__skip_row == 0) ->
+ *       Filter (__skip_row == 0) ->
  *         Delta Scan (key, value, __skip_row)
  *   - Here we insert a new column `__skip_row` in Delta scan. This value is populated by the
  *     Parquet reader using the DV corresponding to the Parquet file read
@@ -160,11 +160,7 @@ object ScanWithDeletionVectors {
       s"Expected only one column with name=$IS_ROW_DELETED_COLUMN_NAME")
     val skipRowColumnRef = skipRowColumnRefs.head
 
-    val keepRow = DeltaUDF.booleanFromByte( _ == RowIndexFilter.KEEP_ROW_VALUE)
-      .asNondeterministic() // To avoid constant folding the filter based on stats.
-
-    val filterExp = keepRow(new Column(skipRowColumnRef)).expr
-    Filter(filterExp, newScan)
+    Filter(EqualTo(skipRowColumnRef, Literal(RowIndexFilter.KEEP_ROW_VALUE)), newScan)
   }
 
   private def createBroadcastDVMap(
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/PrepareDeltaScan.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/PrepareDeltaScan.scala
@@ -52,7 +52,7 @@ trait PrepareDeltaScanBase extends Rule[LogicalPlan]
   with PredicateHelper
   with DeltaLogging
   with OptimizeMetadataOnlyDeltaQuery
-  with PreprocessTableWithDVs { self: PrepareDeltaScan =>
+  with SubqueryTransformerHelper { self: PrepareDeltaScan =>
 
   /**
    * Tracks the first-access snapshots of other logs planned by this rule. The snapshots are
@@ -204,7 +204,7 @@ trait PrepareDeltaScanBase extends Rule[LogicalPlan]
     } else {
       prepareDeltaScanWithoutFileSkipping(plan)
     }
-    preprocessTablesWithDVs(updatedPlan)
+    updatedPlan
   }
 
   protected def prepareDeltaScanWithoutFileSkipping(plan: LogicalPlan): LogicalPlan = {

Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,9 @@ class DeltaSparkSessionExtension extends (SparkSessionExtensions => Unit) {`
`132`	`132`	`new PrepareDeltaScan(session)`
`133`	`133`	`}`
`134`	`134`
	`135`	`+ // Add skip row column and filter.`
	`136`	`+ extensions.injectPlannerStrategy(PreprocessTableWithDVsStrategy)`
	`137`	`+`
`135`	`138`	`// Tries to load PrepareDeltaSharingScan class with class reflection, when delta-sharing-spark`
`136`	`139`	`// 3.1+ package is installed, this will be loaded and delta sharing batch queries with`
`137`	`140`	`// DeltaSharingFileIndex will be handled by the rule.`