@@ -30,6 +30,7 @@ import org.apache.spark.sql.functions._
30
30
import org .apache .spark .sql .types ._
31
31
32
32
import java .sql .Date
33
+ import java .util .Locale
33
34
34
35
/** Optimize COUNT, MIN and MAX expressions on Delta tables.
35
36
* This optimization is only applied when the following conditions are met:
@@ -57,7 +58,8 @@ trait OptimizeMetadataOnlyDeltaQuery {
57
58
val rowCount = extractGlobalCount(tahoeLogFileIndex)
58
59
59
60
if (rowCount.isDefined) {
60
- lazy val columnStats = extractMinMaxFromDeltaLog(tahoeLogFileIndex)
61
+ val aggColumnsNames = Set (extractMinMaxFieldNames(plan).map(_.toLowerCase(Locale .ROOT )) : _* )
62
+ val columnStats = extractMinMaxFromDeltaLog(tahoeLogFileIndex, aggColumnsNames)
61
63
62
64
def checkStatsExists (attrRef : AttributeReference ): Boolean = {
63
65
columnStats.contains(attrRef.name) &&
@@ -117,6 +119,23 @@ trait OptimizeMetadataOnlyDeltaQuery {
117
119
}
118
120
}
119
121
122
+ private def extractMinMaxFieldNames (plan : Aggregate ): Seq [String ] = {
123
+ plan.aggregateExpressions.collect {
124
+ case Alias (AggregateExpression (
125
+ Min (minReference : AttributeReference ), _, _, _, _), _) =>
126
+ minReference.name
127
+ case Alias (AggregateExpression (
128
+ Max (maxReference : AttributeReference ), _, _, _, _), _) =>
129
+ maxReference.name
130
+ case Alias (ToPrettyString (AggregateExpression (
131
+ Min (minReference : AttributeReference ), _, _, _, _), _), _) =>
132
+ minReference.name
133
+ case Alias (ToPrettyString (AggregateExpression (
134
+ Max (maxReference : AttributeReference ), _, _, _, _), _), _) =>
135
+ maxReference.name
136
+ }
137
+ }
138
+
120
139
/** Return the number of rows in the table or `None` if we cannot calculate it from stats */
121
140
private def extractGlobalCount (tahoeLogFileIndex : TahoeLogFileIndex ): Option [Long ] = {
122
141
// account for deleted rows according to deletion vectors
@@ -141,12 +160,15 @@ trait OptimizeMetadataOnlyDeltaQuery {
141
160
*/
142
161
case class DeltaColumnStat (min : Any , max : Any )
143
162
144
- private def extractMinMaxFromStats (deltaScanGenerator : DeltaScanGenerator ):
145
- Map [String , DeltaColumnStat ] = {
163
+ private def extractMinMaxFromStats (
164
+ deltaScanGenerator : DeltaScanGenerator ,
165
+ lowerCaseColumnNames : Set [String ]): Map [String , DeltaColumnStat ] = {
166
+
146
167
// TODO Update this to work with DV (https://github.com/delta-io/delta/issues/1485)
147
168
val snapshot = deltaScanGenerator.snapshotToScan
148
- val dataColumns = snapshot.statCollectionPhysicalSchema
149
- .filter(col => AggregateDeltaTable .isSupportedDataType(col.dataType))
169
+ val dataColumns = snapshot.statCollectionPhysicalSchema.filter(col =>
170
+ AggregateDeltaTable .isSupportedDataType(col.dataType) &&
171
+ lowerCaseColumnNames.contains(col.name.toLowerCase(Locale .ROOT )))
150
172
151
173
// Validate all the files has stats
152
174
lazy val filesStatsCount = deltaScanGenerator.filesWithStatsForScan(Nil ).select(
@@ -232,12 +254,14 @@ trait OptimizeMetadataOnlyDeltaQuery {
232
254
}
233
255
}
234
256
235
- private def extractMinMaxFromPartitionValue (snapshot : Snapshot ):
236
- Map [String , DeltaColumnStat ] = {
257
+ private def extractMinMaxFromPartitionValue (
258
+ snapshot : Snapshot ,
259
+ lowerCaseColumnNames : Set [String ]): Map [String , DeltaColumnStat ] = {
237
260
238
261
val partitionedColumns = snapshot.metadata.partitionSchema
239
- .filter(x => AggregateDeltaTable .isSupportedDataType(x.dataType))
240
- .map(x => (x, DeltaColumnMapping .getPhysicalName(x)))
262
+ .filter(col => AggregateDeltaTable .isSupportedDataType(col.dataType) &&
263
+ lowerCaseColumnNames.contains(col.name.toLowerCase(Locale .ROOT )))
264
+ .map(col => (col, DeltaColumnMapping .getPhysicalName(col)))
241
265
242
266
if (partitionedColumns.isEmpty) {
243
267
Map .empty
@@ -271,14 +295,21 @@ trait OptimizeMetadataOnlyDeltaQuery {
271
295
}
272
296
}
273
297
274
- private def extractMinMaxFromDeltaLog (tahoeLogFileIndex : TahoeLogFileIndex ):
298
+ private def extractMinMaxFromDeltaLog (
299
+ tahoeLogFileIndex : TahoeLogFileIndex ,
300
+ lowerCaseColumnNames : Set [String ]):
275
301
CaseInsensitiveMap [DeltaColumnStat ] = {
276
302
val deltaScanGenerator = getDeltaScanGenerator(tahoeLogFileIndex)
277
303
val snapshot = deltaScanGenerator.snapshotToScan
304
+ val columnFromStats = extractMinMaxFromStats(deltaScanGenerator, lowerCaseColumnNames)
278
305
306
+ if (lowerCaseColumnNames.equals(columnFromStats.keySet)) {
307
+ CaseInsensitiveMap (columnFromStats)
308
+ } else {
279
309
CaseInsensitiveMap (
280
- extractMinMaxFromStats(deltaScanGenerator).++
281
- (extractMinMaxFromPartitionValue(snapshot)))
310
+ columnFromStats.++
311
+ (extractMinMaxFromPartitionValue(snapshot, lowerCaseColumnNames)))
312
+ }
282
313
}
283
314
284
315
object AggregateDeltaTable {
@@ -291,20 +322,25 @@ trait OptimizeMetadataOnlyDeltaQuery {
291
322
dataType.isInstanceOf [DateType ]
292
323
}
293
324
294
- private def isAggExprOptimizable (aggExpr : AggregateExpression ): Boolean = aggExpr match {
295
- case AggregateExpression (
296
- Count (Seq (Literal (1 , _))), Complete , false , None , _) => true
297
- case AggregateExpression (
298
- Min (min), Complete , false , None , _) => isSupportedDataType(min.dataType)
299
- case AggregateExpression (
300
- Max (max), Complete , false , None , _) => isSupportedDataType(max.dataType)
301
- case _ => false
325
+ def getAggFunctionOptimizable (aggExpr : AggregateExpression ): Option [DeclarativeAggregate ] = {
326
+ aggExpr match {
327
+ case AggregateExpression (
328
+ c@ Count (Seq (Literal (1 , _))), Complete , false , None , _) =>
329
+ Some (c)
330
+ case AggregateExpression (
331
+ min@ Min (minExpr), Complete , false , None , _) if isSupportedDataType(minExpr.dataType) =>
332
+ Some (min)
333
+ case AggregateExpression (
334
+ max@ Max (maxExpr), Complete , false , None , _) if isSupportedDataType(maxExpr.dataType) =>
335
+ Some (max)
336
+ case _ => None
337
+ }
302
338
}
303
339
304
340
private def isStatsOptimizable (aggExpr : Seq [Alias ]): Boolean = aggExpr.forall {
305
- case Alias (aggExpr : AggregateExpression , _) => isAggExprOptimizable (aggExpr)
341
+ case Alias (aggExpr : AggregateExpression , _) => getAggFunctionOptimizable (aggExpr).isDefined
306
342
case Alias (ToPrettyString (aggExpr : AggregateExpression , _), _) =>
307
- isAggExprOptimizable (aggExpr)
343
+ getAggFunctionOptimizable (aggExpr).isDefined
308
344
case _ => false
309
345
}
310
346
0 commit comments