From f5cb96317f05b4e13ffae49738da2ee9aaddc1b2 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Wed, 17 Jun 2026 12:36:38 +0200 Subject: [PATCH] fix PruneManifests::row_counts to use total live rows instead of added_rows_count added_rows_count counts only ADDED-status files in the current snapshot. Manifests carrying EXISTING rows (e.g. after compaction) have added_rows_count=0, causing DataFusion's IS NOT NULL pruning (null_count != row_count) to evaluate 0 != 0 = false and incorrectly prune manifests that contain live data. Use added + existing - deleted to get the actual live row count; fall back to None (unknown) when any of the three optional fields is absent. Fixes #359 Co-Authored-By: Claude Sonnet 4.6 --- datafusion_iceberg/src/pruning_statistics.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion_iceberg/src/pruning_statistics.rs b/datafusion_iceberg/src/pruning_statistics.rs index 7552e4d2..e0ee0cb2 100644 --- a/datafusion_iceberg/src/pruning_statistics.rs +++ b/datafusion_iceberg/src/pruning_statistics.rs @@ -132,13 +132,13 @@ impl PruningStatistics for PruneManifests<'_, '_> { } fn row_counts(&self) -> Option { - ScalarValue::iter_to_array( - self.files - .iter() - .map(|x| x.added_rows_count) - .map(ScalarValue::Int64), - ) - .ok() + let row_counts = self.files.iter().map(|x| { + match (x.added_rows_count, x.existing_rows_count, x.deleted_rows_count) { + (Some(a), Some(e), Some(d)) => Some(a + e - d), + _ => None, + } + }); + ScalarValue::iter_to_array(row_counts.map(ScalarValue::Int64)).ok() } }