feat: introduce table scan to plan splits (#96)

luoyuxia · web-flow · commit acf5438364ac · 2026-03-09T12:09:48.000+08:00
diff --git a/crates/paimon/src/lib.rs b/crates/paimon/src/lib.rs
@@ -24,4 +24,4 @@ pub mod file_index;
 pub mod io;
 pub mod spec;
 mod table;
-pub use table::{DataSplit, Plan, Table};
+pub use table::{DataSplit, DataSplitBuilder, Plan, SnapshotManager, Table, TableScan};
diff --git a/crates/paimon/src/spec/manifest_entry.rs b/crates/paimon/src/spec/manifest_entry.rs
@@ -47,7 +47,7 @@ pub struct ManifestEntry {
     total_buckets: i32,
 
     #[serde(rename = "_FILE")]
-    file: DataFileMeta,
+    pub(crate) file: DataFileMeta,
 
     #[serde(rename = "_VERSION")]
     version: i32,
@@ -59,7 +59,8 @@ impl ManifestEntry {
         &self.kind
     }
 
-    fn partition(&self) -> &Vec<u8> {
+    /// Partition bytes for this entry (for grouping splits).
+    pub fn partition(&self) -> &[u8] {
         &self.partition
     }
 
diff --git a/crates/paimon/src/spec/mod.rs b/crates/paimon/src/spec/mod.rs
@@ -39,9 +39,13 @@ pub use index_file_meta::*;
 
 mod index_manifest;
 mod manifest;
+pub use manifest::Manifest;
 mod manifest_common;
+pub use manifest_common::FileKind;
 mod manifest_entry;
+pub use manifest_entry::ManifestEntry;
 mod objects_file;
+pub use objects_file::from_avro_bytes;
 mod stats;
 mod types;
 pub use types::*;
diff --git a/crates/paimon/src/table/mod.rs b/crates/paimon/src/table/mod.rs
@@ -19,8 +19,11 @@
 
 mod snapshot_manager;
 mod source;
+mod table_scan;
 
-pub use source::{DataSplit, Plan};
+pub use snapshot_manager::SnapshotManager;
+pub use source::{DataSplit, DataSplitBuilder, Plan};
+pub use table_scan::TableScan;
 
 use crate::catalog::Identifier;
 use crate::io::FileIO;
@@ -71,4 +74,11 @@ impl Table {
     pub fn file_io(&self) -> &FileIO {
         &self.file_io
     }
+
+    /// Create a table scan for full table read (no incremental, no predicate).
+    ///
+    /// Reference: [pypaimon TableScan](https://github.com/apache/paimon/blob/release-1.3/paimon-python/pypaimon/read/table_scan.py).
+    pub fn new_scan(&self) -> TableScan {
+        TableScan::new(self.clone())
+    }
 }
diff --git a/crates/paimon/src/table/source.rs b/crates/paimon/src/table/source.rs
@@ -34,7 +34,7 @@ pub struct DataSplit {
     partition: BinaryRow,
     bucket: i32,
     bucket_path: String,
-    total_buckets: Option<i32>,
+    total_buckets: i32,
     data_files: Vec<DataFileMeta>,
 }
 
@@ -51,7 +51,7 @@ impl DataSplit {
     pub fn bucket_path(&self) -> &str {
         &self.bucket_path
     }
-    pub fn total_buckets(&self) -> Option<i32> {
+    pub fn total_buckets(&self) -> i32 {
         self.total_buckets
     }
 
@@ -78,7 +78,7 @@ pub struct DataSplitBuilder {
     partition: Option<BinaryRow>,
     bucket: i32,
     bucket_path: Option<String>,
-    total_buckets: Option<i32>,
+    total_buckets: i32,
     data_files: Option<Vec<DataFileMeta>>,
 }
 
@@ -89,7 +89,7 @@ impl DataSplitBuilder {
             partition: None,
             bucket: -1,
             bucket_path: None,
-            total_buckets: None,
+            total_buckets: -1,
             data_files: None,
         }
     }
@@ -110,10 +110,14 @@ impl DataSplitBuilder {
         self.bucket_path = Some(bucket_path);
         self
     }
-    pub fn with_total_buckets(mut self, total_buckets: Option<i32>) -> Self {
+    pub fn with_total_buckets(mut self, total_buckets: i32) -> Self {
         self.total_buckets = total_buckets;
         self
     }
+    pub fn with_data_files(mut self, data_files: Vec<DataFileMeta>) -> Self {
+        self.data_files = Some(data_files);
+        self
+    }
 
     pub fn build(self) -> crate::Result<DataSplit> {
         if self.snapshot_id == -1 {
diff --git a/crates/paimon/src/table/table_scan.rs b/crates/paimon/src/table/table_scan.rs
@@ -0,0 +1,187 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! TableScan for full table scan.
+//!
+//! Reference: [pypaimon.read.table_scan.TableScan](https://github.com/apache/paimon/blob/release-1.3/paimon-python/pypaimon/read/table_scan.py)
+//! and [FullStartingScanner](https://github.com/apache/paimon/blob/release-1.3/paimon-python/pypaimon/read/scanner/full_starting_scanner.py).
+
+use super::Table;
+use crate::io::FileIO;
+use crate::spec::{BinaryRow, FileKind, ManifestEntry, Snapshot};
+use crate::table::source::{DataSplitBuilder, Plan};
+use crate::table::SnapshotManager;
+use crate::Error;
+use std::collections::{HashMap, HashSet};
+
+/// Path segment for manifest directory under table.
+const MANIFEST_DIR: &str = "manifest";
+
+/// Reads a manifest list file (Avro) and returns manifest file metas.
+async fn read_manifest_list(
+    file_io: &FileIO,
+    table_path: &str,
+    list_name: &str,
+) -> crate::Result<Vec<crate::spec::ManifestFileMeta>> {
+    let path = format!(
+        "{}/{}/{}",
+        table_path.trim_end_matches('/'),
+        MANIFEST_DIR,
+        list_name
+    );
+    let input = file_io.new_input(&path)?;
+    if !input.exists().await? {
+        return Ok(Vec::new());
+    }
+    let bytes = input.read().await?;
+    crate::spec::from_avro_bytes::<crate::spec::ManifestFileMeta>(&bytes)
+}
+
+/// Reads all manifest entries for a snapshot (base + delta manifest lists, then each manifest file).
+async fn read_all_manifest_entries(
+    file_io: &FileIO,
+    table_path: &str,
+    snapshot: &Snapshot,
+) -> crate::Result<Vec<ManifestEntry>> {
+    let mut manifest_files =
+        read_manifest_list(file_io, table_path, snapshot.base_manifest_list()).await?;
+    let delta = read_manifest_list(file_io, table_path, snapshot.delta_manifest_list()).await?;
+    manifest_files.extend(delta);
+
+    let manifest_path_prefix = format!("{}/{}", table_path.trim_end_matches('/'), MANIFEST_DIR);
+    let mut all_entries = Vec::new();
+    // todo: consider use multiple-threads read manifest
+    for meta in manifest_files {
+        let path = format!("{}/{}", manifest_path_prefix, meta.file_name());
+        let entries = crate::spec::Manifest::read(file_io, &path).await?;
+        all_entries.extend(entries);
+    }
+    Ok(all_entries)
+}
+
+/// Merges add/delete manifest entries: keeps only ADD entries whose (partition, bucket, file_name) is not in DELETE set.
+fn merge_manifest_entries(entries: Vec<ManifestEntry>) -> Vec<ManifestEntry> {
+    let mut deleted = HashSet::new();
+    let mut added = Vec::new();
+    for e in entries {
+        // follow python code to use partition, bucket, filename as duplicator
+        let key = (
+            e.partition().to_vec(),
+            e.bucket(),
+            e.file().file_name.clone(),
+        );
+        match e.kind() {
+            FileKind::Add => added.push(e),
+            FileKind::Delete => {
+                deleted.insert(key);
+            }
+        }
+    }
+    added
+        .into_iter()
+        .filter(|e| {
+            !deleted.contains(&(
+                e.partition().to_vec(),
+                e.bucket(),
+                e.file().file_name.clone(),
+            ))
+        })
+        .collect()
+}
+
+/// TableScan for full table scan (no incremental, no predicate).
+///
+/// Reference: [pypaimon.read.table_scan.TableScan](https://github.com/apache/paimon/blob/master/paimon-python/pypaimon/read/table_scan.py)
+#[derive(Debug, Clone)]
+pub struct TableScan {
+    table: Table,
+}
+
+impl TableScan {
+    pub fn new(table: Table) -> Self {
+        Self { table }
+    }
+
+    /// Plan the full scan: read latest snapshot, manifest list, manifest entries, then build one DataSplit per (partition, bucket).
+    pub async fn plan(&self) -> crate::Result<Plan> {
+        let file_io = self.table.file_io();
+        let table_path = self.table.location();
+        let snapshot_manager = SnapshotManager::new(file_io.clone(), table_path.to_string());
+
+        let snapshot = match snapshot_manager.get_latest_snapshot().await? {
+            Some(s) => s,
+            None => return Ok(Plan::new(Vec::new())),
+        };
+        Self::plan_snapshot(snapshot, file_io, table_path).await
+    }
+
+    pub async fn plan_snapshot(
+        snapshot: Snapshot,
+        file_io: &FileIO,
+        table_path: &str,
+    ) -> crate::Result<Plan> {
+        let entries = read_all_manifest_entries(file_io, table_path, &snapshot).await?;
+        let entries = merge_manifest_entries(entries);
+        if entries.is_empty() {
+            return Ok(Plan::new(Vec::new()));
+        }
+
+        // Group by (partition, bucket). Key = (partition_bytes, bucket).
+        let mut groups: HashMap<(Vec<u8>, i32), Vec<ManifestEntry>> = HashMap::new();
+        for e in entries {
+            let key = (e.partition().to_vec(), e.bucket());
+            groups.entry(key).or_default().push(e);
+        }
+
+        let snapshot_id = snapshot.id();
+        let base_path = table_path;
+        let mut splits = Vec::new();
+
+        for ((_partition, bucket), group_entries) in groups {
+            let total_buckets = group_entries
+                .first()
+                .map(|e| e.total_buckets())
+                .ok_or_else(|| Error::UnexpectedError {
+                    message: format!("Manifest entry group for bucket {bucket} is empty, cannot determine total_buckets"),
+                    source: None,
+                })?;
+            let mut data_files = Vec::new();
+
+            // currently, only group by splits by bucket
+            // todo: consider use binpack to generate split
+            for manifest_entry in group_entries {
+                let ManifestEntry { file, .. } = manifest_entry;
+                data_files.push(file);
+            }
+
+            // todo: consider partitioned table
+            let bucket_path = format!("{base_path}/bucket-{bucket}");
+            let partition = BinaryRow::new(0);
+
+            let split = DataSplitBuilder::new()
+                .with_snapshot(snapshot_id)
+                .with_partition(partition)
+                .with_bucket(bucket)
+                .with_bucket_path(bucket_path)
+                .with_total_buckets(total_buckets)
+                .with_data_files(data_files)
+                .build()?;
+            splits.push(split);
+        }
+        Ok(Plan::new(splits))
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ pub struct ManifestEntry {`
`47`	`47`	`total_buckets: i32,`
`48`	`48`
`49`	`49`	`#[serde(rename = "_FILE")]`
`50`		`- file: DataFileMeta,`
	`50`	`+ pub(crate) file: DataFileMeta,`
`51`	`51`
`52`	`52`	`#[serde(rename = "_VERSION")]`
`53`	`53`	`version: i32,`
`@@ -59,7 +59,8 @@ impl ManifestEntry {`
`59`	`59`	`&self.kind`
`60`	`60`	`}`
`61`	`61`
`62`		`- fn partition(&self) -> &Vec<u8> {`
	`62`	`+ /// Partition bytes for this entry (for grouping splits).`
	`63`	`+ pub fn partition(&self) -> &[u8] {`
`63`	`64`	`&self.partition`
`64`	`65`	`}`
`65`	`66`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ pub struct DataSplit {`
`34`	`34`	`partition: BinaryRow,`
`35`	`35`	`bucket: i32,`
`36`	`36`	`bucket_path: String,`
`37`		`- total_buckets: Option<i32>,`
	`37`	`+ total_buckets: i32,`
`38`	`38`	`data_files: Vec<DataFileMeta>,`
`39`	`39`	`}`
`40`	`40`
`@@ -51,7 +51,7 @@ impl DataSplit {`
`51`	`51`	`pub fn bucket_path(&self) -> &str {`
`52`	`52`	`&self.bucket_path`
`53`	`53`	`}`
`54`		`- pub fn total_buckets(&self) -> Option<i32> {`
	`54`	`+ pub fn total_buckets(&self) -> i32 {`
`55`	`55`	`self.total_buckets`
`56`	`56`	`}`
`57`	`57`
`@@ -78,7 +78,7 @@ pub struct DataSplitBuilder {`
`78`	`78`	`partition: Option<BinaryRow>,`
`79`	`79`	`bucket: i32,`
`80`	`80`	`bucket_path: Option<String>,`
`81`		`- total_buckets: Option<i32>,`
	`81`	`+ total_buckets: i32,`
`82`	`82`	`data_files: Option<Vec<DataFileMeta>>,`
`83`	`83`	`}`
`84`	`84`
`@@ -89,7 +89,7 @@ impl DataSplitBuilder {`
`89`	`89`	`partition: None,`
`90`	`90`	`bucket: -1,`
`91`	`91`	`bucket_path: None,`
`92`		`- total_buckets: None,`
	`92`	`+ total_buckets: -1,`
`93`	`93`	`data_files: None,`
`94`	`94`	`}`
`95`	`95`	`}`
`@@ -110,10 +110,14 @@ impl DataSplitBuilder {`
`110`	`110`	`self.bucket_path = Some(bucket_path);`
`111`	`111`	`self`
`112`	`112`	`}`
`113`		`- pub fn with_total_buckets(mut self, total_buckets: Option<i32>) -> Self {`
	`113`	`+ pub fn with_total_buckets(mut self, total_buckets: i32) -> Self {`
`114`	`114`	`self.total_buckets = total_buckets;`
`115`	`115`	`self`
`116`	`116`	`}`
	`117`	`+ pub fn with_data_files(mut self, data_files: Vec<DataFileMeta>) -> Self {`
	`118`	`+ self.data_files = Some(data_files);`
	`119`	`+ self`
	`120`	`+ }`
`117`	`121`
`118`	`122`	`pub fn build(self) -> crate::Result<DataSplit> {`
`119`	`123`	`if self.snapshot_id == -1 {`