Skip to content

Commit 4fa8d2f

Browse files
authored
fix(ipc): replace wildcard in skip_field with explicit DataType handling (#9822)
# Which issue does this PR close? - Closes #9821 . # Rationale for this change `skip_field` currently uses a wildcard match (`_`) to skip remaining `DataType` variants by assuming a two-buffer layout. This is not robust, as new variants may have different buffer layouts, leading to incorrect skipping and buffer misalignment. Replacing the wildcard with explicit handling ensures the correct number of buffers are skipped and avoids silent errors for future types. # What changes are included in this PR? * Removed the wildcard (`_`) match arm in `skip_field` * Added explicit handling for all remaining fixed-width and boolean `DataType` variants * Each of these types now explicitly skips: * null buffer * values buffer File updated: * `arrow-ipc/src/reader.rs` # Are these changes tested? Yes. * Added test: `test_projection_skip_fixed_width_types` in `arrow-ipc/src/reader.rs` * The test iterates over all fixed-width and boolean `DataType` variants covered by this change * For each type: * writes a batch with `[skipped_column(type), values_column(Int32)]` * reads only the second column (skipping the first) * verifies whether the returned column exactly matches the original `Int32` values * This directly validates that skipping each of these types consumes the correct number of buffers * Also, all existing `arrow-ipc` tests pass (`cargo test -p arrow-ipc --lib`) # Are there any user-facing changes? No.
1 parent c4b2569 commit 4fa8d2f

1 file changed

Lines changed: 123 additions & 2 deletions

File tree

arrow-ipc/src/reader.rs

Lines changed: 123 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -707,8 +707,33 @@ impl<'a> RecordBatchDecoder<'a> {
707707
self.skip_field(field, variadic_count)?
708708
}
709709
}
710-
Null => {} // No buffer increases
711-
_ => {
710+
// Null has no buffers to skip
711+
Null => {}
712+
713+
// Fixed-width and boolean types: skip null buffer + values buffer
714+
Boolean
715+
| Int8
716+
| Int16
717+
| Int32
718+
| Int64
719+
| UInt8
720+
| UInt16
721+
| UInt32
722+
| UInt64
723+
| Float16
724+
| Float32
725+
| Float64
726+
| Timestamp(_, _)
727+
| Date32
728+
| Date64
729+
| Time32(_)
730+
| Time64(_)
731+
| Duration(_)
732+
| Interval(_)
733+
| Decimal32(_, _)
734+
| Decimal64(_, _)
735+
| Decimal128(_, _)
736+
| Decimal256(_, _) => {
712737
self.skip_buffer();
713738
self.skip_buffer();
714739
}
@@ -3517,4 +3542,100 @@ mod tests {
35173542
assert_eq!(read_batch.num_columns(), 1);
35183543
assert_eq!(read_batch.column(0).as_ref(), &values);
35193544
}
3545+
3546+
// Tests reading a column when preceding fixed-width and boolean columns are skipped.
3547+
// Covers all types that use the same two-buffer layout (null + values).
3548+
// Verifies that skipping these types does not affect subsequent column decoding.
3549+
#[test]
3550+
fn test_projection_skip_fixed_width_types() {
3551+
use std::sync::Arc;
3552+
3553+
use arrow_array::{ArrayRef, BooleanArray, Int32Array, RecordBatch, make_array};
3554+
use arrow_buffer::Buffer;
3555+
use arrow_data::ArrayData;
3556+
use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit};
3557+
3558+
use crate::reader::FileReader;
3559+
use crate::writer::FileWriter;
3560+
3561+
// Create a minimal array for a given fixed-width or boolean type
3562+
fn make_array_for_type(data_type: DataType) -> ArrayRef {
3563+
let len = 3;
3564+
3565+
if matches!(data_type, DataType::Boolean) {
3566+
return Arc::new(BooleanArray::from(vec![true, false, true]));
3567+
}
3568+
3569+
let width = data_type.primitive_width().unwrap();
3570+
let data = ArrayData::builder(data_type)
3571+
.len(len)
3572+
.add_buffer(Buffer::from(vec![0_u8; len * width]))
3573+
.build()
3574+
.unwrap();
3575+
3576+
make_array(data)
3577+
}
3578+
3579+
// List of types that follow the same two-buffer layout (null + values)
3580+
let data_types = vec![
3581+
DataType::Boolean,
3582+
DataType::Int8,
3583+
DataType::Int16,
3584+
DataType::Int32,
3585+
DataType::Int64,
3586+
DataType::UInt8,
3587+
DataType::UInt16,
3588+
DataType::UInt32,
3589+
DataType::UInt64,
3590+
DataType::Float16,
3591+
DataType::Float32,
3592+
DataType::Float64,
3593+
DataType::Timestamp(TimeUnit::Second, None),
3594+
DataType::Date32,
3595+
DataType::Date64,
3596+
DataType::Time32(TimeUnit::Second),
3597+
DataType::Time64(TimeUnit::Microsecond),
3598+
DataType::Duration(TimeUnit::Second),
3599+
DataType::Interval(IntervalUnit::YearMonth),
3600+
DataType::Interval(IntervalUnit::DayTime),
3601+
DataType::Interval(IntervalUnit::MonthDayNano),
3602+
DataType::Decimal32(9, 2),
3603+
DataType::Decimal64(18, 2),
3604+
DataType::Decimal128(38, 2),
3605+
DataType::Decimal256(76, 2),
3606+
];
3607+
3608+
// For each type:
3609+
// - write a batch with [skipped_column, values]
3610+
// - read only the second column
3611+
// - verify the result is correct
3612+
for data_type in data_types {
3613+
let skipped = make_array_for_type(data_type.clone());
3614+
let values = Int32Array::from(vec![10, 20, 30]);
3615+
3616+
let schema = Arc::new(Schema::new(vec![
3617+
Field::new("skipped", data_type, false),
3618+
Field::new("values", DataType::Int32, false),
3619+
]));
3620+
3621+
let batch =
3622+
RecordBatch::try_new(schema, vec![skipped, Arc::new(values.clone())]).unwrap();
3623+
3624+
// Serialize the batch into IPC format
3625+
let mut buf = Vec::new();
3626+
{
3627+
let mut writer = FileWriter::try_new(&mut buf, &batch.schema()).unwrap();
3628+
writer.write(&batch).unwrap();
3629+
writer.finish().unwrap();
3630+
}
3631+
3632+
// Read back only the second column (skip the first)
3633+
let mut reader = FileReader::try_new(std::io::Cursor::new(buf), Some(vec![1])).unwrap();
3634+
let read_batch = reader.next().unwrap().unwrap();
3635+
3636+
// Verify that the returned column matches the original values column
3637+
assert_eq!(read_batch.num_columns(), 1);
3638+
assert_eq!(read_batch.column(0).as_ref(), &values);
3639+
}
3640+
}
35203641
}

0 commit comments

Comments
 (0)