-
Notifications
You must be signed in to change notification settings - Fork 160
feat: add ListLayout
#8071
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
feat: add ListLayout
#8071
Changes from all commits
27af946
01a4afd
67490bd
5325ac0
eae0ef9
a058ed3
4d12459
39b62a9
3f6fdd5
0ddfd9e
0fb4f94
23d8023
38c9ccf
bac5b73
9ceb746
be66bab
6a1a3da
b267f99
05d0fc0
58c5be5
4256223
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,271 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| mod reader; | ||
| pub mod writer; | ||
|
|
||
| use std::sync::Arc; | ||
|
|
||
| use reader::ListReader; | ||
| use vortex_array::DeserializeMetadata; | ||
| use vortex_array::ProstMetadata; | ||
| use vortex_array::dtype::DType; | ||
| use vortex_array::dtype::Nullability; | ||
| use vortex_array::dtype::PType; | ||
| use vortex_error::VortexExpect; | ||
| use vortex_error::VortexResult; | ||
| use vortex_error::vortex_bail; | ||
| use vortex_error::vortex_ensure; | ||
| use vortex_error::vortex_err; | ||
| use vortex_error::vortex_panic; | ||
| use vortex_session::VortexSession; | ||
| use vortex_session::registry::ReadContext; | ||
|
|
||
| use crate::LayoutChildType; | ||
| use crate::LayoutEncodingRef; | ||
| use crate::LayoutId; | ||
| use crate::LayoutReaderRef; | ||
| use crate::LayoutRef; | ||
| use crate::VTable; | ||
| use crate::children::LayoutChildren; | ||
| use crate::segments::SegmentId; | ||
| use crate::segments::SegmentSource; | ||
| use crate::vtable; | ||
|
|
||
| /// Child index of the `elements` layout. | ||
| pub const ELEMENTS_CHILD_INDEX: usize = 0; | ||
| /// Child index of the `offsets` layout. | ||
| pub const OFFSETS_CHILD_INDEX: usize = 1; | ||
| /// Child index of the `validity` layout (only present when the list dtype is nullable). | ||
| pub const VALIDITY_CHILD_INDEX: usize = 2; | ||
|
|
||
| /// Number of children when the list dtype is non-nullable. | ||
| pub const NUM_CHILDREN_NON_NULLABLE: usize = 2; | ||
|
|
||
| vtable!(List); | ||
|
|
||
| impl VTable for List { | ||
| type Layout = ListLayout; | ||
| type Encoding = ListLayoutEncoding; | ||
| type Metadata = ProstMetadata<ListLayoutMetadata>; | ||
|
|
||
| fn id(_encoding: &Self::Encoding) -> LayoutId { | ||
| LayoutId::new("vortex.list") | ||
| } | ||
|
|
||
| fn encoding(_layout: &Self::Layout) -> LayoutEncodingRef { | ||
| LayoutEncodingRef::new_ref(ListLayoutEncoding.as_ref()) | ||
| } | ||
|
|
||
| fn row_count(layout: &Self::Layout) -> u64 { | ||
| layout.row_count() | ||
| } | ||
|
|
||
| fn dtype(layout: &Self::Layout) -> &DType { | ||
| &layout.dtype | ||
| } | ||
|
|
||
| fn metadata(layout: &Self::Layout) -> Self::Metadata { | ||
| ProstMetadata(ListLayoutMetadata::new(layout.offsets_ptype())) | ||
| } | ||
|
|
||
| fn segment_ids(_layout: &Self::Layout) -> Vec<SegmentId> { | ||
| vec![] | ||
| } | ||
|
|
||
| fn nchildren(layout: &Self::Layout) -> usize { | ||
| let mut n = NUM_CHILDREN_NON_NULLABLE; | ||
| if layout.dtype.is_nullable() { | ||
| n += 1; | ||
| } | ||
|
|
||
| n | ||
| } | ||
|
|
||
| fn child(layout: &Self::Layout, idx: usize) -> VortexResult<LayoutRef> { | ||
| match (idx, layout.validity.as_ref()) { | ||
| (ELEMENTS_CHILD_INDEX, _) => Ok(Arc::clone(&layout.elements)), | ||
| (OFFSETS_CHILD_INDEX, _) => Ok(Arc::clone(&layout.offsets)), | ||
| (VALIDITY_CHILD_INDEX, Some(validity)) => Ok(Arc::clone(validity)), | ||
| _ => vortex_bail!("Invalid child index {idx} for ListLayout"), | ||
| } | ||
| } | ||
|
|
||
| fn child_type(layout: &Self::Layout, idx: usize) -> LayoutChildType { | ||
| match (idx, layout.validity.is_some()) { | ||
| (ELEMENTS_CHILD_INDEX, _) => LayoutChildType::Auxiliary("elements".into()), | ||
| (OFFSETS_CHILD_INDEX, _) => LayoutChildType::Auxiliary("offsets".into()), | ||
| (VALIDITY_CHILD_INDEX, true) => LayoutChildType::Auxiliary("validity".into()), | ||
| _ => vortex_panic!("Invalid child index {idx} for ListLayout"), | ||
| } | ||
| } | ||
|
|
||
| fn new_reader( | ||
| layout: &Self::Layout, | ||
| name: Arc<str>, | ||
| segment_source: Arc<dyn SegmentSource>, | ||
| session: &VortexSession, | ||
| ) -> VortexResult<LayoutReaderRef> { | ||
| Ok(Arc::new(ListReader::try_new( | ||
| layout.clone(), | ||
| name, | ||
| segment_source, | ||
| session.clone(), | ||
| )?)) | ||
| } | ||
|
|
||
| fn build( | ||
| _encoding: &Self::Encoding, | ||
| dtype: &DType, | ||
| _row_count: u64, | ||
| metadata: &<Self::Metadata as DeserializeMetadata>::Output, | ||
| _segment_ids: Vec<SegmentId>, | ||
| children: &dyn LayoutChildren, | ||
| _ctx: &ReadContext, | ||
| ) -> VortexResult<Self::Layout> { | ||
| validate_children(dtype, children.nchildren())?; | ||
|
|
||
| let elements_dtype = dtype | ||
| .as_list_element_opt() | ||
| .ok_or_else(|| vortex_err!("ListLayout requires a List dtype, got {dtype}"))?; | ||
| let elements = children.child(ELEMENTS_CHILD_INDEX, elements_dtype.as_ref())?; | ||
|
|
||
| let offsets_dtype = DType::Primitive(metadata.offsets_ptype(), Nullability::NonNullable); | ||
| let offsets = children.child(OFFSETS_CHILD_INDEX, &offsets_dtype)?; | ||
|
|
||
| let validity = dtype | ||
| .is_nullable() | ||
| .then(|| children.child(VALIDITY_CHILD_INDEX, &DType::Bool(Nullability::NonNullable))) | ||
| .transpose()?; | ||
|
|
||
| Ok(ListLayout { | ||
| dtype: dtype.clone(), | ||
| elements, | ||
| offsets, | ||
| validity, | ||
| }) | ||
| } | ||
|
|
||
| fn with_children(layout: &mut Self::Layout, children: Vec<LayoutRef>) -> VortexResult<()> { | ||
| validate_children(layout.dtype(), children.len())?; | ||
|
|
||
| let mut iter = children.into_iter(); | ||
| layout.elements = iter | ||
| .next() | ||
| .ok_or_else(|| vortex_err!("missing elements child"))?; | ||
| layout.offsets = iter | ||
| .next() | ||
| .ok_or_else(|| vortex_err!("missing offsets child"))?; | ||
| layout.validity = layout | ||
| .dtype | ||
| .is_nullable() | ||
| .then(|| { | ||
| iter.next() | ||
| .ok_or_else(|| vortex_err!("missing validity child")) | ||
| }) | ||
| .transpose()?; | ||
| Ok(()) | ||
| } | ||
| } | ||
|
|
||
| /// Validates expected number of children based on `dtype` | ||
| #[inline] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We generally dont want to inline functions with branches, and also because this is a private function the compiler will likely inline this anyways. inline is only useful for public functions that are small |
||
| fn validate_children(dtype: &DType, n_children: usize) -> VortexResult<()> { | ||
| let mut expected = NUM_CHILDREN_NON_NULLABLE; | ||
|
|
||
| if dtype.is_nullable() { | ||
| expected += 1; | ||
| }; | ||
|
|
||
| vortex_ensure!( | ||
| n_children == expected, | ||
|
Comment on lines
+180
to
+181
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. vortex_ensure_eq! |
||
| "ListLayout expects {expected} children, got {n_children}", | ||
| ); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[derive(Debug)] | ||
| pub struct ListLayoutEncoding; | ||
|
|
||
| /// Stores a list-typed array by shredding `elements`, `offsets`, and optional `validity` children. | ||
| #[derive(Clone, Debug)] | ||
| pub struct ListLayout { | ||
| dtype: DType, | ||
| elements: LayoutRef, | ||
| offsets: LayoutRef, | ||
| validity: Option<LayoutRef>, | ||
| } | ||
|
|
||
| impl ListLayout { | ||
| /// Construct a new `ListLayout` from its components. | ||
| /// | ||
| /// # Invariants | ||
| /// | ||
| /// - `dtype` must be a [`DType::List`]. | ||
| /// - `validity` must be `Some` iff `dtype.is_nullable()`. | ||
| /// - `offsets.dtype()` must be a non-nullable integer. | ||
| /// - `offsets.row_count()` is the Arrow-canonical `n+1` for `n` lists (or `0` for empty). | ||
| /// - When present, `validity.row_count() == offsets.row_count().saturating_sub(1)`. | ||
| pub fn new( | ||
| dtype: DType, | ||
| elements: LayoutRef, | ||
| offsets: LayoutRef, | ||
| validity: Option<LayoutRef>, | ||
| ) -> Self { | ||
| Self { | ||
| dtype, | ||
| elements, | ||
| offsets, | ||
| validity, | ||
| } | ||
| } | ||
|
|
||
| /// Number of lists in this layout. | ||
| #[inline] | ||
| pub fn row_count(&self) -> u64 { | ||
| self.offsets.row_count().saturating_sub(1) | ||
| } | ||
|
|
||
| #[inline] | ||
| pub fn elements(&self) -> &LayoutRef { | ||
| &self.elements | ||
| } | ||
|
|
||
| #[inline] | ||
| pub fn offsets(&self) -> &LayoutRef { | ||
| &self.offsets | ||
| } | ||
|
|
||
| #[inline] | ||
| pub fn validity(&self) -> Option<&LayoutRef> { | ||
| self.validity.as_ref() | ||
| } | ||
|
|
||
| /// The integer type used for the `offsets` child layout. | ||
| #[inline] | ||
| pub fn offsets_ptype(&self) -> PType { | ||
| self.offsets.dtype().as_ptype() | ||
| } | ||
|
|
||
| /// The dtype of the inner elements column. | ||
| pub fn elements_dtype(&self) -> &DType { | ||
| self.dtype | ||
| .as_list_element_opt() | ||
| .vortex_expect("ListLayout dtype must be a List") | ||
| } | ||
| } | ||
|
|
||
| #[derive(prost::Message)] | ||
| pub struct ListLayoutMetadata { | ||
| #[prost(enumeration = "PType", tag = "1")] | ||
| offsets_ptype: i32, | ||
| } | ||
|
|
||
| impl ListLayoutMetadata { | ||
| pub fn new(offsets_ptype: PType) -> Self { | ||
| let mut metadata = Self::default(); | ||
| metadata.set_offsets_ptype(offsets_ptype); | ||
| metadata | ||
| } | ||
|
Comment on lines
+266
to
+270
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a strange constructor? Why not just do |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like I would prefer reading just an if branch and then return either
NUM_CHILDREN_NON_NULLABLEorNUM_CHILDREN_NON_NULLABLE + 1?