Skip to content

Commit

Permalink
Improve arrow-ipc documentation (#6983)
Browse files Browse the repository at this point in the history
* Improve `arrow-ipc` documentation

* Improve, reduce emphasis on Read/Write/Seek

* Apply suggestions from code review

Co-authored-by: Raphael Taylor-Davies <[email protected]>

---------

Co-authored-by: Raphael Taylor-Davies <[email protected]>
  • Loading branch information
alamb and tustvold authored Jan 22, 2025
1 parent ffeda12 commit 7bb96c5
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 12 deletions.
19 changes: 19 additions & 0 deletions arrow-ipc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,26 @@

//! Support for the [Arrow IPC Format]
//!
//! The Arrow IPC format defines how to read and write [`RecordBatch`]es to/from
//! a file or stream of bytes. This format can be used to serialize and deserialize
//! data to files and over the network.
//!
//! There are two variants of the IPC format:
//! 1. [IPC Streaming Format]: Supports streaming data sources, implemented by
//! [StreamReader] and [StreamWriter]
//!
//! 2. [IPC File Format]: Supports random access, implemented by [FileReader] and
//! [FileWriter].
//!
//! See the [`reader`] and [`writer`] modules for more information.
//!
//! [Arrow IPC Format]: https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc
//! [IPC Streaming Format]: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
//! [StreamReader]: reader::StreamReader
//! [StreamWriter]: writer::StreamWriter
//! [IPC File Format]: https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format
//! [FileReader]: reader::FileReader
//! [FileWriter]: writer::FileWriter
#![warn(missing_docs)]
pub mod convert;
Expand Down
87 changes: 81 additions & 6 deletions arrow-ipc/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@

//! Arrow IPC File and Stream Readers
//!
//! The `FileReader` and `StreamReader` have similar interfaces,
//! however the `FileReader` expects a reader that supports `Seek`ing
//! # Notes
//!
//! The [`FileReader`] and [`StreamReader`] have similar interfaces,
//! however the [`FileReader`] expects a reader that supports [`Seek`]ing
//!
//! [`Seek`]: std::io::Seek
mod stream;

Expand Down Expand Up @@ -997,10 +1001,49 @@ impl FileReaderBuilder {
}
}

/// Arrow File reader
/// Arrow File Reader
///
/// Reads Arrow [`RecordBatch`]es from bytes in the [IPC File Format],
/// providing random access to the record batches.
///
/// # See Also
///
/// * [`Self::set_index`] for random access
/// * [`StreamReader`] for reading streaming data
///
/// # Example: Reading from a `File`
/// ```
/// # use std::io::Cursor;
/// use arrow_array::record_batch;
/// # use arrow_ipc::reader::FileReader;
/// # use arrow_ipc::writer::FileWriter;
/// # let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
/// # let mut file = vec![]; // mimic a stream for the example
/// # {
/// # let mut writer = FileWriter::try_new(&mut file, &batch.schema()).unwrap();
/// # writer.write(&batch).unwrap();
/// # writer.write(&batch).unwrap();
/// # writer.finish().unwrap();
/// # }
/// # let mut file = Cursor::new(&file);
/// let projection = None; // read all columns
/// let mut reader = FileReader::try_new(&mut file, projection).unwrap();
/// // Position the reader to the second batch
/// reader.set_index(1).unwrap();
/// // read batches from the reader using the Iterator trait
/// let mut num_rows = 0;
/// for batch in reader {
/// let batch = batch.unwrap();
/// num_rows += batch.num_rows();
/// }
/// assert_eq!(num_rows, 3);
/// ```
/// # Example: Reading from `mmap`ed file
///
/// For an example creating Arrays with memory mapped (`mmap`) files see the [`zero_copy_ipc`] example.
/// For an example creating Arrays without copying using memory mapped (`mmap`)
/// files see the [`zero_copy_ipc`] example.
///
/// [IPC Streaming Format]: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
/// [`zero_copy_ipc`]: https://github.com/apache/arrow-rs/blob/main/arrow/examples/zero_copy_ipc.rs
pub struct FileReader<R> {
/// File reader that supports reading and seeking
Expand Down Expand Up @@ -1078,7 +1121,7 @@ impl<R: Read + Seek> FileReader<R> {
self.decoder.schema.clone()
}

/// Read a specific record batch
/// See to a specific [`RecordBatch`]
///
/// Sets the current block to the index, allowing random reads
pub fn set_index(&mut self, index: usize) -> Result<(), ArrowError> {
Expand Down Expand Up @@ -1136,7 +1179,39 @@ impl<R: Read + Seek> RecordBatchReader for FileReader<R> {
}
}

/// Arrow Stream reader
/// Arrow Stream Reader
///
/// Reads Arrow [`RecordBatch`]es from bytes in the [IPC Streaming Format].
///
/// # See Also
///
/// * [`FileReader`] for random access.
///
/// # Example
/// ```
/// # use arrow_array::record_batch;
/// # use arrow_ipc::reader::StreamReader;
/// # use arrow_ipc::writer::StreamWriter;
/// # let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
/// # let mut stream = vec![]; // mimic a stream for the example
/// # {
/// # let mut writer = StreamWriter::try_new(&mut stream, &batch.schema()).unwrap();
/// # writer.write(&batch).unwrap();
/// # writer.finish().unwrap();
/// # }
/// # let stream = stream.as_slice();
/// let projection = None; // read all columns
/// let mut reader = StreamReader::try_new(stream, projection).unwrap();
/// // read batches from the reader using the Iterator trait
/// let mut num_rows = 0;
/// for batch in reader {
/// let batch = batch.unwrap();
/// num_rows += batch.num_rows();
/// }
/// assert_eq!(num_rows, 3);
/// ```
///
/// [IPC Streaming Format]: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
pub struct StreamReader<R> {
/// Stream reader
reader: R,
Expand Down
59 changes: 53 additions & 6 deletions arrow-ipc/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@

//! Arrow IPC File and Stream Writers
//!
//! The `FileWriter` and `StreamWriter` have similar interfaces,
//! however the `FileWriter` expects a reader that supports `Seek`ing
//! # Notes
//!
//! [`FileWriter`] and [`StreamWriter`] have similar interfaces,
//! however the [`FileWriter`] expects a reader that supports [`Seek`]ing
//!
//! [`Seek`]: std::io::Seek
use std::cmp::min;
use std::collections::HashMap;
Expand Down Expand Up @@ -188,7 +192,7 @@ impl Default for IpcWriteOptions {
/// Handles low level details of encoding [`Array`] and [`Schema`] into the
/// [Arrow IPC Format].
///
/// # Example:
/// # Example
/// ```
/// # fn run() {
/// # use std::sync::Arc;
Expand Down Expand Up @@ -905,7 +909,28 @@ impl DictionaryTracker {
}
}

/// Writer for an IPC file
/// Arrow File Writer
///
/// Writes Arrow [`RecordBatch`]es in the [IPC File Format].
///
/// # See Also
///
/// * [`StreamWriter`] for writing IPC Streams
///
/// # Example
/// ```
/// # use arrow_array::record_batch;
/// # use arrow_ipc::writer::StreamWriter;
/// # let mut file = vec![]; // mimic a file for the example
/// let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
/// // create a new writer, the schema must be known in advance
/// let mut writer = StreamWriter::try_new(&mut file, &batch.schema()).unwrap();
/// // write each batch to the underlying writer
/// writer.write(&batch).unwrap();
/// // When all batches are written, call finish to flush all buffers
/// writer.finish().unwrap();
/// ```
/// [IPC File Format]: https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format
pub struct FileWriter<W> {
/// The object to write to
writer: W,
Expand Down Expand Up @@ -1108,7 +1133,7 @@ impl<W: Write> FileWriter<W> {
Ok(())
}

/// Unwraps the the underlying writer.
/// Unwraps the underlying writer.
///
/// The writer is flushed and the FileWriter is finished before returning.
///
Expand All @@ -1135,7 +1160,29 @@ impl<W: Write> RecordBatchWriter for FileWriter<W> {
}
}

/// Writer for an IPC stream
/// Arrow Stream Writer
///
/// Writes Arrow [`RecordBatch`]es to bytes using the [IPC Streaming Format].
///
/// # See Also
///
/// * [`FileWriter`] for writing IPC Files
///
/// # Example
/// ```
/// # use arrow_array::record_batch;
/// # use arrow_ipc::writer::StreamWriter;
/// # let mut stream = vec![]; // mimic a stream for the example
/// let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
/// // create a new writer, the schema must be known in advance
/// let mut writer = StreamWriter::try_new(&mut stream, &batch.schema()).unwrap();
/// // write each batch to the underlying stream
/// writer.write(&batch).unwrap();
/// // When all batches are written, call finish to flush all buffers
/// writer.finish().unwrap();
/// ```
///
/// [IPC Streaming Format]: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
pub struct StreamWriter<W> {
/// The object to write to
writer: W,
Expand Down

0 comments on commit 7bb96c5

Please sign in to comment.