Skip to content

Commit

Permalink
StringView support in arrow-csv (#6062)
Browse files Browse the repository at this point in the history
* StringView support in arrow-csv

* review and micro-benches
  • Loading branch information
2010YOUY01 authored Jul 16, 2024
1 parent 66390ff commit b2458bd
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 8 deletions.
94 changes: 86 additions & 8 deletions arrow-csv/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,14 @@ fn parse(
})
.collect::<StringArray>(),
) as ArrayRef),
DataType::Utf8View => Ok(Arc::new(
rows.iter()
.map(|row| {
let s = row.get(i);
(!null_regex.is_null(s)).then_some(s)
})
.collect::<StringViewArray>(),
) as ArrayRef),
DataType::Dictionary(key_type, value_type)
if value_type.as_ref() == &DataType::Utf8 =>
{
Expand Down Expand Up @@ -2380,17 +2388,27 @@ mod tests {
}

fn err_test(csv: &[u8], expected: &str) {
let schema = Arc::new(Schema::new(vec![
fn err_test_with_schema(csv: &[u8], expected: &str, schema: Arc<Schema>) {
let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv));
let b = ReaderBuilder::new(schema)
.with_batch_size(2)
.build_buffered(buffer)
.unwrap();
let err = b.collect::<Result<Vec<_>, _>>().unwrap_err().to_string();
assert_eq!(err, expected)
}

let schema_utf8 = Arc::new(Schema::new(vec![
Field::new("text1", DataType::Utf8, true),
Field::new("text2", DataType::Utf8, true),
]));
let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv));
let b = ReaderBuilder::new(schema)
.with_batch_size(2)
.build_buffered(buffer)
.unwrap();
let err = b.collect::<Result<Vec<_>, _>>().unwrap_err().to_string();
assert_eq!(err, expected)
err_test_with_schema(csv, expected, schema_utf8);

let schema_utf8view = Arc::new(Schema::new(vec![
Field::new("text1", DataType::Utf8View, true),
Field::new("text2", DataType::Utf8View, true),
]));
err_test_with_schema(csv, expected, schema_utf8view);
}

#[test]
Expand Down Expand Up @@ -2587,4 +2605,64 @@ mod tests {
&vec![2, 22]
);
}

#[test]
fn test_parse_string_view_single_column() {
let csv = ["foo", "something_cannot_be_inlined", "foobar"].join("\n");
let schema = Arc::new(Schema::new(vec![Field::new(
"c1",
DataType::Utf8View,
true,
)]));

let mut decoder = ReaderBuilder::new(schema).build_decoder();

let decoded = decoder.decode(csv.as_bytes()).unwrap();
assert_eq!(decoded, csv.len());
decoder.decode(&[]).unwrap();

let batch = decoder.flush().unwrap().unwrap();
assert_eq!(batch.num_columns(), 1);
assert_eq!(batch.num_rows(), 3);
let col = batch.column(0).as_string_view();
assert_eq!(col.data_type(), &DataType::Utf8View);
assert_eq!(col.value(0), "foo");
assert_eq!(col.value(1), "something_cannot_be_inlined");
assert_eq!(col.value(2), "foobar");
}

#[test]
fn test_parse_string_view_multi_column() {
let csv = ["foo,", ",something_cannot_be_inlined", "foobarfoobar,bar"].join("\n");
let schema = Arc::new(Schema::new(vec![
Field::new("c1", DataType::Utf8View, true),
Field::new("c2", DataType::Utf8View, true),
]));

let mut decoder = ReaderBuilder::new(schema).build_decoder();

let decoded = decoder.decode(csv.as_bytes()).unwrap();
assert_eq!(decoded, csv.len());
decoder.decode(&[]).unwrap();

let batch = decoder.flush().unwrap().unwrap();
assert_eq!(batch.num_columns(), 2);
assert_eq!(batch.num_rows(), 3);
let c1 = batch.column(0).as_string_view();
let c2 = batch.column(1).as_string_view();
assert_eq!(c1.data_type(), &DataType::Utf8View);
assert_eq!(c2.data_type(), &DataType::Utf8View);

assert!(!c1.is_null(0));
assert!(c1.is_null(1));
assert!(!c1.is_null(2));
assert_eq!(c1.value(0), "foo");
assert_eq!(c1.value(2), "foobarfoobar");

assert!(c2.is_null(0));
assert!(!c2.is_null(1));
assert!(!c2.is_null(2));
assert_eq!(c2.value(1), "something_cannot_be_inlined");
assert_eq!(c2.value(2), "bar");
}
}
42 changes: 42 additions & 0 deletions arrow/benches/csv_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ extern crate criterion;
use std::io::Cursor;
use std::sync::Arc;

use arrow::util::bench_util::create_string_view_array_with_len;
use criterion::*;
use rand::Rng;

Expand Down Expand Up @@ -59,6 +60,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec<ArrayRef>) {
fn criterion_benchmark(c: &mut Criterion) {
let mut rng = seedable_rng();

// Single Primitive Column tests
let values = Int32Array::from_iter_values((0..4096).map(|_| rng.gen_range(0..1024)));
let cols = vec![Arc::new(values) as ArrayRef];
do_bench(c, "4096 i32_small(0)", cols);
Expand Down Expand Up @@ -101,6 +103,7 @@ fn criterion_benchmark(c: &mut Criterion) {
let cols = vec![Arc::new(values) as ArrayRef];
do_bench(c, "4096 f64(0)", cols);

// Single String Column tests
let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0., 10)) as ArrayRef];
do_bench(c, "4096 string(10, 0)", cols);

Expand All @@ -113,6 +116,20 @@ fn criterion_benchmark(c: &mut Criterion) {
let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 100)) as ArrayRef];
do_bench(c, "4096 string(100, 0.5)", cols);

// Single StringView Column tests
let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10, false)) as ArrayRef];
do_bench(c, "4096 StringView(10, 0)", cols);

let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef];
do_bench(c, "4096 StringView(30, 0)", cols);

let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 100, false)) as ArrayRef];
do_bench(c, "4096 StringView(100, 0)", cols);

let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100, false)) as ArrayRef];
do_bench(c, "4096 StringView(100, 0.5)", cols);

// Multi-Column(with String) tests
let cols = vec![
Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 20)) as ArrayRef,
Arc::new(create_string_array_with_len::<i32>(4096, 0., 30)) as ArrayRef,
Expand All @@ -136,6 +153,31 @@ fn criterion_benchmark(c: &mut Criterion) {
"4096 string(20, 0.5), string(30, 0), f64(0), i64(0)",
cols,
);

// Multi-Column(with StringView) tests
let cols = vec![
Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as ArrayRef,
Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef,
Arc::new(create_string_view_array_with_len(4096, 0., 100, false)) as ArrayRef,
Arc::new(create_primitive_array::<Int64Type>(4096, 0.)) as ArrayRef,
];
do_bench(
c,
"4096 StringView(20, 0.5), StringView(30, 0), StringView(100, 0), i64(0)",
cols,
);

let cols = vec![
Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as ArrayRef,
Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef,
Arc::new(create_primitive_array::<Float64Type>(4096, 0.)) as ArrayRef,
Arc::new(create_primitive_array::<Int64Type>(4096, 0.)) as ArrayRef,
];
do_bench(
c,
"4096 StringView(20, 0.5), StringView(30, 0), f64(0), i64(0)",
cols,
);
}

criterion_group!(benches, criterion_benchmark);
Expand Down

0 comments on commit b2458bd

Please sign in to comment.