-
Notifications
You must be signed in to change notification settings - Fork 591
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(connector): support local fs source (#14312)
Co-authored-by: KeXiangWang <[email protected]>
- Loading branch information
Showing
18 changed files
with
353 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import os | ||
import sys | ||
import csv | ||
import random | ||
import psycopg2 | ||
import opendal | ||
|
||
from time import sleep | ||
from io import StringIO | ||
from functools import partial | ||
|
||
def gen_data(file_num, item_num_per_file): | ||
assert item_num_per_file % 2 == 0, \ | ||
f'item_num_per_file should be even to ensure sum(mark) == 0: {item_num_per_file}' | ||
return [ | ||
[{ | ||
'id': file_id * item_num_per_file + item_id, | ||
'name': f'{file_id}_{item_id}', | ||
'sex': item_id % 2, | ||
'mark': (-1) ** (item_id % 2), | ||
} for item_id in range(item_num_per_file)] | ||
for file_id in range(file_num) | ||
] | ||
|
||
def format_csv(data, with_header): | ||
csv_files = [] | ||
|
||
for file_data in data: | ||
ostream = StringIO() | ||
writer = csv.DictWriter(ostream, fieldnames=file_data[0].keys()) | ||
if with_header: | ||
writer.writeheader() | ||
for item_data in file_data: | ||
writer.writerow(item_data) | ||
csv_files.append(ostream.getvalue()) | ||
return csv_files | ||
|
||
|
||
def do_test(file_num, item_num_per_file, prefix, fmt): | ||
conn = psycopg2.connect( | ||
host="localhost", | ||
port="4566", | ||
user="root", | ||
database="dev" | ||
) | ||
|
||
# Open a cursor to execute SQL statements | ||
cur = conn.cursor() | ||
|
||
def _table(): | ||
return f'posix_fs_test_{fmt}' | ||
|
||
def _encode(): | ||
return f"CSV (delimiter = ',', without_header = {str('without' in fmt).lower()})" | ||
|
||
# Execute a SELECT statement | ||
cur.execute(f'''CREATE TABLE {_table()}( | ||
id int, | ||
name TEXT, | ||
sex int, | ||
mark int, | ||
) WITH ( | ||
connector = 'posix_fs', | ||
match_pattern = '{prefix}*.{fmt}', | ||
posix_fs.root = '/tmp', | ||
) FORMAT PLAIN ENCODE {_encode()};''') | ||
|
||
total_rows = file_num * item_num_per_file | ||
MAX_RETRIES = 40 | ||
for retry_no in range(MAX_RETRIES): | ||
cur.execute(f'select count(*) from {_table()}') | ||
result = cur.fetchone() | ||
if result[0] == total_rows: | ||
break | ||
print(f"[retry {retry_no}] Now got {result[0]} rows in table, {total_rows} expected, wait 30s") | ||
sleep(30) | ||
|
||
stmt = f'select count(*), sum(id), sum(sex), sum(mark) from {_table()}' | ||
print(f'Execute {stmt}') | ||
cur.execute(stmt) | ||
result = cur.fetchone() | ||
|
||
print('Got:', result) | ||
|
||
def _assert_eq(field, got, expect): | ||
assert got == expect, f'{field} assertion failed: got {got}, expect {expect}.' | ||
|
||
_assert_eq('count(*)', result[0], total_rows) | ||
_assert_eq('sum(id)', result[1], (total_rows - 1) * total_rows / 2) | ||
_assert_eq('sum(sex)', result[2], total_rows / 2) | ||
_assert_eq('sum(mark)', result[3], 0) | ||
|
||
print('Test pass') | ||
|
||
cur.execute(f'drop table {_table()}') | ||
cur.close() | ||
conn.close() | ||
|
||
|
||
if __name__ == "__main__": | ||
FILE_NUM = 4001 | ||
ITEM_NUM_PER_FILE = 2 | ||
data = gen_data(FILE_NUM, ITEM_NUM_PER_FILE) | ||
|
||
fmt = sys.argv[1] | ||
FORMATTER = { | ||
'csv_with_header': partial(format_csv, with_header=True), | ||
'csv_without_header': partial(format_csv, with_header=False), | ||
} | ||
assert fmt in FORMATTER, f"Unsupported format: {fmt}" | ||
formatted_files = FORMATTER[fmt](data) | ||
|
||
run_id = str(random.randint(1000, 9999)) | ||
_local = lambda idx: f'data_{idx}.{fmt}' | ||
_posix = lambda idx: f"{run_id}_data_{idx}.{fmt}" | ||
# put local files | ||
op = opendal.Operator("fs", root="/tmp") | ||
|
||
print("write file to /tmp") | ||
for idx, file_str in enumerate(formatted_files): | ||
with open(_local(idx), "w") as f: | ||
f.write(file_str) | ||
os.fsync(f.fileno()) | ||
file_name = _posix(idx) | ||
file_bytes = file_str.encode('utf-8') | ||
op.write(file_name, file_bytes) | ||
|
||
# do test | ||
print("do test") | ||
do_test(FILE_NUM, ITEM_NUM_PER_FILE, run_id, fmt) | ||
|
||
# clean up local files | ||
print("clean up local files in /tmp") | ||
for idx, _ in enumerate(formatted_files): | ||
file_name = _posix(idx) | ||
op.delete(file_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,4 @@ | ||
import os | ||
import string | ||
import json | ||
import string | ||
from time import sleep | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
carat,cut,color,depth | ||
0.25,Ideal,E,61.4 | ||
0.22,Premium,I,62.0 | ||
0.28,Good,J,63.1 | ||
0.23,Very Good,H,57.5 | ||
0.30,Fair,E,64.7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
carat,cut,color,depth | ||
1.25,Ideal,E,61.4 | ||
1.22,Premium,I,62.0 | ||
1.28,Good,J,63.1 | ||
1.23,Very Good,H,57.5 | ||
1.30,Fair,E,64.7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
statement ok | ||
SET RW_IMPLICIT_FLUSH TO true; | ||
|
||
statement ok | ||
CREATE TABLE diamonds ( | ||
carat FLOAT, | ||
cut TEXT, | ||
color TEXT, | ||
depth FLOAT, | ||
) WITH ( | ||
connector = 'posix_fs', | ||
match_pattern = 'data*.csv', | ||
posix_fs.root = 'e2e_test/source/opendal/data', | ||
) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ','); | ||
|
||
sleep 10s | ||
|
||
query TTTT rowsort | ||
select * from diamonds; | ||
---- | ||
0.22 Premium I 62 | ||
0.23 Very Good H 57.5 | ||
0.25 Ideal E 61.4 | ||
0.28 Good J 63.1 | ||
0.3 Fair E 64.7 | ||
1.22 Premium I 62 | ||
1.23 Very Good H 57.5 | ||
1.25 Ideal E 61.4 | ||
1.28 Good J 63.1 | ||
1.3 Fair E 64.7 | ||
|
||
statement ok | ||
DROP TABLE diamonds; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
58 changes: 58 additions & 0 deletions
58
src/connector/src/source/filesystem/opendal_source/posix_fs_source.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
// Copyright 2024 RisingWave Labs | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
use std::marker::PhantomData; | ||
|
||
use anyhow::Context; | ||
use opendal::layers::{LoggingLayer, RetryLayer}; | ||
use opendal::services::Fs; | ||
use opendal::Operator; | ||
|
||
use super::opendal_enumerator::OpendalEnumerator; | ||
use super::{OpendalSource, PosixFsProperties}; | ||
|
||
// Posix fs source should only be used for testing. | ||
// For a single-CN cluster, the behavior is well-defined. It will read from the local file system. | ||
// For a multi-CN cluster, each CN will read from its own local file system under the given directory. | ||
|
||
impl<Src: OpendalSource> OpendalEnumerator<Src> { | ||
/// create opendal posix fs source. | ||
pub fn new_posix_fs_source(posix_fs_properties: PosixFsProperties) -> anyhow::Result<Self> { | ||
// Create Fs builder. | ||
let mut builder = Fs::default(); | ||
|
||
builder.root(&posix_fs_properties.root); | ||
|
||
let op: Operator = Operator::new(builder)? | ||
.layer(LoggingLayer::default()) | ||
.layer(RetryLayer::default()) | ||
.finish(); | ||
|
||
let (prefix, matcher) = if let Some(pattern) = posix_fs_properties.match_pattern.as_ref() { | ||
// TODO(Kexiang): Currently, FsListnenr in opendal does not support a prefix. (Seems a bug in opendal) | ||
// So we assign prefix to empty string. | ||
let matcher = glob::Pattern::new(pattern) | ||
.with_context(|| format!("Invalid match_pattern: {}", pattern))?; | ||
(Some(String::new()), Some(matcher)) | ||
} else { | ||
(None, None) | ||
}; | ||
Ok(Self { | ||
op, | ||
prefix, | ||
matcher, | ||
marker: PhantomData, | ||
}) | ||
} | ||
} |
Oops, something went wrong.