Skip to content

Commit

Permalink
feat(source): introduce gcs source and new s3 source via OpenDAL (#13414
Browse files Browse the repository at this point in the history
)

Signed-off-by: Richard Chien <[email protected]>
Co-authored-by: Richard Chien <[email protected]>
Co-authored-by: wcy-fdu <[email protected]>
  • Loading branch information
3 people authored Dec 18, 2023
1 parent fa1afbf commit a9def98
Show file tree
Hide file tree
Showing 29 changed files with 947 additions and 178 deletions.
37 changes: 35 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion ci/scripts/s3-source-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ echo "--- starting risingwave cluster with connector node"
cargo make ci-start ci-1cn-1fe

echo "--- Run test"
python3 -m pip install minio psycopg2-binary
python3 -m pip install minio psycopg2-binary opendal
python3 e2e_test/s3/$script

echo "--- Kill cluster"
Expand Down
130 changes: 130 additions & 0 deletions e2e_test/s3/gcs_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import os
import sys
import csv
import json
import random
import psycopg2
import opendal

from time import sleep
from io import StringIO
from functools import partial

def gen_data(file_num, item_num_per_file):
assert item_num_per_file % 2 == 0, \
f'item_num_per_file should be even to ensure sum(mark) == 0: {item_num_per_file}'
return [
[{
'id': file_id * item_num_per_file + item_id,
'name': f'{file_id}_{item_id}',
'sex': item_id % 2,
'mark': (-1) ** (item_id % 2),
} for item_id in range(item_num_per_file)]
for file_id in range(file_num)
]

def format_json(data):
return [
'\n'.join([json.dumps(item) for item in file])
for file in data
]


def do_test(config, file_num, item_num_per_file, prefix, fmt, credential):
conn = psycopg2.connect(
host="localhost",
port="4566",
user="root",
database="dev"
)

# Open a cursor to execute SQL statements
cur = conn.cursor()

def _table():
return f'gcs_test_{fmt}'

def _encode():
return 'JSON'

# Execute a SELECT statement
cur.execute(f'''CREATE TABLE {_table()}(
id int,
name TEXT,
sex int,
mark int,
) WITH (
connector = 'gcs',
match_pattern = '{prefix}*.{fmt}',
gcs.bucket_name = '{config['GCS_BUCKET']}',
gcs.credentials = '{credential}',
) FORMAT PLAIN ENCODE {_encode()};''')

total_rows = file_num * item_num_per_file
MAX_RETRIES = 40
for retry_no in range(MAX_RETRIES):
cur.execute(f'select count(*) from {_table()}')
result = cur.fetchone()
if result[0] == total_rows:
break
print(f"[retry {retry_no}] Now got {result[0]} rows in table, {total_rows} expected, wait 30s")
sleep(30)

stmt = f'select count(*), sum(id), sum(sex), sum(mark) from {_table()}'
print(f'Execute {stmt}')
cur.execute(stmt)
result = cur.fetchone()

print('Got:', result)

def _assert_eq(field, got, expect):
assert got == expect, f'{field} assertion failed: got {got}, expect {expect}.'

_assert_eq('count(*)', result[0], total_rows)
_assert_eq('sum(id)', result[1], (total_rows - 1) * total_rows / 2)
_assert_eq('sum(sex)', result[2], total_rows / 2)
_assert_eq('sum(mark)', result[3], 0)

print('Test pass')

cur.execute(f'drop table {_table()}')
cur.close()
conn.close()


if __name__ == "__main__":
FILE_NUM = 4001
ITEM_NUM_PER_FILE = 2
data = gen_data(FILE_NUM, ITEM_NUM_PER_FILE)

fmt = sys.argv[1]
FORMATTER = {
'json': format_json,
}
assert fmt in FORMATTER, f"Unsupported format: {fmt}"
formatted_files = FORMATTER[fmt](data)

config = json.loads(os.environ["GCS_SOURCE_TEST_CONF"])
run_id = str(random.randint(1000, 9999))
_local = lambda idx: f'data_{idx}.{fmt}'
_gcs = lambda idx: f"{run_id}_data_{idx}.{fmt}"
credential_str = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS"])
# put gcs files
op = opendal.Operator("gcs", root="/", bucket=config["GCS_BUCKET"], credential=credential_str)

print("upload file to gcs")
for idx, file_str in enumerate(formatted_files):
with open(_local(idx), "w") as f:
f.write(file_str)
os.fsync(f.fileno())
file_bytes = file_str.encode('utf-8')
op.write(_gcs(idx), file_bytes)

# do test
print("do test")
do_test(config, FILE_NUM, ITEM_NUM_PER_FILE, run_id, fmt, credential_str)

# clean up gcs files
print("clean up gcs files")
for idx, _ in enumerate(formatted_files):
op.delete(_gcs(idx))
1 change: 1 addition & 0 deletions src/connector/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ mysql_common = { version = "0.31", default-features = false, features = [
] }
nexmark = { version = "0.2", features = ["serde"] }
num-bigint = "0.4"
opendal = { git = "https://github.com/apache/incubator-opendal", rev = "9a222e4d72b328a24d5775b1565292f4636bbe69" }
parking_lot = "0.12"
paste = "1"
prometheus = { version = "0.13", features = ["process"] }
Expand Down
3 changes: 3 additions & 0 deletions src/connector/src/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ macro_rules! for_all_classified_sources {
{ Citus }
},
// other sources
// todo: file source do not nest with mq source.
{
{ Kafka, $crate::source::kafka::KafkaProperties, $crate::source::kafka::KafkaSplit },
{ Pulsar, $crate::source::pulsar::PulsarProperties, $crate::source::pulsar::PulsarSplit },
Expand All @@ -32,6 +33,8 @@ macro_rules! for_all_classified_sources {
{ GooglePubsub, $crate::source::google_pubsub::PubsubProperties, $crate::source::google_pubsub::PubsubSplit },
{ Nats, $crate::source::nats::NatsProperties, $crate::source::nats::split::NatsSplit },
{ S3, $crate::source::filesystem::S3Properties, $crate::source::filesystem::FsSplit },
{ Gcs, $crate::source::filesystem::opendal_source::GcsProperties , $crate::source::filesystem::OpendalFsSplit<$crate::source::filesystem::opendal_source::OpendalGcs> },
{ OpendalS3, $crate::source::filesystem::opendal_source::OpendalS3Properties, $crate::source::filesystem::OpendalFsSplit<$crate::source::filesystem::opendal_source::OpendalS3> },
{ Test, $crate::source::test_source::TestSourceProperties, $crate::source::test_source::TestSourceSplit}
}
$(
Expand Down
52 changes: 26 additions & 26 deletions src/connector/src/source/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,12 @@ use super::google_pubsub::GooglePubsubMeta;
use super::kafka::KafkaMeta;
use super::monitor::SourceMetrics;
use super::nexmark::source::message::NexmarkMeta;
use super::OPENDAL_S3_CONNECTOR;
use crate::parser::ParserConfig;
pub(crate) use crate::source::common::CommonSplitReader;
use crate::source::filesystem::{FsPageItem, S3Properties, S3_V2_CONNECTOR};
use crate::source::filesystem::opendal_source::OpendalS3Properties;
use crate::source::filesystem::{FsPageItem, GcsProperties, S3Properties};
use crate::source::monitor::EnumeratorMetrics;
use crate::source::S3_CONNECTOR;
use crate::{
dispatch_source_prop, dispatch_split_impl, for_all_sources, impl_connector_properties,
impl_split, match_source_name_str,
Expand Down Expand Up @@ -78,7 +79,7 @@ impl<P: DeserializeOwned> TryFromHashmap for P {
}
}

pub async fn create_split_reader<P: SourceProperties>(
pub async fn create_split_reader<P: SourceProperties + std::fmt::Debug>(
prop: P,
splits: Vec<SplitImpl>,
parser_config: ParserConfig,
Expand Down Expand Up @@ -367,44 +368,43 @@ impl ConnectorProperties {
pub fn is_new_fs_connector_b_tree_map(with_properties: &BTreeMap<String, String>) -> bool {
with_properties
.get(UPSTREAM_SOURCE_KEY)
.map(|s| s.eq_ignore_ascii_case(S3_V2_CONNECTOR))
.map(|s| s.eq_ignore_ascii_case(OPENDAL_S3_CONNECTOR))
.unwrap_or(false)
}

pub fn is_new_fs_connector_hash_map(with_properties: &HashMap<String, String>) -> bool {
with_properties
.get(UPSTREAM_SOURCE_KEY)
.map(|s| s.eq_ignore_ascii_case(S3_V2_CONNECTOR))
.map(|s| s.eq_ignore_ascii_case(OPENDAL_S3_CONNECTOR))
.unwrap_or(false)
}

pub fn rewrite_upstream_source_key_hash_map(with_properties: &mut HashMap<String, String>) {
let connector = with_properties.remove(UPSTREAM_SOURCE_KEY).unwrap();
match connector.as_str() {
S3_V2_CONNECTOR => {
tracing::info!(
"using new fs source, rewrite connector from '{}' to '{}'",
S3_V2_CONNECTOR,
S3_CONNECTOR
);
with_properties.insert(UPSTREAM_SOURCE_KEY.to_string(), S3_CONNECTOR.to_string());
}
_ => {
with_properties.insert(UPSTREAM_SOURCE_KEY.to_string(), connector);
}
}
}
}

impl ConnectorProperties {
pub fn extract(mut with_properties: HashMap<String, String>) -> Result<Self> {
if Self::is_new_fs_connector_hash_map(&with_properties) {
_ = with_properties
let connector = with_properties
.remove(UPSTREAM_SOURCE_KEY)
.ok_or_else(|| anyhow!("Must specify 'connector' in WITH clause"))?;
return Ok(ConnectorProperties::S3(Box::new(
S3Properties::try_from_hashmap(with_properties)?,
)));
match connector.as_str() {
"s3_v2" => {
let assume_role = with_properties.get("s3.assume_role").cloned();
return Ok(ConnectorProperties::OpendalS3(Box::new(
OpendalS3Properties {
s3_properties: S3Properties::try_from_hashmap(with_properties)?,
assume_role,
},
)));
}
"gcs" => {
return Ok(ConnectorProperties::Gcs(Box::new(
GcsProperties::try_from_hashmap(with_properties)?,
)));
}
_ => {
unreachable!()
}
}
}

let connector = with_properties
Expand Down
Loading

0 comments on commit a9def98

Please sign in to comment.